Python MDP 예제들, mdp.MDP Python 예제들

예제 #1

0

파일 보기

파일: test_mpd.py 프로젝트: jlynchkun/csci5512_hw3

def test_policy_iteration():
    r = -0.04
    m = mdp.MDP(
        states=problem_2.states,
        actions=problem_2.actions,
        reward={
            (3, 1): r,
            (3, 2): r,
            (3, 3): r,
            (3, 4): 1.0,
            (2, 1): r,
            (2, 3): r,
            (2, 4): -1.0,
            (1, 1): r,
            (1, 2): r,
            (1, 3): r,
            (1, 4): r
        },
        allowed_transitions=problem_2.allowed_transitions,
        p_action_given_desired_action=problem_2.p_action_given_desired_action,
        gamma=0.999999  # can't use 1.0
    )

    U, policy = m.policy_iteration()
    assert_optimal_policy(U, policy)

예제 #2

0

파일 보기

def mdpVI():
    r = float(sys.argv[1])

    m = mdp.MDP(
        states=problem_2.states,
        actions=problem_2.actions,
        reward={
            (3, 1): r,
            (3, 2): r,
            (3, 3): r,
            (3, 4): 1.0,
            (2, 1): r,
            (2, 3): r,
            (2, 4): -1.0,
            (1, 1): r,
            (1, 2): r,
            (1, 3): r,
            (1, 4): r
        },
        allowed_transitions=problem_2.allowed_transitions,
        p_action_given_desired_action=problem_2.p_action_given_desired_action,
        gamma=0.99999)

    U, policy = m.value_iteration(0.001)
    for r in (3, 2, 1):
        for c in (1, 2, 3, 4):
            state = (r, c)
            if state in policy:
                print('policy for state {} is {} with utility {}'.format(
                    state, policy.get(state, 'terminal'), U[state]))

예제 #3

0

파일 보기

def train_both_players(iterations, rate, discount=0.99):

    print("Training for both the players ............")

    for _ in tqdm.trange(iterations):

        state = "123456789"
        first_states_list = []
        second_states_list = []

        for step in range(10):

            if state not in cache:
                cache[state] = mdp.MDP(state)

            if cache[state].is_terminal_state:
                first_states_list.append(state)
                second_states_list.append(state)
                break

            if step % 2 == 0:  # First player move
                first_states_list.append(state)
                state = explore(state, 'f')
            else:
                second_states_list.append(state)
                state = explore(state, 's')

        train_player_per_episode(first_states_list, 'f', rate, discount)
        train_player_per_episode(second_states_list, 's', rate, discount)

예제 #4

0

파일 보기

파일: test_mpd.py 프로젝트: jlynchkun/csci5512_hw3

def test_policy_evaluation():
    m = mdp.MDP(
        states=two_by_two_states,
        actions=two_by_two_actions,
        reward=two_by_two_reward,
        allowed_transitions=two_by_two_allowed_transitions,
        p_action_given_desired_action=two_by_two_p_action_given_desired_action,
        gamma=0.3)

    # this policy only gets to the terminal state by accident
    policy = {
        (1, 1): {
            'd': (2, 1)
        },
        (1, 2): {
            'd': (2, 2)
        },
        (2, 1): {
            'u': (1, 1)
        },
        # (2, 2) is a terminal state
        (2, 2): {}
    }
    U = {state: 0.0 for state in two_by_two_states}
    U_update = m.policy_evaluation(policy, U)
    assert -0.125 < U_update[(1, 1)] < -0.124
    assert 0.140 < U_update[(1, 2)] < 0.141
    assert -0.103 < U_update[(2, 1)] < -0.102
    assert U_update[(2, 2)] == 1.0

예제 #5

0

파일 보기

파일: test_mpd.py 프로젝트: jlynchkun/csci5512_hw3

def test_random_policy():
    m = mdp.MDP(
        states=two_by_two_states,
        actions=two_by_two_actions,
        reward=two_by_two_reward,
        allowed_transitions=two_by_two_allowed_transitions,
        p_action_given_desired_action=two_by_two_p_action_given_desired_action,
        gamma=0.0)
    for i in range(100):
        policy = m.get_random_policy()
        assert policy[(1, 1)] in [{'d': (2, 1)}, {'r': (1, 2)}]
        assert policy[(1, 2)] in [{'d': (2, 2)}, {'l': (1, 1)}]
        assert policy[(2, 1)] in [{'u': (1, 1)}, {'r': (2, 2)}]
        assert len(policy[(2, 2)].keys()) == 0

예제 #6

0

파일 보기

def exploit(state, player_symbol):

    if state not in cache:
        cache[state] = mdp.MDP(state)

    if player_symbol == 'f':
        max_value = max(cache[state].values_f)
        max_val_index = cache[state].values_f.index(max_value)
        return cache[state].actions[max_val_index]

    elif player_symbol == 's':
        max_value = max(cache[state].values_s)
        max_val_index = cache[state].values_s.index(max_value)
        return cache[state].actions[max_val_index]

예제 #7

0

파일 보기

    def getTrueMDP(self):
        obsProbs = self.initOProbs()
        obsProbs[(0,self.LEFT_ACTION)][self.STAY_WITH_01_OUTCOME] = 1.0
        obsProbs[(0,self.RIGHT_ACTION)][self.STAY_WITH_0_OUTCOME] = 0.8
        obsProbs[(0,self.RIGHT_ACTION)][self.RIGHT_OUTCOME] = 0.2
        for i in range(1,self.NUM_STATES-1):
            obsProbs[(i,self.RIGHT_ACTION)][self.LEFT_OUTCOME] = 0.2
            obsProbs[(i,self.RIGHT_ACTION)][self.RIGHT_OUTCOME] = 0.8
            obsProbs[(i,self.LEFT_ACTION)][self.LEFT_OUTCOME] = 1.0

        lst= self.NUM_STATES-1
        obsProbs[(lst,self.LEFT_ACTION)][self.LEFT_OUTCOME] = 1.0
        obsProbs[(lst,self.RIGHT_ACTION)][self.STAY_WITH_1_OUTCOME] = 0.8
        obsProbs[(lst,self.RIGHT_ACTION)][self.LEFT_OUTCOME] = 0.2
        return mdp.MDP(self, obsProbs)

예제 #8

0

파일 보기

def train_second_player(num_of_iterations, rate):
    print("\nTraining second player, please wait .... \n")

    for _ in tqdm.trange(num_of_iterations):

        # v = random.random()
        # if v < 0.5:
        #     state = "1234f6789"
        # else:
        #     state = "123456789"
        state = "123456789"
        states_list = []
        for step in range(10):

            if state not in cache:
                cache[state] = mdp.MDP(state)

            if cache[state].is_terminal_state:
                states_list.append(state)
                break

            if step % 2 == 0:  # First Player Move, Random player move
                state = random_player(state, 'f')
                # if state not in cache:
                #     cache[state] = mdp.MDP(state)

            else:  # Second player move
                states_list.append(state)

                # Start Exploration - Exploitation trade off here ---------
                state = explore(state, 's')
                # v = 1
                # if (random.random()<=v):
                #     # v = v - 0.0001
                # else:
                #     index = exploit(state, 's')
                #     state = ttt.input_to_board(index, state, 's')
                # ---------------------------------------------------------

                # if state not in cache:
                #     cache[state] = mdp.MDP(state)

        for _ in range(1):
            train_player_per_episode(states_list,
                                     symbol='s',
                                     learning_rate=rate,
                                     discount_factor=0.99)

예제 #9

0

파일 보기

파일: test_mpd.py 프로젝트: jlynchkun/csci5512_hw3

def test_evaluate_bellman_equation():
    m = mdp.MDP(
        states=two_by_two_states,
        actions=two_by_two_actions,
        reward=two_by_two_reward,
        allowed_transitions=two_by_two_allowed_transitions,
        p_action_given_desired_action=two_by_two_p_action_given_desired_action,
        gamma=0.3)
    U = {(1, 1): 0.0, (1, 2): 0.0, (2, 1): 0.0, (2, 2): 0.0}
    U_next = {state: 0.0 for state in m.states}
    # test a state with maximizing action moving to a non-terminal state
    (U_next[(1, 1)],
     maximizing_action) = m.evaluate_bellman_equation(U, (1, 1))
    assert U_next[(
        1, 1)] == (m.reward[(1, 1)] + 0.3 *
                   (0.8 * U[(2, 1)] + 0.1 * U[(1, 2)] + 0.1 * U[(1, 1)]))
    #assert maximizing_action == 'd'
    # test a state with maximizing action moving to a terminal state
    (U_next[(1, 2)],
     maximizing_action) = m.evaluate_bellman_equation(U, (1, 2))
    assert U_next[(1, 2)] == m.reward[(1, 2)] + m.gamma * (0.8 * U[
        (2, 2)] + 0.1 * U[(1, 1)] + 0.1 * U[(1, 2)])
    #assert maximizing_action == 'd'
    # test a state with maximizing action moving to a terminal state
    (U_next[(2, 1)],
     maximizing_action) = m.evaluate_bellman_equation(U, (2, 1))
    assert U_next[(2, 1)] == m.reward[(2, 1)] + m.gamma * (0.8 * U[
        (2, 2)] + 0.1 * U[(1, 1)] + 0.1 * U[(2, 1)])
    # test the terminal state
    (U_next[(2, 2)],
     maximizing_action) = m.evaluate_bellman_equation(U, (2, 2))
    assert maximizing_action is None
    assert U_next[(2, 2)] == 1.0

    U = U_next.copy()
    print(U)
    # test a state with maximizing action moving to a non-terminal state
    (U_next[(1, 1)],
     maximizing_action) = m.evaluate_bellman_equation(U, (1, 1))
    assert U_next[(
        1, 1)] == (m.reward[(1, 1)] + m.gamma *
                   (0.8 * U[(2, 1)] + 0.1 * U[(1, 2)] + 0.1 * U[(1, 1)]))
    assert maximizing_action == 'r'

예제 #10

0

파일 보기

 def getTrueMDP(self):
     obsProbs = self.initOProbs()
     walls = [(0,1),(2,1),(3,2)]
     pit = (1,3)
     for s,a in obsProbs:
         (x,y) = s
         for o in [self.NORTH_OUTCOME, self.SOUTH_OUTCOME, \
                         self.EAST_OUTCOME, self.WEST_OUTCOME]:
             oreal = o
             nextState = self.getNextStateInner(x,y,o)
             if nextState == pit:
                 oreal = self.FELL_PIT_OUTCOME
             if nextState in walls:
                 oreal = self.HIT_WALL_OUTCOME
             if o == a:
                 obsProbs[(s,a)][oreal] += 0.7
             else:
                 obsProbs[(s,a)][oreal] += 0.1
                 
     return mdp.MDP(self,obsProbs)

예제 #11

0

파일 보기

def train_first_player(num_of_iterations, rate=0.01, discount_factor=0.99):
    """
    This function plays a tic-tac-toe game involving two random players.
    All the states produces withhin the game are stored in a list and is forwarded to training_player_per_each_episode 
    function for actual training.

    """

    print("\nTraining first player, please wait .... \n")
    for _ in tqdm.trange(num_of_iterations):

        state = "123456789"
        states_list = []
        for step in range(10):

            if state not in cache:
                cache[state] = mdp.MDP(state)

            if cache[state].is_terminal_state:
                states_list.append(state)
                break

            if step % 2 == 0:
                # First Player Move
                states_list.append(state)

                # Start Exploration - Exploitation trade off here ---------
                state = explore(state, 'f')
                # ---------------------------------------------------------

            else:
                # Random player move
                state = random_player(state, 's')

        train_player_per_episode(states_list,
                                 symbol='f',
                                 learning_rate=rate,
                                 discount_factor=discount_factor)

예제 #12

0

파일 보기

파일: test_mpd.py 프로젝트: jlynchkun/csci5512_hw3

def test_get_resulting_action_list():
    m = mdp.MDP(
        states=problem_2.states,
        actions=problem_2.actions,
        reward={},
        allowed_transitions=problem_2.allowed_transitions,
        p_action_given_desired_action=problem_2.p_action_given_desired_action,
        gamma=0.0)
    # from state (1, 1) and desired action 'u' we can move
    #   up with probability 0.8
    #   stay (left) with probability 0.1
    #   right with probability 0.1
    resulting_action_list = m.get_resulting_action_list((1, 1), 'u')
    assert len(resulting_action_list) == 3
    assert ('u', 0.8, (2, 1)) in resulting_action_list
    assert ('l', 0.1, (1, 1)) in resulting_action_list
    assert ('r', 0.1, (1, 2)) in resulting_action_list

    resulting_action_list = m.get_resulting_action_list((2, 1), 'u')
    assert len(resulting_action_list) == 3
    assert ('u', 0.8, (3, 1)) in resulting_action_list
    assert ('l', 0.1, (2, 1)) in resulting_action_list
    assert ('r', 0.1, (2, 1)) in resulting_action_list

예제 #13

0

파일 보기

def create_grid(x, y, terminals, discount):
    transitions = []
    states = []

    for j in range(y):
        for i in range(x):
            name = _name(i, j)
            states.append(name)

            reward = -1
            if ((i, j) in terminals):
                reward = 0

            transitions.append((name, 'n', _name(i, max(j - 1, 0)), 1, reward))
            transitions.append((name, 'e', _name(min(i + 1, x - 1),
                                                 j), 1, reward))
            transitions.append((name, 's', _name(i, min(j + 1,
                                                        y - 1)), 1, reward))
            transitions.append((name, 'w', _name(max(i - 1, 0), j), 1, reward))

    return m.MDP(states, ['n', 'e', 's', 'w'], transitions,
                 {_name(t[0], t[1])
                  for t in terminals}, discount)

예제 #14

0

파일 보기

파일: test_value_iteration.py 프로젝트: vishalbelsare/mdp-problog

    def setUp(cls):
        cls.models = {
            'sysadmin':
            """
                computer(c1). computer(c2). computer(c3).
                connected(c1,[c2,c3]). connected(c2,[c1]). connected(c3,[c1]).

                accTotal([],A,A).
                accTotal([_|T],A,X) :- B is A+1, accTotal(T,B,X).
                total(L,T) :- accTotal(L,0,T).
                total_connected(C,T) :- connected(C,L), total(L,T).

                accAlive([],A,A).
                accAlive([H|T],A,X) :- running(H,0), B is A+1, accAlive(T,B,X).
                accAlive([H|T],A,X) :- not(running(H,0)), B is A, accAlive(T,B,X).
                alive(L,A) :- accAlive(L,0,A).
                total_running(C,R) :- connected(C,L), alive(L,R).

                state_fluent(running(C)) :- computer(C).

                action(reboot(none)).
                action(reboot(C)) :- computer(C).

                1.00::running(C,1) :- reboot(C).
                0.05::running(C,1) :- not(reboot(C)), not(running(C,0)).
                P::running(C,1)    :- not(reboot(C)), running(C,0),
                                      total_connected(C,T), total_running(C,R), P is 0.45+0.50*R/T.

                utility(running(C,0),  1.00) :- computer(C).

                utility(reboot(C), -0.75) :- computer(C).
                utility(reboot(none), 0.00).
                """
        }

        cls.mdp = mdp.MDP(cls.models['sysadmin'])
        cls.vi = vi.ValueIteration(cls.mdp)

예제 #15

0

파일 보기

def initialize():
    global board, player, graphics, mdp, valueIteration, qLearn, stateHandler

    if len(sys.argv) > 2:
        if "json" not in sys.argv[1]:
            print("error: level json argument must be first argument")
            exit()
        if sys.argv[2] != 'v' and sys.argv[2] != 'q':
            print(
                "error: AI algorithm type not specified. Please use 'v' for value iteration or 'q' for q-learning"
            )
            exit()

        rewardArgument = None
        livingReward = None
        iterations = None
        learninRate = None
        epsilon = None

        if sys.argv[2] == 'v':
            if len(sys.argv) != 6:
                print(
                    "Error: value iteration requires reward discount, living reward, and number of iterations.\npython main.py level.json v 0.8 -1 50"
                )
                exit()
            valueIteration = True
            iterations = int(sys.argv[5])
        else:
            if len(sys.argv) != 7:
                print(
                    "Error: q learning requires reward, living reward, learning rate, and epsilon.\npython main.py level.json 0.8 -1 0.2 0.9"
                )
                exit()
            valueIteration = False
            learningRate = float(sys.argv[5])
            epsilon = float(sys.argv[6])

        rewardArgument = float(sys.argv[3])
        livingReward = float(sys.argv[4])
        board = boardLibrary.Board(sys.argv[1])

        player = playerLibrary.Player(board.playerPosition[0],
                                      board.playerPosition[1])

        startingState = state.State((player.x, player.y),
                                    [(key.x, key.y) for key in board.keys])

        if valueIteration is True:
            mdp = mdpLibrary.MDP(startingState, rewardArgument, livingReward,
                                 iterations)
            qLearn = None
        else:
            qLearn = qLearningLibrary.QLearn(startingState, rewardArgument,
                                             livingReward, learningRate,
                                             epsilon)
            mdp = None

        graphics = graphicsLibrary.Graphics()

    else:
        print(
            "error: not enough arguments provided. Provide level json file followed by string 'v' or 'q' for value iteration or q-learning respectively"
        )
        exit()

예제 #16

0

파일 보기

def plotCurve():
    thresholdCurves = getThresholdCurves()

    currentCurve = thresholdCurves[0]
    currentYear = 2080
    action = 8000

    approachOptim = "optimistic"
    approachPess = "pessimistic"
    statesOptim = []
    statesPess = []
    scenariosOptim = []
    scenariosPess = []
    actions = []

    mdp1 = mdp.MDP(thresholdCurves)
    # actions2 = [20000, 1000, 2000, 3000, 4000, 5000, 6000, 8000, 9000, 10000]
    # actions2 = [12000, 1000, 3000, 5000, 8000]
    # seqOfAction = [12000, 1000, 5000]
    seqOfAction = [5000, 12000, 1000]
    # seqOfAction = [1000, 20000, 5000]
    # seqOfAction = [2000, 3000, 1000, 2000]
    # seqOfAction = [8000, 4000, 6000, 3000]
    # seqOfAction = [9000, 8000, 12000, 6000]
    # for i in range(len(actions2)):
    #     actions = []
    #     for j in range(25):
    #         actions.append(actions2[0])
    #     for j in range(25):
    #         actions.append(actions2[i])
    #     # actions = [4000] * 50
    #     # debris0 = 13000
    #     debris0 = thresholdCurves[0].totDebris[0]
    #     states.append(debris0)
    #     for i in range(50):
    #         yr = (2 * i) + 2017
    #         # print yr
    #         states.append(mdp(thresholdCurves, yr, states[i], actions[i], approach))
    #     scenarios.append(states[:-1])
    #     states = []
    #     actions = []
    for j in range(20):
        actions.append(seqOfAction[0])
        # actions.append(20000)
    for j in range(15):
        actions.append(seqOfAction[1])
        # actions.append(1000)
    for j in range(15):
        actions.append(seqOfAction[2])
    # for j in range(20):
    #     actions.append(seqOfAction[3])
    # print actions
    # for j in range(20):
    #     actions.append(seqOfAction[3])
    # actions = [4000] * 50
    statesOptim = []
    statesPess = []
    timeStep = 2
    debris0 = thresholdCurves[4].totDebris[0]
    totDebrisLevel = debris0
    state1O = mdp1.state(totDebrisLevel, START_YEAR)
    state1P = mdp1.state(totDebrisLevel, START_YEAR)
    targetThreshold = 0

    statesOptim.append(debris0)
    statesPess.append(debris0)
    yr = 2017
    i = 0
    while (yr - START_YEAR) < 100:
        expLostAssets, totDebrisLevelO, expRemoved, targetThreshold = mdp1.transitionOfStates(
            state1O, actions[i], approachOptim, timeStep)
        expLostAssets, totDebrisLevelP, expRemoved, targetThreshold = mdp1.transitionOfStates(
            state1P, actions[i], approachPess, timeStep)
        state1O = mdp1.state(totDebrisLevelO, yr)
        state1P = mdp1.state(totDebrisLevelP, yr)

        statesOptim.append(state1O.totDebrisLevel)
        statesPess.append(state1P.totDebrisLevel)
        i += 1
        yr = (timeStep * i) + 2017
    scenariosOptim.append(statesOptim)
    scenariosPess.append(statesPess)
    plotDebEvol(scenariosOptim, scenariosPess, 1)

예제 #17

0

파일 보기

def learnStrategy():
    thresholdCurves = getThresholdCurves()
    mdp1 = mdp.MDP(thresholdCurves)
    debris0 = mdp1.thresholdCurves[0].totDebris[0]
    action0 = 3000
    attackerStrategy = [
        2000, 2000, 2000, 2000, 2000, 12000, 12000, 12000, 12000, 12000, 12000
    ]
    attackerStrategy1 = [5000, 12000]
    defenderFixedStrats = []
    for ii in range(len(thresholdCurves)):
        defenderFixedStrats.append([
            (ii + 1) * 1000 for jj in range(timeHorizon / TIME_STEP + 1)
        ])
    fixedToPlay = 4

    finalStrategy = []
    finalDebrisLevel = []
    Q_table = qLearning.initQtable(debrisLevels, actionVector, years)
    print len(Q_table)

    # # fn approx
    # lenOfPhi = (timeHorizon / TIME_STEP) + len(range(13000, 350000, debrisLevelStep))
    # # length is the total size + 1 to account for a bias term
    # weights = [random.random() for j in range(lenOfPhi * len(actionVector) + 1)]

    discRewardEvol = []
    immRewardEvol = []
    listOfQvalues = []
    no_states = 0
    for iteration in range(totIter):

        totDebrisLevel = debris0
        state0 = mdp1.state(totDebrisLevel, 2017)
        state = state0
        rewardList = []
        discReward = []
        targetThreshold = action0
        atBeginning = 1
        if iteration == totIter - 1:
            finalDebrisLevel.append(totDebrisLevel)

        i = 0
        yr = 2017
        while (yr - START_YEAR) < 100:
            stateApprox = mdp1.state(mdp.approximateState(state), state.year)

            if iteration == totIter - 1:
                action = qLearning.epsilon_greedy_strat(
                    Q_table, stateApprox, targetThreshold, 0, atBeginning)
            else:
                action = qLearning.epsilon_greedy_strat(
                    Q_table, stateApprox, targetThreshold, epsilon,
                    atBeginning)

            atBeginning = 0

            expLostAssets, totDebrisLevel, expRemoved, targetThreshold = mdp1.transitionOfStates(
                state, action, approach, TIME_STEP)

            state_next = mdp1.state(totDebrisLevel, yr + TIME_STEP)

            state_nextApprox = mdp1.state(mdp.approximateState(state_next),
                                          state_next.year)

            if infiniteRewSum:
                # ======= last reward is infinite sum of future discounted rewards with constant values ==============
                if (yr + timeHorizon % TIME_STEP) == finalYear:
                    averagedEndReward = 0
                    indx = 0
                    for thr in thresholdCurves:
                        if thr.threshold == action:
                            break
                        indx += 1

                    size = 0
                    for j in range(95, 100):
                        averagedEndReward += -(
                            thresholdCurves[indx].lostAssets[j] * C_L +
                            thresholdCurves[indx].removed[j] * C_R)
                        size += 1

                    averagedEndReward = averagedEndReward / size
                    rewardInf = averagedEndReward * (1 /
                                                     (1 - rewardDiscountGamma))
                    reward = mdp1.getReward(expLostAssets,
                                            expRemoved_def) + rewardInf

                else:
                    reward = mdp1.getReward(expLostAssets, expRemoved_def)
                # ================================================================================
            else:
                reward = mdp1.getReward(expLostAssets, expRemoved)

            discReward.append(pow(rewardDiscountGamma, i - 1) * reward)

            qLearning.update_Q(Q_table, alpha, rewardDiscountGamma,
                               stateApprox, state_nextApprox, action, reward)
            state = state_next
            rewardList.append(reward)
            if iteration == totIter - 1:
                finalStrategy.append(action)
                finalDebrisLevel.append(totDebrisLevel)
            i += 1
            yr = (TIME_STEP * i) + 2017

        discRewardEvol.append(sum(discReward))
    print "step in debris size: ", debrisLevelStep
    print "Learned sequence of actions: ", finalStrategy
    print "Total number of explored states ", no_states
    # print weights
    print "discounted reward is :", sum(discReward)
    print "imm reward is :", sum(rewardList)

    # print "total error is: ", sum(td_errorList)

    def movingaverage(interval, window_size):
        window = np.ones(int(window_size)) / float(window_size)
        return np.convolve(interval, window, 'same')

    wins_MA = []
    return finalStrategy

예제 #18

0

파일 보기

def learnStrategyMultiAgent():
    thresholdCurves = getThresholdCurves()
    mdp1 = mdp.MDP(thresholdCurves)
    debris0 = mdp1.thresholdCurves[0].totDebris[0]
    action0 = 5000
    defenderFixedStrats = []
    for ii in range(len(thresholdCurves)):
        defenderFixedStrats.append([
            (ii + 1) * 1000 for jj in range(timeHorizon / TIME_STEP + 1)
        ])
    fixedToPlay = 4

    finalStrategy = []
    finalStrategy_att = []
    finalDebrisLevel = []
    Q_table = qLearning.initQtable(debrisLevels, actionVector, years)
    Q_table_att = qLearning.initQtable(debrisLevels, actionVector, years)

    discRewardEvol = []
    immReward = []
    immReward_att = []
    listOfQvalues = []
    no_states = 0
    td_errorList = []
    td_errorList_att = []
    for iteration in range(totIter):

        totDebrisLevel = debris0
        state0 = mdp1.state(totDebrisLevel, 2017)

        state = state0
        discReward = []
        discReward_att = []
        targetThreshold_def = action0
        targetThreshold_att = action0

        atBeginning = 1

        if iteration == totIter - 1:
            finalDebrisLevel.append(totDebrisLevel)

        # for i in range(1, 50):
        i = 0
        yr = 2017
        while (yr - START_YEAR) < 100:

            # if iteration == 350:
            #     action = 3000
            stateApprox = mdp1.state(mdp.approximateState(state), state.year)

            if iteration == totIter - 1:
                # epsilon = 0
                # action = qLearning.epsilon_greedy_strat(Q_table, state, targetThreshold, 0)
                # print "learned ", Q_table[state.totDebrisLevel, state.year, action]
                # listOfQvalues.append(Q_table[state.totDebrisLevel, state.year, action])
                action = qLearning.epsilon_greedy_strat(
                    Q_table, stateApprox, targetThreshold_def, 0, atBeginning)
                # action = 4000

                action_att = qLearning.epsilon_greedy_strat(
                    Q_table_att, stateApprox, targetThreshold_att, 0,
                    atBeginning)
                # action_att = 6000
                # fn approx
                # action = qLearning.epsilon_greedy_fnApprox(weights, state, targetThreshold, 0, atBeginning)
                # action = 4000
                # action_att = qLearning.epsilon_greedy_fnApprox(weights_att, state, targetThreshold_att, 0, atBeginning)
                # action_att = 12000
                # action = 12000
                # action = defenderFixedStrats[fixedToPlay][i]
            else:
                # action = qLearning.epsilon_greedy_strat(Q_table, state, targetThreshold, epsilon)
                # action = actionsLearned[i]
                # print atBeginning
                action = qLearning.epsilon_greedy_strat(
                    Q_table, stateApprox, targetThreshold_def, epsilon,
                    atBeginning)
                # action = 4000

                action_att = qLearning.epsilon_greedy_strat(
                    Q_table_att, stateApprox, targetThreshold_att, epsilon,
                    atBeginning)
                # action_att = 6000

                # fn approx
                # action = qLearning.epsilon_greedy_fnApprox(weights, state, targetThreshold, epsilon, atBeginning)
                # action = 4000
                # action_att = qLearning.epsilon_greedy_fnApprox(weights_att, state, targetThreshold_att, epsilon, atBeginning)
                # action_att = 12000
                # action = defenderFixedStrats[fixedToPlay][i]
            atBeginning = 0

            # pastAgentAction = action

            expLostAssets_def, totDebrisLevel_def, expRemoved_def, targetThreshold_def = mdp1.transitionOfStates(
                state, action, approach, TIME_STEP)

            # OPPONENT ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
            # print opponentAction
            # expLostAssets_att, totDebrisLevel_att, expRemoved_att, targetThreshold_att = mdp1.transitionOfStates(state, opponentAction, approach, TIME_STEP)
            expLostAssets_att, totDebrisLevel_att, expRemoved_att, targetThreshold_att = mdp1.transitionOfStates(
                state, action_att, approach, TIME_STEP)

            # print expRemoved_def
            expRemoved_total = expRemoved_def + expRemoved_att
            # print expRemoved_def
            if expRemoved_total != 0:
                remProportional_def = expRemoved_def / expRemoved_total
                remProportional_att = expRemoved_att / expRemoved_total
            else:
                remProportional_def = 0
                remProportional_att = 0

            curve_total = mdp1.findCurve(expRemoved_total, state, approach,
                                         TIME_STEP)
            action_joint = curve_total.threshold

            expLostAssets, totDebrisLevel, expRemoved, targetThreshold = mdp1.transitionOfStates(
                state, action_joint, approach, TIME_STEP)
            # print expRemoved_total, expRemoved
            expRemoved_def = expRemoved * remProportional_def
            expRemoved_att = expRemoved * remProportional_att

            state_next = mdp1.state(totDebrisLevel, yr + TIME_STEP)
            state_nextApprox = mdp1.state(mdp.approximateState(state_next),
                                          state_next.year)

            if infiniteRewSum:
                # ======= last reward is infinite sum of future discounted rewards with constant values ==============
                # !!!!!! this type of reward not defined for the attacker yet **********************************************
                if (yr + timeHorizon % TIME_STEP) == finalYear:
                    averagedEndReward = 0
                    indx = 0
                    for thr in thresholdCurves:
                        if thr.threshold == action:
                            break
                        indx += 1

                    size = 0
                    for j in range(95, 100):
                        averagedEndReward += -(
                            share_IA * thresholdCurves[indx].lostAssets[j] +
                            ratioC_L_C_R * thresholdCurves[indx].removed[j])
                        size += 1

                    averagedEndReward = averagedEndReward / size
                    rewardInf = averagedEndReward * (1 /
                                                     (1 - rewardDiscountGamma))
                    reward = mdp1.getReward_multiAgent(
                        expLostAssets, expRemoved_def) + rewardInf

                else:
                    reward = mdp1.getReward_multiAgent(expLostAssets,
                                                       expRemoved_def)
                # ================================================================================
            else:
                reward = mdp1.getReward_multiAgent(expLostAssets,
                                                   expRemoved_def)
                reward_att = mdp1.getReward_multiAgent_att(
                    expLostAssets, expRemoved_att)

            discReward.append(pow(rewardDiscountGamma, i - 1) * reward)
            discReward_att.append(pow(rewardDiscountGamma, i - 1) * reward_att)

            # Q learning ++++++++++++++++++++++++++++++++++++++++++
            qLearning.update_Q(Q_table, alpha, rewardDiscountGamma,
                               stateApprox, state_nextApprox, action, reward)

            qLearning.update_Q(Q_table_att, alpha, rewardDiscountGamma,
                               stateApprox, state_nextApprox, action_att,
                               reward_att)
            # fn approx
            # weights, td_error = qLearning.update_weights(weights, alpha, rewardDiscountGamma, state, action, state_next, reward)

            # weights_att, td_error_att = qLearning.update_weights(weights_att, alpha, rewardDiscountGamma, state, action_att, state_next, reward_att)

            # td_errorList.append(math.sqrt(td_error ** 2))
            # td_errorList_att.append(math.sqrt(td_error_att ** 2))

            state = state_next
            if iteration == totIter - 1:
                finalStrategy.append(action)
                finalStrategy_att.append(action_att)
                finalDebrisLevel.append(totDebrisLevel)
                immReward.append(reward)
                immReward_att.append(reward_att)
            i += 1
            yr = (TIME_STEP * i) + 2017

        discRewardEvol.append(sum(discReward))
    print
    print "++++" * 20
    print "share: ", share_IA, "ratio: ", ratioC_L_C_R
    print "Q-learning params, alpha :", alpha, " epsilon: ", epsilon, " gamma: ", rewardDiscountGamma
    print "step in debris size: ", debrisLevelStep
    print "DEF: Learned sequence of actions: ", finalStrategy
    print "ATT: Learned sequence of actions: ", finalStrategy_att
    # print "Total number of explored states ", no_states
    # print weights
    print "DEF: discounted reward is :", sum(discReward)
    print "ATT: discounted reward is :", sum(discReward_att)
    # print np.cumsum(discReward)[::1]
    print "DEF: imm reward is :", sum(immReward)
    print "ATT: imm att reward is :", sum(immReward_att)
    print
    print "Both tot reward is: ", sum(immReward) + sum(immReward_att)
    print
    print "DEF: total error is: ", sum(td_errorList)
    print "ATT: total error is: ", sum(td_errorList_att)
    print "++++" * 20
    print

    def movingaverage(interval, window_size):
        window = np.ones(int(window_size)) / float(window_size)
        return np.convolve(interval, window, 'same')

    return finalStrategy

예제 #19

0

파일 보기

def showFinalStrategy(finalStrategy):
    years = range(2017, 2017 + 100, TIME_STEP)
    thresholdCurves = getThresholdCurves()
    # thresholds = [1000, 2000, 3000, 4000, 5000, 6000]
    thresholds = [1000, 2000, 3000, 4000, 5000, 6000, 8000, 9000, 10000, 12000]
    # thresholds = [1000, 4000, 8000, 12000]
    thresholdsNames = [
        "learned", 1000, 2000, 3000, 4000, 5000, 6000, 8000, 9000, 10000, 12000
    ]
    strategies = []
    strategies.append(finalStrategy)
    for thr in thresholds:
        strat = [thr] * 52
        strategies.append(strat)
    mdp1 = mdp.MDP(thresholdCurves)

    # actionsLabels = ["above 1000", "above 2000", "above 3000", "above 4000", "above 5000", "above 6000", "above 8000", "above 9000", "above 10000", "no removal"]
    actionsLabels = [
        "Q-learned strat", "above 1000", "above 2000", "above 3000",
        "above 4000", "above 5000", "above 6000", "above 8000", "above 9000",
        "above 10000", "no removal"
    ]
    # actionsLabels = ["above 1000", "above 4000", "above 8000", "no removal"]
    # actionsLabels = ["$\gamma$ = 0.99", "$\gamma$ = 0.975", "$\gamma$ = 0.95", "$\gamma$ = 0.9", "$\gamma$ = 0.8"]
    # actionsLabels = ["C_L/C_R = 0.05", "C_L/C_R = 0.1", "C_L/C_R = 0.2", "C_L/C_R = 0.3", "C_L/C_R = 0.4", "C_L/C_R = 0.5"]
    # colorOfExper = ['g--', 'g', 'b', 'k', 'y', 'm', 'c', 'k', 'b', 'g', 'r', 'y']
    colorOfExper = [
        'g', 'g--', 'b--', 'k--', 'y--', 'm--', 'c--', 'k--', 'b--', 'g--',
        'r--', 'y'
    ]
    file1 = fileN
    ff = open(file1, 'w')

    ix = 0
    for thr in range(0, len(strategies)):

        finalStrategy = strategies[thr]
        if thr == 0:
            debris0 = mdp1.thresholdCurves[2].totDebris[0]
        else:
            debris0 = mdp1.thresholdCurves[thr - 1].totDebris[0]

        totDebrisLevel = debris0
        state0 = mdp1.state(totDebrisLevel, 2017)
        state1 = state0
        immReward = []
        discReward = []
        finalDebrisLevel = []
        finalDebrisLevel.append(totDebrisLevel)
        totRemoved = []
        totLost = []
        i = 0
        yr = 2017
        while (yr - START_YEAR) < 100:
            action = finalStrategy[i]

            expLostAssets, totDebrisLevel, expRemoved, targetThreshold = mdp1.transitionOfStates(
                state1, action, approach, TIME_STEP)
            totRemoved.append(expRemoved)
            totLost.append(expLostAssets)

            if infiniteRewSum:
                # ======= last reward is infinite sum of future discounted rewards with constant values ==============
                if (yr + timeHorizon % TIME_STEP) == finalYear:
                    averagedEndReward = 0
                    indx = 0
                    for thre in thresholdCurves:
                        if thre.threshold == action:
                            break
                        indx += 1

                    size = 0
                    for j in range(95, 100):
                        averagedEndReward += -(
                            thresholdCurves[indx].lostAssets[j] * C_L +
                            thresholdCurves[indx].removed[j] * C_R)
                        size += 1

                    averagedEndReward = averagedEndReward / size
                    rewardInf = averagedEndReward * (1 /
                                                     (1 - rewardDiscountGamma))
                    reward = mdp1.getReward(expLostAssets,
                                            expRemoved) + rewardInf

                else:
                    reward = mdp1.getReward(expLostAssets, expRemoved)
                    # reward = mdp1.getReward_multiAgent(expLostAssets, expRemoved)
                # ================================================================================
            else:
                reward = mdp1.getReward(expLostAssets, expRemoved)
                # reward = mdp1.getReward_multiAgent(expLostAssets, expRemoved)
            immReward.append(reward)
            # discReward.append(round((pow(g, i - 1) * reward), 4))
            discReward.append(
                round((pow(rewardDiscountGamma, i - 1) * reward), 4))
            finalDebrisLevel.append(totDebrisLevel)
            i += 1
            # print i, yr
            yr = (TIME_STEP * i) + 2017
            # print yr
            state1 = mdp1.state(totDebrisLevel, yr)
        print
        print "Sum of discounted reward for threshold ", thresholdsNames[
            ix], " is ", sum(discReward)
        print "Sum of immediate reward for threshold is ", sum(immReward)
        print "Discounted reward for threshold ", thresholdsNames[
            ix], " is ", np.cumsum(discReward)[::1]
        print "total removed: ", sum(totRemoved)
        print "total lost: ", sum(totLost)
        print

        ff.write("%s\n" % finalStrategy)
        ff.write("%s\n" % sum(discReward))
        ff.write("%s\n" % np.cumsum(discReward))
        ff.write("\n")

        totRew = round(sum(immReward), 3)

        if ix == 0:
            plt.plot(years,
                     finalDebrisLevel[0:-1],
                     colorOfExper[ix],
                     label=actionsLabels[ix],
                     linewidth=3)
            # plt.plot(years, finalDebrisLevel, colorOfExper[ix], label=actionsLabels[ix], linewidth=3)
        else:
            plt.plot(years,
                     finalDebrisLevel[0:-1],
                     colorOfExper[ix],
                     label=actionsLabels[ix],
                     linewidth=1.5)
            # plt.plot(years, finalDebrisLevel, colorOfExper[ix], label=actionsLabels[ix], linewidth=1.5)
        ix += 1
        # inxG += 1
    ff.close()
    pylab.xlim(2015, 2120)
    # pylab.ylim(-1.2, 0.2)
    # plt.ylabel('discounted reward', fontsize=20)
    plt.ylabel('number of objects', fontsize=20)
    # plt.ylabel('removed', fontsize=20)
    # plt.ylabel('lost assets', fontsize=20)
    plt.xlabel('year', fontsize=20)
    # plt.title('Objects number evolution', fontsize=20)
    # plt.title('Objects number evolution - $C_R/C_L = 0.5$', fontsize=20)
    plt.title('Discounted reward - C_R/C_L = 0.3, $\gamma$ = 0.95',
              fontsize=20)
    # plt.title('Discounted reward - above 1000, time step 2 years', fontsize=17)
    # plt.title('Immediate reward evolution', fontsize=20)
    # plt.title('Immediate reward evolution - threshold = 3000', fontsize=20)
    # plt.title('Expected # of removed', fontsize=20)
    # plt.title('Expected # of removed', fontsize=20)
    # plt.title('Expected # of lost assets', fontsize=20)
    plt.grid()
    plt.legend(loc='upper left')
    plt.show()

예제 #20

0

파일 보기

def findOptimalSequence(attackerFixed):
    years = range(2017, 2017 + 101, TIME_STEP)
    thresholdCurves = getThresholdCurves()
    # thresholds = [1000, 2000, 3000, 4000, 5000, 6000]
    thresholds = [1000, 2000, 3000, 4000, 5000, 6000, 8000, 9000, 10000, 12000]
    mdp1 = mdp.MDP(thresholdCurves)

    startAction = [3000]
    yr = 0
    # seq = []
    seqAll = []

    def getSeq(seq, action, yr):
        if yr < 100:
            seq.append(action)
            indx = 0
            for thr in thresholds:
                if thr == action:
                    break
                indx += 1
            yr += TIME_STEP
            getSeq(seq[:], action, yr)
            if action != 12000:
                getSeq(seq[:], thresholds[indx + 1], yr)
            if action != 1000:
                getSeq(seq[:], thresholds[indx - 1], yr)
        else:
            seqAll.append(seq)

        return seqAll

    seq = []
    stratsAll = []
    for thrs in thresholds:
        stratsAll.extend(getSeq([], thrs, yr))

    strats = stratsAll
    allDiscRews = []
    allRews = []
    allRews_att = []
    allRews_both = []
    action_att = attackerFixed
    # showFinalStrategy(strats[100])
    for strat in strats:
        finalStrategy = strat
        debris0 = mdp1.thresholdCurves[2].totDebris[0]

        totDebrisLevel = debris0
        state0 = mdp1.state(totDebrisLevel, 2017)
        state1 = state0
        immReward = []
        immReward_att = []
        discReward = []
        finalDebrisLevel = []
        finalDebrisLevel.append(totDebrisLevel)
        i = 0
        yr = 2017
        while (yr - START_YEAR) < 100:

            action = finalStrategy[i]

            # adding attacker +++++++++++++++++++++++++++++++++++++++++++++++++++++++
            expLostAssets_def, totDebrisLevel_def, expRemoved_def, targetThreshold = mdp1.transitionOfStates(
                state1, action, approach, TIME_STEP)
            expLostAssets_att, totDebrisLevel_att, expRemoved_att, targetThreshold_att = mdp1.transitionOfStates(
                state1, action_att, approach, TIME_STEP)

            expRemoved_total = expRemoved_def + expRemoved_att

            if expRemoved_total != 0:
                remProportional_def = expRemoved_def / expRemoved_total
                remProportional_att = expRemoved_att / expRemoved_total
            else:
                remProportional_def = 0
                remProportional_att = 0

            curve_total = mdp1.findCurve(expRemoved_total, state1, approach,
                                         TIME_STEP)
            action_joint = curve_total.threshold

            expLostAssets, totDebrisLevel, expRemoved, targetThreshold = mdp1.transitionOfStates(
                state1, action_joint, approach, TIME_STEP)
            # print expRemoved_total, expRemoved
            expRemoved_def = expRemoved * remProportional_def
            expRemoved_att = expRemoved * remProportional_att

            # adding attacker +++++++++++++++++++++++++++++++++++++++++++++++++++++++

            if infiniteRewSum:
                # ======= last reward is infinite sum of future discounted rewards with constant values ==============
                if (yr + timeHorizon % TIME_STEP) == finalYear:
                    averagedEndReward = 0
                    indx = 0
                    for thre in thresholdCurves:
                        if thre.threshold == action:
                            break
                        indx += 1

                    size = 0
                    for j in range(95, 100):
                        averagedEndReward += -(
                            thresholdCurves[indx].lostAssets[j] * C_L +
                            thresholdCurves[indx].removed[j] * C_R)
                        size += 1

                    averagedEndReward = averagedEndReward / size
                    rewardInf = averagedEndReward * (1 /
                                                     (1 - rewardDiscountGamma))
                    reward = mdp1.getReward(expLostAssets,
                                            expRemoved) + rewardInf

                else:
                    reward = mdp1.getReward(expLostAssets, expRemoved)
                # ================================================================================
            else:
                # reward = mdp1.getReward(expLostAssets, expRemoved)

                # adding attacker +++++++++++++++++++++++++++++++++++++++++++++++++++++++
                reward = mdp1.getReward_multiAgent(expLostAssets,
                                                   expRemoved_def)
                reward_att = mdp1.getReward_multiAgent_att(
                    expLostAssets, expRemoved_att)
                # adding attacker +++++++++++++++++++++++++++++++++++++++++++++++++++++++

            immReward.append(reward)
            immReward_att.append(reward_att)
            # discReward.append(round((pow(g, i - 1) * reward), 4))
            discReward.append(
                round((pow(rewardDiscountGamma, i - 1) * reward), 4))
            finalDebrisLevel.append(totDebrisLevel)

            i += 1
            # print i, yr
            yr = (TIME_STEP * i) + 2017
            # print yr
            state1 = mdp1.state(totDebrisLevel, yr)
        allDiscRews.append(sum(discReward))
        allRews.append(sum(immReward))
        allRews_att.append(sum(immReward_att))
        allRews_both.append(sum(immReward) + sum(immReward_att))

    # print max(allDiscRews)
    # maxRewDef = max(allRews)
    maxRewEnv = max(allRews_both)
    BR_defender = np.argmax(allRews)
    BR_env = np.argmax(allRews_both)

    # maxRewsDef = [strats[i] for i, j in enumerate(allRews) if j == maxRewDef]
    # maxRewsEnv = [strats[i] for i, j in enumerate(allRews_both) if j == maxRewEnv]
    maxRewsEnv = [i for i, j in enumerate(allRews_both) if j == maxRewEnv]
    maxRewsEnv_maxRewDefPom = [allRews[i] for i in maxRewsEnv]
    maxRewsEnv_maxRewDef = maxRewsEnv[np.argmax(maxRewsEnv_maxRewDefPom)]
    # BR_env = np.argmax([i for i in )
    # print maxRewsDef
    # print "====" *20
    print "ratio: ", ratioC_L_C_R, "opponent fixed: ", attackerFixed
    # print "def reward is: ", max(allRews)
    print "def reward is: ", allRews[BR_defender]
    # print "att reward is: ", (allRews_att[maxIndex])
    print "att reward is: ", allRews_att[BR_defender]
    # print "tot reward both is: ", (max(allRews) + allRews_att[maxIndex])
    print "tot reward both is: ", (allRews[BR_defender] +
                                   allRews_att[BR_defender])
    print strats[BR_defender]
    print
    # print "def reward is: ", allRews[BR_env]
    # print "att reward is: ", allRews_att[BR_env]
    # # print "env reward both is: ", max(allRews_both)
    # print "env reward both is: ", (allRews[BR_env] + allRews_att[BR_env])
    # print strats[BR_env]
    print "def reward is: ", allRews[maxRewsEnv_maxRewDef]
    print "att reward is: ", allRews_att[maxRewsEnv_maxRewDef]
    # print "env reward both is: ", max(allRews_both)
    print "env reward both is: ", (allRews[maxRewsEnv_maxRewDef] +
                                   allRews_att[maxRewsEnv_maxRewDef])
    print strats[maxRewsEnv_maxRewDef]

예제 #21

0

파일 보기

파일: learners.py 프로젝트: DGotshalk/Academics

 def episodeStart(self):  
     for state in self.env.getAllStates():
         for action in range(self.env.getNumActions()):  
             self.samples[(state,action)] = self.distribute[(state,action)].sample()
     self.mymdp = mdp.MDP(self.env, self.samples)
     self.policy = self.mymdp.computeOptimalPolicy()

예제 #22

0

파일 보기

파일: env.py 프로젝트: ucbtns/dai

def priors(theta_transition, theta_reward, env, discount):
    ''' 
            9 states in total and 4 actions:
            LEFT = 0
            DOWN = 1
            RIGHT = 2
            UP = 3
            
            4 types of states: start, goal (+100), frozen and hole        
             # 'FrozenLakeEnv-v1'
     
               "SFF",
               "FFG",
               "FHF"   
                
            # 'FrozenLakeEnv-v2'   
    
                "SFF",
                "FFH",
                "FGF"    
        '''

    action_space_size = env.action_space.n
    state_space_size = env.observation_space.n
    ass = np.zeros([action_space_size, state_space_size, state_space_size
                    ])  # Transition function [|A| x |S| x |S'|]

    # Using beta function to formulate the structure of the model based on priors:
    # Splitting each action into three types of moves: intended; lateral; others

    # hyper-parametres for beta distribution as the conjugate prior:
    a_t = theta_transition
    b_t = (1 - theta_transition) / 2

    P = env.P
    states = list(range(state_space_size))
    for j in states:
        sa = P[j]
        for i in list(range(action_space_size)):
            state_next = sa[i][0][1]
            ass[i][j][state_next] = a_t  # intended move
            if (state_next == 0) or (state_next == 8):
                ass[i][j][
                    state_next] = ass[i][j][state_next] + b_t  # intended move
            if state_next <= 7:
                ass[i][j][state_next + 1] = b_t  # lateral move
            if state_next >= 1:
                ass[i][j][state_next - 1] = b_t  # lateral move
            if state_next == 4:
                ass[i][j] = np.where(ass[i][j] != 0, b_t, 0)
                ass[i][j][state_next] = a_t  # lateral move
            if state_next == j:
                if (state_next == 7) or (state_next == 5):
                    ass[i][j] = 0
                    ass[i][j][state_next] = 1 - (1e-5)  # intended move
    ass[i][7] = 0
    ass[i][7][7] = 1 - (1e-5)

    ass[i][5] = 0
    ass[i][5][5] = 1 - (1e-5)

    # reward function: |A| x |S|:
    r = np.ones([action_space_size, state_space_size]) * 10

    # We account uncertainty for the reward:
    a_r = theta_reward
    b_r = (1 - theta_reward) / 2
    #b_r = (1-theta_reward)

    r[:, 5] = 100 * (a_r)  # belief that this is the true reward location
    r[:, 7] = 100 * (b_r)  # belief that this is the wrong reward location

    mdp_env = mdp.MDP(ass, r, discount)
    return mdp_env

예제 #23

0

파일 보기

파일: planner.py 프로젝트: methi1999/CS747-FILA-Assignments

import argparse
import mdpSolver
import mdp

if __name__ == '__main__':
    # argparse object
    parser = argparse.ArgumentParser()
    parser.add_argument("--mdp",type=str)
    parser.add_argument("--algorithm",type=str)
    args = parser.parse_args()

    # generate mdp instance
    mdp_instance = mdp.MDP(args.mdp)
    # solve
    if args.algorithm == 'vi':
        mdpSolver.value_iteration(mdp_instance)
    elif args.algorithm == 'hpi':
        mdpSolver.howard_pi(mdp_instance)
    elif args.algorithm == 'lp':
        mdpSolver.lp(mdp_instance)
    else:
        print("unknown algorithm")
    
    print(mdp_instance.prettyPrint())

예제 #24

0

파일 보기

파일: MDPMain.py 프로젝트: tristan-ka/IBOAT_RL

import math
import matplotlib.pyplot as plt
import mdp
import numpy as np

# %%
TORAD = math.pi / 180
'''
MDP PARAMETERS
'''
history_duration = 3
mdp_step = 1
time_step = 0.1
SP = -40 * TORAD
mdp = mdp.MDP(history_duration, mdp_step, time_step)
'''
WIND CONDITIONS
'''
mean = 45 * TORAD
std = 3 * TORAD
wind_samples = 10
WH = np.random.uniform(mean - std, mean + std, size=10)
'''
MDP INIT
'''
hdg0 = 2 * TORAD * np.ones(10)
state = mdp.initializeMDP(hdg0, WH)
'''
Generation of a simulation
'''

예제 #25

0

파일 보기

import mdp
import utils
import mdp_optimizer as mdpo
import numpy as np
import gridw

transitions = [('c1', 'facebook', 'distraction', 1.0, -1),
               ('c1', 'study', 'c2', 1.0, -2),
               ('distraction', 'facebook', 'distraction', 1.0, -1),
               ('distraction', 'quit', 'c1', 1.0, 0),
               ('c2', 'study', 'c3', 1.0, -2), ('c2', 'sleep', 'rest', 1.0, 0),
               ('c3', 'study', 'rest', 1.0, 10), ('c3', 'pub', 'c1', 0.2, 1),
               ('c3', 'pub', 'c2', 0.4, 1), ('c3', 'pub', 'c3', 0.4, 1)]

test_mdp = mdp.MDP(['c1', 'distraction', 'c2', 'c3', 'rest'],
                   ['facebook', 'quit', 'study', 'sleep', 'pub'], transitions,
                   {'rest'}, 1)

test_policy = np.array([
    (0.5, 0.0, 0.5, 0.0, 0.0),
    (0.5, 0.5, 0.0, 0.0, 0.0),
    (0.0, 0.0, 0.5, 0.5, 0.0),
    (0.0, 0.0, 0.5, 0.0, 0.5),
    (0.0, 0.0, 0.0, 1.0, 0.0),
],
                       dtype=float)

print mdpo.first_pass_monte_carlo(test_mdp, test_policy, 10000, .01)
#print mdpo.every_pass_monte_carlo(test_mdp, test_policy, 10000)
#print mdpo.calc_value_func_dynamic(test_mdp, test_policy, 1000)
print mdpo.temporal_difference(test_mdp, test_policy, 'c1', 0, .01, -1)