def test_policy_iteration(): r = -0.04 m = mdp.MDP( states=problem_2.states, actions=problem_2.actions, reward={ (3, 1): r, (3, 2): r, (3, 3): r, (3, 4): 1.0, (2, 1): r, (2, 3): r, (2, 4): -1.0, (1, 1): r, (1, 2): r, (1, 3): r, (1, 4): r }, allowed_transitions=problem_2.allowed_transitions, p_action_given_desired_action=problem_2.p_action_given_desired_action, gamma=0.999999 # can't use 1.0 ) U, policy = m.policy_iteration() assert_optimal_policy(U, policy)
def mdpVI(): r = float(sys.argv[1]) m = mdp.MDP( states=problem_2.states, actions=problem_2.actions, reward={ (3, 1): r, (3, 2): r, (3, 3): r, (3, 4): 1.0, (2, 1): r, (2, 3): r, (2, 4): -1.0, (1, 1): r, (1, 2): r, (1, 3): r, (1, 4): r }, allowed_transitions=problem_2.allowed_transitions, p_action_given_desired_action=problem_2.p_action_given_desired_action, gamma=0.99999) U, policy = m.value_iteration(0.001) for r in (3, 2, 1): for c in (1, 2, 3, 4): state = (r, c) if state in policy: print('policy for state {} is {} with utility {}'.format( state, policy.get(state, 'terminal'), U[state]))
def train_both_players(iterations, rate, discount=0.99): print("Training for both the players ............") for _ in tqdm.trange(iterations): state = "123456789" first_states_list = [] second_states_list = [] for step in range(10): if state not in cache: cache[state] = mdp.MDP(state) if cache[state].is_terminal_state: first_states_list.append(state) second_states_list.append(state) break if step % 2 == 0: # First player move first_states_list.append(state) state = explore(state, 'f') else: second_states_list.append(state) state = explore(state, 's') train_player_per_episode(first_states_list, 'f', rate, discount) train_player_per_episode(second_states_list, 's', rate, discount)
def test_policy_evaluation(): m = mdp.MDP( states=two_by_two_states, actions=two_by_two_actions, reward=two_by_two_reward, allowed_transitions=two_by_two_allowed_transitions, p_action_given_desired_action=two_by_two_p_action_given_desired_action, gamma=0.3) # this policy only gets to the terminal state by accident policy = { (1, 1): { 'd': (2, 1) }, (1, 2): { 'd': (2, 2) }, (2, 1): { 'u': (1, 1) }, # (2, 2) is a terminal state (2, 2): {} } U = {state: 0.0 for state in two_by_two_states} U_update = m.policy_evaluation(policy, U) assert -0.125 < U_update[(1, 1)] < -0.124 assert 0.140 < U_update[(1, 2)] < 0.141 assert -0.103 < U_update[(2, 1)] < -0.102 assert U_update[(2, 2)] == 1.0
def test_random_policy(): m = mdp.MDP( states=two_by_two_states, actions=two_by_two_actions, reward=two_by_two_reward, allowed_transitions=two_by_two_allowed_transitions, p_action_given_desired_action=two_by_two_p_action_given_desired_action, gamma=0.0) for i in range(100): policy = m.get_random_policy() assert policy[(1, 1)] in [{'d': (2, 1)}, {'r': (1, 2)}] assert policy[(1, 2)] in [{'d': (2, 2)}, {'l': (1, 1)}] assert policy[(2, 1)] in [{'u': (1, 1)}, {'r': (2, 2)}] assert len(policy[(2, 2)].keys()) == 0
def exploit(state, player_symbol): if state not in cache: cache[state] = mdp.MDP(state) if player_symbol == 'f': max_value = max(cache[state].values_f) max_val_index = cache[state].values_f.index(max_value) return cache[state].actions[max_val_index] elif player_symbol == 's': max_value = max(cache[state].values_s) max_val_index = cache[state].values_s.index(max_value) return cache[state].actions[max_val_index]
def getTrueMDP(self): obsProbs = self.initOProbs() obsProbs[(0,self.LEFT_ACTION)][self.STAY_WITH_01_OUTCOME] = 1.0 obsProbs[(0,self.RIGHT_ACTION)][self.STAY_WITH_0_OUTCOME] = 0.8 obsProbs[(0,self.RIGHT_ACTION)][self.RIGHT_OUTCOME] = 0.2 for i in range(1,self.NUM_STATES-1): obsProbs[(i,self.RIGHT_ACTION)][self.LEFT_OUTCOME] = 0.2 obsProbs[(i,self.RIGHT_ACTION)][self.RIGHT_OUTCOME] = 0.8 obsProbs[(i,self.LEFT_ACTION)][self.LEFT_OUTCOME] = 1.0 lst= self.NUM_STATES-1 obsProbs[(lst,self.LEFT_ACTION)][self.LEFT_OUTCOME] = 1.0 obsProbs[(lst,self.RIGHT_ACTION)][self.STAY_WITH_1_OUTCOME] = 0.8 obsProbs[(lst,self.RIGHT_ACTION)][self.LEFT_OUTCOME] = 0.2 return mdp.MDP(self, obsProbs)
def train_second_player(num_of_iterations, rate): print("\nTraining second player, please wait .... \n") for _ in tqdm.trange(num_of_iterations): # v = random.random() # if v < 0.5: # state = "1234f6789" # else: # state = "123456789" state = "123456789" states_list = [] for step in range(10): if state not in cache: cache[state] = mdp.MDP(state) if cache[state].is_terminal_state: states_list.append(state) break if step % 2 == 0: # First Player Move, Random player move state = random_player(state, 'f') # if state not in cache: # cache[state] = mdp.MDP(state) else: # Second player move states_list.append(state) # Start Exploration - Exploitation trade off here --------- state = explore(state, 's') # v = 1 # if (random.random()<=v): # # v = v - 0.0001 # else: # index = exploit(state, 's') # state = ttt.input_to_board(index, state, 's') # --------------------------------------------------------- # if state not in cache: # cache[state] = mdp.MDP(state) for _ in range(1): train_player_per_episode(states_list, symbol='s', learning_rate=rate, discount_factor=0.99)
def test_evaluate_bellman_equation(): m = mdp.MDP( states=two_by_two_states, actions=two_by_two_actions, reward=two_by_two_reward, allowed_transitions=two_by_two_allowed_transitions, p_action_given_desired_action=two_by_two_p_action_given_desired_action, gamma=0.3) U = {(1, 1): 0.0, (1, 2): 0.0, (2, 1): 0.0, (2, 2): 0.0} U_next = {state: 0.0 for state in m.states} # test a state with maximizing action moving to a non-terminal state (U_next[(1, 1)], maximizing_action) = m.evaluate_bellman_equation(U, (1, 1)) assert U_next[( 1, 1)] == (m.reward[(1, 1)] + 0.3 * (0.8 * U[(2, 1)] + 0.1 * U[(1, 2)] + 0.1 * U[(1, 1)])) #assert maximizing_action == 'd' # test a state with maximizing action moving to a terminal state (U_next[(1, 2)], maximizing_action) = m.evaluate_bellman_equation(U, (1, 2)) assert U_next[(1, 2)] == m.reward[(1, 2)] + m.gamma * (0.8 * U[ (2, 2)] + 0.1 * U[(1, 1)] + 0.1 * U[(1, 2)]) #assert maximizing_action == 'd' # test a state with maximizing action moving to a terminal state (U_next[(2, 1)], maximizing_action) = m.evaluate_bellman_equation(U, (2, 1)) assert U_next[(2, 1)] == m.reward[(2, 1)] + m.gamma * (0.8 * U[ (2, 2)] + 0.1 * U[(1, 1)] + 0.1 * U[(2, 1)]) # test the terminal state (U_next[(2, 2)], maximizing_action) = m.evaluate_bellman_equation(U, (2, 2)) assert maximizing_action is None assert U_next[(2, 2)] == 1.0 U = U_next.copy() print(U) # test a state with maximizing action moving to a non-terminal state (U_next[(1, 1)], maximizing_action) = m.evaluate_bellman_equation(U, (1, 1)) assert U_next[( 1, 1)] == (m.reward[(1, 1)] + m.gamma * (0.8 * U[(2, 1)] + 0.1 * U[(1, 2)] + 0.1 * U[(1, 1)])) assert maximizing_action == 'r'
def getTrueMDP(self): obsProbs = self.initOProbs() walls = [(0,1),(2,1),(3,2)] pit = (1,3) for s,a in obsProbs: (x,y) = s for o in [self.NORTH_OUTCOME, self.SOUTH_OUTCOME, \ self.EAST_OUTCOME, self.WEST_OUTCOME]: oreal = o nextState = self.getNextStateInner(x,y,o) if nextState == pit: oreal = self.FELL_PIT_OUTCOME if nextState in walls: oreal = self.HIT_WALL_OUTCOME if o == a: obsProbs[(s,a)][oreal] += 0.7 else: obsProbs[(s,a)][oreal] += 0.1 return mdp.MDP(self,obsProbs)
def train_first_player(num_of_iterations, rate=0.01, discount_factor=0.99): """ This function plays a tic-tac-toe game involving two random players. All the states produces withhin the game are stored in a list and is forwarded to training_player_per_each_episode function for actual training. """ print("\nTraining first player, please wait .... \n") for _ in tqdm.trange(num_of_iterations): state = "123456789" states_list = [] for step in range(10): if state not in cache: cache[state] = mdp.MDP(state) if cache[state].is_terminal_state: states_list.append(state) break if step % 2 == 0: # First Player Move states_list.append(state) # Start Exploration - Exploitation trade off here --------- state = explore(state, 'f') # --------------------------------------------------------- else: # Random player move state = random_player(state, 's') train_player_per_episode(states_list, symbol='f', learning_rate=rate, discount_factor=discount_factor)
def test_get_resulting_action_list(): m = mdp.MDP( states=problem_2.states, actions=problem_2.actions, reward={}, allowed_transitions=problem_2.allowed_transitions, p_action_given_desired_action=problem_2.p_action_given_desired_action, gamma=0.0) # from state (1, 1) and desired action 'u' we can move # up with probability 0.8 # stay (left) with probability 0.1 # right with probability 0.1 resulting_action_list = m.get_resulting_action_list((1, 1), 'u') assert len(resulting_action_list) == 3 assert ('u', 0.8, (2, 1)) in resulting_action_list assert ('l', 0.1, (1, 1)) in resulting_action_list assert ('r', 0.1, (1, 2)) in resulting_action_list resulting_action_list = m.get_resulting_action_list((2, 1), 'u') assert len(resulting_action_list) == 3 assert ('u', 0.8, (3, 1)) in resulting_action_list assert ('l', 0.1, (2, 1)) in resulting_action_list assert ('r', 0.1, (2, 1)) in resulting_action_list
def create_grid(x, y, terminals, discount): transitions = [] states = [] for j in range(y): for i in range(x): name = _name(i, j) states.append(name) reward = -1 if ((i, j) in terminals): reward = 0 transitions.append((name, 'n', _name(i, max(j - 1, 0)), 1, reward)) transitions.append((name, 'e', _name(min(i + 1, x - 1), j), 1, reward)) transitions.append((name, 's', _name(i, min(j + 1, y - 1)), 1, reward)) transitions.append((name, 'w', _name(max(i - 1, 0), j), 1, reward)) return m.MDP(states, ['n', 'e', 's', 'w'], transitions, {_name(t[0], t[1]) for t in terminals}, discount)
def setUp(cls): cls.models = { 'sysadmin': """ computer(c1). computer(c2). computer(c3). connected(c1,[c2,c3]). connected(c2,[c1]). connected(c3,[c1]). accTotal([],A,A). accTotal([_|T],A,X) :- B is A+1, accTotal(T,B,X). total(L,T) :- accTotal(L,0,T). total_connected(C,T) :- connected(C,L), total(L,T). accAlive([],A,A). accAlive([H|T],A,X) :- running(H,0), B is A+1, accAlive(T,B,X). accAlive([H|T],A,X) :- not(running(H,0)), B is A, accAlive(T,B,X). alive(L,A) :- accAlive(L,0,A). total_running(C,R) :- connected(C,L), alive(L,R). state_fluent(running(C)) :- computer(C). action(reboot(none)). action(reboot(C)) :- computer(C). 1.00::running(C,1) :- reboot(C). 0.05::running(C,1) :- not(reboot(C)), not(running(C,0)). P::running(C,1) :- not(reboot(C)), running(C,0), total_connected(C,T), total_running(C,R), P is 0.45+0.50*R/T. utility(running(C,0), 1.00) :- computer(C). utility(reboot(C), -0.75) :- computer(C). utility(reboot(none), 0.00). """ } cls.mdp = mdp.MDP(cls.models['sysadmin']) cls.vi = vi.ValueIteration(cls.mdp)
def initialize(): global board, player, graphics, mdp, valueIteration, qLearn, stateHandler if len(sys.argv) > 2: if "json" not in sys.argv[1]: print("error: level json argument must be first argument") exit() if sys.argv[2] != 'v' and sys.argv[2] != 'q': print( "error: AI algorithm type not specified. Please use 'v' for value iteration or 'q' for q-learning" ) exit() rewardArgument = None livingReward = None iterations = None learninRate = None epsilon = None if sys.argv[2] == 'v': if len(sys.argv) != 6: print( "Error: value iteration requires reward discount, living reward, and number of iterations.\npython main.py level.json v 0.8 -1 50" ) exit() valueIteration = True iterations = int(sys.argv[5]) else: if len(sys.argv) != 7: print( "Error: q learning requires reward, living reward, learning rate, and epsilon.\npython main.py level.json 0.8 -1 0.2 0.9" ) exit() valueIteration = False learningRate = float(sys.argv[5]) epsilon = float(sys.argv[6]) rewardArgument = float(sys.argv[3]) livingReward = float(sys.argv[4]) board = boardLibrary.Board(sys.argv[1]) player = playerLibrary.Player(board.playerPosition[0], board.playerPosition[1]) startingState = state.State((player.x, player.y), [(key.x, key.y) for key in board.keys]) if valueIteration is True: mdp = mdpLibrary.MDP(startingState, rewardArgument, livingReward, iterations) qLearn = None else: qLearn = qLearningLibrary.QLearn(startingState, rewardArgument, livingReward, learningRate, epsilon) mdp = None graphics = graphicsLibrary.Graphics() else: print( "error: not enough arguments provided. Provide level json file followed by string 'v' or 'q' for value iteration or q-learning respectively" ) exit()
def plotCurve(): thresholdCurves = getThresholdCurves() currentCurve = thresholdCurves[0] currentYear = 2080 action = 8000 approachOptim = "optimistic" approachPess = "pessimistic" statesOptim = [] statesPess = [] scenariosOptim = [] scenariosPess = [] actions = [] mdp1 = mdp.MDP(thresholdCurves) # actions2 = [20000, 1000, 2000, 3000, 4000, 5000, 6000, 8000, 9000, 10000] # actions2 = [12000, 1000, 3000, 5000, 8000] # seqOfAction = [12000, 1000, 5000] seqOfAction = [5000, 12000, 1000] # seqOfAction = [1000, 20000, 5000] # seqOfAction = [2000, 3000, 1000, 2000] # seqOfAction = [8000, 4000, 6000, 3000] # seqOfAction = [9000, 8000, 12000, 6000] # for i in range(len(actions2)): # actions = [] # for j in range(25): # actions.append(actions2[0]) # for j in range(25): # actions.append(actions2[i]) # # actions = [4000] * 50 # # debris0 = 13000 # debris0 = thresholdCurves[0].totDebris[0] # states.append(debris0) # for i in range(50): # yr = (2 * i) + 2017 # # print yr # states.append(mdp(thresholdCurves, yr, states[i], actions[i], approach)) # scenarios.append(states[:-1]) # states = [] # actions = [] for j in range(20): actions.append(seqOfAction[0]) # actions.append(20000) for j in range(15): actions.append(seqOfAction[1]) # actions.append(1000) for j in range(15): actions.append(seqOfAction[2]) # for j in range(20): # actions.append(seqOfAction[3]) # print actions # for j in range(20): # actions.append(seqOfAction[3]) # actions = [4000] * 50 statesOptim = [] statesPess = [] timeStep = 2 debris0 = thresholdCurves[4].totDebris[0] totDebrisLevel = debris0 state1O = mdp1.state(totDebrisLevel, START_YEAR) state1P = mdp1.state(totDebrisLevel, START_YEAR) targetThreshold = 0 statesOptim.append(debris0) statesPess.append(debris0) yr = 2017 i = 0 while (yr - START_YEAR) < 100: expLostAssets, totDebrisLevelO, expRemoved, targetThreshold = mdp1.transitionOfStates( state1O, actions[i], approachOptim, timeStep) expLostAssets, totDebrisLevelP, expRemoved, targetThreshold = mdp1.transitionOfStates( state1P, actions[i], approachPess, timeStep) state1O = mdp1.state(totDebrisLevelO, yr) state1P = mdp1.state(totDebrisLevelP, yr) statesOptim.append(state1O.totDebrisLevel) statesPess.append(state1P.totDebrisLevel) i += 1 yr = (timeStep * i) + 2017 scenariosOptim.append(statesOptim) scenariosPess.append(statesPess) plotDebEvol(scenariosOptim, scenariosPess, 1)
def learnStrategy(): thresholdCurves = getThresholdCurves() mdp1 = mdp.MDP(thresholdCurves) debris0 = mdp1.thresholdCurves[0].totDebris[0] action0 = 3000 attackerStrategy = [ 2000, 2000, 2000, 2000, 2000, 12000, 12000, 12000, 12000, 12000, 12000 ] attackerStrategy1 = [5000, 12000] defenderFixedStrats = [] for ii in range(len(thresholdCurves)): defenderFixedStrats.append([ (ii + 1) * 1000 for jj in range(timeHorizon / TIME_STEP + 1) ]) fixedToPlay = 4 finalStrategy = [] finalDebrisLevel = [] Q_table = qLearning.initQtable(debrisLevels, actionVector, years) print len(Q_table) # # fn approx # lenOfPhi = (timeHorizon / TIME_STEP) + len(range(13000, 350000, debrisLevelStep)) # # length is the total size + 1 to account for a bias term # weights = [random.random() for j in range(lenOfPhi * len(actionVector) + 1)] discRewardEvol = [] immRewardEvol = [] listOfQvalues = [] no_states = 0 for iteration in range(totIter): totDebrisLevel = debris0 state0 = mdp1.state(totDebrisLevel, 2017) state = state0 rewardList = [] discReward = [] targetThreshold = action0 atBeginning = 1 if iteration == totIter - 1: finalDebrisLevel.append(totDebrisLevel) i = 0 yr = 2017 while (yr - START_YEAR) < 100: stateApprox = mdp1.state(mdp.approximateState(state), state.year) if iteration == totIter - 1: action = qLearning.epsilon_greedy_strat( Q_table, stateApprox, targetThreshold, 0, atBeginning) else: action = qLearning.epsilon_greedy_strat( Q_table, stateApprox, targetThreshold, epsilon, atBeginning) atBeginning = 0 expLostAssets, totDebrisLevel, expRemoved, targetThreshold = mdp1.transitionOfStates( state, action, approach, TIME_STEP) state_next = mdp1.state(totDebrisLevel, yr + TIME_STEP) state_nextApprox = mdp1.state(mdp.approximateState(state_next), state_next.year) if infiniteRewSum: # ======= last reward is infinite sum of future discounted rewards with constant values ============== if (yr + timeHorizon % TIME_STEP) == finalYear: averagedEndReward = 0 indx = 0 for thr in thresholdCurves: if thr.threshold == action: break indx += 1 size = 0 for j in range(95, 100): averagedEndReward += -( thresholdCurves[indx].lostAssets[j] * C_L + thresholdCurves[indx].removed[j] * C_R) size += 1 averagedEndReward = averagedEndReward / size rewardInf = averagedEndReward * (1 / (1 - rewardDiscountGamma)) reward = mdp1.getReward(expLostAssets, expRemoved_def) + rewardInf else: reward = mdp1.getReward(expLostAssets, expRemoved_def) # ================================================================================ else: reward = mdp1.getReward(expLostAssets, expRemoved) discReward.append(pow(rewardDiscountGamma, i - 1) * reward) qLearning.update_Q(Q_table, alpha, rewardDiscountGamma, stateApprox, state_nextApprox, action, reward) state = state_next rewardList.append(reward) if iteration == totIter - 1: finalStrategy.append(action) finalDebrisLevel.append(totDebrisLevel) i += 1 yr = (TIME_STEP * i) + 2017 discRewardEvol.append(sum(discReward)) print "step in debris size: ", debrisLevelStep print "Learned sequence of actions: ", finalStrategy print "Total number of explored states ", no_states # print weights print "discounted reward is :", sum(discReward) print "imm reward is :", sum(rewardList) # print "total error is: ", sum(td_errorList) def movingaverage(interval, window_size): window = np.ones(int(window_size)) / float(window_size) return np.convolve(interval, window, 'same') wins_MA = [] return finalStrategy
def learnStrategyMultiAgent(): thresholdCurves = getThresholdCurves() mdp1 = mdp.MDP(thresholdCurves) debris0 = mdp1.thresholdCurves[0].totDebris[0] action0 = 5000 defenderFixedStrats = [] for ii in range(len(thresholdCurves)): defenderFixedStrats.append([ (ii + 1) * 1000 for jj in range(timeHorizon / TIME_STEP + 1) ]) fixedToPlay = 4 finalStrategy = [] finalStrategy_att = [] finalDebrisLevel = [] Q_table = qLearning.initQtable(debrisLevels, actionVector, years) Q_table_att = qLearning.initQtable(debrisLevels, actionVector, years) discRewardEvol = [] immReward = [] immReward_att = [] listOfQvalues = [] no_states = 0 td_errorList = [] td_errorList_att = [] for iteration in range(totIter): totDebrisLevel = debris0 state0 = mdp1.state(totDebrisLevel, 2017) state = state0 discReward = [] discReward_att = [] targetThreshold_def = action0 targetThreshold_att = action0 atBeginning = 1 if iteration == totIter - 1: finalDebrisLevel.append(totDebrisLevel) # for i in range(1, 50): i = 0 yr = 2017 while (yr - START_YEAR) < 100: # if iteration == 350: # action = 3000 stateApprox = mdp1.state(mdp.approximateState(state), state.year) if iteration == totIter - 1: # epsilon = 0 # action = qLearning.epsilon_greedy_strat(Q_table, state, targetThreshold, 0) # print "learned ", Q_table[state.totDebrisLevel, state.year, action] # listOfQvalues.append(Q_table[state.totDebrisLevel, state.year, action]) action = qLearning.epsilon_greedy_strat( Q_table, stateApprox, targetThreshold_def, 0, atBeginning) # action = 4000 action_att = qLearning.epsilon_greedy_strat( Q_table_att, stateApprox, targetThreshold_att, 0, atBeginning) # action_att = 6000 # fn approx # action = qLearning.epsilon_greedy_fnApprox(weights, state, targetThreshold, 0, atBeginning) # action = 4000 # action_att = qLearning.epsilon_greedy_fnApprox(weights_att, state, targetThreshold_att, 0, atBeginning) # action_att = 12000 # action = 12000 # action = defenderFixedStrats[fixedToPlay][i] else: # action = qLearning.epsilon_greedy_strat(Q_table, state, targetThreshold, epsilon) # action = actionsLearned[i] # print atBeginning action = qLearning.epsilon_greedy_strat( Q_table, stateApprox, targetThreshold_def, epsilon, atBeginning) # action = 4000 action_att = qLearning.epsilon_greedy_strat( Q_table_att, stateApprox, targetThreshold_att, epsilon, atBeginning) # action_att = 6000 # fn approx # action = qLearning.epsilon_greedy_fnApprox(weights, state, targetThreshold, epsilon, atBeginning) # action = 4000 # action_att = qLearning.epsilon_greedy_fnApprox(weights_att, state, targetThreshold_att, epsilon, atBeginning) # action_att = 12000 # action = defenderFixedStrats[fixedToPlay][i] atBeginning = 0 # pastAgentAction = action expLostAssets_def, totDebrisLevel_def, expRemoved_def, targetThreshold_def = mdp1.transitionOfStates( state, action, approach, TIME_STEP) # OPPONENT ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # print opponentAction # expLostAssets_att, totDebrisLevel_att, expRemoved_att, targetThreshold_att = mdp1.transitionOfStates(state, opponentAction, approach, TIME_STEP) expLostAssets_att, totDebrisLevel_att, expRemoved_att, targetThreshold_att = mdp1.transitionOfStates( state, action_att, approach, TIME_STEP) # print expRemoved_def expRemoved_total = expRemoved_def + expRemoved_att # print expRemoved_def if expRemoved_total != 0: remProportional_def = expRemoved_def / expRemoved_total remProportional_att = expRemoved_att / expRemoved_total else: remProportional_def = 0 remProportional_att = 0 curve_total = mdp1.findCurve(expRemoved_total, state, approach, TIME_STEP) action_joint = curve_total.threshold expLostAssets, totDebrisLevel, expRemoved, targetThreshold = mdp1.transitionOfStates( state, action_joint, approach, TIME_STEP) # print expRemoved_total, expRemoved expRemoved_def = expRemoved * remProportional_def expRemoved_att = expRemoved * remProportional_att state_next = mdp1.state(totDebrisLevel, yr + TIME_STEP) state_nextApprox = mdp1.state(mdp.approximateState(state_next), state_next.year) if infiniteRewSum: # ======= last reward is infinite sum of future discounted rewards with constant values ============== # !!!!!! this type of reward not defined for the attacker yet ********************************************** if (yr + timeHorizon % TIME_STEP) == finalYear: averagedEndReward = 0 indx = 0 for thr in thresholdCurves: if thr.threshold == action: break indx += 1 size = 0 for j in range(95, 100): averagedEndReward += -( share_IA * thresholdCurves[indx].lostAssets[j] + ratioC_L_C_R * thresholdCurves[indx].removed[j]) size += 1 averagedEndReward = averagedEndReward / size rewardInf = averagedEndReward * (1 / (1 - rewardDiscountGamma)) reward = mdp1.getReward_multiAgent( expLostAssets, expRemoved_def) + rewardInf else: reward = mdp1.getReward_multiAgent(expLostAssets, expRemoved_def) # ================================================================================ else: reward = mdp1.getReward_multiAgent(expLostAssets, expRemoved_def) reward_att = mdp1.getReward_multiAgent_att( expLostAssets, expRemoved_att) discReward.append(pow(rewardDiscountGamma, i - 1) * reward) discReward_att.append(pow(rewardDiscountGamma, i - 1) * reward_att) # Q learning ++++++++++++++++++++++++++++++++++++++++++ qLearning.update_Q(Q_table, alpha, rewardDiscountGamma, stateApprox, state_nextApprox, action, reward) qLearning.update_Q(Q_table_att, alpha, rewardDiscountGamma, stateApprox, state_nextApprox, action_att, reward_att) # fn approx # weights, td_error = qLearning.update_weights(weights, alpha, rewardDiscountGamma, state, action, state_next, reward) # weights_att, td_error_att = qLearning.update_weights(weights_att, alpha, rewardDiscountGamma, state, action_att, state_next, reward_att) # td_errorList.append(math.sqrt(td_error ** 2)) # td_errorList_att.append(math.sqrt(td_error_att ** 2)) state = state_next if iteration == totIter - 1: finalStrategy.append(action) finalStrategy_att.append(action_att) finalDebrisLevel.append(totDebrisLevel) immReward.append(reward) immReward_att.append(reward_att) i += 1 yr = (TIME_STEP * i) + 2017 discRewardEvol.append(sum(discReward)) print print "++++" * 20 print "share: ", share_IA, "ratio: ", ratioC_L_C_R print "Q-learning params, alpha :", alpha, " epsilon: ", epsilon, " gamma: ", rewardDiscountGamma print "step in debris size: ", debrisLevelStep print "DEF: Learned sequence of actions: ", finalStrategy print "ATT: Learned sequence of actions: ", finalStrategy_att # print "Total number of explored states ", no_states # print weights print "DEF: discounted reward is :", sum(discReward) print "ATT: discounted reward is :", sum(discReward_att) # print np.cumsum(discReward)[::1] print "DEF: imm reward is :", sum(immReward) print "ATT: imm att reward is :", sum(immReward_att) print print "Both tot reward is: ", sum(immReward) + sum(immReward_att) print print "DEF: total error is: ", sum(td_errorList) print "ATT: total error is: ", sum(td_errorList_att) print "++++" * 20 print def movingaverage(interval, window_size): window = np.ones(int(window_size)) / float(window_size) return np.convolve(interval, window, 'same') return finalStrategy
def showFinalStrategy(finalStrategy): years = range(2017, 2017 + 100, TIME_STEP) thresholdCurves = getThresholdCurves() # thresholds = [1000, 2000, 3000, 4000, 5000, 6000] thresholds = [1000, 2000, 3000, 4000, 5000, 6000, 8000, 9000, 10000, 12000] # thresholds = [1000, 4000, 8000, 12000] thresholdsNames = [ "learned", 1000, 2000, 3000, 4000, 5000, 6000, 8000, 9000, 10000, 12000 ] strategies = [] strategies.append(finalStrategy) for thr in thresholds: strat = [thr] * 52 strategies.append(strat) mdp1 = mdp.MDP(thresholdCurves) # actionsLabels = ["above 1000", "above 2000", "above 3000", "above 4000", "above 5000", "above 6000", "above 8000", "above 9000", "above 10000", "no removal"] actionsLabels = [ "Q-learned strat", "above 1000", "above 2000", "above 3000", "above 4000", "above 5000", "above 6000", "above 8000", "above 9000", "above 10000", "no removal" ] # actionsLabels = ["above 1000", "above 4000", "above 8000", "no removal"] # actionsLabels = ["$\gamma$ = 0.99", "$\gamma$ = 0.975", "$\gamma$ = 0.95", "$\gamma$ = 0.9", "$\gamma$ = 0.8"] # actionsLabels = ["C_L/C_R = 0.05", "C_L/C_R = 0.1", "C_L/C_R = 0.2", "C_L/C_R = 0.3", "C_L/C_R = 0.4", "C_L/C_R = 0.5"] # colorOfExper = ['g--', 'g', 'b', 'k', 'y', 'm', 'c', 'k', 'b', 'g', 'r', 'y'] colorOfExper = [ 'g', 'g--', 'b--', 'k--', 'y--', 'm--', 'c--', 'k--', 'b--', 'g--', 'r--', 'y' ] file1 = fileN ff = open(file1, 'w') ix = 0 for thr in range(0, len(strategies)): finalStrategy = strategies[thr] if thr == 0: debris0 = mdp1.thresholdCurves[2].totDebris[0] else: debris0 = mdp1.thresholdCurves[thr - 1].totDebris[0] totDebrisLevel = debris0 state0 = mdp1.state(totDebrisLevel, 2017) state1 = state0 immReward = [] discReward = [] finalDebrisLevel = [] finalDebrisLevel.append(totDebrisLevel) totRemoved = [] totLost = [] i = 0 yr = 2017 while (yr - START_YEAR) < 100: action = finalStrategy[i] expLostAssets, totDebrisLevel, expRemoved, targetThreshold = mdp1.transitionOfStates( state1, action, approach, TIME_STEP) totRemoved.append(expRemoved) totLost.append(expLostAssets) if infiniteRewSum: # ======= last reward is infinite sum of future discounted rewards with constant values ============== if (yr + timeHorizon % TIME_STEP) == finalYear: averagedEndReward = 0 indx = 0 for thre in thresholdCurves: if thre.threshold == action: break indx += 1 size = 0 for j in range(95, 100): averagedEndReward += -( thresholdCurves[indx].lostAssets[j] * C_L + thresholdCurves[indx].removed[j] * C_R) size += 1 averagedEndReward = averagedEndReward / size rewardInf = averagedEndReward * (1 / (1 - rewardDiscountGamma)) reward = mdp1.getReward(expLostAssets, expRemoved) + rewardInf else: reward = mdp1.getReward(expLostAssets, expRemoved) # reward = mdp1.getReward_multiAgent(expLostAssets, expRemoved) # ================================================================================ else: reward = mdp1.getReward(expLostAssets, expRemoved) # reward = mdp1.getReward_multiAgent(expLostAssets, expRemoved) immReward.append(reward) # discReward.append(round((pow(g, i - 1) * reward), 4)) discReward.append( round((pow(rewardDiscountGamma, i - 1) * reward), 4)) finalDebrisLevel.append(totDebrisLevel) i += 1 # print i, yr yr = (TIME_STEP * i) + 2017 # print yr state1 = mdp1.state(totDebrisLevel, yr) print print "Sum of discounted reward for threshold ", thresholdsNames[ ix], " is ", sum(discReward) print "Sum of immediate reward for threshold is ", sum(immReward) print "Discounted reward for threshold ", thresholdsNames[ ix], " is ", np.cumsum(discReward)[::1] print "total removed: ", sum(totRemoved) print "total lost: ", sum(totLost) print ff.write("%s\n" % finalStrategy) ff.write("%s\n" % sum(discReward)) ff.write("%s\n" % np.cumsum(discReward)) ff.write("\n") totRew = round(sum(immReward), 3) if ix == 0: plt.plot(years, finalDebrisLevel[0:-1], colorOfExper[ix], label=actionsLabels[ix], linewidth=3) # plt.plot(years, finalDebrisLevel, colorOfExper[ix], label=actionsLabels[ix], linewidth=3) else: plt.plot(years, finalDebrisLevel[0:-1], colorOfExper[ix], label=actionsLabels[ix], linewidth=1.5) # plt.plot(years, finalDebrisLevel, colorOfExper[ix], label=actionsLabels[ix], linewidth=1.5) ix += 1 # inxG += 1 ff.close() pylab.xlim(2015, 2120) # pylab.ylim(-1.2, 0.2) # plt.ylabel('discounted reward', fontsize=20) plt.ylabel('number of objects', fontsize=20) # plt.ylabel('removed', fontsize=20) # plt.ylabel('lost assets', fontsize=20) plt.xlabel('year', fontsize=20) # plt.title('Objects number evolution', fontsize=20) # plt.title('Objects number evolution - $C_R/C_L = 0.5$', fontsize=20) plt.title('Discounted reward - C_R/C_L = 0.3, $\gamma$ = 0.95', fontsize=20) # plt.title('Discounted reward - above 1000, time step 2 years', fontsize=17) # plt.title('Immediate reward evolution', fontsize=20) # plt.title('Immediate reward evolution - threshold = 3000', fontsize=20) # plt.title('Expected # of removed', fontsize=20) # plt.title('Expected # of removed', fontsize=20) # plt.title('Expected # of lost assets', fontsize=20) plt.grid() plt.legend(loc='upper left') plt.show()
def findOptimalSequence(attackerFixed): years = range(2017, 2017 + 101, TIME_STEP) thresholdCurves = getThresholdCurves() # thresholds = [1000, 2000, 3000, 4000, 5000, 6000] thresholds = [1000, 2000, 3000, 4000, 5000, 6000, 8000, 9000, 10000, 12000] mdp1 = mdp.MDP(thresholdCurves) startAction = [3000] yr = 0 # seq = [] seqAll = [] def getSeq(seq, action, yr): if yr < 100: seq.append(action) indx = 0 for thr in thresholds: if thr == action: break indx += 1 yr += TIME_STEP getSeq(seq[:], action, yr) if action != 12000: getSeq(seq[:], thresholds[indx + 1], yr) if action != 1000: getSeq(seq[:], thresholds[indx - 1], yr) else: seqAll.append(seq) return seqAll seq = [] stratsAll = [] for thrs in thresholds: stratsAll.extend(getSeq([], thrs, yr)) strats = stratsAll allDiscRews = [] allRews = [] allRews_att = [] allRews_both = [] action_att = attackerFixed # showFinalStrategy(strats[100]) for strat in strats: finalStrategy = strat debris0 = mdp1.thresholdCurves[2].totDebris[0] totDebrisLevel = debris0 state0 = mdp1.state(totDebrisLevel, 2017) state1 = state0 immReward = [] immReward_att = [] discReward = [] finalDebrisLevel = [] finalDebrisLevel.append(totDebrisLevel) i = 0 yr = 2017 while (yr - START_YEAR) < 100: action = finalStrategy[i] # adding attacker +++++++++++++++++++++++++++++++++++++++++++++++++++++++ expLostAssets_def, totDebrisLevel_def, expRemoved_def, targetThreshold = mdp1.transitionOfStates( state1, action, approach, TIME_STEP) expLostAssets_att, totDebrisLevel_att, expRemoved_att, targetThreshold_att = mdp1.transitionOfStates( state1, action_att, approach, TIME_STEP) expRemoved_total = expRemoved_def + expRemoved_att if expRemoved_total != 0: remProportional_def = expRemoved_def / expRemoved_total remProportional_att = expRemoved_att / expRemoved_total else: remProportional_def = 0 remProportional_att = 0 curve_total = mdp1.findCurve(expRemoved_total, state1, approach, TIME_STEP) action_joint = curve_total.threshold expLostAssets, totDebrisLevel, expRemoved, targetThreshold = mdp1.transitionOfStates( state1, action_joint, approach, TIME_STEP) # print expRemoved_total, expRemoved expRemoved_def = expRemoved * remProportional_def expRemoved_att = expRemoved * remProportional_att # adding attacker +++++++++++++++++++++++++++++++++++++++++++++++++++++++ if infiniteRewSum: # ======= last reward is infinite sum of future discounted rewards with constant values ============== if (yr + timeHorizon % TIME_STEP) == finalYear: averagedEndReward = 0 indx = 0 for thre in thresholdCurves: if thre.threshold == action: break indx += 1 size = 0 for j in range(95, 100): averagedEndReward += -( thresholdCurves[indx].lostAssets[j] * C_L + thresholdCurves[indx].removed[j] * C_R) size += 1 averagedEndReward = averagedEndReward / size rewardInf = averagedEndReward * (1 / (1 - rewardDiscountGamma)) reward = mdp1.getReward(expLostAssets, expRemoved) + rewardInf else: reward = mdp1.getReward(expLostAssets, expRemoved) # ================================================================================ else: # reward = mdp1.getReward(expLostAssets, expRemoved) # adding attacker +++++++++++++++++++++++++++++++++++++++++++++++++++++++ reward = mdp1.getReward_multiAgent(expLostAssets, expRemoved_def) reward_att = mdp1.getReward_multiAgent_att( expLostAssets, expRemoved_att) # adding attacker +++++++++++++++++++++++++++++++++++++++++++++++++++++++ immReward.append(reward) immReward_att.append(reward_att) # discReward.append(round((pow(g, i - 1) * reward), 4)) discReward.append( round((pow(rewardDiscountGamma, i - 1) * reward), 4)) finalDebrisLevel.append(totDebrisLevel) i += 1 # print i, yr yr = (TIME_STEP * i) + 2017 # print yr state1 = mdp1.state(totDebrisLevel, yr) allDiscRews.append(sum(discReward)) allRews.append(sum(immReward)) allRews_att.append(sum(immReward_att)) allRews_both.append(sum(immReward) + sum(immReward_att)) # print max(allDiscRews) # maxRewDef = max(allRews) maxRewEnv = max(allRews_both) BR_defender = np.argmax(allRews) BR_env = np.argmax(allRews_both) # maxRewsDef = [strats[i] for i, j in enumerate(allRews) if j == maxRewDef] # maxRewsEnv = [strats[i] for i, j in enumerate(allRews_both) if j == maxRewEnv] maxRewsEnv = [i for i, j in enumerate(allRews_both) if j == maxRewEnv] maxRewsEnv_maxRewDefPom = [allRews[i] for i in maxRewsEnv] maxRewsEnv_maxRewDef = maxRewsEnv[np.argmax(maxRewsEnv_maxRewDefPom)] # BR_env = np.argmax([i for i in ) # print maxRewsDef # print "====" *20 print "ratio: ", ratioC_L_C_R, "opponent fixed: ", attackerFixed # print "def reward is: ", max(allRews) print "def reward is: ", allRews[BR_defender] # print "att reward is: ", (allRews_att[maxIndex]) print "att reward is: ", allRews_att[BR_defender] # print "tot reward both is: ", (max(allRews) + allRews_att[maxIndex]) print "tot reward both is: ", (allRews[BR_defender] + allRews_att[BR_defender]) print strats[BR_defender] print # print "def reward is: ", allRews[BR_env] # print "att reward is: ", allRews_att[BR_env] # # print "env reward both is: ", max(allRews_both) # print "env reward both is: ", (allRews[BR_env] + allRews_att[BR_env]) # print strats[BR_env] print "def reward is: ", allRews[maxRewsEnv_maxRewDef] print "att reward is: ", allRews_att[maxRewsEnv_maxRewDef] # print "env reward both is: ", max(allRews_both) print "env reward both is: ", (allRews[maxRewsEnv_maxRewDef] + allRews_att[maxRewsEnv_maxRewDef]) print strats[maxRewsEnv_maxRewDef]
def episodeStart(self): for state in self.env.getAllStates(): for action in range(self.env.getNumActions()): self.samples[(state,action)] = self.distribute[(state,action)].sample() self.mymdp = mdp.MDP(self.env, self.samples) self.policy = self.mymdp.computeOptimalPolicy()
def priors(theta_transition, theta_reward, env, discount): ''' 9 states in total and 4 actions: LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3 4 types of states: start, goal (+100), frozen and hole # 'FrozenLakeEnv-v1' "SFF", "FFG", "FHF" # 'FrozenLakeEnv-v2' "SFF", "FFH", "FGF" ''' action_space_size = env.action_space.n state_space_size = env.observation_space.n ass = np.zeros([action_space_size, state_space_size, state_space_size ]) # Transition function [|A| x |S| x |S'|] # Using beta function to formulate the structure of the model based on priors: # Splitting each action into three types of moves: intended; lateral; others # hyper-parametres for beta distribution as the conjugate prior: a_t = theta_transition b_t = (1 - theta_transition) / 2 P = env.P states = list(range(state_space_size)) for j in states: sa = P[j] for i in list(range(action_space_size)): state_next = sa[i][0][1] ass[i][j][state_next] = a_t # intended move if (state_next == 0) or (state_next == 8): ass[i][j][ state_next] = ass[i][j][state_next] + b_t # intended move if state_next <= 7: ass[i][j][state_next + 1] = b_t # lateral move if state_next >= 1: ass[i][j][state_next - 1] = b_t # lateral move if state_next == 4: ass[i][j] = np.where(ass[i][j] != 0, b_t, 0) ass[i][j][state_next] = a_t # lateral move if state_next == j: if (state_next == 7) or (state_next == 5): ass[i][j] = 0 ass[i][j][state_next] = 1 - (1e-5) # intended move ass[i][7] = 0 ass[i][7][7] = 1 - (1e-5) ass[i][5] = 0 ass[i][5][5] = 1 - (1e-5) # reward function: |A| x |S|: r = np.ones([action_space_size, state_space_size]) * 10 # We account uncertainty for the reward: a_r = theta_reward b_r = (1 - theta_reward) / 2 #b_r = (1-theta_reward) r[:, 5] = 100 * (a_r) # belief that this is the true reward location r[:, 7] = 100 * (b_r) # belief that this is the wrong reward location mdp_env = mdp.MDP(ass, r, discount) return mdp_env
import argparse import mdpSolver import mdp if __name__ == '__main__': # argparse object parser = argparse.ArgumentParser() parser.add_argument("--mdp",type=str) parser.add_argument("--algorithm",type=str) args = parser.parse_args() # generate mdp instance mdp_instance = mdp.MDP(args.mdp) # solve if args.algorithm == 'vi': mdpSolver.value_iteration(mdp_instance) elif args.algorithm == 'hpi': mdpSolver.howard_pi(mdp_instance) elif args.algorithm == 'lp': mdpSolver.lp(mdp_instance) else: print("unknown algorithm") print(mdp_instance.prettyPrint())
import math import matplotlib.pyplot as plt import mdp import numpy as np # %% TORAD = math.pi / 180 ''' MDP PARAMETERS ''' history_duration = 3 mdp_step = 1 time_step = 0.1 SP = -40 * TORAD mdp = mdp.MDP(history_duration, mdp_step, time_step) ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 3 * TORAD wind_samples = 10 WH = np.random.uniform(mean - std, mean + std, size=10) ''' MDP INIT ''' hdg0 = 2 * TORAD * np.ones(10) state = mdp.initializeMDP(hdg0, WH) ''' Generation of a simulation '''
import mdp import utils import mdp_optimizer as mdpo import numpy as np import gridw transitions = [('c1', 'facebook', 'distraction', 1.0, -1), ('c1', 'study', 'c2', 1.0, -2), ('distraction', 'facebook', 'distraction', 1.0, -1), ('distraction', 'quit', 'c1', 1.0, 0), ('c2', 'study', 'c3', 1.0, -2), ('c2', 'sleep', 'rest', 1.0, 0), ('c3', 'study', 'rest', 1.0, 10), ('c3', 'pub', 'c1', 0.2, 1), ('c3', 'pub', 'c2', 0.4, 1), ('c3', 'pub', 'c3', 0.4, 1)] test_mdp = mdp.MDP(['c1', 'distraction', 'c2', 'c3', 'rest'], ['facebook', 'quit', 'study', 'sleep', 'pub'], transitions, {'rest'}, 1) test_policy = np.array([ (0.5, 0.0, 0.5, 0.0, 0.0), (0.5, 0.5, 0.0, 0.0, 0.0), (0.0, 0.0, 0.5, 0.5, 0.0), (0.0, 0.0, 0.5, 0.0, 0.5), (0.0, 0.0, 0.0, 1.0, 0.0), ], dtype=float) print mdpo.first_pass_monte_carlo(test_mdp, test_policy, 10000, .01) #print mdpo.every_pass_monte_carlo(test_mdp, test_policy, 10000) #print mdpo.calc_value_func_dynamic(test_mdp, test_policy, 1000) print mdpo.temporal_difference(test_mdp, test_policy, 'c1', 0, .01, -1)