def __init__(self, question, testDict): super(ApproximateQLearningTest, self).__init__(question, testDict) self.discount = float(testDict['discount']) self.grid = gridworld.Gridworld(parseGrid(testDict['grid'])) if 'noise' in testDict: self.grid.setNoise(float(testDict['noise'])) if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward'])) self.grid = gridworld.Gridworld(parseGrid(testDict['grid'])) self.env = gridworld.GridworldEnvironment(self.grid) self.epsilon = float(testDict['epsilon']) self.learningRate = float(testDict['learningRate']) self.extractor = 'IdentityExtractor' if 'extractor' in testDict: self.extractor = testDict['extractor'] self.opts = {'actionFn': self.env.getPossibleActions, 'epsilon': self.epsilon, 'gamma': self.discount, 'alpha': self.learningRate} numExperiences = int(testDict['numExperiences']) maxPreExperiences = 10 self.numsExperiencesForDisplay = range(min(numExperiences, maxPreExperiences)) self.testOutFile = testDict['test_out_file'] if maxPreExperiences < numExperiences: self.numsExperiencesForDisplay.append(numExperiences)
def __init__(self, question, testDict): super(EpsilonGreedyTest, self).__init__(question, testDict) self.discount = float(testDict['discount']) self.grid = gridworld.Gridworld(parseGrid(testDict['grid'])) if 'noise' in testDict: self.grid.setNoise(float(testDict['noise'])) if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward'])) self.grid = gridworld.Gridworld(parseGrid(testDict['grid'])) self.env = gridworld.GridworldEnvironment(self.grid) self.epsilon = float(testDict['epsilon']) self.learningRate = float(testDict['learningRate']) self.numExperiences = int(testDict['numExperiences']) self.numIterations = int(testDict['iterations']) self.opts = { 'actionFn': self.env.getPossibleActions, 'epsilon': self.epsilon, 'gamma': self.discount, 'alpha': self.learningRate }
def __init__(self, question, testDict): super(EpsilonGreedyTest, self).__init__(question, testDict) self.discount = float(testDict["discount"]) self.grid = gridworld.Gridworld(parseGrid(testDict["grid"])) if "noise" in testDict: self.grid.setNoise(float(testDict["noise"])) if "livingReward" in testDict: self.grid.setLivingReward(float(testDict["livingReward"])) self.grid = gridworld.Gridworld(parseGrid(testDict["grid"])) self.env = gridworld.GridworldEnvironment(self.grid) self.epsilon = float(testDict["epsilon"]) self.learningRate = float(testDict["learningRate"]) self.numExperiences = int(testDict["numExperiences"]) self.numIterations = int(testDict["iterations"]) self.opts = { "actionFn": self.env.getPossibleActions, "epsilon": self.epsilon, "gamma": self.discount, "alpha": self.learningRate, }
def __init__(self, question, testDict): super(ValueIterationTest, self).__init__(question, testDict) self.discount = float(testDict['discount']) self.grid = gridworld.Gridworld(parseGrid(testDict['grid'])) iterations = int(testDict['valueIterations']) if 'noise' in testDict: self.grid.setNoise(float(testDict['noise'])) if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward'])) maxPreIterations = 10 self.numsIterationsForDisplay = range(min(iterations, maxPreIterations)) self.testOutFile = testDict['test_out_file'] if maxPreIterations < iterations: self.numsIterationsForDisplay.append(iterations)
def __init__(self, question, testDict): super(QLearningTest, self).__init__(question, testDict) self.discount = float(testDict['discount']) self.grid = gridworld.Gridworld(parseGrid(testDict['grid'])) if 'noise' in testDict: self.grid.setNoise(float(testDict['noise'])) if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward'])) self.grid = gridworld.Gridworld(parseGrid(testDict['grid'])) self.env = gridworld.GridworldEnvironment(self.grid) self.epsilon = float(testDict['epsilon']) self.learningRate = float(testDict['learningRate']) self.opts = {'actionFn': self.env.getPossibleActions, 'epsilon': self.epsilon, 'gamma': self.discount, 'alpha': self.learningRate} numExperiences = int(testDict['numExperiences']) maxPreExperiences = 10 self.numsExperiencesForDisplay = list(range(min(numExperiences, maxPreExperiences))) self.testOutFile = testDict['test_out_file'] if sys.platform == 'win32': _, question_name, test_name = testDict['test_out_file'].split('\\') else: _, question_name, test_name = testDict['test_out_file'].split('/') self.experiences = Experiences(test_name.split('.')[0]) if maxPreExperiences < numExperiences: self.numsExperiencesForDisplay.append(numExperiences)
def __init__(self, question, testDict): super(ValueIterationTest, self).__init__(question, testDict) self.discount = float(testDict["discount"]) self.grid = gridworld.Gridworld(parseGrid(testDict["grid"])) iterations = int(testDict["valueIterations"]) if "noise" in testDict: self.grid.setNoise(float(testDict["noise"])) if "livingReward" in testDict: self.grid.setLivingReward(float(testDict["livingReward"])) maxPreIterations = 10 self.numsIterationsForDisplay = list( range(min(iterations, maxPreIterations))) self.testOutFile = testDict["test_out_file"] if maxPreIterations < iterations: self.numsIterationsForDisplay.append(iterations)
def main(grid_size, discount, n_trajectories, learning_rate): wind = 0.3 gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) r = Apprenticeship.irl(gw, n_trajectories, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def lp_irl_gridworld(grid_size, discount): wind = 0.3 traj_len = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) gt_reward = np.array([gw.reward(s) for s in range(gw.n_states)]) policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] r = lp_irl.compute_reward(gw.n_states, gw.n_actions, gw.transition_probability, policy, gw.discount, 1, 5) plt.subplot(1, 2, 1) plt.pcolor(gt_reward.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(gt_reward.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_trajectories, epochs, learning_rate): wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) #trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy) #trajectories = gw.my_generate_trajectories_some_without_goal(n_trajectories,trajectory_length,gw.optimal_policy) trajectories = gw.my_generate_trajectories_multiple( n_trajectories, trajectory_length, gw.optimal_policy) feature_matrix = gw.feature_matrix() #feature_matrix = gw.feature_matrix_goalVsOther() #feature_matrix = gw.feature_matrix_goalVsOtherTwo() #feature_matrix = gw.feature_matrix_goalVsOtherThree() #ground truth given by us as we know which states are good vs bad ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #reard recovered using IRL algorithm recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) #let's standardiese it scaler = StandardScaler() standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1)) #print(recovered_reward) #print(standardised_reward) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(standardised_reward.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
l1=l1, l2=l2) scaler = StandardScaler() standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1)) plot.plot(ground_r, standardised_reward, grid_size) grid_size = 5 discount = 0.01 n_trajectories = 25 epochs = 700 learning_rate = 0.01 wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) feature_matrix = gw.feature_matrix() #feature_matrix = gw.feature_matrix_goalVsOther() #feature_matrix = gw.feature_matrix_goalVsOtherTwo() #feature_matrix = gw.feature_matrix_goalVsOtherThree() feature_space = feature_matrix.shape[1] #trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy) trajectories = gw.my_generate_trajectories_some_without_goal( n_trajectories, trajectory_length, gw.optimal_policy) n_states, d_states = feature_matrix.shape no_of_iterations = 20 structure = (3, 3)
def do_turn(self, ants): # track all moves, prevent collisions orders = {} def do_move_direction(loc, direction): #Rrr destination takes care of wrapping around, returns the destination #issues the moving order new_loc = ants.destination(loc, direction) #Rrr orders is the dictionary of location of ants if (ants.unoccupied(new_loc) and new_loc not in orders): ants.issue_order((loc, direction)) orders[new_loc] = loc return True else: return False targets = {} #ROHAN added the variable directn def do_move_location(loc, dirctn): #Rrr ants.direction takes a location and a destination and returns a list of the closest direction "as the crow flies". #If the target is up and to the left, it will return ['n', 'w'] and we should then try and move our ant one of the two directions. #If the target is directly down, it will return ['s'], which is a list of one item. directions = dirctn for direction in directions: if do_move_direction(loc, direction): #targets[dest] = loc return True return False # --------------------------------starts from here-------------------------------------- # find close self.turn = self.turn + 1 #MY HILLLLSSS for hill_loc in ants.my_hills(): x, y = hill_loc self.grid[x][y] = self.MYHILL #Rrr The dummy entry doesn't need a from location, so we just set the value to None. #prevent stepping on own hill orders[hill_loc] = None #ENEMY HILLLSSSS for hill_loc, hill_owner in ants.enemy_hills(): hillrow, hillcol = hill_loc self.grid[hillrow][hillcol] = self.ENEMYHILL #LAND, water food for i in range(ants.rows): for j in range(ants.cols): #if ((ants.visible((i,j))==True) or (self.grid[i][j]==(self.FOOD or self.ENEMYANTS or self.BOUNDARY2 or self.ENEMYANTS2))): # self.grid[i][j]=' ' # self.gridu[i][j]='v' if ants.visible((i, j)) == True: self.gridu[i][j] = 'v' if (self.grid[i][j] != (self.MYHILL or self.MYHILL2 or self.ENEMYHILL or self.WATER)): self.grid[i][j] = ' ' elif self.grid[i][j] == (self.FOOD or self.ENEMYANTS or self.BOUNDARY2 or self.ENEMYANTS2 or self.MYANTS): self.grid[i][j] = ' ' if ants.map[i][j] == -3: self.grid[i][j] = self.FOOD elif ants.map[i][j] == -4: self.grid[i][j] = self.WATER # if i cant see my hill, retreat to it urgently if self.grid[i][j] == self.MYHILL: if ants.visible((i, j)) == False: print >> sys.stderr, 'hill retreat', i, j sys.stderr.flush() self.grid[i][j] = self.MYHILL2 else: self.grid[i][j] = self.MYHILL if self.grid[i][j] == self.ENEMYHILL: print >> sys.stderr, 'hill attack!!!!!!!!!!!!!!!!!!', i, j sys.stderr.flush() #MY ANTSSSSSSSSSS S num_ants = 0 sx = 0 sy = 0 for ant_loc in ants.my_ants(): antrow, antcol = ant_loc sx = sx + antrow sy = sy + antcol #self.grid[antrow][antcol]=self.MYANTS num_ants = num_ants + 1 sx = int(sx / num_ants) sy = int(sy / num_ants) self.grid[sx][sy] = self.MYANTS ## change MODE------------------------------------------what do i do here ????????? if num_ants >= 0: ##(ants.rows*ants.cols/200): self.BOUNDARY = self.BOUNDARY2 else: self.BOUNDARY = ' ' #ENEMYYYYYYYY ANTSSSSSSS for enemy_loc, enemy_owner in ants.enemy_ants(): enemyrow, enemycol = enemy_loc #TO DO, if own ant concentration is good near enemy ant (enemy ant conc in the area), then a positive reward self.grid[enemyrow][enemycol] = self.ENEMYANTS #if they're near my base, retreat to base for hill_loc in ants.my_hills(): x, y = hill_loc if ants.distance(hill_loc, enemy_loc) < 9.0: self.grid[enemyrow][enemycol] = self.ENEMYANTS2 #if i can surround em attack :) TODO: also check the enemy density surround = 0 for ant_loc in ants.my_ants(): antrow, antcol = ant_loc if ants.distance(ant_loc, enemy_loc) < 6.0: surround = surround + 1 if surround >= 3: self.grid[enemyrow][enemycol] = self.ENEMYANTS2 print >> sys.stderr, 'ursurrounded attack: ', enemyrow, enemycol sys.stderr.flush() #BOUNDARY EXPANDING !!!!!!!!! for i in range(ants.rows): for j in range(ants.cols): if (self.gridu[i][j] == 'v' and self.grid[i][j] == ' '): if (ants.visible(ants.destination((i, j), 'n')) == False or ants.visible(ants.destination( (i, j), 'e')) == False or ants.visible(ants.destination( (i, j), 'w')) == False or ants.visible(ants.destination( (i, j), 's')) == False): self.grid[i][j] = self.BOUNDARY #-----------------------------------------------------------------------------------------VALUE ITERATION #opts={'agent': 'value', 'discount': 0.9, 'iters': 200, 'noise': 0.01, 'livingReward': 0.0, 'epsilon': 0.0, 'pause': False, 'manual': False, 'quiet': True, 'episodes': 100, 'learningRate': 0.5, 'grid': 'BookGrid', 'gridSize': 150, 'speed': 1000.0, 'textDisplay': False} opts = { 'livingReward': 0.0, 'discount': 0.9, 'iters': 300, 'noise': 0.05, 'epsilon': 0.0, 'manual': False, 'quiet': True, 'agent': 'value', 'pause': False, 'episodes': 100, 'learningRate': 0.5, 'grid': 'BookGrid', 'gridSize': 150, 'speed': 1000.0, 'textDisplay': False } mdp = gridworld.Gridworld(self.grid) mdp.setLivingReward(opts['livingReward']) mdp.setNoise(opts['noise']) env = gridworld.GridworldEnvironment(mdp) ########################### # GET THE AGENT ########################### #time_to_spare = (ants.turntime/1000.0) - (0.00064286 + 0.0000547619*num_ants + 0.0000065476*(num_ants*num_ants)) - 0.01 if num_ants <= 60: time_to_spare = (ants.turntime / 1000.0) - 0.03 else: time_to_spare = (ants.turntime / 1000.0) - ( -0.003512 + 0.00047632 * num_ants - 0.00000105286 * (num_ants * num_ants)) - 0.005 a = None a = valueIterationAgents.ValueIterationAgent(ants.turn_start_time, time_to_spare, mdp, opts['discount'], opts['iters']) #TIME TIME TIME TIME #t1 = time.time() for ant_loc in ants.my_ants(): antcol, antrow = ant_loc antcol = ants.rows - antcol - 1 inverted_ant_loc = (antrow, antcol) if (a.getQValue(inverted_ant_loc, 'north') == a.getQValue( inverted_ant_loc, 'south') == a.getQValue( inverted_ant_loc, 'east') == a.getQValue( inverted_ant_loc, 'west')): direct = random.choice('sewn') do_move_location(ant_loc, direct) elif a.getPolicy(inverted_ant_loc) == 'north': direct = 'n' do_move_location(ant_loc, direct) elif a.getPolicy(inverted_ant_loc) == 'south': direct = 's' do_move_location(ant_loc, direct) elif a.getPolicy(inverted_ant_loc) == 'east': direct = 'e' do_move_location(ant_loc, direct) elif a.getPolicy(inverted_ant_loc) == 'west': direct = 'w' do_move_location(ant_loc, direct) else: direct = random.choice('sewn') do_move_location(ant_loc, direct) #TIME 2 TIME 2 TIME 2 #t2 = time.time() - t1 print >> sys.stderr, 'turn: ', self.turn, 'ants :', num_ants, 'spare:', time_to_spare, 'time:', ( time.time() - ants.turn_start_time) sys.stderr.flush() # unblock own hill for hill_loc in ants.my_hills(): if hill_loc in ants.my_ants() and hill_loc not in orders.values(): for direction in ('s', 'e', 'w', 'n'): if do_move_direction(hill_loc, direction): break
sol = solvers.lp(matrix(c), matrix(A), matrix(b)) rewards = sol['x'][:n_states] rewards = utils.normalize(rewards) * R_MAX return rewards if __name__ == '__main__': print("\n*** Gridworld: Value Iteration demo ***\n") # Create gridworld trans_prob = 0.7 size_grid = 10 gamma = 0.5 gw = gridworld.Gridworld(size_grid, trans_prob) # Convert gridwolrd in Finite discrete MDP format n_states, n_actions, p_trans, rewards, terminal_state_1d = gw.get_MDP_format( ) # Run Value Iteration algortims v_states = value_iteration.run_value_iteration(n_states, n_actions, p_trans, rewards, terminal_state_1d, gamma) v_states = np.reshape(v_states, gw.grid.shape, order='F') # Find aptimal policy policy_opt = value_iteration.get_optimal_policy(n_states, n_actions, p_trans, rewards, terminal_state_1d, gamma)
def main(grid_size, discount): """ Run multi-agent linear programming inverse reinforcement learning on the gridworld MG. Plots the reward function. grid_size: Grid size. int. discount: MG discount factor. float. """ play_num = 2 gw = gridworld.Gridworld(play_num, grid_size, discount) act = np.array(gw.actions) policy_tu = [((0,1),(1,0)),((0,1),(0,1)),((0,1),(1,0)),((0,1),(0,-1)),\ ((0,1),(0,1)),((-1,0),(0,1)),((-1,0),(-1,0)),((-1,0),(-1,0)),\ ((-1,0),(0,1)),((-1,0),(0,1)),((-1,0),(0,1)),((-1,0),(0,1)),\ ((-1,0),(1,0)),((-1,0),(-1,0)),((-1,0),(1,0)),((-1,0),(0,-1))] policy = np.zeros(gw.n_states, dtype=int) for i in range(gw.n_states): policy[i] = int(tu_action(act, policy_tu[i])) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) print(ground_r) r = linear_irl.irl(gw.n_players, gw.actions, gw.n_states, gw.n_actions, gw.transition_probability, policy, gw.discount, 10, 0) print(r) a = ground_r[:, 0].reshape((4, 4)) b = ground_r[:, 1].reshape((4, 4)) a_1 = r[:16].reshape((4, 4)) b_1 = r[16:].reshape((4, 4)) print(b, b_1) fig, axes = plt.subplots(nrows=1, ncols=2) im1 = axes.flat[0].imshow(a) axes.flat[0].set_xlabel("B's square", fontsize=13) axes.flat[0].set_ylabel("A's square", fontsize=13) axes.flat[0].set_title("A's reward", fontsize=13) im2 = axes.flat[1].imshow(b) axes.flat[1].set_xlabel("B's square", fontsize=13) axes.flat[1].set_title("B's reward", fontsize=13) fig.subplots_adjust(right=0.8) cbar_ax = fig.add_axes([0.85, 0.25, 0.05, 0.5]) fig.colorbar(im1, cax=cbar_ax) plt.show() plt.subplot(2, 2, 3) plt.pcolormesh(a_1) plt.colorbar() plt.title("a_1") plt.subplot(2, 2, 4) plt.pcolormesh(b_1) plt.colorbar() plt.title("b_1") plt.show()
def __init__(self, question, testDict): super(PolicyIterationTest, self).__init__(question, testDict) self.discount = float(testDict['discount']) self.grid = gridworld.Gridworld(parseGrid(testDict['grid'])) if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward']))
def test_manual_sums_to_one(self): """Tests issue #1 on GitHub.""" gw = gridworld.Gridworld(5, 0.3, 0.2) self.assertTrue( np.isclose(gw.transition_probability.sum(axis=2), 1).all())
def make_random_gridworld(): grid_size = rn.randint(2, 15) wind = rn.uniform(0.0, 1.0) discount = rn.uniform(0.0, 1.0) return gridworld.Gridworld(grid_size, wind, discount)