def main(): env = GridWorld(3, 4) #Define the state matrix state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print("State Matrix:") print(state_matrix) #Define the reward matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print("Reward Matrix:") print(reward_matrix) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) state_action_matrix = np.random.random((4,12)) print("State-Action Matrix:") print(state_action_matrix) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3,4)) print("Utility Matrix:") print(utility_matrix) gamma = 0.999 alpha = 0.001 #constant step size beta_matrix = np.zeros((4,12)) tot_epoch = 300000 print_epoch = 1000 for epoch in range(tot_epoch): #Reset and return the first observation observation = env.reset(exploring_starts=True) for step in range(1000): #Estimating the action through Softmax col = observation[1] + (observation[0]*4) action_array = state_action_matrix[:, col] action_distribution = softmax(action_array) action = np.random.choice(4, 1, p=action_distribution) #To enable the beta parameter, enable the libe below #and add beta_matrix=beta_matrix in the update actor function #beta_matrix[action,col] += 1 #increment the counter #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) utility_matrix, delta = update_critic(utility_matrix, observation, new_observation, reward, alpha, gamma) state_action_matrix = update_actor(state_action_matrix, observation, action, delta, beta_matrix=None) observation = new_observation if done: break if(epoch % print_epoch == 0): print("") print("Utility matrix after " + str(epoch+1) + " iterations:") print(utility_matrix) print("") print("State-Action matrix after " + str(epoch+1) + " iterations:") print(state_action_matrix) #Time to check the utility matrix obtained print("Utility matrix after " + str(tot_epoch) + " iterations:") print(utility_matrix) print("State-Action matrix after " + str(tot_epoch) + " iterations:") print(state_action_matrix)
def main(): world = GridWorld() q_table = np.zeros([len(world.available_actions()), 7, 10]) q_table = train(world, q_table) moves = evaluate(world, q_table) print 'Moves: ' + str(moves) print 'Steps: ' + str(len(moves))
def init_or(): '''Init the OR boolean environment @return the environment gridworld object ''' env = GridWorld(5, 5) #Define the state matrix state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the index matrix index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)], [(3,0), (3,1), (3,2), (3,3), (3,4)], [(2,0), (2,1), (2,2), (2,3), (2,4)], [(1,0), (1,1), (1,2), (1,3), (1,4)], [(0,0), (0,1), (0,2), (0,3), (0,4)]]) #Define the reward matrix reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [-1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) env.setStateMatrix(state_matrix) env.setIndexMatrix(index_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) return env, np.zeros((5,5))
def q_learning(q_tables, gamma=GAMMA, alpha=0.001, number_of_episodes=10000, max_step_number=1000): policy_list = [] # contains the final approximate optimal policy env = GridWorld() actions = ['up', 'down', 'left', 'right'] indexes_actions = {-4: 0, 4: 1, -1: 2, 1: 3} rewards = 0 for episode in range(number_of_episodes): obs = env.reset() number = 0 # the number of steps in one episode which is no more than max_step_number while True: # one episode action = epsilon_greedy(obs, q_tables) # action = A action_index = indexes_actions[action] next_obs, reward, done, _ = env.step( action) # next_obs = S', reward = R rewards += reward q_tables[obs][ action_index] = q_tables[obs][action_index] + alpha * ( reward + gamma * max(q_tables[next_obs]) - q_tables[obs][action_index]) obs = next_obs number += 1 if done == 1 or number == max_step_number: # reach final state or max number step break for row in range(len(q_tables)): policy_list.append(actions[np.argmax(q_tables[row])]) performance = rewards / number_of_episodes optimal_policy = np.array(policy_list).reshape(4, 4) return q_tables, optimal_policy, performance
def main(): env = GridWorld(3, 4) state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print state_matrix reward_matrix = np.full((3, 4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print reward_matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) state_action_matrix = np.random.random((4, 12)) print "State Action matrix" print state_action_matrix env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3, 4)) print "utility matrix" print utility_matrix gamma, alpha, tot_epoch, print_epoch = 0.999, 0.1, 30000, 1000 for epoch in range(tot_epoch): observation = env.reset(exploring_starts=True) for step in range(1000): col = observation[1] + (4 * observation[0]) # Sending Action to Environment action_array = state_action_matrix[:, col] action_distribution = softmax(action_array) action = np.random.choice(4, 1, p=action_distribution) new_observation, reward, done = env.step(action) # Update Critic utility_matrix, delta = update_critic(utility_matrix, alpha, observation, new_observation, reward, gamma) # Update Actor state_action_matrix = update_actor(state_action_matrix, observation, action, delta, beta_matrix=None) observation = new_observation if done: break print "final utility matrix" print utility_matrix print "final state action matrix" print state_action_matrix
def _run(FLAGS, model_cls): logger = logging.getLogger('Trainer_%s' % model_cls.__name__) logger.setLevel(logging.INFO) file_handler = logging.FileHandler('%s.log' % model_cls.__name__) file_handler.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] ## %(message)s') file_handler.setFormatter(formatter) stream_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.addHandler(stream_handler) hparams = tf.contrib.training.HParams(**COMMON_HPARAMS.values()) hparams.set_hparam('batch_size', FLAGS.bs) hparams.set_hparam('n_steps', FLAGS.stp) hparams.set_hparam('n_dims', FLAGS.dims) hparams.set_hparam('n_info_dims', FLAGS.info_dims) hparams.set_hparam('n_att_dims', FLAGS.att_dims) hparams.set_hparam('max_epochs', FLAGS.epochs) hparams.set_hparam('checkpoint', FLAGS.ckpt) hparams.set_hparam('n_heads', FLAGS.heads) hparams.set_hparam('n_selfatt_dims', FLAGS.selfatt_dims) assert hparams.n_dims == hparams.n_info_dims + hparams.n_att_dims, "`n_dims` should be equal to the sum of `n_info_dims` and `n_att_dims`" assert hparams.n_dims == hparams.n_heads * hparams.n_selfatt_dims, "`n_dims` should be equal to the product of `n_heads` and `n_selfatt_dims`" name_size = 'SZ%d-STP%d' % (FLAGS.sz, FLAGS.stp) config_size = Config(size=FLAGS.sz, max_steps=FLAGS.stp) for name_std, config_std in CONFIG_STDS.iteritems(): for name_drop, config_drop in CONFIG_DROPS.iteritems(): for name_direction, config_direction in CONFIG_DIRECTIONS.iteritems( ): config = Config() config.add('base', 'base', CONFIG_BASE) config.add('size', name_size, config_size) config.add('direction', name_direction, config_direction) config.add('drop', name_drop, config_drop) config.add('std', name_std, config_std) gridworld = GridWorld(name=config.get_name(), **config.get_kwargs()) for seed in GRIDWORLD_SEEDS: data_dir = '%s-SEED%d' % (config.get_name(), seed) gridworld.load(data_dir, seed=seed, splitting_seed=SPLITTING_SEED) dataset_name = config.get_name() for shuffling_seed in SHUFFLING_SEEDS: dataset = Dataset(dataset_name, os.path.join(BASE_DIR, data_dir), shuffling_seed=shuffling_seed) model = model_cls(dataset, hparams, gridworld, seed=MODEL_SEED) Trainer(model, logger)()
def __init__(self, **args): start = args.get('start', Windy.Start) goal = args.get('goal', Windy.Goal) GridWorld.__init__(self, Windy.Columns, Windy.Rows, start=start, goal=goal) self.wind = args.get('wind', Windy.Wind)
def example_1(): #example 1 height = 6 width = 2 start = [5, 0] goals = ([5, 0]) walls = ([2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3]) cliffs = ([1, 1], [1, 2], [1, 3]) env = GridWorld(height, width, False, False, start, goals, walls, cliffs) env.render(mode='simple_render')
def example_3(): #example 3 height = 3 width = 3 start = [0, 0] goals = ([2, 2]) walls = None cliffs = None env = GridWorld(height, width, False, False, start, goals, walls, cliffs) env.render(mode='simple_render')
def init_env(): '''Init the XOR boolean environment @return the environment gridworld object ''' env = GridWorld(5, 5) #Define the state matrix state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the index matrix index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)], [(3,0), (3,1), (3,2), (3,3), (3,4)], [(2,0), (2,1), (2,2), (2,3), (2,4)], [(1,0), (1,1), (1,2), (1,3), (1,4)], [(0,0), (0,1), (0,2), (0,3), (0,4)]]) #Define the reward matrix reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, -1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [-1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) env.setStateMatrix(state_matrix) env.setIndexMatrix(index_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) return env
def test_move_dir(self): grid = ' \n \n ' gw = GridWorld(grid) start = (1,1) # N, E, S, W tests = [(0, (1,0)), (1, (2,1)), (2, (1,2)), (3, (0,1))] for dir, end in tests: e, _, _ = gw.move_dir(start, dir) self.assertEqual(e, end)
def configure_gridworld() -> Tuple[Domain, Task]: domain = GridWorld(10, 7, agent_x_start=0, agent_y_start=3, wind=True, wind_strengths=[0, 0, 0, 1, 1, 1, 2, 1, 1, 0], stochasticity=stochasticity) domain.place_exit(7, 3) task = ReachExit(domain) return domain, task
def create_env(): """ Создает среду для экспериментов :return: среду """ # Создаем среду в виде сетки 3x4 env = GridWorld(3, 4) # Задаем матрицу состояний state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 # Задаем матрицу вознаграждений # Для всех кроме терминальных состояний вознаграждение -0.04 reward_matrix = np.full((3, 4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 # Задаем матрицу вероятности совершения действия transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) # Настраиваем и возвращаем среду env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) return env
def main(): env = GridWorld(3, 4) state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) state_action_matrix = np.zeros((4,12)) visit_counter_matrix = np.zeros((4, 12)) utility_matrix = np.zeros((3, 4)) gamma, alpha, tot_epoch, print_epoch = 0.999, 0.001, 500000, 1000 for epoch in range(tot_epoch): epsilon = return_decayed_value(0.1, epoch, decay_step=100000) observation = env.reset(exploring_starts=True) is_starting = True for step in range(1000): action = return_epsilon_greedy_action(policy_matrix, observation, epsilon=0.1) if is_starting: action = np.random.randint(0, 4) is_starting = False new_observation, reward, done = env.step(action) new_action = int(policy_matrix[new_observation[0]][new_observation[1]]) state_action_matrix = update_state_action_matrix(state_action_matrix, reward, gamma, observation, new_observation, action, new_action, visit_counter_matrix) policy_matrix = update_policy(policy_matrix, state_action_matrix, observation) visit_counter_matrix = update_visit_counter(visit_counter_matrix, observation, action) observation = new_observation if done: break if epoch % print_epoch == 0: print "state action and policy matrices after %d iterations: " %(epoch) print state_action_matrix print "best policy after %d iterations: " %(epoch) print_policy(policy_matrix) print "##################################" print "final state action matrix: ", state_action_matrix print "final policy matrix: ", policy_matrix
def main(): env = GridWorld(3, 4) state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print state_matrix reward_matrix = np.full((3, 4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print reward_matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) trace_matrix = np.zeros((3, 4)) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3, 4)) gamma, alpha, tot_epoch, print_epoch, lambda_ = 0.999, 0.1, 30000, 1000, 0.5 for epoch in range(tot_epoch): observation = env.reset(exploring_starts=True) for step in range(1000): action = policy_matrix[observation[0]][observation[1]] new_observation, reward, done = env.step(action) delta = reward + gamma * utility_matrix[new_observation[0]][ new_observation[1]] - utility_matrix[observation[0]][ observation[1]] trace_matrix[observation[0]][observation[1]] += 1 utility_matrix = update_utility_matrix(utility_matrix, alpha, delta, trace_matrix) trace_matrix = update_eligibility_matrix(trace_matrix, gamma, lambda_) observation = new_observation if done: break if epoch % print_epoch == 0: print "utility matrix after %d iterations: " % (epoch) print utility_matrix print "final utility matrix: ", utility_matrix
def question_3_1_a(printing=True): if printing: print( 'a) Develop a state graph representation for this search problem, and' ) print( 'develop a step() method for finding the next legal steps this problem,' ) print('i.e. for generating successor nodes (vertices).') print() obstacle_coords = [(2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (3, 6), (4, 6), (5, 6), (6, 6), (7, 6), (7, 3), (7, 4), (7, 5)] env_config = { 'nrow': 9, 'ncol': 9, 'obstacle_coords': obstacle_coords, 'start_coord': (8, 0), 'goal_coord': (0, 8), 'cost_map': None } env = GridWorld(env_config) if printing: print([i for i in dir(env) if '__' not in i], '\n') for k, v in vars(env).items(): if k == 'cfg': continue if printing: print(k, v, '\n') return env
def test_parse(self): grid = ' #P\nG #' gw = GridWorld(grid) self.assertEqual(gw.grid[1][0], '#') self.assertEqual(gw.grid[2][0], 'P') self.assertEqual(gw.grid[0][1], 'G') self.assertEqual(gw.grid[1][1], ' ')
class AnnRunner(object): """Wraps up the gross reality of running a ``print'' using the printer simulation (controlled by a neural network)""" camera_size = 3 def __init__(self, ideal_grid_path, cell_size, units_per_cell=10): """Sets up all the pieces needed to perform a print with the simulated 3d printer (controlled by the neural network). Takes in a path to an a ``goal'' or ``ideal'' grid, and constructs the GridWorld based on the dimensions of that goal grid. Understands both a ``camera'', which observes the actual world (around the print head) and an ``ideal camera'' which observes the same location but based on the ``goal grid'' """ ideal_grid = Grid(path=ideal_grid_path, scale=cell_size) self.ideal_grid = ideal_grid self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, cell_size) self.gridworld.set_ideal_grid(ideal_grid) self.printer = Printer(10, 10, 9, self.gridworld, units_per_cell) #TODO: shouldn't be giving location values here when it's determined somewhere else. that smells a lot self.camera = Camera(self.gridworld.grid, self.printer, self.camera_size) self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, self.camera_size) def run(self, n, iterations=10000): """Runs a simulated print run with the printer simulation (controlled by an ANN. Starts the printer in the location provided by the ideal grid spec """ #set the printer location to the starting postition as defined by the ideal_grid spec self.printer.set_position_on_grid(*self.gridworld.get_starting_position()) for i in xrange(iterations): self.printer.setPenDown() actual = self.camera.all_cell_values() ideal = self.ideal_camera.all_cell_values() pattern = [i - a for i,a in zip(actual, ideal)] result = n.propagate(pattern) result = [int(round(x)) for x in result] result = ''.join(map(str, result)) self.printer.set_printer_direction(self.get_velocity(result[:2]), self.get_velocity(result[2:])) self.printer.simulate() self.update() return (self.ideal_grid, self.gridworld.grid) def update(self): return def get_velocity(self, instruction): """Translates between the output of the neural network and direction instructions for the printer. leftright and updown are translated separately""" if instruction == "10": return -1 elif instruction == "01": return 1 else: return 0
def main(): env = GridWorld(3, 4) state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print state_matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print reward_matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) state_action_matrix = np.random.random((4, 12)) print "State Action matrix" print state_action_matrix env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3, 4)) print "utility matrix" print utility_matrix gamma, alpha, tot_epoch, print_epoch = 0.999, 0.1, 30000, 1000 for epoch in range(tot_epoch): observation = env.reset(exploring_starts=True) for step in range(1000): col = observation[1] + (4 * observation[0]) # Sending Action to Environment action_array = state_action_matrix[:, col] action_distribution = softmax(action_array) action = np.random.choice(4, 1, p=action_distribution) new_observation, reward, done = env.step(action) # Update Critic utility_matrix, delta = update_critic(utility_matrix, alpha, observation, new_observation, reward, gamma) # Update Actor state_action_matrix = update_actor(state_action_matrix, observation, action, delta, beta_matrix=None) observation = new_observation if done: break print "final utility matrix" print utility_matrix print "final state action matrix" print state_action_matrix
class GridWrapper(Game): def __init__(self, size, frame_stack_size=1, render=False): self.size = size self.new_game() def possible_actions(self): num_actions = 4 actions = [] for i in range(num_actions): new_action = [0] * num_actions new_action[i] = 1 actions.append(new_action) return actions def perform_action(self, action): action_idx = np.argmax(action) self.game.perform_action(action_idx) def get_state(self): state, reward, terminal = self.game.get_state() for i in range(len(state)): for j in range(len(state[i])): state[i][j] = [state[i][j]] return state, reward, terminal def get_score(self): _, _, terminal = self.game.get_state() score = 0 if terminal: score += 1 score -= self.game.actions_taken * 0 return score def goal_reached(self): return self.game.goal_reached def actions_taken(self): return self.game.actions_taken def new_game(self): self.game = GridWorld(self.size) self.min_moves = self.game.min_remaining_moves() def generate_states(self): return self.game.generate_states()
def __init__(self, ideal_grid, units_per_cell=10): self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize) self.gridworld.set_ideal_grid(ideal_grid) self.printer = Printer(10, 10, 9, 1, self.gridworld, units_per_cell) #TODO: shouldn't be giving location values here when it's determined somewhere else. that smells a lot self.camera = Camera(self.gridworld.grid, self.printer, self.camera_size) self.ideal_grid = self.gridworld.ideal_grid self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, self.camera_size) width = self.gridworld.width() * self.gridworld.gridsize() height = self.gridworld.height() * self.gridworld.gridsize()
def load_gridworld(filename): grid = [] with open(filename, newline='') as f: reader = csv.reader(f) for row in reader: grid_row = [] for col in row: grid_row.append(int(col)) grid.append(grid_row) return GridWorld(grid)
def __init__(self, ideal_grid_path, cell_size, units_per_cell=10): """Sets up all the pieces needed to perform a print with the simulated 3d printer (controlled by the neural network). Takes in a path to an a ``goal'' or ``ideal'' grid, and constructs the GridWorld based on the dimensions of that goal grid. Understands both a ``camera'', which observes the actual world (around the print head) and an ``ideal camera'' which observes the same location but based on the ``goal grid'' """ ideal_grid = Grid(path=ideal_grid_path, scale=cell_size) self.ideal_grid = ideal_grid self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, cell_size) self.gridworld.set_ideal_grid(ideal_grid) self.printer = Printer(10, 10, 9, self.gridworld, units_per_cell) #TODO: shouldn't be giving location values here when it's determined somewhere else. that smells a lot self.camera = Camera(self.gridworld.grid, self.printer, self.camera_size) self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, self.camera_size)
def __init__(self, ideal_grid): self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize) self.gridworld.set_ideal_grid(ideal_grid) self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), self.gridworld) self.camera = VisualCamera(self.gridworld, self.printer, 3) self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, 3) #gui stuff pygame.init() width = self.gridworld.width() * self.gridworld.gridsize() height = self.gridworld.height() * self.gridworld.gridsize() self.window = pygame.display.set_mode((width, height))
def random_play(n_steps): #env from example_3 height = 3 width = 3 start = [0, 0] goals = ([2, 2]) walls = None cliffs = None env = GridWorld(height, width, False, False, start, goals, walls, cliffs) #random actions over n_steps: env.reset() for step in range(n_steps): action = env.action_space_sample() new_state, reward, done = env.step(action) print("Step:", step, ", Action:", action, ", New state:", env.get_obs(), ", Done:", done, ", Reward:", reward) env.render(mode='episode')
def main(args): if args.verbose: logging.basicConfig(level=logging.INFO) elif args.debug: logging.basicConfig(level=logging.DEBUG) # initializations gridworld = GridWorld(args.size, args.interval, args.obstacles, args.vision, args.phase) logging.info("Generated grid world!") logging.info("Visuals created") mc = MonteCarlo(gridworld, mode=args.method) logging.info("Initialized Monte Carlo method") mc.run()
def main(): env = GridWorld(3, 4) state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print state_matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print reward_matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) trace_matrix = np.zeros((3, 4)) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3, 4)) gamma, alpha, tot_epoch, print_epoch, lambda_ = 0.999, 0.1, 30000, 1000, 0.5 for epoch in range(tot_epoch): observation = env.reset(exploring_starts=True) for step in range(1000): action = policy_matrix[observation[0]][observation[1]] new_observation, reward, done = env.step(action) delta = reward + gamma * utility_matrix[new_observation[0]][new_observation[1]] - utility_matrix[observation[0]][observation[1]] trace_matrix[observation[0]][observation[1]] += 1 utility_matrix = update_utility_matrix(utility_matrix, alpha, delta, trace_matrix) trace_matrix = update_eligibility_matrix(trace_matrix, gamma, lambda_) observation = new_observation if done: break if epoch % print_epoch == 0: print "utility matrix after %d iterations: " %(epoch) print utility_matrix print "final utility matrix: ", utility_matrix
def generate_data(FLAGS): name_size = 'SZ%d-STP%d' % (FLAGS.sz, FLAGS.stp) config_size = Config(size=FLAGS.sz, max_steps=FLAGS.stp) for name_std, config_std in CONFIG_STDS.iteritems(): for name_drop, config_drop in CONFIG_DROPS.iteritems(): for name_direction, config_direction in CONFIG_DIRECTIONS.iteritems( ): config = Config() config.add('base', 'base', CONFIG_BASE) config.add('size', name_size, config_size) config.add('direction', name_direction, config_direction) config.add('drop', name_drop, config_drop) config.add('std', name_std, config_std) gridworld = GridWorld(name=config.get_name(), **config.get_kwargs()) for seed in GRIDWORLD_SEEDS: data_dir = '%s-SEED%d' % (config.get_name(), seed) gridworld.generate(data_dir=data_dir, seed=seed, splitting_seed=SPLITTING_SEED)
def test_norm_wind(self): env = GridWorld() state = env.reset() for _ in range(4): state, _, _ = env.step(0) # move right self.assertTrue(np.array_equal(state, np.array([4, 4]))) for _ in range(2): state, _, _ = env.step(0) # move right self.assertTrue(np.array_equal(state, np.array([6, 6]))) state, _, _ = env.step(3) # move down self.assertTrue(np.array_equal(state, np.array([6, 6]))) for _ in range(5): state, _, _ = env.step(0) # move right self.assertTrue(np.array_equal(state, np.array([9, 6]))) for _ in range(4): state, _, _ = env.step(3) # move down for _ in range(2): state, _, done = env.step(2) # move left self.assertTrue(done)
def init_nand(bias=True): '''Init the boolean environment @return the environment gridworld object ''' env = GridWorld(5, 5) #Define the state matrix state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the index matrix index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)], [(3,0), (3,1), (3,2), (3,3), (3,4)], [(2,0), (2,1), (2,2), (2,3), (2,4)], [(1,0), (1,1), (1,2), (1,3), (1,4)], [(0,0), (0,1), (0,2), (0,3), (0,4)]]) #Define the reward matrix reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, -1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) env.setStateMatrix(state_matrix) env.setIndexMatrix(index_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) if bias: return env, np.random.uniform(-1, 1, 3) else: return env, np.random.uniform(-1, 1, 2)
def run_instance(param): # runs sim with given parameters for different controllers and different trials and writes to results directory # init environment if param.env_name in 'gridworld': env = GridWorld(param) elif param.env_name in 'citymap': env = CityMap(param) else: exit('env_name not recognized: ', param.env_name) # run sim for i_trial in range(param.n_trials): # init datasets if param.make_dataset_on: print(' making dataset...') train_dataset, test_dataset = datahandler.make_dataset(env) datahandler.write_dataset(env, train_dataset, test_dataset) print(' loading dataset...') datahandler.load_dataset(env) # initial condition s0 = env.get_s0() for controller_name in param.controller_names: controller = Controller(param,env,controller_name) # sim sim_result = sim(param,env,controller,s0) # write results case_count = len(glob.glob('../current_results/*')) + 1 results_dir = '../current_results/sim_result_{}'.format(case_count) datahandler.write_sim_result(sim_result, results_dir) return
def test_move(self): grid = ' #P\nG #' gw = GridWorld(grid, move_value=-1, die_value=-20, win_value=10) step_tests = [ # move into wall ((0,0), (1,0), (0,0), -1, False), # move to free field ((0,0), (1,1), (1,1), -1, False), # move to goal ((0,0), (0,1), (0,1), 10, True), # die penalty ((0,0), (2,0), (2,0), -20, True), # out of bounds #1 ((0,0), (-1,0), (0,0), -1, False), # out of bounds #1 ((0,0), (10,0), (0,0), -1, False), ] for start, to, end, reward, is_terminal in step_tests: e, r, t = gw.move(start, to) self.assertEqual(e, end) self.assertEqual(r, reward) self.assertEqual(t, is_terminal)
def __init__(self, ideal_grid=None, ideal_grid_path=None): """ Set pygame stuff up for running the simulation.""" assert ideal_grid or ideal_grid_path, "must provide at least one ideal grid" self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize) self.gridworld.set_ideal_grid(ideal_grid) self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), self.gridworld) self.camera = VisualCamera(self.gridworld, self.printer, 3) self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, 3) #gui stuff pygame.init() width = self.gridworld.width() * self.gridworld.gridsize() height = self.gridworld.height() * self.gridworld.gridsize() self.window = pygame.display.set_mode((width, height))
def __init__(self): """ Set pygame stuff up for running the simulation.""" pygame.init() grid = GridWorld(20, 20, 30) ideal_grid = Grid(20, 20, 30) ideal_grid.grid = [[1 if x <= 10 else 0 for x in range(20)] for _ in range(20)] grid.set_ideal_grid(ideal_grid) width = grid.width() * grid.gridsize() height = grid.height() * grid.gridsize() self.grid = grid self.window = pygame.display.set_mode((width, height)) self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), grid) self.camera = VisualCamera(self.grid, self.printer, 3) self.grid.draw(self.window)
class AnnRunner: def __init__(self, ideal_grid): self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize) self.gridworld.set_ideal_grid(ideal_grid) self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), self.gridworld) self.camera = VisualCamera(self.gridworld, self.printer, 3) self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, 3) #gui stuff pygame.init() width = self.gridworld.width() * self.gridworld.gridsize() height = self.gridworld.height() * self.gridworld.gridsize() self.window = pygame.display.set_mode((width, height)) def run(self, n): self.printer.position = Vector(270, 150) while True: self.printer.setPenDown() actual = self.camera.camera.all_cell_values() ideal = self.ideal_camera.all_cell_values() pattern = [i - a for i,a in zip(actual, ideal)] result = n.propagate(pattern) result = [int(round(x)) for x in result] result = ''.join(map(str, result)) self.printer.v = Vector(self.get_velocity(result[:2]), self.get_velocity(result[2:])) self.printer.simulate(1) self.redraw() pygame.display.update() def get_velocity(self, instruction): if instruction == "10": return -100 elif instruction == "01": return 100 else: return 0 def redraw(self): self.gridworld.draw(self.window) self.printer.draw(self.window) self.camera.draw(self.window)
def main(): grid = '' with open("grid.lay","r") as file: grid = file.read() eps = 0.2 episodes = 10000 random.seed(1) gw = GridWorld(grid) Q = SARSA(gw, episodes=episodes, eps=eps) # plotQ(Q, gw, f'SARSA after {episodes} episodes') plotPolicy(Q, gw, f'SARSA: greedy-policy after {episodes} episodes') random.seed(1) Q = QLearning(gw, episodes=episodes, eps=eps) # plotQ(Q, gw, f'Q-Learning after {episodes} episodes') plotPolicy(Q, gw, f'Q-Learning: greedy-policy after {episodes} episodes')
def plot_gridworld(n_rows=2, n_cols=3, figsize=(10, 6), eps=0, save_path='gridworld_demo.svg', seed=42, dtype='bool'): """Makes a picture of an expert trajectory :param n_rows: number of rows to put the grids in :param n_cols: number of columns to put the grids in :param figsize: figure size :param eps: probability of a random action por the expert :param save_path: path to save the result :param seed: random seed to set to numpy :param dtype: observation dtype. For checking that both dtypes work the same way """ total = n_rows * n_cols np.random.seed(seed) env = GridWorld(5, 5, 3, obs_dtype=dtype) env.reset() done = False grids = [env.render(mode='get_grid')] while not done: action = env.get_expert_action(eps=eps) _, _, done, _ = env.step(action) grids.append(env.render(mode='get_grid')) if total < len(grids): display_ind = np.linspace(0, len(grids) - 1, total, dtype=int) grids = [grids[i] for i in display_ind] fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=figsize) fig.suptitle('Example of an expert trajectory') for r in range(n_rows): for c in range(n_cols): ind = r * n_cols + c ax = axes[r, c] ax.set_axis_off() if ind < len(grids): grid = grids[ind] ax.imshow(grid) plt.savefig(save_path)
def first_visit_monte_carlo_evaluate(gamma=GAMMA, number_of_episodes=100000): env = GridWorld() policy = Get_Action() values = np.zeros(16) returns = {state: list() for state in range(16)} for episode in range(number_of_episodes): observations, _, rewards, _ = generate_one_episode(env, policy) observations.pop() # exclude the sT G = 0 for i, obs in enumerate(observations[::-1] ): # reverse the list observations and rewards G = gamma * G + rewards[::-1][i] if obs not in observations[::-1][i + 1:]: returns[obs].append(G) values[obs] = np.average(returns[obs]) values[15] = 0 if episode % 10000 == 0: print(f"In the No.{episode} the values are {values}.") return values
def main(): cost_map = [] cost_map.append([1, 1, 1, 5, 5, 5, 5, 1, None]) cost_map.append([1, 1, 1, 5, 5, 5, 5, 1, 1]) cost_map.append([1, 1, 10, 10, 10, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 10, 10, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 1, 1, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 1, 1, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 1, 10, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 10, 10, 10, 10, 1, 1]) cost_map.append([0, 1, 1, 1, 1, 1, 1, 1, 1]) env_config = {'nrow': 9, 'ncol': 9, 'obstacle_coords': [], 'start_coord': (8, 0), 'goal_coord': (0, 8), 'cost_map': cost_map} env = GridWorld(env_config) start = env.start_state goal = env.goal_state question_3_2_a(start, goal, env) question_3_2_b(start, goal, env) question_3_2_c(start, goal, env) question_3_2_d_and_e(start, goal, env) question_3_2_f(start, goal, env)
class AnnRunner(object): camera_size = 3 def __init__(self, ideal_grid, units_per_cell=10): self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize) self.gridworld.set_ideal_grid(ideal_grid) self.printer = Printer(10, 10, 9, 1, self.gridworld, units_per_cell) #TODO: shouldn't be giving location values here when it's determined somewhere else. that smells a lot self.camera = Camera(self.gridworld.grid, self.printer, self.camera_size) self.ideal_grid = self.gridworld.ideal_grid self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, self.camera_size) width = self.gridworld.width() * self.gridworld.gridsize() height = self.gridworld.height() * self.gridworld.gridsize() def run(self, n, iterations=10000): self.printer.set_position_on_grid(self.ideal_grid.starting_point[0], self.ideal_grid.starting_point[1]) for i in xrange(iterations): self.printer.setPenDown() actual = self.camera.all_cell_values() ideal = self.ideal_camera.all_cell_values() pattern = [i - a for i,a in zip(actual, ideal)] result = n.propagate(pattern) result = [int(round(x)) for x in result] result = ''.join(map(str, result)) self.printer.v = Vector(self.get_velocity(result[:2]), self.get_velocity(result[2:])) self.printer.simulate() self.update() return (self.ideal_grid, self.gridworld.grid) def update(self): return def get_velocity(self, instruction): if instruction == "10": return -1 elif instruction == "01": return 1 else: return 0
def setUp(self): self.gridworld = GridWorld(20, 20, 10) self.printer = VirtualPrinter(0, 0, 10, 1, pygame.color.Color("darkorange"), self.gridworld) self.grid = self.gridworld.grid self.camera = Camera(self.grid, self.printer, 3)
class TestCamera(unittest.TestCase): def setUp(self): self.gridworld = GridWorld(20, 20, 10) self.printer = VirtualPrinter(0, 0, 10, 1, pygame.color.Color("darkorange"), self.gridworld) self.grid = self.gridworld.grid self.camera = Camera(self.grid, self.printer, 3) def test_camera_has_correct_values_at_init(self): self.assertIs(self.camera.grid, self.gridworld.grid) self.assertIs(self.camera.printer, self.printer) self.assertEqual(self.camera.n, 3) self.assertEqual(self.camera.cell_width, self.gridworld.gridsize()) def test_num_cells_in_view_isnt_wrong(self): #one cell self.printer.position = Vector(15, 15) self.assertEqual(self.camera.num_cells_in_view(Vector(1, 1)), 1) #two cell self.printer.position = Vector(15, 12) self.assertEqual(self.camera.num_cells_in_view(Vector(1, 1)), 2) #red cell self.printer.position = Vector(12, 12) self.assertEqual(self.camera.num_cells_in_view(Vector(1, 1)), 4) #blue cell def test_cells_in_view_independent_of_camera_size(self): local_camera = Camera(self.grid, self.printer, 4) self.printer.position = Vector(30, 30) self.assertEqual(local_camera.num_cells_in_view(Vector(1, 1)), 1) #two cell self.printer.position = Vector(30, 32) self.assertEqual(local_camera.num_cells_in_view(Vector(1, 1)), 2) #red cell self.printer.position = Vector(32, 32) self.assertEqual(local_camera.num_cells_in_view(Vector(1, 1)), 4) def test_cells_have_same_result_for_cells_in_view(self): #one cell self.printer.position = Vector(15, 15) self.assertEqual(self.camera.num_cells_in_view(Vector(3, 3)), 1) #two cell self.printer.position = Vector(15, 12) self.assertEqual(self.camera.num_cells_in_view(Vector(3, 3)), 2) #red cell self.printer.position = Vector(12, 12) self.assertEqual(self.camera.num_cells_in_view(Vector(3, 3)), 4) #blue cell def test_region_aligned(self): self.printer.position = Vector(15, 15) self.grid.set_loc_val(1, 1, 3) self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 3) def test_region_over_two_cells_horizontally_aligned(self): self.printer.position = Vector(15, 20) self.grid.set_loc_val(1, 1, 1) self.grid.set_loc_val(1, 2, 0) self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.5) def test_region_over_two_cells_vertically_aligned(self): self.printer.position = Vector(20, 15) self.grid.set_loc_val(1, 1, 1) self.grid.set_loc_val(2, 1, 0) self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.5) self.printer.position = Vector(22, 15) self.grid.set_loc_val(1, 1, 1) self.grid.set_loc_val(2, 1, 0) self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.7) def test_region_over_four_cells(self): self.printer.position = Vector(20, 20) self.grid.set_loc_val(1, 1, 1) self.grid.set_loc_val(2, 1, 0) self.grid.set_loc_val(1, 2, 0) self.grid.set_loc_val(2, 2, 0) self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.25) def test_region_over_four_cells_with_arbitrary_camera_size(self): localcamera = Camera(self.grid, self.printer, 5) self.printer.position = Vector(20, 20) self.grid.set_loc_val(1, 1, 1) self.grid.set_loc_val(2, 1, 0) self.grid.set_loc_val(1, 2, 0) self.grid.set_loc_val(2, 2, 0) self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.25)
def main(): env = GridWorld(3, 4) #Define the state matrix state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print("State Matrix:") print(state_matrix) #Define the reward matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print("Reward Matrix:") print(reward_matrix) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) #Define the policy matrix #This is the optimal policy for world with reward=-0.04 policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) print("Policy Matrix:") print(policy_matrix) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3,4)) #init with 1.0e-10 to avoid division by zero running_mean_matrix = np.full((3,4), 1.0e-10) gamma = 0.999 tot_epoch = 50000 print_epoch = 1000 for epoch in range(tot_epoch): #Starting a new episode episode_list = list() #Reset and return the first observation and reward observation = env.reset(exploring_starts=False) for _ in range(1000): #Take the action from the action matrix action = policy_matrix[observation[0], observation[1]] #Move one step in the environment and get obs and reward observation, reward, done = env.step(action) #Append the visit in the episode list episode_list.append((observation, reward)) if done: break #The episode is finished, now estimating the utilities counter = 0 #Checkup to identify if it is the first visit to a state checkup_matrix = np.zeros((3,4)) #This cycle is the implementation of First-Visit MC. #For each state stored in the episode list check if it #is the rist visit and then estimate the return. for visit in episode_list: observation = visit[0] row = observation[0] col = observation[1] reward = visit[1] if(checkup_matrix[row, col] == 0): return_value = get_return(episode_list[counter:], gamma) running_mean_matrix[row, col] += 1 utility_matrix[row, col] += return_value checkup_matrix[row, col] = 1 counter += 1 if(epoch % print_epoch == 0): print("") print("Utility matrix after " + str(epoch+1) + " iterations:") print(utility_matrix / running_mean_matrix) #Time to check the utility matrix obtained print("Utility matrix after " + str(tot_epoch) + " iterations:") print(utility_matrix / running_mean_matrix)
def main(): env = GridWorld(3, 4) #Define the state matrix state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print("State Matrix:") print(state_matrix) #Define the reward matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print("Reward Matrix:") print(reward_matrix) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) #Random policy policy_matrix = np.random.randint(low=0, high=4, size=(3, 4)).astype(np.float32) policy_matrix[1,1] = np.NaN #NaN for the obstacle at (1,1) policy_matrix[0,3] = policy_matrix[1,3] = -1 #No action for the terminal states #Set the matrices in the world env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) state_action_matrix = np.random.random_sample((4,12)) # Q #init with 1.0e-10 to avoid division by zero running_mean_matrix = np.full((4,12), 1.0e-10) gamma = 0.999 tot_epoch = 500000 print_epoch = 3000 for epoch in range(tot_epoch): #Starting a new episode episode_list = list() #Reset and return the first observation and reward observation = env.reset(exploring_starts=True) #action = np.random.choice(4, 1) #action = policy_matrix[observation[0], observation[1]] #episode_list.append((observation, action, reward)) is_starting = True for _ in range(1000): #Take the action from the action matrix action = policy_matrix[observation[0], observation[1]] #If the episode just started then it is #necessary to choose a random action (exploring starts) if(is_starting): action = np.random.randint(0, 4) is_starting = False #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) #Append the visit in the episode list episode_list.append((observation, action, reward)) observation = new_observation if done: break #The episode is finished, now estimating the utilities counter = 0 #Checkup to identify if it is the first visit to a state checkup_matrix = np.zeros((4,12)) #This cycle is the implementation of First-Visit MC. #For each state stored in the episode list check if it #is the rist visit and then estimate the return. for visit in episode_list: observation = visit[0] action = visit[1] col = observation[1] + (observation[0]*4) row = action if(checkup_matrix[row, col] == 0): return_value = get_return(episode_list[counter:], gamma) running_mean_matrix[row, col] += 1 state_action_matrix[row, col] += return_value checkup_matrix[row, col] = 1 counter += 1 #Policy Update policy_matrix = update_policy(episode_list, policy_matrix, state_action_matrix/running_mean_matrix) #Printing if(epoch % print_epoch == 0): print("") print("State-Action matrix after " + str(epoch+1) + " iterations:") print(state_action_matrix / running_mean_matrix) print("Policy matrix after " + str(epoch+1) + " iterations:") print(policy_matrix) print_policy(policy_matrix) #Time to check the utility matrix obtained print("Utility matrix after " + str(tot_epoch) + " iterations:") print(state_action_matrix / running_mean_matrix)
def main(): tot_generations = 100 tot_episodes = 100 tot_steps = 14 #a good choice is: (world_rows+world_cols)*2 population_size = 100 elite_size = 10 mutation_rate = 0.10 gene_set = [0, 1, 2, 3] chromosome_size = 12 #Define the world dimension world_rows = 3 world_columns = 4 env = GridWorld(world_rows, world_columns) mean_fitness_list = list() max_fitness_list = list() min_fitness_list = list() #Define the state matrix state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print("State Matrix:") print(state_matrix) #Define the reward matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print("Reward Matrix:") print(reward_matrix) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) #Init a random population population_matrix = return_random_population(population_size, chromosome_size, gene_set = gene_set) print("Population matrix shape: " + str(population_matrix.shape)) #Main iteration loop for generation in range(tot_generations): #The fitness value for each individual is stored in np.array fitness_array = np.zeros((population_size)) for chromosome_index in range(population_size): for episode in range(tot_episodes): #Reset and return the first observation observation = env.reset(exploring_starts=True) for step in range(tot_steps): #Estimating the action for that state col = observation[1] + (observation[0]*world_columns) action = population_matrix[chromosome_index,:][col] #Taking the action and observing the new state and reward observation, reward, done = env.step(action) #Accumulating the fitness for this individual fitness_array[chromosome_index] += reward if done: break #Printing and saving Fitness information max_fitness_list.append(np.amax(fitness_array)) mean_fitness_list.append(np.mean(fitness_array)) min_fitness_list.append(np.amin(fitness_array)) print("Generation: " + str(generation+1)) print("Fitness Mean: " + str(np.mean(fitness_array))) print("Fitness STD: " + str(np.std(fitness_array))) print("Fitness Max: " + str(np.amax(fitness_array)) + " at index " + str(np.argmax(fitness_array))) print("Fitness Min: " + str(np.amin(fitness_array)) + " at index " + str(np.argmin(fitness_array))) print("Optimal Policy:") print(" > > > * ^ # ^ * ^ < < <") for i in range(int(fitness_array.shape[0]/10)): print("Fitness " + str(i) + " ..... " + str(fitness_array[i])) print(return_chromosome_string(population_matrix[i,:])) print("") #Uncomment the following line to enable roulette wheel selection population_matrix, fitness_array = \ return_roulette_selected_population(population_matrix, fitness_array, population_size) population_matrix, fitness_array = \ return_best_worst_population(population_matrix, fitness_array) #Comment the following line if you enable the roulette wheel #population_matrix, fitness_array = \ #return_truncated_population(population_matrix, #fitness_array, #new_size=int(population_size/2)) population_matrix = return_crossed_population(population_matrix, population_size, elite=elite_size) population_matrix = return_mutated_population(population_matrix, gene_set=gene_set, mutation_rate=mutation_rate, elite=elite_size) #If you have matplotlib installed it saves an image of #the fitness/generation plot try: import matplotlib.pyplot as plt print("Using matplotlib to show the fitness/generation plot...") array = np.arange(1, tot_generations+1, dtype='int32') plt.plot(array, mean_fitness_list, color='red', marker='o', markersize=6, markevery=10, label='Mean') plt.plot(array, max_fitness_list, color='blue', marker='^', markersize=6, markevery=10, label='Max') #plt.plot(array, min_fitness_list, color='black', marker='v', markersize=6, markevery=10, label='Min') plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, fancybox=True, shadow=True) #plt.xlim((0,tot_generations)) #plt.ylim((-100,+100)) plt.ylabel('Fitness', fontsize=15) plt.xlabel('Generation', fontsize=15) print("Saving the image in './fitness.jpg'...") plt.savefig("./fitness.jpg", dpi=500) #plt.show() except ImportError, e: print("Please install matplotlib if you want to see the fitness/generation plot.") pass # module doesn't exist, deal with it.
def main(): env = GridWorld(3, 4) #Define the state matrix state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print("State Matrix:") print(state_matrix) #Define the reward matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print("Reward Matrix:") print(reward_matrix) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) #Define the policy matrix #This is the optimal policy for world with reward=-0.04 policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) print("Policy Matrix:") print(policy_matrix) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3,4)) gamma = 0.999 alpha = 0.1 #constant step size tot_epoch = 300000 print_epoch = 1000 for epoch in range(tot_epoch): #Reset and return the first observation observation = env.reset(exploring_starts=False) for step in range(1000): #Take the action from the action matrix action = policy_matrix[observation[0], observation[1]] #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) utility_matrix = update_utility(utility_matrix, observation, new_observation, reward, alpha, gamma) observation = new_observation #print(utility_matrix) if done: break if(epoch % print_epoch == 0): print("") print("Utility matrix after " + str(epoch+1) + " iterations:") print(utility_matrix) #Time to check the utility matrix obtained print("Utility matrix after " + str(tot_epoch) + " iterations:") print(utility_matrix)
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #SOFTWARE. #In this example I will use the class gridworld to generate a 3x4 world #in which the cleaning robot will move. In this example the robot follows #a policy which is optimal, reaching the terminal state (+1) with high probability. import numpy as np from gridworld import GridWorld env = GridWorld(3, 4) #Define the state matrix state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 #Define the reward matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0],
hood = moore_neighborhood(radius=4, center=old_position) random.shuffle(hood) for location in hood: if agent.world.is_empty(location): return location return old_position def schedule(): for agent in myagents: move(agent) def run(maxiter): for ct in range(maxiter): myobserver.off() schedule() myobserver.on() #create a grid and a world based on this grid mygrid = TorusGrid(shape=params['world_shape']) myworld = GridWorld(topology=mygrid) #create agents, located in `myworld`, and then set display suggestions myagents = myworld.create_agents(AgentType=Agent, number=params['n_agents']) for agent in myagents: agent.display(shape='circle', fillcolor='red', shapesize=(0.25,0.25)) #add an observer myobserver = GridWorldGUI(myworld) #run the simulation by repeatedly executing the schedule run(maxiter=params['maxiter']) #dw
class Generator: movement_constant = 3 aquire_data = True def __init__(self, ideal_grid=None, ideal_grid_path=None): """ Set pygame stuff up for running the simulation.""" assert ideal_grid or ideal_grid_path, "must provide at least one ideal grid" self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize) self.gridworld.set_ideal_grid(ideal_grid) self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), self.gridworld) self.camera = VisualCamera(self.gridworld, self.printer, 3) self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, 3) #gui stuff pygame.init() width = self.gridworld.width() * self.gridworld.gridsize() height = self.gridworld.height() * self.gridworld.gridsize() self.window = pygame.display.set_mode((width, height)) def generate(self, outputfile): inputs = [] outputs = [] self.printer.setPenDown() self.printer.v = Vector(0, 0) self.printer.position = Vector(270, 130) while self.aquire_data: actual = self.camera.camera.all_cell_values() ideal = self.ideal_camera.all_cell_values() inputs.append([i - a for a,i in zip(actual, ideal)]) outputs.append([self.printer.v.x, self.printer.v.y]) self.act_and_refresh() outputs = [[self.encode(x) + self.encode(y)] for x,y in outputs] self.aquire_data = True with open(outputfile, 'w') as output: writer = csv.writer(output) writer.writerow(camera_headers + output_headers) for inval, outval in zip(inputs, outputs): writer.writerow(inval + outval) def encode(self, velocity): if velocity >= 100: return "01" elif velocity <= -100: return "10" else: return "00" def act_and_refresh(self): self.act_on_key_input() self.printer.simulate(1) self.redraw() pygame.display.update() def act_on_key_input(self): for event in pygame.event.get(pygame.KEYUP): if event.key == pygame.K_p: self.print_all_camera_values() keys = pygame.key.get_pressed() if keys[pygame.K_LEFT]: self.printer.v = Vector(-100, 0) if keys[pygame.K_RIGHT]: self.printer.v = Vector(100, 0) if keys[pygame.K_UP]: self.printer.v = Vector(0, -100) if keys[pygame.K_DOWN]: self.printer.v = Vector(0, 100) if keys[pygame.K_SPACE]: self.printer.v = Vector(0, 0) if keys[pygame.K_q]: self.aquire_data = False def redraw(self): self.gridworld.draw(self.window) self.printer.draw(self.window) self.camera.draw(self.window)
def main(): env = GridWorld(3, 4) #Define the state matrix state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print("State Matrix:") print(state_matrix) #Define the reward matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print("Reward Matrix:") print(reward_matrix) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) #Define the policy matrix #This is the optimal policy for world with reward=-0.04 policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) print("Policy Matrix:") print(policy_matrix) #Define and print the eligibility trace matrix trace_matrix = np.zeros((3,4)) print("Trace Matrix:") print(trace_matrix) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3,4)) gamma = 0.999 #discount rate alpha = 0.1 #constant step size lambda_ = 0.5 #decaying factor tot_epoch = 300000 print_epoch = 100 for epoch in range(tot_epoch): #Reset and return the first observation observation = env.reset(exploring_starts=True) for step in range(1000): #Take the action from the action matrix action = policy_matrix[observation[0], observation[1]] #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) #Estimate the error delta (Target - OldEstimate) delta = reward + gamma * utility_matrix[new_observation[0], new_observation[1]] - \ utility_matrix[observation[0], observation[1]] #Adding +1 in the trace matrix for the state visited trace_matrix[observation[0], observation[1]] += 1 #Update the utility matrix utility_matrix = update_utility(utility_matrix, trace_matrix, alpha, delta) #Update the trace matrix (decaying) trace_matrix = update_eligibility(trace_matrix, gamma, lambda_) observation = new_observation if done: break #return if(epoch % print_epoch == 0): print("") print("Utility matrix after " + str(epoch+1) + " iterations:") print(utility_matrix) print(trace_matrix) #Time to check the utility matrix obtained print("Utility matrix after " + str(tot_epoch) + " iterations:") print(utility_matrix)
counter, return_value = 0, 0 for visit in state_list: return_value += visit[2] * np.power(gamma, counter) counter += 1 return return_value def update_policy_matrix(episode_list, policy_matrix, state_action_matrix): for visit in episode_list: observation = visit[0] col = observation[1] + (observation[0] * 4) if policy_matrix[observation[0], observation[1]] != -1: policy_matrix[observation[0], observation[1]] = np.argmax(state_action_matrix[:, col]) return policy_matrix if __name__ == "__main__": env = GridWorld(3, 4) state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 reward_matrix = np.full((3, 4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]])
plt.ylabel("Return") plt.ylim(-4, 1) plt.plot(returns) plt.plot(estimates) plt.legend(['Returns', 'Estimate']) pp = PdfPages('./plots/qplot.pdf') pp.savefig(fig) plt.close() pp.close() return returns, estimates, q if __name__ == '__main__': env = GridWorld() mdp = GridWorld_MDP() U, pi, Ustart = policy_iteration(mdp, plot=True) print(pi) for x in range(env.num_states): print("{} : {}".format(env.state2loc[x], U[x])) print("_________________") vret, vest, v = td_learning(env, pi, gamma=1., alpha=0.1, episodes=2000, plot=True) for x in range(env.num_states): print("{} : {}".format(env.state2loc[x], v[x]))
from gridworld import GridWorld sim = GridWorld(60, 40, 10) sim.set_cell(10, 10, (255, 255, 255)) sim.end
def train(cfg): # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up # env = FrozenLakeWapper(env) # env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left # env = CliffWalkingWapper(env) gridmap = [ 'SFFFFH', 'HHHFFH', 'FFFFFH', 'FFFFFH', 'FHHHHH', 'FFFFFG'] env = GridWorld(gridmap) agent = QLearning( obs_dim=env.observation_space.n, action_dim=env.action_space.n, learning_rate=cfg.policy_lr, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay) render = True # 是否打开GUI画面 rewards = [] # 记录所有episode的reward MA_rewards = [] # 记录滑动平均的reward steps = [] # 记录所有episode的steps for i_episode in range(1, cfg.max_episodes+1): ep_reward = 0 # 记录每个episode的reward ep_steps = 0 # 记录每个episode走了多少step obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) while True: action = agent.sample(obs) # 根据算法选择一个动作 next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 # 训练 Q-learning算法 agent.learn(obs, action, reward, next_obs, done) # 不需要下一步的action obs = next_obs # 存储上一个观察值 ep_reward += reward ep_steps += 1 # 计算step数 if render: env.render() # 渲染新的一帧图形 if done: break steps.append(ep_steps) rewards.append(ep_reward) # 计算滑动平均的reward if i_episode == 1: MA_rewards.append(ep_reward) else: MA_rewards.append( 0.9*MA_rewards[-1]+0.1*ep_reward) print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps, ep_reward, agent.epsilon)) # 每隔20个episode渲染一下看看效果 if i_episode % 20 == 0: render = True else: render = False agent.save() # 训练结束,保存模型 output_path = os.path.dirname(__file__)+"/result/" # 检测是否存在文件夹 if not os.path.exists(output_path): os.mkdir(output_path) np.save(output_path+"rewards_train.npy", rewards) np.save(output_path+"MA_rewards_train.npy", MA_rewards) np.save(output_path+"steps_train.npy", steps)
def main(): env = GridWorld(3, 4) #Define the state matrix state_matrix = np.zeros((3,4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print("State Matrix:") print(state_matrix) #Define the reward matrix reward_matrix = np.full((3,4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print("Reward Matrix:") print(reward_matrix) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) #Random policy policy_matrix = np.random.randint(low=0, high=4, size=(3, 4)).astype(np.float32) policy_matrix[1,1] = np.NaN #NaN for the obstacle at (1,1) policy_matrix[0,3] = policy_matrix[1,3] = -1 #No action for the terminal states print("Policy Matrix:") print(policy_matrix) print_policy(policy_matrix) #Adversarial exploration policy #exploratory_policy_matrix = np.array([[2, 3, 2, -1], #[2, np.NaN, 1, -1], #[1, 1, 1, 0]]) exploratory_policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 1, 0, 3]]) print("Exploratory Policy Matrix:") print(exploratory_policy_matrix) print_policy(exploratory_policy_matrix) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) state_action_matrix = np.zeros((4,12)) visit_counter_matrix = np.zeros((4,12)) gamma = 0.999 alpha = 0.001 #constant step size tot_epoch = 5000000 print_epoch = 1000 for epoch in range(tot_epoch): #Reset and return the first observation observation = env.reset(exploring_starts=True) epsilon = return_decayed_value(0.1, epoch, decay_step=50000) is_starting = True for step in range(1000): #Take the action from the action matrix #action = policy_matrix[observation[0], observation[1]] #Take the action using epsilon-greedy action = return_epsilon_greedy_action(exploratory_policy_matrix, observation, epsilon=0.001) if(is_starting): action = np.random.randint(0, 4) is_starting = False #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) #Updating the state-action matrix state_action_matrix = update_state_action(state_action_matrix, visit_counter_matrix, observation, new_observation, action, reward, alpha, gamma) #Updating the policy policy_matrix = update_policy(policy_matrix, state_action_matrix, observation) #Increment the visit counter visit_counter_matrix = update_visit_counter(visit_counter_matrix, observation, action) observation = new_observation if done: break if(epoch % print_epoch == 0): print("") print("Epsilon: " + str(epsilon)) print("State-Action matrix after " + str(epoch+1) + " iterations:") print(state_action_matrix) print("Policy matrix after " + str(epoch+1) + " iterations:") print_policy(policy_matrix) #Time to check the utility matrix obtained print("State-Action matrix after " + str(tot_epoch) + " iterations:") print(state_action_matrix) print("Policy matrix after " + str(tot_epoch) + " iterations:") print_policy(policy_matrix)