def create_env(): """ Создает среду для экспериментов :return: среду """ # Создаем среду в виде сетки 3x4 env = GridWorld(3, 4) # Задаем матрицу состояний state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 # Задаем матрицу вознаграждений # Для всех кроме терминальных состояний вознаграждение -0.04 reward_matrix = np.full((3, 4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 # Задаем матрицу вероятности совершения действия transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) # Настраиваем и возвращаем среду env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) return env
def test_parse(self): grid = ' #P\nG #' gw = GridWorld(grid) self.assertEqual(gw.grid[1][0], '#') self.assertEqual(gw.grid[2][0], 'P') self.assertEqual(gw.grid[0][1], 'G') self.assertEqual(gw.grid[1][1], ' ')
def init_or(): '''Init the OR boolean environment @return the environment gridworld object ''' env = GridWorld(5, 5) #Define the state matrix state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the index matrix index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)], [(3,0), (3,1), (3,2), (3,3), (3,4)], [(2,0), (2,1), (2,2), (2,3), (2,4)], [(1,0), (1,1), (1,2), (1,3), (1,4)], [(0,0), (0,1), (0,2), (0,3), (0,4)]]) #Define the reward matrix reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [-1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) env.setStateMatrix(state_matrix) env.setIndexMatrix(index_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) return env, np.zeros((5,5))
def main(): world = GridWorld() q_table = np.zeros([len(world.available_actions()), 7, 10]) q_table = train(world, q_table) moves = evaluate(world, q_table) print 'Moves: ' + str(moves) print 'Steps: ' + str(len(moves))
def q_learning(q_tables, gamma=GAMMA, alpha=0.001, number_of_episodes=10000, max_step_number=1000): policy_list = [] # contains the final approximate optimal policy env = GridWorld() actions = ['up', 'down', 'left', 'right'] indexes_actions = {-4: 0, 4: 1, -1: 2, 1: 3} rewards = 0 for episode in range(number_of_episodes): obs = env.reset() number = 0 # the number of steps in one episode which is no more than max_step_number while True: # one episode action = epsilon_greedy(obs, q_tables) # action = A action_index = indexes_actions[action] next_obs, reward, done, _ = env.step( action) # next_obs = S', reward = R rewards += reward q_tables[obs][ action_index] = q_tables[obs][action_index] + alpha * ( reward + gamma * max(q_tables[next_obs]) - q_tables[obs][action_index]) obs = next_obs number += 1 if done == 1 or number == max_step_number: # reach final state or max number step break for row in range(len(q_tables)): policy_list.append(actions[np.argmax(q_tables[row])]) performance = rewards / number_of_episodes optimal_policy = np.array(policy_list).reshape(4, 4) return q_tables, optimal_policy, performance
def question_3_1_a(printing=True): if printing: print( 'a) Develop a state graph representation for this search problem, and' ) print( 'develop a step() method for finding the next legal steps this problem,' ) print('i.e. for generating successor nodes (vertices).') print() obstacle_coords = [(2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (3, 6), (4, 6), (5, 6), (6, 6), (7, 6), (7, 3), (7, 4), (7, 5)] env_config = { 'nrow': 9, 'ncol': 9, 'obstacle_coords': obstacle_coords, 'start_coord': (8, 0), 'goal_coord': (0, 8), 'cost_map': None } env = GridWorld(env_config) if printing: print([i for i in dir(env) if '__' not in i], '\n') for k, v in vars(env).items(): if k == 'cfg': continue if printing: print(k, v, '\n') return env
def _run(FLAGS, model_cls): logger = logging.getLogger('Trainer_%s' % model_cls.__name__) logger.setLevel(logging.INFO) file_handler = logging.FileHandler('%s.log' % model_cls.__name__) file_handler.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] ## %(message)s') file_handler.setFormatter(formatter) stream_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.addHandler(stream_handler) hparams = tf.contrib.training.HParams(**COMMON_HPARAMS.values()) hparams.set_hparam('batch_size', FLAGS.bs) hparams.set_hparam('n_steps', FLAGS.stp) hparams.set_hparam('n_dims', FLAGS.dims) hparams.set_hparam('n_info_dims', FLAGS.info_dims) hparams.set_hparam('n_att_dims', FLAGS.att_dims) hparams.set_hparam('max_epochs', FLAGS.epochs) hparams.set_hparam('checkpoint', FLAGS.ckpt) hparams.set_hparam('n_heads', FLAGS.heads) hparams.set_hparam('n_selfatt_dims', FLAGS.selfatt_dims) assert hparams.n_dims == hparams.n_info_dims + hparams.n_att_dims, "`n_dims` should be equal to the sum of `n_info_dims` and `n_att_dims`" assert hparams.n_dims == hparams.n_heads * hparams.n_selfatt_dims, "`n_dims` should be equal to the product of `n_heads` and `n_selfatt_dims`" name_size = 'SZ%d-STP%d' % (FLAGS.sz, FLAGS.stp) config_size = Config(size=FLAGS.sz, max_steps=FLAGS.stp) for name_std, config_std in CONFIG_STDS.iteritems(): for name_drop, config_drop in CONFIG_DROPS.iteritems(): for name_direction, config_direction in CONFIG_DIRECTIONS.iteritems( ): config = Config() config.add('base', 'base', CONFIG_BASE) config.add('size', name_size, config_size) config.add('direction', name_direction, config_direction) config.add('drop', name_drop, config_drop) config.add('std', name_std, config_std) gridworld = GridWorld(name=config.get_name(), **config.get_kwargs()) for seed in GRIDWORLD_SEEDS: data_dir = '%s-SEED%d' % (config.get_name(), seed) gridworld.load(data_dir, seed=seed, splitting_seed=SPLITTING_SEED) dataset_name = config.get_name() for shuffling_seed in SHUFFLING_SEEDS: dataset = Dataset(dataset_name, os.path.join(BASE_DIR, data_dir), shuffling_seed=shuffling_seed) model = model_cls(dataset, hparams, gridworld, seed=MODEL_SEED) Trainer(model, logger)()
def main(): env = GridWorld(3, 4) state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print state_matrix reward_matrix = np.full((3, 4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print reward_matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) state_action_matrix = np.random.random((4, 12)) print "State Action matrix" print state_action_matrix env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3, 4)) print "utility matrix" print utility_matrix gamma, alpha, tot_epoch, print_epoch = 0.999, 0.1, 30000, 1000 for epoch in range(tot_epoch): observation = env.reset(exploring_starts=True) for step in range(1000): col = observation[1] + (4 * observation[0]) # Sending Action to Environment action_array = state_action_matrix[:, col] action_distribution = softmax(action_array) action = np.random.choice(4, 1, p=action_distribution) new_observation, reward, done = env.step(action) # Update Critic utility_matrix, delta = update_critic(utility_matrix, alpha, observation, new_observation, reward, gamma) # Update Actor state_action_matrix = update_actor(state_action_matrix, observation, action, delta, beta_matrix=None) observation = new_observation if done: break print "final utility matrix" print utility_matrix print "final state action matrix" print state_action_matrix
def load_gridworld(filename): grid = [] with open(filename, newline='') as f: reader = csv.reader(f) for row in reader: grid_row = [] for col in row: grid_row.append(int(col)) grid.append(grid_row) return GridWorld(grid)
def example_1(): #example 1 height = 6 width = 2 start = [5, 0] goals = ([5, 0]) walls = ([2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3]) cliffs = ([1, 1], [1, 2], [1, 3]) env = GridWorld(height, width, False, False, start, goals, walls, cliffs) env.render(mode='simple_render')
def example_3(): #example 3 height = 3 width = 3 start = [0, 0] goals = ([2, 2]) walls = None cliffs = None env = GridWorld(height, width, False, False, start, goals, walls, cliffs) env.render(mode='simple_render')
def test_move_dir(self): grid = ' \n \n ' gw = GridWorld(grid) start = (1,1) # N, E, S, W tests = [(0, (1,0)), (1, (2,1)), (2, (1,2)), (3, (0,1))] for dir, end in tests: e, _, _ = gw.move_dir(start, dir) self.assertEqual(e, end)
def configure_gridworld() -> Tuple[Domain, Task]: domain = GridWorld(10, 7, agent_x_start=0, agent_y_start=3, wind=True, wind_strengths=[0, 0, 0, 1, 1, 1, 2, 1, 1, 0], stochasticity=stochasticity) domain.place_exit(7, 3) task = ReachExit(domain) return domain, task
def main(args): if args.verbose: logging.basicConfig(level=logging.INFO) elif args.debug: logging.basicConfig(level=logging.DEBUG) # initializations gridworld = GridWorld(args.size, args.interval, args.obstacles, args.vision, args.phase) logging.info("Generated grid world!") logging.info("Visuals created") mc = MonteCarlo(gridworld, mode=args.method) logging.info("Initialized Monte Carlo method") mc.run()
def main(args): os.makedirs(args.output_dir, exist_ok=True) for k in tqdm.trange(args.count): g = GridWorld(args.max_size, args.max_size) size = 2 * random.randint(args.min_size // 2, args.max_size // 2) + 1 if size < args.max_size: pad = (args.max_size - size) // 2 g._fill_rect(1, 1, pad, args.max_size) g._fill_rect(args.max_size - pad + 1, 1, args.max_size, args.max_size) g._fill_rect(1, 1, args.max_size, pad) g._fill_rect(1, args.max_size - pad + 1, args.max_size, args.max_size) else: pad = 0 wall_count = random.randint(1, args.wall_count) for _ in range(wall_count): is_vert = random.random() > 0.5 wall_coord = random.randint(2, size - 1) wall_len = random.randint(2, size - 2) wall_start = random.randint(1, size - wall_len) if is_vert: g.add_vertical_wall(pad + wall_coord, pad + wall_start, pad + wall_start + wall_len - 1) else: g.add_horizontal_wall(pad + wall_coord, pad + wall_start, pad + wall_start + wall_len - 1) connected_component = list(check_connectivity(g)) random.shuffle(connected_component) start_count = random.randint(1, args.start_count) for _ in range(start_count): g.add_start(*connected_component.pop()) goal_count = random.randint(1, args.goal_count) for _ in range(goal_count): g.add_goal(*connected_component.pop()) trap_count = random.randint(0, args.trap_count) for _ in range(trap_count): g.add_trap(*connected_component.pop()) g.save( os.path.join( args.output_dir, "grid{0:03d}_{1}x{1}_w{2}_s{3}_g{4}_t{5}.pkl".format( k, size, wall_count, start_count, goal_count, trap_count)))
def main(): env = GridWorld(3, 4) state_matrix = np.zeros((3, 4)) state_matrix[0, 3] = 1 state_matrix[1, 3] = 1 state_matrix[1, 1] = -1 print state_matrix reward_matrix = np.full((3, 4), -0.04) reward_matrix[0, 3] = 1 reward_matrix[1, 3] = -1 print reward_matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]]) trace_matrix = np.zeros((3, 4)) env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) utility_matrix = np.zeros((3, 4)) gamma, alpha, tot_epoch, print_epoch, lambda_ = 0.999, 0.1, 30000, 1000, 0.5 for epoch in range(tot_epoch): observation = env.reset(exploring_starts=True) for step in range(1000): action = policy_matrix[observation[0]][observation[1]] new_observation, reward, done = env.step(action) delta = reward + gamma * utility_matrix[new_observation[0]][ new_observation[1]] - utility_matrix[observation[0]][ observation[1]] trace_matrix[observation[0]][observation[1]] += 1 utility_matrix = update_utility_matrix(utility_matrix, alpha, delta, trace_matrix) trace_matrix = update_eligibility_matrix(trace_matrix, gamma, lambda_) observation = new_observation if done: break if epoch % print_epoch == 0: print "utility matrix after %d iterations: " % (epoch) print utility_matrix print "final utility matrix: ", utility_matrix
def main(): grid = '' with open("grid.lay","r") as file: grid = file.read() eps = 0.2 episodes = 10000 random.seed(1) gw = GridWorld(grid) Q = SARSA(gw, episodes=episodes, eps=eps) # plotQ(Q, gw, f'SARSA after {episodes} episodes') plotPolicy(Q, gw, f'SARSA: greedy-policy after {episodes} episodes') random.seed(1) Q = QLearning(gw, episodes=episodes, eps=eps) # plotQ(Q, gw, f'Q-Learning after {episodes} episodes') plotPolicy(Q, gw, f'Q-Learning: greedy-policy after {episodes} episodes')
def random_play(n_steps): #env from example_3 height = 3 width = 3 start = [0, 0] goals = ([2, 2]) walls = None cliffs = None env = GridWorld(height, width, False, False, start, goals, walls, cliffs) #random actions over n_steps: env.reset() for step in range(n_steps): action = env.action_space_sample() new_state, reward, done = env.step(action) print("Step:", step, ", Action:", action, ", New state:", env.get_obs(), ", Done:", done, ", Reward:", reward) env.render(mode='episode')
def test_norm_wind(self): env = GridWorld() state = env.reset() for _ in range(4): state, _, _ = env.step(0) # move right self.assertTrue(np.array_equal(state, np.array([4, 4]))) for _ in range(2): state, _, _ = env.step(0) # move right self.assertTrue(np.array_equal(state, np.array([6, 6]))) state, _, _ = env.step(3) # move down self.assertTrue(np.array_equal(state, np.array([6, 6]))) for _ in range(5): state, _, _ = env.step(0) # move right self.assertTrue(np.array_equal(state, np.array([9, 6]))) for _ in range(4): state, _, _ = env.step(3) # move down for _ in range(2): state, _, done = env.step(2) # move left self.assertTrue(done)
def first_visit_monte_carlo_evaluate(gamma=GAMMA, number_of_episodes=100000): env = GridWorld() policy = Get_Action() values = np.zeros(16) returns = {state: list() for state in range(16)} for episode in range(number_of_episodes): observations, _, rewards, _ = generate_one_episode(env, policy) observations.pop() # exclude the sT G = 0 for i, obs in enumerate(observations[::-1] ): # reverse the list observations and rewards G = gamma * G + rewards[::-1][i] if obs not in observations[::-1][i + 1:]: returns[obs].append(G) values[obs] = np.average(returns[obs]) values[15] = 0 if episode % 10000 == 0: print(f"In the No.{episode} the values are {values}.") return values
def plot_gridworld(n_rows=2, n_cols=3, figsize=(10, 6), eps=0, save_path='gridworld_demo.svg', seed=42, dtype='bool'): """Makes a picture of an expert trajectory :param n_rows: number of rows to put the grids in :param n_cols: number of columns to put the grids in :param figsize: figure size :param eps: probability of a random action por the expert :param save_path: path to save the result :param seed: random seed to set to numpy :param dtype: observation dtype. For checking that both dtypes work the same way """ total = n_rows * n_cols np.random.seed(seed) env = GridWorld(5, 5, 3, obs_dtype=dtype) env.reset() done = False grids = [env.render(mode='get_grid')] while not done: action = env.get_expert_action(eps=eps) _, _, done, _ = env.step(action) grids.append(env.render(mode='get_grid')) if total < len(grids): display_ind = np.linspace(0, len(grids) - 1, total, dtype=int) grids = [grids[i] for i in display_ind] fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=figsize) fig.suptitle('Example of an expert trajectory') for r in range(n_rows): for c in range(n_cols): ind = r * n_cols + c ax = axes[r, c] ax.set_axis_off() if ind < len(grids): grid = grids[ind] ax.imshow(grid) plt.savefig(save_path)
def generate_data(FLAGS): name_size = 'SZ%d-STP%d' % (FLAGS.sz, FLAGS.stp) config_size = Config(size=FLAGS.sz, max_steps=FLAGS.stp) for name_std, config_std in CONFIG_STDS.iteritems(): for name_drop, config_drop in CONFIG_DROPS.iteritems(): for name_direction, config_direction in CONFIG_DIRECTIONS.iteritems( ): config = Config() config.add('base', 'base', CONFIG_BASE) config.add('size', name_size, config_size) config.add('direction', name_direction, config_direction) config.add('drop', name_drop, config_drop) config.add('std', name_std, config_std) gridworld = GridWorld(name=config.get_name(), **config.get_kwargs()) for seed in GRIDWORLD_SEEDS: data_dir = '%s-SEED%d' % (config.get_name(), seed) gridworld.generate(data_dir=data_dir, seed=seed, splitting_seed=SPLITTING_SEED)
def main(): cost_map = [] cost_map.append([1, 1, 1, 5, 5, 5, 5, 1, None]) cost_map.append([1, 1, 1, 5, 5, 5, 5, 1, 1]) cost_map.append([1, 1, 10, 10, 10, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 10, 10, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 1, 1, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 1, 1, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 1, 10, 10, 10, 1, 1]) cost_map.append([1, 1, 1, 10, 10, 10, 10, 1, 1]) cost_map.append([0, 1, 1, 1, 1, 1, 1, 1, 1]) env_config = {'nrow': 9, 'ncol': 9, 'obstacle_coords': [], 'start_coord': (8, 0), 'goal_coord': (0, 8), 'cost_map': cost_map} env = GridWorld(env_config) start = env.start_state goal = env.goal_state question_3_2_a(start, goal, env) question_3_2_b(start, goal, env) question_3_2_c(start, goal, env) question_3_2_d_and_e(start, goal, env) question_3_2_f(start, goal, env)
def run_instance(param): # runs sim with given parameters for different controllers and different trials and writes to results directory # init environment if param.env_name in 'gridworld': env = GridWorld(param) elif param.env_name in 'citymap': env = CityMap(param) else: exit('env_name not recognized: ', param.env_name) # run sim for i_trial in range(param.n_trials): # init datasets if param.make_dataset_on: print(' making dataset...') train_dataset, test_dataset = datahandler.make_dataset(env) datahandler.write_dataset(env, train_dataset, test_dataset) print(' loading dataset...') datahandler.load_dataset(env) # initial condition s0 = env.get_s0() for controller_name in param.controller_names: controller = Controller(param,env,controller_name) # sim sim_result = sim(param,env,controller,s0) # write results case_count = len(glob.glob('../current_results/*')) + 1 results_dir = '../current_results/sim_result_{}'.format(case_count) datahandler.write_sim_result(sim_result, results_dir) return
def init_nand(bias=True): '''Init the boolean environment @return the environment gridworld object ''' env = GridWorld(5, 5) #Define the state matrix state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the index matrix index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)], [(3,0), (3,1), (3,2), (3,3), (3,4)], [(2,0), (2,1), (2,2), (2,3), (2,4)], [(1,0), (1,1), (1,2), (1,3), (1,4)], [(0,0), (0,1), (0,2), (0,3), (0,4)]]) #Define the reward matrix reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, -1.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]]) #Define the transition matrix transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]) env.setStateMatrix(state_matrix) env.setIndexMatrix(index_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) if bias: return env, np.random.uniform(-1, 1, 3) else: return env, np.random.uniform(-1, 1, 2)
def test_move(self): grid = ' #P\nG #' gw = GridWorld(grid, move_value=-1, die_value=-20, win_value=10) step_tests = [ # move into wall ((0,0), (1,0), (0,0), -1, False), # move to free field ((0,0), (1,1), (1,1), -1, False), # move to goal ((0,0), (0,1), (0,1), 10, True), # die penalty ((0,0), (2,0), (2,0), -20, True), # out of bounds #1 ((0,0), (-1,0), (0,0), -1, False), # out of bounds #1 ((0,0), (10,0), (0,0), -1, False), ] for start, to, end, reward, is_terminal in step_tests: e, r, t = gw.move(start, to) self.assertEqual(e, end) self.assertEqual(r, reward) self.assertEqual(t, is_terminal)
def train(cfg): # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up # env = FrozenLakeWapper(env) # env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left # env = CliffWalkingWapper(env) gridmap = [ 'SFFFFH', 'HHHFFH', 'FFFFFH', 'FFFFFH', 'FHHHHH', 'FFFFFG'] env = GridWorld(gridmap) agent = QLearning( obs_dim=env.observation_space.n, action_dim=env.action_space.n, learning_rate=cfg.policy_lr, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay) render = True # 是否打开GUI画面 rewards = [] # 记录所有episode的reward MA_rewards = [] # 记录滑动平均的reward steps = [] # 记录所有episode的steps for i_episode in range(1, cfg.max_episodes+1): ep_reward = 0 # 记录每个episode的reward ep_steps = 0 # 记录每个episode走了多少step obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) while True: action = agent.sample(obs) # 根据算法选择一个动作 next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 # 训练 Q-learning算法 agent.learn(obs, action, reward, next_obs, done) # 不需要下一步的action obs = next_obs # 存储上一个观察值 ep_reward += reward ep_steps += 1 # 计算step数 if render: env.render() # 渲染新的一帧图形 if done: break steps.append(ep_steps) rewards.append(ep_reward) # 计算滑动平均的reward if i_episode == 1: MA_rewards.append(ep_reward) else: MA_rewards.append( 0.9*MA_rewards[-1]+0.1*ep_reward) print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps, ep_reward, agent.epsilon)) # 每隔20个episode渲染一下看看效果 if i_episode % 20 == 0: render = True else: render = False agent.save() # 训练结束,保存模型 output_path = os.path.dirname(__file__)+"/result/" # 检测是否存在文件夹 if not os.path.exists(output_path): os.mkdir(output_path) np.save(output_path+"rewards_train.npy", rewards) np.save(output_path+"MA_rewards_train.npy", MA_rewards) np.save(output_path+"steps_train.npy", steps)
from gridworld import GridWorld sim = GridWorld(60, 40, 10) sim.set_cell(10, 10, (255, 255, 255)) sim.end
plt.ylabel("Return") plt.ylim(-4, 1) plt.plot(returns) plt.plot(estimates) plt.legend(['Returns', 'Estimate']) pp = PdfPages('./plots/qplot.pdf') pp.savefig(fig) plt.close() pp.close() return returns, estimates, q if __name__ == '__main__': env = GridWorld() mdp = GridWorld_MDP() U, pi, Ustart = policy_iteration(mdp, plot=True) print(pi) for x in range(env.num_states): print("{} : {}".format(env.state2loc[x], U[x])) print("_________________") vret, vest, v = td_learning(env, pi, gamma=1., alpha=0.1, episodes=2000, plot=True) for x in range(env.num_states): print("{} : {}".format(env.state2loc[x], v[x]))
def go(self): # Subscribers to know the position pos_sub = rospy.Subscriber("/base_pose_ground_truth", Odometry, self.pos_callback) anemo_sub = rospy.Subscriber("/Anemometer/WindSensor_reading", anemometer, self.wind_callback) rospy.wait_for_message("/Anemometer/WindSensor_reading", anemometer, rospy.Duration(5.0)) # Probability mapping publisher prob_pub = rospy.Publisher("mapping_viz", OccupancyGrid, queue_size=10) prob_val = rospy.Publisher("max_prob_val", Float64, queue_size=10) # Choose between service and topic if self.use_service_for_gas: rospy.wait_for_service("odor_value") odor_req = rospy.ServiceProxy('odor_value', GasPosition) else: sensor_sub = rospy.Subscriber("/PID/Sensor_reading", gas_sensor, self.sensor_callback) rospy.wait_for_message("/PID/Sensor_reading", gas_sensor, rospy.Duration(4.0)) # Get grid parameters grid = GridWorld() if self.adjust_tmax: self.got_initial_position = False self.x_prev = grid.xlims[1] # Algorithm specific parameters # Initializing matrices alpha = (1.0 / grid.M) * np.ones(grid.M) Sij = np.zeros(grid.M) beta = np.zeros(grid.M) gamma = (1.0 / grid.M) * np.ones(grid.M) decimal_shifter = 1000 # OccupancyGrid prob = OccupancyGrid() prob.info.height = grid.m prob.info.width = grid.n prob.info.resolution = grid.res prob.info.origin.position.x = 0 prob.info.origin.position.y = 0 prob.info.origin.position.z = 0 prob.info.map_load_time = rospy.Time.now() prob.header.frame_id = self.fixed_frame prob.data = (alpha * 100).astype(np.int8).tolist() self.start_time = rospy.Time.now() self.K = -1 r = rospy.Rate(2) # Might have to be changed later while not rospy.is_shutdown(): if self.L is None: continue # Creating local saves at each timestep for continuously changing values x_pos, y_pos = self.x, self.y K = self.K # Read chemical concentration if self.use_service_for_gas: try: odor_res = odor_req(x_pos, y_pos, grid.height) gas_conc = odor_res.gas_conc[0] except rospy.ServiceException, e: rospy.logerr("[mapping.py] Odor service call failed %s" % e) else: gas_conc = self.gas_conc # Check if detection occurs detection = gas_conc > self._conc_epsilon if not self.adjust_wind_interval(x_pos, y_pos, detect=detection): continue rospy.loginfo("x,y = [%.2f,%.2f], Gas concentration: %.2f", x_pos, y_pos, gas_conc) # Wind values from t_L to t_K without accounting for the 'time' column of wind_history wind_data = np.delete(self.wind_history[self.L:K + 1], 2, 1).astype(float) Vx, Vy = np.sum(wind_data, 0) beta[:] = 0 gamma[:] = 1 for t0 in range(self.L, K): tl, tk = self.wind_history[t0][2].to_sec( ), self.wind_history[K][2].to_sec() deviation_x = math.sqrt(tk - tl) * grid.sx deviation_y = math.sqrt(tk - tl) * grid.sy for i in range(0, grid.M): deltax = x_pos - grid.xcell[i] - Vx deltay = y_pos - grid.ycell[i] - Vy Sij[i] = grid.res**2 * np.exp((-deltax**2)/(2*deviation_x**2)) * \ np.exp((-deltay**2)/(2*deviation_y**2)) /\ (2*np.pi*deviation_x*deviation_y) try: Sij /= np.sum(Sij) except RuntimeWarning: rospy.logerr( "All values of Sij = 0. sx and/or sy has to be changed" ) if detection: beta = beta + Sij else: gamma = gamma * (1 - grid.mu * Sij) if self.L != K: if detection: beta /= (K - self.L) alpha_k = grid.M * beta * alpha else: alpha_k = (grid.M / np.sum(gamma)) * gamma * alpha alpha_k = alpha_k / np.sum(alpha_k) alpha = alpha_k # Probability map is scaled up by a factor to show the color in Rviz # Occupancy map supports only integers from 0-100 prob.data = (alpha * decimal_shifter).astype(np.int8).tolist() prob.data = [100 if x > 100 else x for x in prob.data] if self.verbose: if self.L != 0: rospy.loginfo("self.L = %d", self.L) prob_pub.publish(prob) prob_val.publish(Float64(np.max(alpha))) r.sleep()