def test_gridworld_q_learning(): np.random.seed(0) N = 5 goal_pos = np.array([[N-1, N-1]]) human_pos = np.array([[N-1, 0]]) human_radius = 2 grid = np.ones((N, N), dtype=float) * -1 grid = construct_goal_reward(grid, goal_pos, 10) grid = construct_human_radius_reward(grid, human_pos, human_radius, -10) env = GridWorld( dimensions=(N, N), init_pos=(0, 0), goal_pos=goal_pos, reward_grid=grid, human_pos=human_pos, action_success_rate=0.8, render=True, ) mdp_algo = q_learning(env.transition, env.reward, gamma=0.99) mdp_algo.run() policy = StochasticGreedyPolicy( env.action_space(), mdp_algo, env.transition) # plot results R = env.reward.reshape((N, N)).T V = np.asarray(mdp_algo.V).reshape((N, N)).T plot_grid_map(R, "Reward", cmap=plt.cm.Reds) plot_grid_map(V, "Value Function", cmap=plt.cm.Blues) plt.show() obs, rew, done, info = env.reset() while not done: act = policy.get_action(obs) obs, rew, done, info = env.step(act) time.sleep(0.2) env.close()
def test_gridworld_value_iteration(): np.random.seed(0) N = 10 goal_pos = np.array([[N-1, N-1], [N-1, N-2]]) human_pos = np.array([[N//2, N//2], [N-1, 0]]) human_radius = 3 grid = np.zeros((N, N), dtype=float) grid = construct_goal_reward(grid, goal_pos, 10) grid = construct_human_radius_reward(grid, human_pos, human_radius, -10) env = GridWorld( dimensions=(N, N), init_pos=(0, 0), goal_pos=goal_pos, reward_grid=grid, human_pos=human_pos, action_success_rate=1, render=True, ) mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99) policy = EpsGreedyPolicy(env.action_space(), mdp_algo) # plot results R = env.reward.reshape((N, N)).T V = np.asarray(mdp_algo.V).reshape((N, N)).T plot_grid_map(R, "Reward", cmap=plt.cm.Reds) plot_grid_map(V, "Value Function", cmap=plt.cm.Blues) plot_policy(policy, (N, N), "Policy", values=V, cmap=plt.cm.Blues) plt.show() obs, rew, done, info = env.reset() while not done: act = policy.get_action(obs) obs, rew, done, info = env.step(act) time.sleep(0.2) env.close()
def main(cfg): pygame.init() # フォントの作成 sysfont = pygame.font.SysFont(None, 40) screen = pygame.display.set_mode(WINDOW_SIZE) pygame.display.set_caption("Grid World") done = False clock = pygame.time.Clock() # grid worldの初期化 grid_env = GridWorld() # grid worldの環境の初期化 ini_state = grid_env.start_pos # 初期状態(エージェントのスタート地点の位置) agent = QLearningAgent( epsilon=cfg["agent"]["epsilon"], epsilon_decay_rate=cfg["agent"]["epsilon_decay_rate"], actions=np.arange(4), observation=ini_state) # Q学習エージェント nb_episode = cfg["nb_episode"] # エピソード数 save_interval = cfg["save_interval"] result_dir = cfg["result_dir"] max_step = 1 rewards = [] # 評価用報酬の保存 is_end_episode = False # エージェントがゴールしてるかどうか? step = 0 # time.sleep(30) for episode in range(nb_episode): print("episode:", episode) episode_reward = [] # 1エピソードの累積報酬 step = 0 while (is_end_episode is False and step < max_step): # ゴールするまで続ける action = agent.act() # 行動選択 state, reward, is_end_episode = grid_env.step(action) agent.observe(state, reward) # 状態と報酬の観測 episode_reward.append(reward) screen.fill(BLACK) # grid worldの描画 draw_grid_world(grid_env.map, screen) # テキストを描画したSurfaceを作成 step_str = sysfont.render("step:{}".format(step), False, WHITE) # 位# テキストを描画する screen.blit(step_str, (500, 50)) clock.tick(1) step += 1 # 再描画 pygame.display.flip() rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える state = grid_env.reset() # 初期化 agent.observe(state) # エージェントを初期位置に is_end_episode = False print("step:", step) agents = [agent] if episode % save_interval == 0: save_result(agents, episode, result_dir) pygame.quit()
observation = env.state observation_oh = env.one_hot(env.state) episode_reward = 0 while True: action = Agent.act(observation_oh, env.A[observation]) state, reward, done = env.step(action) state_oh = env.one_hot(state) Agent.record(observation_oh, action, reward, state_oh, done) observation = state observation_oh = state_oh episode_reward += reward if done: reward_list.append(episode_reward) env.reset() Agent.learn() if episode % 100 == 0: print('Reward for agent %d, episode %d, is %f' % (i, episode, np.mean(reward_list[-10::]))) f_list = [] for test_state in range(env.state_space): if test_state in env.w: continue else: test_state_oh = env.one_hot(test_state) action = Agent.act(test_state_oh, env.A[test_state], test_mode=True) if action in np.where(env.Optimal[test_state])[0]: f_list.append(1) else: f_list.append(0) fidelity = np.mean(f_list)
if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess,ckpt.model_checkpoint_path) sess.run(init_op) #Set the target network to be equal to the main network. update_target(update_target_ops_2, sess) global_experience_buffer = ExperienceBuffer() #create list to contain total rewards per episode total_reward_list = [] steps = 0 for i in range(num_episodes + 1): episode_buffer = ExperienceBuffer() observation = env.reset() observation = np.reshape(observation, [21168]) done = False total_reward_in_episode = 0 steps_in_episode = 0 while steps_in_episode < max_episode_length: # 积累样本 steps_in_episode += 1 steps += 1 if np.random.rand(1) < random_threshold or steps < pre_train_steps: action = np.random.randint(0, 4) else: action = sess.run(main_DQN.action, feed_dict={main_DQN.scalar_input: [observation]})[0] new_observation, reward, done = env.step(action) new_observation = np.reshape(new_observation, [21168]) #Save the experience to episode buffer
grid_env = GridWorld() # grid worldの環境の初期化 ini_state = grid_env.start_pos # 初期状態(エージェントのスタート地点の位置) policy = EpsGreedyQPolicy(epsilon=.01) # 方策の初期化。ここではε-greedy agent = QLearningAgent(actions=np.arange(4), observation=ini_state, policy=policy) # Q Learning エージェントの初期化 nb_episode = 100 #エピソード数 rewards = [] # 評価用報酬の保存 is_goal = False # エージェントがゴールしてるかどうか? for episode in range(nb_episode): episode_reward = [] # 1エピソードの累積報酬 while(is_goal == False): # ゴールするまで続ける action = agent.act() # 行動選択 state, reward, is_goal = grid_env.step(action) agent.observe(state, reward) # 状態と報酬の観測 episode_reward.append(reward) rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える state = grid_env.reset() # 初期化 agent.observe(state) # エージェントを初期位置に is_goal = False # テスト(greedyアクション) agent.traning = False while(is_goal == False): # ゴールするまで続ける print("(y, x):{}".format(state)) action = agent.act() # 行動選択 print(action) state, reward, is_goal = grid_env.step(action) agent.observe(state, reward) # 状態と報酬の観測 # 結果のプロット plt.plot(np.arange(nb_episode), rewards) plt.xlabel("episode")