def test_monitor_filename(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, directory=temp) env.close() manifests = glob.glob(os.path.join(temp, '*.manifest.*')) assert len(manifests) == 1
def test_video_callable_false_does_not_record(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp, video_callable=False) env.reset() env.close() results = monitoring.load_results(temp) assert len(results['videos']) == 0
def test_video_callable_records_videos(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp) env.reset() env.close() results = monitoring.load_results(temp) assert len(results['videos']) == 1, "Videos: {}".format(results['videos'])
def test_semisuper_succeeds(): """Regression test. Ensure that this can write""" with helpers.tempdir() as temp: env = gym.make('SemisuperPendulumDecay-v0') env = Monitor(env, temp) env.reset() env.step(env.action_space.sample()) env.close()
class GymEnvironment(Environment): def __init__(self, env_id, directory=None, force=True, monitor_video=0): super(GymEnvironment, self).__init__(env_id=env_id) self._env = gym.make(env_id) if directory: if monitor_video == 0: video_callable = False else: video_callable = (lambda x: x % monitor_video == 0) self._env = Monitor(self._env, directory, video_callable=video_callable, force=force) def __str__(self): return 'OpenAIGym({})'.format(self._env_id) def close(self): if not self._closed: self._env.close() self._closed = True def reset(self, return_spec=True): self._reset() state = self._env.reset() if return_spec: return EnvSpec(action=None, state=None, reward=0, done=False, next_state=state) return state def step(self, action, state, return_spec=True): self._step() if isinstance(action, (list, np.ndarray)): if isinstance(self._env.action_space, Discrete) or isinstance(action, (list, np.ndarray)): action = action[0] if isinstance(self._env.action_space, Box) and not isinstance(action, (list, np.ndarray)): action = list(action) next_state, reward, done, _ = self._env.step(action) if return_spec: return EnvSpec( action=action, state=state, reward=reward, done=done, next_state=next_state) return next_state, reward, done @property def num_states(self): return self._env.observation_space.shape[0] @property def num_actions(self): if isinstance(self._env.action_space, Box): return self._env.action_space.shape[0] else: return self._env.action_space.n @property def is_continuous(self): return not isinstance(self._env.action_space, Discrete)
def cart_pole_with_qlearning(): from gym.wrappers import Monitor env = gym.make('CartPole-v0') experiment_filename = './cartpole-experiment-1' env = Monitor(env, experiment_filename, force=True) observation = env.reset() goal_average_steps = 195 max_number_of_steps = 200 number_of_iterations_to_average = 100 number_of_features = env.observation_space.shape[0] last_time_steps = np.ndarray(0) cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1] pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1] cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1] angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1] learner = QLearner(state_discretization=Binning([[-2.4, 2.4], [-2, 2], [-1., 1], [-3.5, 3.5]], [10] * 4), discrete_actions=[i for i in range(env.action_space.n)], alpha=0.2, gamma=1, random_action_rate=0.5, random_action_decay_rate=0.99) for episode in range(50000): action = learner.set_initial_state(observation) for step in range(max_number_of_steps - 1): observation, reward, done, info = env.step(action) if done: reward = -200 observation = env.reset() action = learner.move(observation, reward) if done: last_time_steps = np.append(last_time_steps, [int(step + 1)]) if len(last_time_steps) > number_of_iterations_to_average: last_time_steps = np.delete(last_time_steps, 0) break if last_time_steps.mean() > goal_average_steps: print "Goal reached!" print "Episodes before solve: ", episode + 1 print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(), unichr(177), # plus minus sign last_time_steps.std()) break env.close()
def test_write_upon_reset_false(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=False) env.reset() files = glob.glob(os.path.join(temp, '*')) assert not files, "Files: {}".format(files) env.close() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0
def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True, t_max=100000): """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward. :param save_path: where to save the report :param record_video: if True, records mp4 video :return: total reward (scalar) """ env = self.make_env() if not use_monitor and record_video: raise warn("Cannot video without gym monitor. If you still want video, set use_monitor to True") if record_video : env = Monitor(env,save_path,force=True) elif use_monitor: env = Monitor(env, save_path, video_callable=lambda i: False, force=True) game_rewards = [] for _ in range(n_games): # initial observation observation = env.reset() # initial memory prev_memories = [np.zeros((1,) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in self.agent.agent_states] t = 0 total_reward = 0 while True: res = self.agent_step(self.preprocess_observation(observation)[None, ...], *prev_memories) action, new_memories = res[0], res[1:] observation, reward, done, info = env.step(action[0]) total_reward += reward prev_memories = new_memories if done or t >= t_max: if verbose: print("Episode finished after {} timesteps with reward={}".format(t + 1, total_reward)) break t += 1 game_rewards.append(total_reward) env.close() del env return game_rewards
def test_write_upon_reset_true(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') # TODO: Fix Cartpole to not configure itself automatically # assert not env._configured env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=True) env.configure() env.reset() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0, "Files: {}".format(files) env.close() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0
def test_steps_limit_restart(): with helpers.tempdir() as temp: env = gym.make('test.StepsLimitCartpole-v0') env = Monitor(env, temp, video_callable=False) env.reset() # Episode has started _, _, done, info = env.step(env.action_space.sample()) assert done == False # Limit reached, now we get a done signal and the env resets itself _, _, done, info = env.step(env.action_space.sample()) assert done == True assert env.episode_id == 1 env.close()
def test_only_complete_episodes_written(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp, video_callable=False) env.reset() d = False while not d: _, _, d, _ = env.step(env.action_space.sample()) env.reset() env.step(env.action_space.sample()) env.close() # Only 1 episode should be written results = monitoring.load_results(temp) assert len(results['episode_lengths']) == 1, "Found {} episodes written; expecting 1".format(len(results['episode_lengths']))
def test_env_reuse(): with helpers.tempdir() as temp: env = gym.make('Autoreset-v0') env = Monitor(env, temp) env.reset() _, _, done, _ = env.step(None) assert not done _, _, done, _ = env.step(None) assert done _, _, done, _ = env.step(None) assert not done _, _, done, _ = env.step(None) assert done env.close()
def test_no_monitor_reset_unless_done(): def assert_reset_raises(env): errored = False try: env.reset() except error.Error: errored = True assert errored, "Env allowed a reset when it shouldn't have" with helpers.tempdir() as temp: # Make sure we can reset as we please without monitor env = gym.make('CartPole-v0') env.reset() env.step(env.action_space.sample()) env.step(env.action_space.sample()) env.reset() # can reset once as soon as we start env = Monitor(env, temp, video_callable=False) env.reset() # can reset multiple times in a row env.reset() env.reset() env.step(env.action_space.sample()) env.step(env.action_space.sample()) assert_reset_raises(env) # should allow resets after the episode is done d = False while not d: _, _, d, _ = env.step(env.action_space.sample()) env.reset() env.reset() env.step(env.action_space.sample()) assert_reset_raises(env) env.close()
class GymEnvironment(Environment): def __init__(self, env_id, directory=None, force=True, monitor_video=0): super(GymEnvironment, self).__init__(env_id=env_id) self._env = gym.make(env_id) if directory: if monitor_video == 0: video_callable = False else: video_callable = (lambda x: x % monitor_video == 0) self._env = Monitor(self._env, directory, video_callable=video_callable, force=force) def __str__(self): return 'OpenAIGym({})'.format(self._env_id) def close(self): if not self._closed: self._env.close() self._closed = True def reset(self, return_spec=True): self._reset() state = self._env.reset() if return_spec: return EnvSpec(action=None, state=None, reward=0, done=False, next_state=state) return state def step(self, action, state, return_spec=True): self._step() if isinstance(action, (list, np.ndarray)): if isinstance(self._env.action_space, Discrete) or isinstance( action, (list, np.ndarray)): action = action[0] if isinstance(self._env.action_space, Box) and not isinstance(action, (list, np.ndarray)): action = list(action) next_state, reward, done, _ = self._env.step(action) if return_spec: return EnvSpec(action=action, state=state, reward=reward, done=done, next_state=next_state) return next_state, reward, done @property def num_states(self): return self._env.observation_space.shape[0] @property def num_actions(self): if isinstance(self._env.action_space, Box): return self._env.action_space.shape[0] else: return self._env.action_space.n @property def is_continuous(self): return not isinstance(self._env.action_space, Discrete)
def run(seed, episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, epsilon_steps, tau_actor, tau_actor_param, use_ornstein_noise, learning_rate_actor, learning_rate_actor_param, title, epsilon_final, clip_grad, beta, scale_actions, split, indexed, zero_index_gradients, action_input_layer, evaluation_episodes, multipass, weighted, average, random_weighted, update_ratio, save_freq, save_dir, layers): if save_freq > 0 and save_dir: save_dir = os.path.join(save_dir, title + "{}".format(str(seed))) os.makedirs(save_dir, exist_ok=True) env = make_env(scale_actions) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) # env.seed(seed) # doesn't work on HFO np.random.seed(seed) from agents.pdqn_nstep import PDQNNStepAgent from agents.pdqn_split_nstep import PDQNNStepSplitAgent from agents.pdqn_multipass_nstep import MultiPassPDQNNStepAgent assert not (split and multipass) agent_class = PDQNNStepAgent if split: agent_class = PDQNNStepSplitAgent elif multipass: agent_class = MultiPassPDQNNStepAgent assert action_input_layer >= 0 if action_input_layer > 0: assert split agent = agent_class( env.observation_space, env.action_space, actor_kwargs={ "hidden_layers": layers, 'action_input_layer': action_input_layer, 'activation': "leaky_relu", 'output_layer_init_std': 0.01 }, actor_param_kwargs={ "hidden_layers": layers, 'activation': "leaky_relu", 'output_layer_init_std': 0.01 }, batch_size=batch_size, learning_rate_actor=learning_rate_actor, # 0.0001 learning_rate_actor_param=learning_rate_actor_param, # 0.001 epsilon_steps=epsilon_steps, epsilon_final=epsilon_final, gamma=gamma, # 0.99 tau_actor=tau_actor, tau_actor_param=tau_actor_param, clip_grad=clip_grad, beta=beta, indexed=indexed, weighted=weighted, average=average, random_weighted=random_weighted, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, inverting_gradients=inverting_gradients, zero_index_gradients=zero_index_gradients, seed=seed) print(agent) network_trainable_parameters = sum(p.numel() for p in agent.actor.parameters() if p.requires_grad) network_trainable_parameters += sum( p.numel() for p in agent.actor_param.parameters() if p.requires_grad) print("Total Trainable Network Parameters: %d" % network_trainable_parameters) max_steps = 15000 total_reward = 0. returns = [] timesteps = [] goals = [] start_time_train = time.time() for i in range(episodes): if save_freq > 0 and save_dir and i % save_freq == 0: agent.save_models(os.path.join(save_dir, str(i))) info = {'status': "NOT_SET"} state = env.reset() state = np.array(state, dtype=np.float32, copy=False) act, act_param, all_action_parameters = agent.act(state) action = pad_action(act, act_param) episode_reward = 0. agent.start_episode() transitions = [] for j in range(max_steps): next_state, reward, terminal, info = env.step(action) next_state = np.array(next_state, dtype=np.float32, copy=False) # status = info['status'] # if status != 'IN_GAME': # print(status) next_act, next_act_param, next_all_action_parameters = agent.act( next_state) next_action = pad_action(next_act, next_act_param) transitions.append([ state, np.concatenate(([act], all_action_parameters.data)).ravel(), reward, next_state, np.concatenate( ([next_act], next_all_action_parameters.data)).ravel(), terminal ]) act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters action = next_action state = next_state episode_reward += reward #env.render() if terminal: break agent.end_episode() # calculate n-step returns n_step_returns = compute_n_step_returns(transitions, gamma) for t, nsr in zip(transitions, n_step_returns): t.append(nsr) agent.replay_memory.append(state=t[0], action=t[1], reward=t[2], next_state=t[3], next_action=t[4], terminal=t[5], time_steps=None, n_step_return=nsr) n_updates = int(update_ratio * j) for _ in range(n_updates): agent._optimize_td_loss() returns.append(episode_reward) timesteps.append(j) goals.append(info['status'] == 'GOAL') total_reward += episode_reward if i % 100 == 0: print('{0:5s} R:{1:.4f} r100:{2:.4f}'.format( str(i + 1), total_reward / (i + 1), np.array(returns[-100:]).mean())) end_time_train = time.time() if save_freq > 0 and save_dir: agent.save_models(os.path.join(save_dir, str(i))) returns = env.get_episode_rewards() np.save(os.path.join(dir, title + "{}".format(str(seed))), np.column_stack((returns, timesteps, goals))) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.epsilon_final = 0. agent.epsilon = 0. agent.noise = None agent.actor.eval() agent.actor_param.eval() start_time_eval = time.time() evaluation_results = evaluate( env, agent, evaluation_episodes) # returns, timesteps, goals end_time_eval = time.time() print("Ave. evaluation return =", sum(evaluation_results[:, 0]) / evaluation_results.shape[0]) print("Ave. timesteps =", sum(evaluation_results[:, 1]) / evaluation_results.shape[0]) goal_timesteps = evaluation_results[:, 1][evaluation_results[:, 2] == 1] if len(goal_timesteps) > 0: print("Ave. timesteps per goal =", sum(goal_timesteps) / evaluation_results.shape[0]) else: print("Ave. timesteps per goal =", sum(goal_timesteps) / evaluation_results.shape[0]) print("Ave. goal prob. =", sum(evaluation_results[:, 2]) / evaluation_results.shape[0]) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_results) print("Evaluation time: %.2f seconds" % (end_time_eval - start_time_eval)) print("Training time: %.2f seconds" % (end_time_train - start_time_train)) print(agent) env.close()
def train(env, estimator, target_network, num_episodes=1000, replay_memory_size=500000, frame_history_len=4, save_every=10, update_every=1000, discount=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=50000, batch_size=32, record_every=50): """ deep q learning algorithm :param env: openAI gym environment :param estimator: estimator model for predicting values :param target_network: :param num_episodes: number of episodes to run :param replay_memory_size: size of replay memory :param update_every: copy params from estimator into target estimator after this many steps :param discount: discount factor :param epsilon_start: starting epsilon value :param epsilon_end: ending epsilon value :param batch_size: 32 lol :param record_every: record a video every N episodes :return: """ # Load previous state here replay_memory = ReplayBuffer(replay_memory_size, frame_history_len) # epsilon delay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) loss_func = nn.SmoothL1Loss() optimizer = torch.optim.Adam(estimator.parameters()) policy = make_epsilon_greedy_policy(estimator, len(VALID_ACTIONS)) env = Monitor(env, directory="./monitor", resume=True, video_callable=lambda count: count % record_every == 0) total_t = 0 pbar = tqdm(range(num_episodes)) pbar.set_description("ep: %d, er: %.2f, et: %d, tt: %d, exp_size: %d" % (0, 0.0, 0, 0, 0)) for ep in pbar: state = env.reset() # 210 x 160 x 4 state = process_state(state) # 94 x 94 x 3 episode_loss = 0 episode_reward = 0 episode_t = 0 for t in itertools.count(): epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] last_idx = replay_memory.store_frame(state) recent_observations = replay_memory.encode_recent_observation() action_dist = policy(recent_observations, epsilon) action_dist = action_dist.squeeze(0).numpy() action = np.random.choice(np.arange(len(action_dist)), p=action_dist) next_state, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) episode_reward += reward replay_memory.store_effect(last_idx, action, reward, done) next_state = process_state(next_state) state = next_state if replay_memory.can_sample(batch_size): obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_memory.sample(batch_size) obs_batch = torch.from_numpy(obs_batch).float() obs_batch = obs_batch.to(device) act_batch = torch.from_numpy(act_batch).long().to(device) / 255.0 rew_batch = torch.from_numpy(rew_batch).to(device) next_obs_batch = torch.from_numpy(next_obs_batch).float().to(device) / 255.0 not_done_mask = torch.from_numpy(1 - done_mask).float().to(device) state_values = estimator(obs_batch) # b x VALID_ACTIONS state_action_values = torch.gather(state_values, 1, act_batch.unsqueeze(1)) # b x 1 next_state_values_max = target_network(next_obs_batch).detach().max(dim=1)[0] next_state_values = not_done_mask * next_state_values_max expected_q_value = (next_state_values * discount) + rew_batch # bellman_error = expected_q_value - state_action_values.squeeze(1) # # clipped_bellman_error = bellman_error.clamp(-1, 1) # # d_error = clipped_bellman_error * -1.0 loss = loss_func(state_action_values, expected_q_value.unsqueeze(1)) episode_loss += loss # state_action_values.backward(d_error.data.unsqueeze(1)) optimizer.zero_grad() loss.backward() optimizer.step() if done: break total_t += 1 episode_t = t pbar.set_description("ep: %d, el: %.5f, er: %.2f, et: %d, tt: %d, exp_size: %d" % (ep, episode_loss, episode_reward, episode_t, total_t, replay_memory.num_in_buffer)) if total_t % update_every == 0: copy_model_params(estimator, target_network) # save checkpoint if ep % save_every == 0: torch.save(estimator.state_dict(), './checkpoints/checkpoint.pt') env.close()
class PongAgent: def __init__(self, mode=None): self.env = wrap_dqn(gym.make('PongDeterministic-v4')) if mode == 'test': self.env = Monitor(self.env, './video', force=True, video_callable=lambda episode_id: True) self.num_actions = self.env.action_space.n self.dqn = DQN(self.num_actions) self.target_dqn = DQN(self.num_actions) if use_gpu: self.dqn.cuda() self.target_dqn.cuda() self.buffer = ReplayMemory(1000) self.gamma = 0.99 self.mse_loss = nn.MSELoss() self.optim = optim.RMSprop(self.dqn.parameters(), lr=0.01) self.out_dir = './model' self.writer = SummaryWriter() if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) def to_var(self, x): x_var = Variable(x) if use_gpu: x_var = x_var.cuda() return x_var def predict_q_values(self, states): states = self.to_var(torch.from_numpy(states).float()) actions = self.dqn(states) return actions def predict_q_target_values(self, states): states = self.to_var(torch.from_numpy(states).float()) actions = self.target_dqn(states) return actions def select_action(self, state, epsilon): choice = np.random.choice([0, 1], p=(epsilon, (1 - epsilon))) if choice == 0: return np.random.choice(range(self.num_actions)) else: state = np.expand_dims(state, 0) actions = self.predict_q_values(state) return np.argmax(actions.data.cpu().numpy()) def update(self, predicts, targets, actions): targets = self.to_var( torch.unsqueeze(torch.from_numpy(targets).float(), -1)) actions = self.to_var( torch.unsqueeze(torch.from_numpy(actions).long(), -1)) affected_values = torch.gather(predicts, 1, actions) loss = self.mse_loss(affected_values, targets) self.optim.zero_grad() loss.backward() self.optim.step() def get_epsilon(self, total_steps, max_epsilon_steps, epsilon_start, epsilon_final): return max(epsilon_final, epsilon_start - total_steps / max_epsilon_steps) def sync_target_network(self): primary_params = list(self.dqn.parameters()) target_params = list(self.target_dqn.parameters()) for i in range(0, len(primary_params)): target_params[i].data[:] = primary_params[i].data[:] def calculate_q_targets(self, next_states, rewards, dones): dones_mask = (dones == 1) predicted_q_target_values = self.predict_q_target_values(next_states) next_max_q_values = np.max( predicted_q_target_values.data.cpu().numpy(), axis=1) next_max_q_values[ dones_mask] = 0 # no next max Q values if the game is over q_targets = rewards + self.gamma * next_max_q_values return q_targets def save_final_model(self): filename = '{}/final_model.pth'.format(self.out_dir) torch.save(self.dqn.state_dict(), filename) def save_model_during_training(self, episode): filename = '{}/current_model_{}.pth'.format(self.out_dir, episode) torch.save(self.dqn.state_dict(), filename) def load_model(self, filename): self.dqn.load_state_dict(torch.load(filename)) self.sync_target_network() def play(self, episodes): for i in range(1, episodes + 1): done = False state = self.env.reset() while not done: action = self.select_action( state, 0) # force to choose an action from the network state, reward, done, _ = self.env.step(action) # self.env.render() def close_env(self): self.env.close() def train(self, replay_buffer_fill_len, batch_size, episodes, max_epsilon_steps, epsilon_start, epsilon_final, sync_target_net_freq): start_time = time.time() print('Start training at: ' + time.asctime(time.localtime(start_time))) total_steps = 0 running_episode_reward = 0 # populate replay memory print('Populating replay buffer... ') print('\n') state = self.env.reset() for i in range(replay_buffer_fill_len): action = self.select_action(state, 1) # force to choose a random action next_state, reward, done, _ = self.env.step(action) self.buffer.add(state, action, reward, done, next_state) state = next_state if done: self.env.reset() print('replay buffer populated with {} transitions, start training...'. format(self.buffer.count())) print('\n') # main loop - iterate over episodes for i in range(1, episodes + 1): # reset the environment done = False state = self.env.reset() # reset spisode reward and length episode_reward = 0 episode_length = 0 # play until it is possible while not done: # synchronize target network with estimation network in required frequence if (total_steps % sync_target_net_freq) == 0: self.sync_target_network() # calculate epsilon and select greedy action epsilon = self.get_epsilon(total_steps, max_epsilon_steps, epsilon_start, epsilon_final) action = self.select_action(state, epsilon) # execute action in the environment next_state, reward, done, _ = self.env.step(action) # store transition in replay memory self.buffer.add(state, action, reward, done, next_state) # sample random minibatch of transitions s_batch, a_batch, r_batch, d_batch, next_s_batch = self.buffer.sample( batch_size) # predict Q value using the estimation network predicted_values = self.predict_q_values(s_batch) # estimate Q value using the target network q_targets = self.calculate_q_targets(next_s_batch, r_batch, d_batch) # update weights in the estimation network self.update(predicted_values, q_targets, a_batch) # set the state for the next action selction and update counters and reward state = next_state total_steps += 1 episode_length += 1 episode_reward += reward self.writer.add_scalar('data/reward', reward, total_steps) self.writer.add_scalar('data/epsilon', epsilon, total_steps) running_episode_reward = running_episode_reward * 0.9 + 0.1 * episode_reward self.writer.add_scalar('data/episode_reward', episode_reward, i) self.writer.add_scalar('data/running_episode_reward', running_episode_reward, i) if (i % 30) == 0: print('global step: {}'.format(total_steps)) print('episode: {}'.format(i)) print('running reward: {}'.format( round(running_episode_reward, 2))) print('current epsilon: {}'.format(round(epsilon, 2))) print('episode_length: {}'.format(episode_length)) print('episode reward: {}'.format(episode_reward)) curr_time = time.time() print('current time: ' + time.asctime(time.localtime(curr_time))) print('running for: ' + str(datetime.timedelta(seconds=curr_time - start_time))) print('saving model after {} episodes...'.format(i)) print('\n') self.save_model_during_training(i) print('Finish training at: ' + time.asctime(time.localtime(start_time)))
class Environment(object): def __init__(self, game, record=False, width=84, height=84, seed=0): self.game = gym.make(game) self.game.seed(seed) if record: self.game = Monitor(self.game, './video', force=True) self.width = width self.height = height self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()]) gym_ple def play_sample(self, mode: str = 'human'): observation = self.game.reset() while True: screen = self.game.render(mode=mode) if mode == 'rgb_array': screen = self.preprocess(screen) action = self.game.action_space.sample() observation, reward, done, info = self.game.step(action) if done: break self.game.close() def preprocess(self, screen): preprocessed: np.array = cv2.resize( screen, (self.height, self.width)) # 84 * 84 로 변경 preprocessed = np.dot(preprocessed[..., :3], [0.299, 0.587, 0.114]) # Gray scale 로 변경 # preprocessed: np.array = preprocessed.transpose((2, 0, 1)) # (C, W, H) 로 변경 preprocessed: np.array = preprocessed.astype('float32') / 255. return preprocessed def init(self): """ @return observation """ return self.game.reset() def get_screen(self): screen = self.game.render('rgb_array') screen = self.preprocess(screen) return screen def step(self, action: int): observation, reward, done, info = self.game.step(action) return observation, reward, done, info def reset(self): """ :return: observation array """ observation = self.game.reset() observation = self.preprocess(observation) return observation @property def action_space(self): return self.game.action_space.n
def train(self, function, discount_factor, actor_learning_rate, learning_env, testing_env, total_observations, test_interval, total_number_of_testing_episodes, gym_training_logs_directory_path, gym_testing_logs_directory_path, actor_weights_saving_interval): """Train the agent function -- An instance of the class implemnenting the actor critic model discount_factor -- Quantifies how much the agent cares about future rewards while learning. Often referred to as gamma in the literature. actor_learning_rate -- Learning rate of the actor learning_env -- A Gym environment (wrapped or vanilla) used for learning testing_env -- A Gym environment (wrapped or vanilla) used for testing. total_observations -- Train till this observation number test_interval -- Test after this many observations total_number_of_testing_episodes -- Number of episodes to test the agent in every testing round gym_training_logs_directory_path - Directory to save automatic Gym logs related to training. We save the rewards for every learning episode. gym_testing_logs_directory_path - Directory to save automatic Gym logs related to testing. We save a video for the first test episode. actor_weights_saving_interval -- Save the actor weights (i.e. write to file) after this many episodes. """ # This keeps track of the number of observations made so far observation_number = 0 # Keep count of the episode number episode_number = 1 # The learning env should always be wrapped by the Monitor provided # by Gym. This lets us automatically save the rewards for every episode. learning_env = Monitor( learning_env, gym_training_logs_directory_path, # Don't want video recording during training, only during testing video_callable=False, # Write after every reset so that we don't lose data for # prematurely interrupted training runs write_upon_reset=True, ) while observation_number < total_observations: # initialize environment observation = learning_env.reset() total_rewards_obtained_in_this_episode = 0 action = function.get_action(observation) # Execute an episode while True: # take the action determined by the Softmax policy next_observation, reward, done, info = learning_env.step( action) # Determine the next action. This is required for the # model update. next_action = function.get_action(next_observation) # Update the model function.update_model( discount_factor, actor_learning_rate, observation, action, reward, done, next_observation, next_action, ) observation = next_observation action = next_action observation_number += 1 # Test the current performance after every test_interval if observation_number % test_interval == 0: # The testing env is also wrapped by a Monitor so that we # can take automatic videos during testing. We will take a # video for the very first testing episode. video_callable = lambda count: count == 0 # Since the environment is closed after every testing round, # the video for different testing round will end up having # the same name! To differentiate the videos, we pass # an unique uid parameter. monitored_testing_env = Monitor( testing_env, gym_testing_logs_directory_path, video_callable=video_callable, resume=True, uid=observation_number / test_interval) # Run the test average_reward = self.test( monitored_testing_env, total_number_of_episodes= total_number_of_testing_episodes, function=function, render=False) print( "[{0}] Episode number : {1}, Observation number : {2} " "Average reward (100 eps) : {3}".format( datetime.datetime.now(), episode_number, observation_number, average_reward)) total_rewards_obtained_in_this_episode += reward if done: episode_number += 1 # Save table to file at regular intervals if episode_number % actor_weights_saving_interval == 0: function.save() break print("[{0}] Episode number : {1}, Obervation number: {2}, " "Reward in this episode : {3}".format( datetime.datetime.now(), episode_number - 1, observation_number, total_rewards_obtained_in_this_episode, )) learning_env.close() # There's a bug in the Gym Monitor. The Monitor's close method does not # close the wrapped environment. This makes the script exit with an # error if the environment is being rendered at some point. To make # this error go away, we have to close the unwrapped testing # environment. The learning environment is not being rendered, so we # don't need to bother about that. testing_env.env.close()
def main_test(id): config(id) env = gym.make(id) env = env.unwrapped dqn = MyDQN(env) if id == 'CartPole-v0': T = 20000 else: T = 2000 count = 0 train_result = [] train_loss = [] for i in range(2000): observation = env.reset() for j in range(T): action = dqn.action(observation, i) new_observation, reward, done, info = env.step(action) if id == 'CartPole-v0': r1 = (env.x_threshold - abs(new_observation[0])) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs( new_observation[2])) / env.theta_threshold_radians - 0.5 reward = r1 + r2 '''if j<2000: reward=-200''' elif done: reward = 100 dqn.perceive(observation, action, reward, new_observation, done) observation = new_observation if done == False and j != T - 1: continue train_result.append(j) if id == 'CartPole-v0': if done or j == T - 1: if j > 5000: count += 1 else: count = 0 print(i, j) break elif id == 'MountainCar-v0': print(i, j) if done and j < 300: count += 1 else: count = 0 break else: print(i, j) if done and j < 300: count += 1 else: count = 0 break train_loss.append(dqn.get_loss() / train_result[-1]) if id == 'CartPole-v0' and count >= 5: break if id != 'CartPole-v0' and count >= 200: break print(train_loss) print(train_result) plt.plot(train_loss) plt.xlabel("round") plt.ylabel("loss") plt.show() if id != 'CartPole-v0': train_result = -np.array(train_result) plt.plot(train_result) plt.xlabel("round") plt.ylabel("reward") plt.show() if RECORD: env = Monitor(env, './cartpole-experiment-0201', force=True) observation = env.reset() for j in range(T): #env.render() action = dqn.best_action(observation) observation, reward, done, info = env.step(action) env.close() result = [] for i in range(200): observation = env.reset() for j in range(T): #env.render() action = dqn.best_action(observation) observation, reward, done, info = env.step(action) if done or j == T - 1: print("test", j + 1) result.append(j + 1) break result = np.array(result) if id != 'CartPole-v0': result = -result plt.plot(result) plt.xlabel("round") plt.ylabel("reward") plt.show() print("mean", np.mean(result)) print("var", np.std(result)) print("len", len(result))
class Environment(object): def __init__(self, game, record=False, width=64, height=64, seed=0,additional=12,activateAdditional=True,videoFolder="0/"): self.activateAdditional=activateAdditional self.game = gym.make(game) self.game.seed(seed) print("record",record) if record: print("record") self.game = Monitor(self.game, f'./videos/{videoFolder}', force=True) self.width = width self.height = height self.additional=additional self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()]) gym_ple def play_sample(self, mode: str = 'human'): observation = self.game.reset() while True: screen = self.game.render(mode=mode) if mode == 'rgb_array': screen,_ = self.preprocess(screen) action = self.game.action_space.sample() observation, reward, done, info = self.game.step(action) if done: break self.game.close() """ looks if the snake is able to go into the desired direction. If not, another guess is calculated. """ def desiredmove(self,head, nextstep): if head==0: if nextstep is not 3: return nextstep else: return 1 elif head==1: if nextstep is not 2: return nextstep else: return 0 elif head==2: if nextstep is not 1: return nextstep else: return 3 elif head==3: if nextstep is not 0: return nextstep else: return 2 else: #time.sleep(5) return -1 #returns position of food for snake def getfood(self,observation): for i in range(observation.shape[0]): #i is north->south for j in range(observation.shape[1]): #j is west->east if observation[i,j]==100: #if not food_found: #food_found=True #print("position of food is to the south from {} to {} and to the east from {} to {}".format(i,i+5,j,j+5)) foodlocation=(i,i+5,j,j+5) return foodlocation print("Error, couldn't find food!!!") print(np.array(observation)) return (-4,-4,-4,-4) """ gets position of the snakes head and the direction its facing """ def getsnake(self,observation): direction_counter=0 headlocation=(-20,-20)#-1 direction=-1 dirx=observation.shape[0]-1 diry=observation.shape[1]-1 for i in range(0,dirx+1): #i is north->south for j in range(0,diry+1): #j is west->east #0 means direction snake is looking at if observation[i,j]==0: direction_counter+=1 #only takes the second of the three direction pixel. If only one is visible that one is taken. if observation[max(0,i-2),j]==255: headlocation=(max(0,i-2),j) direction=3 if direction_counter > 1 : return (headlocation,direction) elif observation[min(i+2,dirx),j]==255: headlocation=(min(i+2,dirx),j) direction=0 if direction_counter > 1 : return (headlocation,direction) elif observation[i,max(0,j-2)]==255: headlocation=(i,max(0,j-2)) direction=2 if direction_counter > 1 : return (headlocation,direction) elif observation[i,min(j+2,diry)]==255: headlocation=(i,min(j+2,diry)) direction=1 if direction_counter > 1 : return (headlocation,direction) #else: #There was a problem, that sometimes the direction is displayed 3 pixel ahead of the snake instead of 2 pixel. elif observation[max(0,i-3),j]==255: headlocation=(max(0,i-3),j) direction=3 if direction_counter > 1 : return (headlocation,direction) elif observation[min(i+3,dirx),j]==255: headlocation=(min(i+3,dirx),j) direction=0 if direction_counter > 1 : return (headlocation,direction) elif observation[i,max(0,j-3)]==255: headlocation=(i,max(0,j-3)) direction=2 if direction_counter > 1 : return (headlocation,direction) elif observation[i,min(j+3,diry)]==255: headlocation=(i,min(j+3,diry)) direction=1 if direction_counter > 1 : return (headlocation,direction) else: pass #print("no direction could be found!!!!") return headlocation,direction """ returns the positions of food, snake head, distance and guesses for the next action """ def printpositions(self,observation): #init the output values for case they are not found in image food_found=False direction_counter=0 foodlocation=(-4,-4,-4,-4) headlocation=(-3,-3) direction=-1 foodlocation=self.getfood(observation) headlocation,direction=self.getsnake(observation) #get direction: newdirection=-1 #generates biggest distance to food distsouth1=foodlocation[0]-headlocation[0] distsouth2=foodlocation[1]-headlocation[0] disteast1=foodlocation[2]-headlocation[1] disteast2=foodlocation[3]-headlocation[1] distances=np.array([distsouth1,distsouth2,disteast1,disteast2]) bigindex=np.argmax(np.absolute(distances)) #goes into direction of biggest distance pos=distances[bigindex] if bigindex <=1: if pos>=0: newdirection=self.desiredmove(direction,3) else: newdirection=self.desiredmove(direction,0) else: if pos>=0: newdirection=self.desiredmove(direction,2) else: newdirection=self.desiredmove(direction,1) #didn't manage to add all inputs in an array in some nice way additional_states=np.append(np.append(np.append(foodlocation,headlocation),[direction,newdirection]),distances) #if no food is found, something is wrong -> set everything to -2 if (foodlocation[0]<0): additional_states.fill(-2) print("didn't found anything") return additional_states """ used to put background at 0 and have equally distant values """ def revaluescreen(self,element): if element<12: return 85 if element<50: return 0 if element<150: return 170 else: return 255 """ shapes the image in a way, we want (1 channel) and calculates additional features if required """ def preprocess(self, screen): #print(np.shape(screen)) preprocessed = screen[:, :,1] #print(preprocessed) #print(preprocessed) #print([[self.revaluescreen(e) for e in row] for row in screen[:, :,1]]) #for i in preprocessed #print(np.shape(preprocessed)) if self.activateAdditional: additional_states=self.printpositions(preprocessed)#np.zeros(12)# else: additional_states=np.zeros(12) preprocessed=np.array([[self.revaluescreen(e) for e in row] for row in screen[:, :,1]]) #print (preprocessed) preprocessed: np.array = preprocessed.astype('float32') / 255. return preprocessed,additional_states def init(self): """ @return observation """ return self.game.reset() def get_screen(self): screen = self.game.render('rgb_array') screen,additional_states = self.preprocess(screen) #print("additional_states",additional_states) return screen,additional_states def step(self, action: int): observation, reward, done, info = self.game.step(action) return observation, reward, done, info def reset(self): """ :return: observation array """ observation = self.game.reset() observation = self.preprocess(observation) return observation @property def action_space(self): return self.game.action_space.n
class Q: def __init__(self, max_ep, folder, constraints, goal): # constraints: the boundaries for s in Q^g(s, a) self.max_episode = max_ep self.log = Logger(folder) self.env = gym.make('PendulumGoal-v0') self.constraints = constraints self.start = None self.goal = goal self.q = np.zeros((ROWS, COLS, DPTS)) self.alpha_rec = np.ones((ROWS, COLS, DPTS)) self.lr = 1.0 self.gamma = 0.8 self.epsilon = 0.5 def __env_init_fn(self): self.env.reset() th = random.uniform(self.constraints[0][0], self.constraints[0][1]) thdot = random.uniform(self.constraints[1][0], self.constraints[1][1]) self.env.setup([np.array([th, thdot]), self.goal]) return np.array([math.cos(th), math.sin(th), thdot]) def __indexer(self, state, action=None): # state=(cos(th), sin(th), thdot) c, s, dot = state[0], state[1], state[2] theta = math.acos(math.fabs(c)) if c >= 0 and s >= 0: theta = theta elif c < 0 and s > 0: theta = PI - theta elif c < 0 and s < 0: theta = PI + theta else: theta = 2.0 * PI - theta row = int(round(math.degrees(theta)) / 360 * (ROWS-1)) col = int(round((dot + 8.0) / 16.0 * (COLS - 1))) if action is None: return (row, col) else: dph = 1 if action == 1.0 else 0 #int(round((action + 1.0)))#/ 4.0 * (DPTS -1))) return (row, col, dph) def __argmin(self, state): idx = self.__indexer(state) return np.argmin(self.q[idx[0], idx[1], :]) def __min(self, state): idx = self.__indexer(state) return np.min(self.q[idx[0], idx[1], :]) def __update(self, state, action, cost, next_state, done, next_action=None): idx = self.__indexer(state, action) self.lr = 1.0 / self.alpha_rec[idx] idx_next = self.__indexer(next_state, next_action) self.q[idx] = ((1 - self.lr) * self.q[idx] + self.lr * (cost + (0.0 if done else self.gamma * self.q[idx_next]))) self.alpha_rec[idx] += 1 def __select_act(self, state, explorefree=False): epsilon = (0.1 if explorefree else self.epsilon) best_act = self.__argmin(state) dice = random.randint(1, 1000) if dice > epsilon * 1000: return best_act * 2.0 - 1.0 else: idcs = [idx for idx in range(0, DPTS)] idcs.remove(best_act) return random.choice(idcs) * 2.0 - 1.0 def __decrease_eps(self): self.epsilon = max(self.epsilon - 0.001, 0.1) def __decrease_lr(self): self.lr = max(self.lr - 0.0002, 0.01) def run(self): self.log.log(Mode.STDOUT, 'Learning started.') total_cost = 0 episode = 0 cntr = 0 done = True state = None min_cost = 0 action = [0.0] avg_scs = 0 while episode < self.max_episode: if done or cntr % 250 == 0: episode += 1 state = self.__env_init_fn() action = self.__select_act(state) self.log.log(Mode.TRAIN_RET_F, [cntr, episode, total_cost]) total_cost = 0 nzeros = np.count_nonzero(self.q) self.__decrease_eps() self.__decrease_lr() if episode % 10 == 0: rtn, scs = self.evaluate() avg_scs += scs self.log.log(Mode.STD_LOG, str(episode) + ': ' + str(rtn) + ' cost: ' + str(min_cost) + ' scs: ' + str(scs) + ' nonzeros: ' + str(nzeros)) self.log.log(Mode.RET_F, [cntr, episode, rtn]) next_state, cost, done, inf = self.env.step([action]) next_action = self.__select_act(next_state) min_cost = inf['min_cost'] self.__update(state, action, cost, next_state, done, next_action) action = next_action cntr += 1 total_cost += cost state = next_state self.log.log(Mode.NUMPY, self.q) self.log.log(Mode.STDOUT, 'Learning finished. matrix was saved.') return avg_scs * 10 / self.max_episode def evaluate(self, video=False, show=False): total_cost = 0 cntr = 0 episode = 0 done = True success = -1 state = None orig_env = self.env if video: self.env = Monitor(orig_env, self.log.video_folder()) while episode < 20: if done: success += 1 if done or cntr > 2000: if video: print(str(episode) + ' ' + str(total_cost)) episode += 1 state = self.__env_init_fn() cntr = 0 action = self.__select_act(state, explorefree=True) state, cost, done, _ = self.env.step([action]) if show: self.env.render() total_cost += cost cntr += 1 self.env = orig_env return total_cost / 20.0, success def load_from_file(self): self.q = self.log.deserialize_numpy() def __del__(self): self.env.close()
def train( self, function, discount_factor, start_epsilon, end_epsilon, observation_number_when_epsilon_annealing_ends, replay_memory_size, learning_env, testing_env, total_observations, observation_number_when_training_starts, test_interval, total_number_of_testing_episodes, gym_training_logs_directory_path, gym_testing_logs_directory_path, ): """Train the agent function -- An instance of the class implemnenting the function approximation model discount_factor -- Quantifies how much the agent cares about future rewards while learning. Often referred to as gamma in the literature. start_epsilon -- Probability of random actions at start of training end_epsilon -- Probability of random actions at end of training observation_number_when_epsilon_annealing_ends -- Epsilon annealing ends when observation_number reaches this value replay_memory_size -- Replay memory contains at most this many experiences at any given point in training. When replay memory grows bigger than this, some of the earlier experiences are thrown away. learning_env -- A Gym environment (wrapped or vanilla) used for learning testing_env -- A Gym environment (wrapped or vanilla) used for testing. total_observations -- Train till this observation number observation_number_when_training_starts -- Training starts when observation_number reaches this value test_interval -- Test after this many observations total_number_of_testing_episodes -- Number of episodes to test the agent in every testing round gym_training_logs_directory_path - Directory to save automatic Gym logs related to training. We save the rewards for every learning episode. gym_testing_logs_directory_path - Directory to save automatic Gym logs related to testing. We save a video for the first test episode. weight_saving_interval -- Save the model weights (i.e. write to file) after this many observations. """ # This keeps track of the number of observations made so far observation_number = 0 # Keep count of the episode number episode_number = 1 # The learning env should always be wrapped by the Monitor provided # by Gym. This lets us automatically save the rewards for every episode. learning_env = Monitor( learning_env, gym_training_logs_directory_path, # Don't want video recording during training, only during testing video_callable=False, # Write after every reset so that we don't lose data for # prematurely interrupted training runs write_upon_reset=True, ) # To ensure that the replay memory never exceeds replay_memory_size, # we use a deque, which is a last in, first out type of data structure replay_memory = deque([], maxlen=replay_memory_size) while observation_number < total_observations: # initialize environment observation = learning_env.reset() total_rewards_obtained_in_this_episode = 0 # Execute an episode while True: # Determine the action according to the epsilon greedy policy epsilon = self.get_epsilon( start_epsilon, end_epsilon, observation_number, observation_number_when_epsilon_annealing_ends, ) action = function.get_action(observation, epsilon) # take the action determined by the epsilon-greedy policy next_observation, reward, done, info = learning_env.step( action) # Store experience in replay memory transition = { "observation": observation, "action": action, "reward": reward, "done": done, "next_observation": next_observation } replay_memory.append(transition) # # Update the model if observation_number > observation_number_when_training_starts: function.update_model(discount_factor, replay_memory) observation = next_observation observation_number += 1 # Test the current performance after every test_interval if observation_number % test_interval == 0: # The testing env is also wrapped by a Monitor so that we # can take automatic videos during testing. We will take a # video for the very first testing episode. video_callable = lambda count: count == 0 # Since the environment is closed after every testing round, # the video for different testing round will end up having # the same name! To differentiate the videos, we pass # an unique uid parameter. monitored_testing_env = Monitor( testing_env, gym_testing_logs_directory_path, video_callable=video_callable, resume=True, uid=observation_number / test_interval) # Run the test average_reward = self.test( monitored_testing_env, total_number_of_episodes= total_number_of_testing_episodes, function=function, epsilon=0.05, render=False) print( "[{0}] Episode number : {1}, Observation number : {2} " "Average reward (100 eps) : {3}".format( datetime.datetime.now(), episode_number, observation_number, average_reward)) total_rewards_obtained_in_this_episode += reward if done: episode_number += 1 break print("[{0}] Episode number : {1}, Obervation number: {2}, " "Reward in this episode : {3}, Epsilon : {4}".format( datetime.datetime.now(), episode_number - 1, observation_number, total_rewards_obtained_in_this_episode, epsilon)) learning_env.close() # There's a bug in the Gym Monitor. The Monitor's close method does not # close the wrapped environment. This makes the script exit with an # error if the environment is being rendered at some point. To make # this error go away, we have to close the unwrapped testing # environment. The learning environment is not being rendered, so we # don't need to bother about that. testing_env.env.close()
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") ############################################################ # YOUR CODE 1 : Populate replay memory! # Hints : use function "populate_replay_buffer" # about 1 line code replay_memory = populate_replay_buffer(sess, env, state_processor, replay_memory_init_size, VALID_ACTIONS, Transition, policy) # Record videos env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_process(sess, state_processor, state) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) ########################################################### # YOUR CODE 2: Target network update # Hints : use function "copy_model_parameters" if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {} Memory Len {} ". format(t, total_t, i_episode + 1, num_episodes, loss, len(replay_memory)), end="") sys.stdout.flush() ############################################## # YOUR CODE 3: Take a step in the environment # Hints 1 : be careful to use function 'state_process' to deal the RPG state # Hints 2 : you can see function "populate_replay_buffer()" # for detail about how to TAKE A STEP # about 2 or 3 line codes action = np.random.choice(len(VALID_ACTIONS), p=policy(sess, state, epsilon)) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) ############################# # YOUR CODE 4: Save transition to replay memory # Hints : you can see function 'populate_replay_buffer' for detail # about 1 or 2 line codes replay_memory.append( Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t ######################################################### # YOUR CODE 5: Sample a minibatch from the replay memory, # hints: can use function "random.sample( replay_memory, batch_size )" to get minibatch # about 1-2 lines codes minibatch = np.array(random.sample(replay_memory, batch_size)) state_batch, action_batch, reward_batch, next_state_batch, done_batch = map( np.array, zip(*minibatch)) ########################################################### # YOUR CODE 6: use minibatch sample to calculate q values and targets # Hints 1 : use function 'q_estimator.predict' to get q values # Hints 2 : use function 'target_estimator.predict' to get targets values # remember 'targets = reward + gamma * max q( s, a' )' # about 2 line codes q = target_estimator.predict(sess, next_state_batch) done_batch = np.invert(done_batch).astype(float) targets = reward_batch + done_batch * discount_factor * np.max( q, axis=1) ################################################ # YOUR CODE 7: Perform gradient descent update # hints : use function 'q_estimator.update' # about 1 line code loss = q_estimator.update(sess, state_batch, np.array(action_batch), targets) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add( simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) env.close() return stats
class Runner: """Define runner for reinforcement learning update on Grid World.""" def __init__( self, policy: str, env_config: Dict[str, Any], agent_config: Dict[str, Any], n_episode: int = 10, max_length: int = 100, save_video: bool = True, save_dir: str = "./result", ) -> None: """Initialize.""" # learning settings self.env = CustomLavaEnv(**env_config) self.policy = policy self.agent = self.get_agent(policy, self.env, agent_config) self.env_config = env_config self.agent_config = agent_config self.n_episode = n_episode self.max_length = max_length self.episode_lengths: List[int] = [] self.episode_rewards: List[float] = [] # log setttings self.save_video = save_video if not os.path.exists(save_dir): os.mkdir(save_dir) self.save_dir = os.path.join(save_dir, policy) if not os.path.exists(self.save_dir): os.mkdir(self.save_dir) log_file = os.path.join(self.save_dir, "log.txt") # Delete old log if it exists. if os.path.exists(log_file): os.remove(log_file) logging.basicConfig( filename=log_file, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y/%m/%d %I:%M:%S", level=logging.INFO, ) logging.getLogger().addHandler(logging.StreamHandler()) logging.info("POLICY: %s", self.policy) logging.info("ENV CONFIG: %s", self.env_config) logging.info("AGENT CONFIG: %s", self.agent_config) logging.info("N_EPISODES: %s", self.n_episode) logging.info("MAX_LENGTH: %s", self.max_length) logging.info("") def get_agent(self, policy: str, env: gym.Env, agent_config: Dict[str, Any]) -> AbstractAgent: """Get agent with policy.""" if policy == "random": agent = RandomPolicy(env) elif policy == "pi": agent = PolicyIteration(env, agent_config) elif policy == "vi": agent = ValueIteration(env, agent_config) elif policy == "mc": agent = MCAgent(env, agent_config) elif policy == "sarsa": agent = SARSAAgent(env, agent_config) elif policy == "qlearning": agent = QLearningAgent(env, agent_config) else: raise NotImplementedError return agent def run(self) -> None: """Start Agent-Environment Interaction and update policy.""" for episode in range(self.n_episode): # Rewrap the env every episode to save all episode video. if self.save_video: save_dir = os.path.join(self.save_dir, "{}_{}".format(self.policy, episode)) self.env = Monitor(self.env, save_dir, force=True) if self.policy in ["pi", "vi"]: self.run_dynamic_programming() elif self.policy in ["mc"]: self.run_monte_carlo() elif self.policy in ["sarsa", "qlearning"]: self.run_temporal_difference() elif self.policy in ["random"]: self.run_random_agent() else: raise NotImplementedError logging.info( "Episode: %d | Episode Length: %d | Episode reward: %d", episode, self.episode_lengths[-1], self.episode_rewards[-1], ) self.agent.print_results() self.env.close() def run_random_agent(self) -> None: """Run single episode for random agent.""" done = False episode_reward = 0 obs = self.env.reset() for _step in range(self.max_length): cur_state = obs["pos"] action = self.agent.get_action(cur_state) obs, reward, done, _ = self.env.step(action) episode_reward += reward if done: break self.episode_lengths.append(_step + 1) self.episode_rewards.append(episode_reward) def run_dynamic_programming(self) -> None: """Run single episode and update DP methods.""" done = False episode_reward = 0 obs = self.env.reset() for _step in range(self.max_length): cur_state = obs["pos"] action = self.agent.get_action(cur_state) obs, reward, done, _ = self.env.step(action) episode_reward += reward if done: break update_info = dict(agent_pos=obs["pos"], reward_grid=obs["reward_grid"]) self.agent.update_policy(update_info) self.episode_lengths.append(_step + 1) self.episode_rewards.append(episode_reward) def run_monte_carlo(self) -> None: """Run single episode and update MC methods.""" done = False episode_reward = 0 obs = self.env.reset() transactions = [] for _step in range(self.max_length): cur_state = obs["pos"] action = self.agent.get_action(cur_state) obs, reward, done, _ = self.env.step(action) next_state = obs["pos"] transactions.append((cur_state, action, next_state, reward)) episode_reward += reward if done: break update_info = dict(transactions=transactions) self.agent.update_policy(update_info) self.episode_lengths.append(_step + 1) self.episode_rewards.append(episode_reward) def run_temporal_difference(self) -> None: """Run single episode and update TD methods.""" done = False episode_reward = 0 obs = self.env.reset() for _step in range(self.max_length): cur_state = obs["pos"] action = self.agent.get_action(cur_state) obs, reward, done, _ = self.env.step(action) next_state = obs["pos"] update_info = dict( state=cur_state, action=action, reward=reward, next_state=next_state, ) self.agent.update_policy(update_info) episode_reward += reward if done: break self.episode_lengths.append(_step + 1) self.episode_rewards.append(episode_reward)
def runExperiment(experiment): import numpy as np from collections import deque import gym from gym.wrappers import Monitor from agents.dqnagent import DQNAgent #environment parameters gym_id = experiment["gym_id"] sliding_window_solved_score = experiment["sliding_window_solved_score"] sliding_window_score_length = experiment["sliding_window_score_length"] env_seed = experiment["env_seed"] max_episode = experiment["max_episode"] env = gym.make(gym_id) env = Monitor(env, "{}".format(experiment['folder']), video_callable=False, force=True, resume=False, write_upon_reset=False, uid=None, mode=None) env.seed(env_seed) scores = deque() sw_scores = deque(maxlen=sliding_window_score_length) #agent parameters agent_seed = experiment["agent_seed"] activation = experiment["activation"] min_episode_before_acting = experiment["min_episode_before_acting"] epsilon = experiment["epsilon"] nb_hidden_layer = experiment["nb_hidden_layer"] layer_width = experiment["layer_width"] memory_length = experiment["memory_length"] batch_size = experiment["batch_size"] agent = DQNAgent(env.observation_space, env.action_space, agent_seed, min_episode_before_acting, activation, epsilon, layer_width, nb_hidden_layer, memory_length) current_episode = 0 while (len(sw_scores) == 0 or np.mean(sw_scores) < sliding_window_solved_score) and ( max_episode == None or current_episode < max_episode): state = env.reset() current_episode += 1 reward = 0 done = False episode_score = 0 while not done: action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.remember(state, action, reward, next_state, done) state = next_state episode_score += reward # if np.mean(sw_scores) > 180: # env.render() if done: scores.append(episode_score) sw_scores.append(episode_score) print( 'Episode: {}\t Epsilon: {}\t Score: {}\t Mean Score:{}\t Sliding Score:{}\t' .format(current_episode, agent.epsilon, episode_score, np.mean(scores), np.mean(sw_scores))) agent.train(batch_size=batch_size) env.close()
def train(self): """ The training loop. This runs a single episode. TODO: Implement the following as desired: 1. Storing transitions to the ReplayMemory 2. Updating the network at some frequency 3. Backing up the current parameters to a reference, target network """ # Initially perform some random walks and make a replay memory env = Monitor(self.env, self.monitor_dir, force=True) for episode in range(1000): done = False obs = env.reset() while not done: action = random.randint(0, env.action_space.n - 1) encoded_action = np.zeros(env.action_space.n) encoded_action[action] = 1 next_obs, reward, done, info = env.step(action) self.replay_memory.append( (obs, encoded_action, reward, next_obs, done)) obs = next_obs if len(self.replay_memory) > self.min_replay_size: self.replay_memory.popleft() sum_of_reward = 0 for episode in range(self.max_episode + 1): obs = env.reset() if self.change_eps == True: if self.eps_start > self.eps_mid: self.eps_start -= ( initial_eps - mid_eps ) / self.eps_decay # Linear decay of exploration elif self.eps_start > self.eps_end: self.eps_start -= (mid_eps - final_eps) / self.eps_decay_later else: self.eps_start = initial_eps done = False # self.num_steps += 1 # self.num_episodes += 1 reward_per_episode = 0 while not done: action = self.select_action(obs) next_obs, reward, done, info = env.step(action) self.train_network(obs, action, reward, next_obs, done) obs = next_obs reward_per_episode += reward sum_of_reward += reward_per_episode if episode % 100 == 0: avg_reward = sum_of_reward / 100 self.saver.save(self.sess, 'models/dqn-model') print("Avg reward: %s" % avg_reward) if avg_reward > 210: test_reward = 0 for i in range(self.sanity_epochs): obs = env.reset() done = False while not done: action = self.select_action(obs, evaluation_mode=True) next_obs, reward, done, info = env.step(action) test_reward += reward avg_test_reward = test_reward / self.sanity_epochs print("Episode: ", episode, "Average test reward: ", avg_test_reward) if avg_test_reward >= 200: env.close() break sum_of_reward = 0
monitor_dir = '/tmp/cartpole_exp1' monitor = Monitor(env, monitor_dir, force=True) sess.run(tf.global_variables_initializer()) b_obs, b_acts, b_rews = [], [], [] # for _ in range(eparams['ep_per_batch']): obs, acts, rews = policy_rollout(env) print('Episode steps: {}'.format(len(obs))) b_obs.extend(obs) b_acts.extend(acts) advantages_rew = process_rewards(rews) b_rews.extend(advantages_rew) #29, 36 np.array(b_obs).shape np.array(b_acts).shape np.array(b_rews).shape b_rews = (b_rews - np.mean(b_rews)) / (np.std(b_rews) + 1e-10) train_step(b_obs, b_acts, b_rews) monitor.close() sess.close()
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for fff-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Lambda time discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ #print "q_learning starts" Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) #print "action_probs is", action_probs action = np.random.choice(np.arange(len(action_probs)), p=action_probs) #print "action is", action next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) #print "next state is", next_state.shape next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # (84,84) to (84,84,1) replay_memory.append( Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # Record videos env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # TODO: Maybe update the target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) # Print out which step we're on, useful for debugging. #print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( # t, total_t, i_episode + 1, num_episodes, loss)) sys.stdout.flush() # Take a step in the environment # The policy we're following policy = make_epsilon_greedy_policy(target_estimator, len(VALID_ACTIONS)) action_probs = policy( sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) replay_memory.append( Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples)) # Calculate q values and targets q_values_next = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * discount_factor * np.amax(q_values_next, axis=1) # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: print("\rEpisode {}/{}, done, loss: {}".format( i_episode + 1, num_episodes, loss)) break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add( simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) env.close()
def main(argv=()): del argv # Unused. # Build an environment # Create and record episode - remove Monitor statement if recording not desired env = Monitor(gym.make('one-random-evader-v0'), './tmp/pursuit_evasion_infer_pursuer_vs_random_evader', force=True) #Reset state state = env.reset() #Initialize Agent Parameters #Get observed state space observed_state_space = env.get_observed_state_space() #Set initial state distribution initial_state_dist = [] initial_state = env.get_initial_state() for state in observed_state_space: if state == initial_state: initial_state_dist.append(1) else: initial_state_dist.append(0) #Get action space action_space = range(0, env.action_space.n) #Set action prior to uniform dist action_prior = [] for action in action_space: action_prior.append(1/len(action_space)) #Get reward function reward_function = env.get_reward_function() #Get transition function transition_function = env.get_transition_function() #Set max trajectory length max_trajectory_length = 11 #needs to be greater than shortest distance to evader for any meaningful inference #Create Agent agent = infer.DiceInferenceEngine(observed_state_space, action_space, initial_state_dist, action_prior, reward_function, transition_function, max_trajectory_length) print("\nAgent created.\n") #Set current observed state to initial state uncolored_obs = initial_state #Initialize actions list actions = [] print("\nInfering action " + str(0) + "\n") actions.append(dist.Categorical(torch.tensor(agent.next(uncolored_obs))).sample().item()) #Game Loop for t in range(0, 11): #Render env.render() #Delay to make video easier to watch #sleep(5) #Take action and get observations, rewards, termination from environment observation, reward, done, info = env.step(actions[t]) #If termination signal received, break out of loop if done: break #Pick next action based on agent's reasoning uncolored_obs = env.uncolor_board(observation) print("\nInfering action " + str(t + 1) + "\n") actions.append(dist.Categorical(torch.tensor(agent.next(uncolored_obs))).sample().item()) env.close()
class Environment(object): def __init__(self, game='FlappyBird-v0', record=False, width=84, height=84, seed=0): self.game = gym.make(game) self.game.seed(seed) if record: self.game = Monitor(self.game, './video', force=True) self.width = width self.height = height def play_sample(self, mode: str = 'human'): observation = self.game.reset() while True: screen = self.game.render(mode=mode) if mode == 'rgb_array': screen = self.preprocess(screen) action = self.game.action_space.sample() observation, reward, done, info = self.game.step(action) if done: break self.game.close() def preprocess(self, screen): # preprocessed = screen[:400, 40:] preprocessed = screen preprocessed = transform.resize(preprocessed, (self.height, self.width)) preprocessed = color.rgb2gray(preprocessed) preprocessed = preprocessed.astype('float32') / 255. return preprocessed def init(self): return self.game.reset() def get_screen(self): screen = self.game.render('rgb_array') screen = self.preprocess(screen) return screen def step(self, action: int): observation, reward, done, info = self.game.step(action) return observation, reward, done, info def reset(self): observation = self.game.reset() observation = self.preprocess(observation) return observation def close(self): self.game.close() @property def action_space(self): return self.game.action_space.n @property def observation_space(self): return self.game.observation_space
def record_sessions(env_id, agent, n_actions): env = Monitor(gym.make(env_id), directory='videos', force=True) for _ in range(100): generate_agent_session(env, agent, n_actions) env.close()
def train(): logger.configure() set_global_seeds(args.seed) directory = os.path.join( args.log_dir, '_'.join([args.env, datetime.datetime.now().strftime("%m%d%H%M")])) if not os.path.exists(directory): os.makedirs(directory) else: ValueError("The directory already exists...", directory) json.dump(vars(args), open(os.path.join(directory, 'learning_prop.json'), 'w')) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = models.wrap_atari_dqn(env) nb_test_steps = args.nb_test_steps if args.nb_test_steps > 0 else None reload_path = args.reload_path if args.reload_path else None if args.record: env = Monitor(env, directory=directory) with tf.device(args.device): model = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[args.num_units] * args.num_layers, dueling=bool(args.dueling), init_mean=args.init_mean, init_sd=args.init_sd, ) act, records = simple.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.lr_decay_factor, lr_growth_factor=args.lr_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, train_freq=4, print_freq=1000, checkpoint_freq=int(args.nb_train_steps / 10), learning_starts=args.nb_warmup_steps, target_network_update_freq=args.target_update_freq, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, epoch_steps=args.nb_epoch_steps, alg=args.alg, noise=args.noise, gpu_memory=args.gpu_memory, varTH=args.varth, act_policy=args.act_policy, save_dir=directory, nb_test_steps=nb_test_steps, scope=args.scope, test_eps=args.test_eps, checkpoint_path=reload_path, init_t=args.init_t, ) print("Saving model to model.pkl") act.save(os.path.join(directory, "model.pkl")) plot(records, directory) env.close()
def main(argv=None): try: options, args = getopt.getopt(sys.argv[1:], "s:x:b:u:mh", [ "step=", "max_eps=", "buffer_size=", "hidden_unit=","monitor", "help"]) except getopt.GetoptError as err: print(str(err)) print(usage.__doc__) sys.exit(1) GAME_NAME = 'CartPole-v1' AGENT_NAME = 'DQN-lr_1_e-3' MONITOR = False print_step = 10 max_eps = 500 buffer_size=1000000 hidden_unit = 16 lr=1e-3 print(options) for o, v in options: if o in ("-h", "--help"): print(usage.__doc__) sys.exit() elif o in ("-m", "--monitor"): MONITOR = True elif o in ("-s", "--step"): print_step = int(v) elif o in ("-x", "--max_eps"): max_eps = int(v) elif o in ("-b", "--buffer_size"): buffer_size = int(v) elif o in ("-u", "--hidden_unit"): hidden_unit = int(v) else: print(usage.__doc__) sys.exit() print('process game: %s\tusing agent: %s' % (GAME_NAME, AGENT_NAME)) # -------------------------------- loop for training ----------------------------- # preparing env output_dir = '%s/%s' % (GAME_NAME, AGENT_NAME) cmd = 'mkdir -p %s && mkdir -p %s/%s' % (GAME_NAME, GAME_NAME, AGENT_NAME) os.system(cmd) env = gym.make(GAME_NAME) if MONITOR: env = Monitor(env, directory=output_dir, force=True, video_callable=lambda ep: ep % 10 == 0, write_upon_reset=True, mode='training') env.seed(0) state_num = len(env.reset()) print(state_num) action_sample = env.action_space.sample() action_num = env.action_space.n if isinstance(action_sample, int) else len(action_sample) print('state_num: %d\taction_num: %d' % (state_num, action_num)) device = torch.device('cpu') agent = DQNAgent(state_num, action_num, buffer_size=buffer_size, batch_size=128, device=device, hidden_unit=hidden_unit, lr=lr) scores_window = deque(maxlen=print_step) # last 10 scores avg_scores = [] for i_episode in range(max_eps): score = 0 state = env.reset() while True: action = agent.choose_action(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) score += reward state = next_state if done: break scores_window.append(score) print('\rEpisode {}\tAverage Score: {:.2f} '.format( i_episode, np.mean(scores_window)), end="") if i_episode % print_step == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) # save model agent.save_model_params(output_dir, i_episode) avg_scores.append(np.mean(scores_window)) sys.stdout.flush() env.close()
def main(): # Play settings parser = argparse.ArgumentParser(description='A3C:Play') parser.add_argument( '--name', type=str, required=True, help= "Experiment name. All outputs will be stored in checkpoints/[name]/") parser.add_argument('--model_name', default='best_model', help='Model to play with (defualt: best_model)') parser.add_argument('--seed', type=int, default=1, help='Random seed (default: 1)') parser.add_argument('--n_eps', default=100, help='# of epsisode (default: 100)') parser.add_argument('--gpu_id', default=0, help='GPU id (default: 0)') parser.add_argument('--no_render', action='store_true', help='Do not render to screen (default: False)') parser.add_argument('--random', action='store_true', help='Act randomly (default: False)') parser.add_argument('--duration', type=float, default=5, help='How long does the play last (default: 5 [min])') args = parser.parse_args() args.save_path = os.path.join('checkpoints', args.name) args.model_path = os.path.join(args.save_path, 'snapshots', '{}.pth'.format(args.model_name)) args.gif_path = os.path.join( args.save_path, 'gifs', '{}_{}'.format(args.model_name, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))) with open(os.path.join(args.save_path, 'config')) as f: vargs = json.loads(''.join(f.readlines())) vargs.update(vars(args)) args.__dict__ = vargs print('------------ Options -------------') for k, v in sorted(vars(args).items()): print('{}: {}'.format(k, v)) print('-------------- End ----------------') if not os.path.isdir(args.gif_path): os.makedirs(args.gif_path) setproctitle('{}:play'.format(args.name)) torch.manual_seed(args.seed) env = create_env(args.game_type, args.env_name, 'play', 1) env = Monitor(env, args.gif_path, force=True) env._max_episode_seconds = args.duration * 60 env.seed(args.seed) model = ActorCriticLSTM(env.observation_space.shape[0], env.action_space.n) model.load_state_dict(torch.load(args.model_path)) if args.gpu_id >= 0: with torch.cuda.device(args.gpu_id): model.cuda() model.eval() best_reward = None for eps in range(args.n_eps): model.reset() reward, _ = play_game(env, model, render=not args.no_render, rand=args.random, gpu_id=args.gpu_id) best_reward = reward if best_reward is None else max( best_reward, reward) print('EPS: {}/{}, Reward: {}'.format(eps + 1, args.n_eps, reward)) env.close() if arg.n_eps > 10: gym.upload(args.gif_path, api_key='sk_aQXs9Po5RUyv0ZDQnkZ2A') os.rename(args.gif_path, args.gif_path + '_' + str(best_reward))
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=20000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step total_t = sess.run(tf.train.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy( q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") ############################################################ # YOUR CODE 1 : Populate replay memory! # Hints : use function "populate_replay_buffer" # about 1 line code replay_memory = populate_replay_buffer(sess, env, state_processor, replay_memory_init_size, VALID_ACTIONS, Transition, policy) # Record videos env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_process(sess, state_processor, state) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) ########################################################### # YOUR CODE 2: Target network update # Hints : use function "copy_model_parameters" if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() ############################################## # YOUR CODE 3: Take a step in the environment # Hints 1 : be careful to use function 'state_process' to deal the RPG state # Hints 2 : you can see function "populate_replay_buffer()" # for detail about how to TAKE A STEP # about 2 or 3 line codes action = np.random.choice(len(VALID_ACTIONS), p=policy(sess, state, epsilon)) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) ############################# # YOUR CODE 4: Save transition to replay memory # Hints : you can see function 'populate_replay_buffer' for detail # about 1 or 2 line codes replay_memory.append(Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t ######################################################### # YOUR CODE 5: Sample a minibatch from the replay memory, # hints: can use function "random.sample( replay_memory, batch_size )" to get minibatch # about 1-2 lines codes #minibatch = np.array(rd.sample(replay_memory, batch_size)) samples = rd.sample(replay_memory, batch_size) s, a, r, s_next, done_ = map(np.array, zip(*samples)) ########################################################### # YOUR CODE 6: use minibatch sample to calculate q values and targets # Hints 1 : use function 'q_estimator.predict' to get q values # Hints 2 : use function 'target_estimator.predict' to get targets values # remember 'target = reward + gamma * max q( s, a' )' # about 2 line codes q_eval_next = q_estimator.predict(sess, s_next) best_actions = np.argmax(q_eval_next, axis=1) q_target_next = target_estimator.predict(sess, s_next) q_targets = r + np.invert(done_).astype(np.float32) * discount_factor * q_target_next[np.arange(batch_size), best_actions] ################################################ # YOUR CODE 7: Perform gradient descent update # hints : use function 'q_estimator.update' # about 1 line code loss = q_estimator.update(sess, np.array(s), a, q_targets) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) env.close() return stats
def run_trial(args): # tries to get agent type agent_t = args.agent results_dir = '' if agent_t == AgentType.Testing: # tries to load config from provided results dir path results_dir = args.results if args.results is not None else \ get_agent_output_dir(DEFAULT_CONFIG, AgentType.Learning) config_file = join(results_dir, 'config.json') if not exists(results_dir) or not exists(config_file): raise ValueError('Could not load configuration from: {}.'.format(config_file)) config = EnvironmentConfiguration.load_json(config_file) # if testing, we want to force a seed different than training (diff. test environments) config.seed += 1 else: # tries to load env config from provided file path config_file = args.config config = DEFAULT_CONFIG if config_file is None or not exists(config_file) \ else EnvironmentConfiguration.load_json(config_file) # creates env helper helper = create_helper(config) # checks for provided output dir output_dir = args.output if args.output is not None else get_agent_output_dir(config, agent_t, args.trial) if not exists(output_dir): makedirs(output_dir) # saves / copies configs to file config.save_json(join(output_dir, 'config.json')) helper.save_state_features(join(output_dir, 'state_features.csv')) # register environment in Gym according to env config env_id = '{}-{}-v0'.format(config.gym_env_id, args.trial) helper.register_gym_environment(env_id, False, FPS, SHOW_SCORE_BAR) # create environment and monitor env = gym.make(env_id) # todo config.num_episodes = 100 video_callable = video_schedule(config, args.record) env = Monitor(env, directory=output_dir, force=True, video_callable=video_callable) # adds reference to monitor to allow for gym environments to update video frames if video_callable(0): env.env.monitor = env # initialize seeds (one for the environment, another for the agent) env.seed(config.seed + args.trial) agent_rng = np.random.RandomState(config.seed + args.trial) # creates the agent agent, exploration_strategy = create_agent(helper, agent_t, agent_rng) # if testing, loads tables from file (some will be filled by the agent during the interaction) if agent_t == AgentType.Testing: agent.load(results_dir, ) # runs episodes behavior_tracker = BehaviorTracker(config.num_episodes) recorded_episodes = [] for e in range(config.num_episodes): # checks whether to activate video monitoring env.env.monitor = env if video_callable(e) else None # reset environment old_obs = env.reset() old_s = helper.get_state_from_observation(old_obs, 0, False) if args.verbose: helper.update_stats_episode(e) exploration_strategy.update(e) t = 0 done = False while not done: # select action a = agent.act(old_s) # observe transition obs, r, done, _ = env.step(a) s = helper.get_state_from_observation(obs, r, done) r = helper.get_reward(old_s, a, r, s, done) # update agent and stats agent.update(old_s, a, r, s) behavior_tracker.add_sample(old_s, a) helper.update_stats(e, t, old_obs, obs, old_s, a, r, s) old_s = s old_obs = obs t += 1 # adds to recorded episodes list if video_callable(e): recorded_episodes.append(e) # signals new episode to tracker behavior_tracker.new_episode() # writes results to files agent.save(output_dir) behavior_tracker.save(output_dir) write_table_csv(recorded_episodes, join(output_dir, 'rec_episodes.csv')) helper.save_stats(join(output_dir, 'results'), CLEAR_RESULTS) print('\nResults of trial {} written to:\n\t\'{}\''.format(args.trial, output_dir)) env.close()
for i, act_name in enumerate(action_names): print('{:10.5f}'.format(agent.q[s][i]), end='\t') print() # checks if save feature was pressed if save_features: helper.save_features_image(obs_vec, join(output_dir, 'features.png')) save_features = False # checks if save environment was pressed if save_environment: save_image(env, join(output_dir, 'environment.png')) save_environment = False window_still_open = env.render() is not None old_s = s old_obs = obs t += 1 # signals new episode to tracker behavior_tracker.new_episode() e += 1 # writes results to files behavior_tracker.save(output_dir) helper.save_stats(join(output_dir, 'results')) print('\nResults written to:\n\t\'{}\''.format(output_dir)) env.close()
def train(self, table, discount_factor, start_epsilon, end_epsilon, learning_env, testing_env, total_observations, test_interval, total_number_of_testing_episodes, gym_training_logs_directory_path, gym_testing_logs_directory_path, table_saving_interval): """Train the GLIE Monte Carlo agent table -- Table storing Q values and visit numbers for state action pairs discount_factor -- Quantifies how much the agent cares about future rewards while learning. Often referred to as gamma in the literature. start_epsilon -- Probability of random actions at start of training end_epsilon -- Probability of random actions at end of training learning_env -- A Gym environment (wrapped or vanilla) used for learning testing_env -- A Gym environment (wrapped or vanilla) used for testing. total_observations -- Train till this observation number test_interval -- Test after this many observations total_number_of_testing_episodes -- Number of episodes to test the agent in every testing round gym_training_logs_directory_path - Directory to save automatic Gym logs related to training. We save the rewards for every learning episode. gym_testing_logs_directory_path - Directory to save automatic Gym logs related to testing. We save a video for the first test episode. table_saving_interval -- Save the table (i.e. write the table to file) after this many observations. """ # This keeps track of the number of observations made so far observation_number = 0 # Keep count of the episode number episode_number = 1 # The learning env should always be wrapped by the Monitor provided # by Gym. This lets us automatically save the rewards for every episode. learning_env = Monitor( learning_env, gym_training_logs_directory_path, # Don't want video recording during training, only during testing video_callable=False, # Write after every reset so that we don't lose data for # prematurely interrupted training runs write_upon_reset=True, ) while observation_number < total_observations: # Need a list to hold the relevant information for this episode. # This will be used in updating Q values. # structure : [ # { # "observation" : observation, # "action" : action, # "reward" : immediate reward # },... # ] episode_history = [] # initialize environment observation = learning_env.reset() total_rewards_obtained_in_this_episode = 0 # Execute an episode while True: epsilon = self.get_epsilon(start_epsilon, end_epsilon, observation_number, total_observations) # use the epsilon-greedy policy to choose an action action = self.get_action(learning_env, table, observation, epsilon) # take the action determined by the epsilon-greedy policy next_observation, reward, done, info = learning_env.step( action) # add the current state and resulting reward to history episode_history.append({ "observation": observation, "action": action, "reward": reward }) observation = next_observation observation_number += 1 # Test the current performance after every test_interval if observation_number % test_interval == 0: # The testing env is also wrapped by a Monitor so that we # can take automatic videos during testing. We will take a # video for the very first testing episode. video_callable = lambda count: count == 0 # Since the environment is closed after every testing round, # the video for different testing round will end up having # the same name! To differentiate the videos, we pass # an unique uid parameter. monitored_testing_env = Monitor( testing_env, gym_testing_logs_directory_path, video_callable=video_callable, resume=True, uid=observation_number / test_interval) # Run the test average_reward = self.test( monitored_testing_env, total_number_of_episodes= total_number_of_testing_episodes, table=table, epsilon=0, render=False) print( "[{0}] Episode number : {1}, Observation number : {2} " "Average reward (100 eps) : {3}".format( datetime.datetime.now(), episode_number, observation_number, average_reward)) total_rewards_obtained_in_this_episode += reward if done: # episode has ended, update Q and N values self.update_table(table, discount_factor, episode_history) episode_number += 1 # save the table at regular intervals if episode_number % table_saving_interval == 0: table.save() break print("[{0}] Episode number : {1}, Obervation number: {2}, " "Reward in this episode : {3}, Epsilon : {4}".format( datetime.datetime.now(), episode_number - 1, observation_number, total_rewards_obtained_in_this_episode, epsilon)) learning_env.close() # There's a bug in the Gym Monitor. The Monitor's close method does not # close the wrapped environment. This makes the script exit with an # error if the environment is being rendered at some point. To make # this error go away, we have to close the unwrapped testing # environment. The learning environment is not being rendered, so we # don't need to bother about that. testing_env.env.close()
def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True, t_max=100000): """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward. :param save_path: where to save the report :param record_video: if True, records mp4 video :return: total reward (scalar) """ env = self.make_env() if not use_monitor and record_video: raise warn( "Cannot video without gym monitor. If you still want video, set use_monitor to True" ) if record_video: env = Monitor(env, save_path, force=True) elif use_monitor: env = Monitor(env, save_path, video_callable=lambda i: False, force=True) game_rewards = [] for _ in range(n_games): # initial observation observation = env.reset() # initial memory prev_memories = [ np.zeros((1, ) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in self.agent.agent_states ] t = 0 total_reward = 0 while True: res = self.agent_step( self.preprocess_observation(observation)[None, ...], *prev_memories) action, new_memories = res[0], res[1:] observation, reward, done, info = env.step(action[0]) total_reward += reward prev_memories = new_memories if done or t >= t_max: if verbose: print( "Episode finished after {} timesteps with reward={}" .format(t + 1, total_reward)) break t += 1 game_rewards.append(total_reward) env.close() del env return game_rewards
def train(self, actor, critic, discount_factor, lambda_value, learning_env, testing_env, horizon, minibatch_size, epochs, total_observations, test_interval, total_number_of_testing_episodes, gym_training_logs_directory_path, gym_testing_logs_directory_path): """Train the PPO agent actor -- The actor instance critic -- The critic instance discount_factor -- Quantifies how much the agent cares about future rewards while learning. Often referred to as gamma in the literature. lambda_value -- The lambda in TD(lambda) learning_env -- A Gym environment (wrapped or vanilla) used for learning testing_env -- A Gym environment (wrapped or vanilla) used for testing. This may be different from learning_env. For example, we might be scaling the rewards in the learning environment. But we want to benchmark performance in a testing environment where the rewards are not scaled. horizon -- Number of experiences to collect before performing a training step. Must be a integer multiple of minibatch_size. minibatch_size -- Minibatch size for training the actor and critic. epochs -- Number of epochs of training on one set of experiences total_observations -- Train till this observation number test_interval -- Test after this many observations total_number_of_testing_episodes -- Number of episodes to test the agent in every testing round gym_training_logs_directory_path - Directory to save automatic Gym logs related to training. We save the rewards for every learning episode. gym_testing_logs_directory_path - Directory to save automatic Gym logs related to testing. We save a video for the first test episode. """ # We will fill training_samples with the agent's experience till it # reaches a size equal to horizon. Then we will train the actor # and critic on this data. After training is done, we will empty the # list and repeat the process for the next sequence of experiences. training_samples = [] # To make computing advantages and value function targets easier, we # put the experiences first in a different list # training_samples_this_episode. When the episode ends or horizon is # reached (whichever happens earlier), we compute advantages and # value function targets using this list. Then the list is emptied # and the data transfered to the other list training_samples. training_samples_this_episode = [] # This keeps track of the number of observations made so far observation_number = 0 # Keep count of the episode number episode_number = 1 # The learning env should always be wrapped by the Monitor provided # by Gym. This lets us automatically save the rewards for every episode. learning_env = Monitor( learning_env, gym_training_logs_directory_path, # Don't want video recording during training, only during testing video_callable=False, # Write after every reset so that we don't lose data for # prematurely interrupted training runs write_upon_reset=True, ) while observation_number < total_observations: # Start of an episode observation = learning_env.reset() # Predicted value for this observation value = critic.get_value(np.array([observation]))[0][0] total_rewards_obtained_in_this_episode = 0 while True: policy = actor.get_policies(np.array([observation]))[0] action = actor.get_actions(np.array([policy]))[0] # The actor may not keep the actions within the bounds accepted # by the environment. Therefore, we clip the action manually to # make it conform to the bounds. clipped_action = np.clip(action, learning_env.action_space.low, learning_env.action_space.high) next_observation, reward, done, info = ( learning_env.step(clipped_action)) # Predicted value of the next observation, necessary for # calculating TD error next_value = (critic.get_value(np.array( [next_observation]))[0][0] if not done else 0) experience = { "observation": observation, "next_observation": next_observation, "means": policy["means"], "vars": policy["vars"], "action": action, "clipped_action": clipped_action, "reward": reward, "terminal": done, "value": value, "next_value": next_value, } training_samples_this_episode.append(experience) observation = next_observation value = next_value observation_number += 1 # Test the current performance after every test_interval if observation_number % test_interval == 0: # The testing env is also wrapped by a Monitor so that we # can take automatic videos during testing. We will take a # video for the very first testing episode. video_callable = lambda count: count == 0 # Since the environment is closed after every testing round, # the video for different testing round will end up having # the same name! To differentiate the videos, we pass # an unique uid parameter. monitored_testing_env = Monitor( testing_env, gym_testing_logs_directory_path, video_callable=video_callable, resume=True, uid=observation_number / test_interval) # Run the test average_reward = self.test( monitored_testing_env, total_number_of_episodes= total_number_of_testing_episodes, actor=actor, render=False) print( "[{0}] Episode number : {1}, Observation number : {2} " "Average reward (100 eps) : {3}".format( datetime.datetime.now(), episode_number, observation_number, average_reward)) total_rewards_obtained_in_this_episode += reward ## Training starts here # If previous episodes ended quickly before we could reach the # horizon, these experiences have already been transfered to # training_samples. So, to get the total number of experiences # gathered since the last training step, we have sum up the # experiences gathered in this episode and the experiences # from previous episodes which have been transferred to # training_samples. number_of_experiences_since_last_training_step = ( len(training_samples_this_episode) + len(training_samples)) # If horizon is reached or the episode ended, we compute # advantages and value targets using # training_samples_this_episode. The experiences are then # transfered to the list training_samples. Finally # training_samples_this_episode is emptied to accomodate # further experiences. if (number_of_experiences_since_last_training_step == horizon or done): training_samples_this_episode_with_targets = ( self.compute_advantages_and_value_targets( training_samples_this_episode, discount_factor, lambda_value)) training_samples += ( training_samples_this_episode_with_targets) training_samples_this_episode = [] # If horizon is reached, we train the actor and critic on the # stored experiences. Then we forget about those experiences # by emptying training_samples. if number_of_experiences_since_last_training_step == horizon: self.perform_training_step(actor, critic, training_samples, minibatch_size, epochs) training_samples = [] # After a round of training, the actor and critic weights # have changed. So we use the updated model to compute the # value function instead of using the value function # predicted by the older models. value = critic.get_value(np.array([next_observation]))[0] # Start over when the episode ends if done: episode_number += 1 break print("[{0}] Episode number : {1}, Obervation number: {2}, " "Reward in this episode : {3}".format( datetime.datetime.now(), episode_number - 1, observation_number, total_rewards_obtained_in_this_episode)) learning_env.close() # There's a bug in the Gym Monitor. The Monitor's close method does not # close the wrapped environment. This makes the script exit with an # error if the environment is being rendered at some point. To make # this error go away, we have to close the unwrapped testing # environment. The learning environment is not being rendered, so we # don't need to bother about that. testing_env.env.close()
def run_trial(args): # tries to get agent type agent_t = args.agent if agent_t == AgentType.Testing: # tries to load a pre-trained agent configuration file config, results_dir = load_agent_config(args.results, args.trial) else: # tries to load env config from provided file path config_file = args.config_file_path config = args.default_frogger_config if config_file is None or not exists(config_file) \ else EnvironmentConfiguration.load_json(config_file) # creates env helper helper = create_helper(config) # checks for provided output dir output_dir = args.output if args.output is not None else \ get_agent_output_dir(config, agent_t, args.trial) if not exists(output_dir): makedirs(output_dir) # saves / copies configs to file config.save_json(join(output_dir, 'config.json')) helper.save_state_features(join(output_dir, 'state_features.csv')) # register environment in Gym according to env config env_id = '{}-{}-v0'.format(config.gym_env_id, args.trial) helper.register_gym_environment(env_id, False, args.fps, args.show_score_bar) # create environment and monitor env = gym.make(env_id) config.num_episodes = args.num_episodes video_callable = video_schedule(config, args.record) env = Monitor(env, directory=output_dir, force=True, video_callable=video_callable) # adds reference to monitor to allow for gym environments to update video frames if video_callable(0): env.env.monitor = env # initialize seeds (one for the environment, another for the agent) env.seed(config.seed + args.trial) agent_rng = np.random.RandomState(config.seed + args.trial) # creates the agent agent, exploration_strategy = create_agent(helper, agent_t, agent_rng) # if testing, loads tables from file (some will be filled by the agent during the interaction) if agent_t == AgentType.Testing: agent.load(results_dir) # runs episodes behavior_tracker = BehaviorTracker(config.num_episodes) recorded_episodes = [] for e in range(config.num_episodes): # checks whether to activate video monitoring env.env.monitor = env if video_callable(e) else None # reset environment old_obs = env.reset() old_s = helper.get_state_from_observation(old_obs, 0, False) if args.verbose: print(f'Episode: {e}') # helper.update_stats_episode(e) exploration_strategy.update(e) # update for learning agent t = 0 done = False while not done: # select action a = agent.act(old_s) # observe transition obs, r, done, _ = env.step(a) s = helper.get_state_from_observation(obs, r, done) r = helper.get_reward(old_s, a, r, s, done) # update agent and stats agent.update(old_s, a, r, s) behavior_tracker.add_sample(old_s, a) helper.update_stats(e, t, old_obs, obs, old_s, a, r, s) old_s = s old_obs = obs t += 1 # adds to recorded episodes list if video_callable(e): recorded_episodes.append(e) # signals new episode to tracker behavior_tracker.new_episode() # writes results to files agent.save(output_dir) behavior_tracker.save(output_dir) write_table_csv(recorded_episodes, join(output_dir, 'rec_episodes.csv')) helper.save_stats(join(output_dir, 'results'), args.clear_results) print('\nResults of trial {} written to:\n\t\'{}\''.format( args.trial, output_dir)) env.close()
class BaseAgent: def __init__(self, config): self.config = config self.env = config['env'] make_seed(config['seed']) self.env.seed(config['seed']) self.use_cuda = config['use_cuda'] self.gamma = config['gamma'] self.verbose = config['verbose'] self.max_episode_length = config['max_episode_length'] self.use_mean_baseline = config.get('use_mean_baseline', False) self.model = config['model'] # the optimizer used by PyTorch (Stochastic Gradient, Adagrad, Adam, etc.) self.optimizer = torch.optim.Adam(self.model.net.parameters(), lr=config['learning_rate']) self.monitor_env = Monitor(self.env, "./gym-results", force=True, video_callable=lambda episode: True) @abc.abstractmethod def _compute_returns(self, rewards): """Returns the cumulative discounted rewards at each time step Parameters ---------- rewards : array The array of rewards of one episode Returns ------- array The cumulative discounted rewards at each time step Example ------- for rewards=[1, 2, 3] this method outputs [1 + 2 * gamma + 3 * gamma**2, 2 + 3 * gamma, 3] """ raise NotImplementedError def sample_trajectories(self, n_trajectories): trajectories = [] for _ in range(n_trajectories): states = [ torch.from_numpy(self.env.reset()).type(self.model.dtype) ] actions = [] rewards = [] log_probs = [] done = False count = 0 # stop after self.max_episode_length steps, # otherwise episodes run for too long when # the agent is skilled enough while not done and count < self.max_episode_length: action = int(self.model.select_action(states[-1])) prob = self.model.forward(states[-1]) # clip prob away from 0 and 1 to avoid numerical issues when taking the log prob = torch.clamp(prob, np.finfo(np.float32).eps, 1 - np.finfo(np.float32).eps) log_prob = torch.log(prob) state, reward, done, _ = self.env.step(action) states.append(torch.from_numpy(state).type(self.model.dtype)) actions.append(action) rewards.append(reward) log_probs.append(log_prob) count += 1 trajectories.append({ 'states': states, 'actions': actions, 'rewards': rewards, 'log_probs': log_probs, }) return trajectories @abc.abstractmethod def optimize_model(self, n_trajectories): """Perform a gradient update using n_trajectories Parameters ---------- n_trajectories : int The number of trajectories used to approximate the expectation card(D) in the formula above Returns ------- array The cumulative discounted rewards of each trajectory """ raise NotImplementedError def train(self, n_trajectories, n_update): """Training method Parameters ---------- n_trajectories : int The number of trajectories used to approximate the expected gradient n_update : int The number of gradient updates """ rewards = [] for episode in range(n_update): rewards.append(self.optimize_model(n_trajectories)) if (episode + 1) % self.verbose == 0: rewards_np = np.array(rewards) # Print the reward stats averaged across all last self.verbose steps mean = rewards_np[-1 - self.verbose:-1].mean() std = rewards_np[-1 - self.verbose:-1].std() print( f'Episode {episode + 1}/{n_update}: rewards {round(mean, 2)} +/- {round(std, 2)}' ) # Plotting r = pd.DataFrame((itertools.chain(*(itertools.product([i], rewards[i]) for i in range(len(rewards))))), columns=['Epoch', 'Reward']) sns.lineplot(x="Epoch", y="Reward", data=r, ci='sd') def evaluate(self, render=False): """Evaluate the agent on a single trajectory """ observation = self.monitor_env.reset() observation = torch.tensor(observation, dtype=torch.float) reward_episode = 0 done = False while not done: action = self.model.select_action(observation) observation, reward, done, info = self.monitor_env.step( int(action)) observation = torch.tensor(observation, dtype=torch.float) reward_episode += reward self.monitor_env.close() if render: self.env.render() print(f'Reward: {reward_episode}')