def q_learning(sess, env, agent, num_episodes, max_time_per_episode, discount_factor=0.99, epsilon=0.4, epsilon_decay=.95, use_experience_replay=False, max_replay_buffer_size=4000, batch_size=128, target=None, tf_saver=None, save_path=None, save_interval=None): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Implements the options of online learning or using experience replay and also target calculation by target networks, depending on the flags. You can reuse your Q-learning implementation of the last exercise. Args: env: PLE game approx: Action-Value function estimator num_episodes: Number of episodes to run for. max_time_per_episode: maximum number of time steps before episode is terminated discount_factor: gamma, discount factor of future rewards. epsilon: Chance to sample a random action. Float betwen 0 and 1. epsilon_decay: decay rate of epsilon parameter use_experience_replay: Indicator if experience replay should be used. batch_size: Number of samples per batch. target: Slowly updated target network to calculate the targets. Ignored if None. Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # Keeps track of useful statistics stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) replay_buffer = ReplayBuffer(max_replay_buffer_size) action_set = env.getActionSet() for i_episode in range(num_episodes): # The policy we're following policy = make_epsilon_greedy_policy(agent.predict, len(action_set)) # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = stats.episode_rewards[i_episode - 1] avg_reward = np.mean(stats.episode_rewards[max(i_episode - 100, 0):i_episode]) print("\rEpisode {}/{} ({}), avg reward: {}".format( i_episode + 1, num_episodes, last_reward, avg_reward), end="") # sys.stdout.flush() # Reset the current environment env.reset_game() state = list(env.getGameState()) done = False loss = None # Iterate through steps for t in range(max_time_per_episode): if env.game_over(): done = True # Update target network maybe if target: pass # Take a step action_probs = policy([state], epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) reward = env.act(action_set[action]) next_state = list(env.getGameState()) # episode stats stats.episode_lengths[i_episode] = t # print(reward) stats.episode_rewards[i_episode] += reward if done: print("\rStep {} ({}) loss: {}\n".format( t, max_time_per_episode, loss), end="") break if use_experience_replay: # Update replay buffer replay_buffer.add_transition(state, action, next_state, reward, done) # Sample minibatch from replay buffer batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = \ replay_buffer.next_batch(min(batch_size, replay_buffer.size())) batch_actions = list( zip(range(len(batch_actions)), batch_actions)) # Calculate TD target for batch. Use "old" fixed parameters if target network is available # to compute targets else use "old" parameters of value function estimate. batch_next_q_values = (target if target else agent.train_model).predict( batch_next_states, None, None) batch_best_next_action = np.argmax(batch_next_q_values, axis=1) batch_td_target = [ batch_rewards[j] + discount_factor * batch_next_q_values[j][batch_best_next_action[j]] for j in range(len(batch_states)) ] # Update Q value estimator parameters by optimizing between Q network and Q-learning targets loss = agent.train(batch_states, batch_actions, batch_td_target) else: next_q_values = (target if target else agent).predict( [next_state], None, None) best_next_action = np.argmax(next_q_values, axis=1) td_target = reward + (discount_factor * next_q_values[0] * best_next_action) loss = agent.train([state], [[0, action]], td_target) if target: target.update() epsilon *= epsilon_decay state = next_state if i_episode % save_interval == 0: tf_saver.save(sess, save_path, global_step=i_episode) return stats
def q_learning(q_network, env, test_env, seed, total_timesteps, log_interval, test_interval, show_interval, logdir, lr, max_grad_norm, units_per_hlayer, activ_fcn, gamma=0.95, epsilon=0.4, epsilon_decay=.95, buffer_size=4000, batch_size=128, trace_length=32, tau=0.99, update_interval=30, early_stop=False, keep_model=2, save_model=True, restore_model=False, save_traj=False): # """ # Q-Learning algorithm for off-policy TD control using Function Approximation. # Finds the optimal greedy policy while following an epsilon-greedy policy. # Implements the options of online learning or using experience replay and also # target calculation by target networks, depending on the flags. You can reuse # your Q-learning implementation of the last exercise. # # Args: # env: PLE game # approx: Action-Value function estimator # num_episodes: Number of episodes to run for. # max_time_per_episode: maximum number of time steps before episode is terminated # discount_factor: gamma, discount factor of future rewards. # epsilon: Chance to sample a random action. Float betwen 0 and 1. # epsilon_decay: decay rate of epsilon parameter # use_experience_replay: Indicator if experience replay should be used. # batch_size: Number of samples per batch. # target: Slowly updated target network to calculate the targets. Ignored if None. # # Returns: # An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. # """ logger = logging.getLogger(__name__) # logger.info(datetime.time) tf.reset_default_graph() set_global_seeds(seed) # Params ob_space = env.observation_space ac_space = env.action_space nd, = ob_space.shape n_ac = ac_space.n # Create learning agent and the replay buffer agent = DQNAgent(q_network=q_network, ob_space=ob_space, ac_space=ac_space, lr=lr, max_grad_norm=max_grad_norm, units_per_hlayer=units_per_hlayer, activ_fcn=activ_fcn, log_interval=log_interval, logdir=logdir, batch_size=batch_size, trace_length=trace_length, update_interval=update_interval, tau=tau, keep_model=keep_model) summary_writer = agent.get_summary_writer() result_path = os.path.join(logdir, 'train_results.csv') if save_traj: rew_traj = [] rew_results_path = os.path.join( logdir, ('lr' + str(lr) + '_tracking_results.csv')) else: rew_results_path = None replay_buffer = ReplayBuffer(buffer_size) # Keeps track of useful statistics stats = EpisodeStats if restore_model: for el in os.listdir(logdir): if 'final' in el and '.meta' in el: # Load pre trained model and set network parameters logger.info('load %s' % os.path.join(logdir, el[:-5])) agent.load(os.path.join(logdir, el[:-5])) # Reset global step parameter. agent.sess.run(agent.global_step.assign(0)) # ------------------ TRAINING -------------------------------------------- logger.info("Start Training") early_stopped = False i_episode, i_sample, i_train = 0, 0, 0 len, rew = 0, 0 horizon = 100 reward_window = deque(maxlen=horizon) avg_rm = deque(maxlen=30) nbatch = batch_size * trace_length return_threshold = -0.05 # 40 # Reset envnn obs = env.reset() obs = normalize_obs(obs) done = False rnn_state0 = agent.step_initial_state if rnn_state0 is None: # If we use a normal feed forward architecture, we sample a batch of single samples, not a batch of sequences. trace_length = 1 # Set the target network to be equal to the primary network agent.update_target(agent.target_ops) while i_sample < total_timesteps: if np.random.rand(1) < epsilon: _, next_rnn_state = agent.step([obs], rnn_state0) # epsilon greedy action action = np.random.randint(0, n_ac) else: AP, next_rnn_state = agent.step( [obs], rnn_state0) # epsilon greedy action action = AP[0] next_obs, reward, done, _ = env.step(action) next_obs = normalize_obs(next_obs) i_sample += 1 # render only every i-th episode if show_interval != 0: if i_episode % show_interval == 0: env.render() len += 1 rew += reward reward_window.append(reward) # When episode is done, add episode information to tensorboard summary and stats if done: # env.game_over(): next_obs = list(np.zeros_like(next_obs, dtype=np.float32)) stats['episode_lengths'].append(len) stats['episode_rewards'].append(rew) if summary_writer is not None: summary = tf.Summary() summary.value.add( tag='envs/ep_return', simple_value=stats['episode_rewards'][i_episode]) summary.value.add( tag="envs/ep_length", simple_value=stats['episode_lengths'][i_episode]) summary_writer.add_summary(summary, i_episode) summary_writer.flush() if save_model and rew > return_threshold: return_threshold = rew logger.info('Save model at max reward %s' % return_threshold) agent.save('inter_model') i_episode += 1 len, rew = 0, 0 # Update replay buffer replay_buffer.add_transition(obs, action, next_obs, reward, done) if save_traj: rew_traj.append(reward) # Update model parameters every #update_interval steps. Use real experience and replayed experience. if replay_buffer.size() > nbatch and (i_sample % update_interval == 0): if (env.spec._env_name == 'ContFlappyBird'): rm = sum(reward_window) / horizon if summary_writer is not None: s_summary = tf.Summary() s_summary.value.add(tag='envs/isample_return', simple_value=rm) summary_writer.add_summary(s_summary, i_sample) summary_writer.flush() if save_model and rm > return_threshold: return_threshold = rm logger.info('Save model at max rolling mean %s' % return_threshold) agent.save('inter_model') avg_rm.append(rm) if early_stop: if (i_sample > 60000) and (i_sample <= (60000 + update_interval)): if (sum(avg_rm) / 30) <= -0.88: print('breaked') early_stopped = True break agent.update_target(agent.target_ops) # reset rnn state (history knowledge) before every training step rnn_state_train = agent.train_initial_state # Sample training mini-batch from replay buffer if rnn_state_train is not None: mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \ replay_buffer.recent_and_next_batch_of_seq(batch_size, trace_length) else: mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \ replay_buffer.recent_and_next_batch(batch_size) # Calculate TD target for batch. Use "old" fixed parameters if target network is available # to compute targets else use "old" parameters of value function estimate. # mb_next_obs = np.reshape(mb_next_obs, (-1, nd)) mb_next_q_values, _ = agent.target_model.predict( mb_next_obs, rnn_state_train) mb_best_next_action = np.argmax(mb_next_q_values, axis=1) mb_td_target = [ mb_rewards[j] + gamma * mb_next_q_values[j][mb_best_next_action[j]] for j in range(nbatch) ] # Update Q value estimator parameters by optimizing between Q network and Q-learning targets loss = agent.train(mb_obs, mb_actions, mb_td_target, rnn_state_train) i_train += 1 # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates if test_interval > 0 and i_train > 0 and (i_train % test_interval == 0): ep_return = agent.test_run(test_env, n_eps=10, n_pipes=2000) with open(result_path, "a") as csvfile: writer = csv.writer(csvfile) ep_return[0:0] = [i_sample, i_train] writer.writerow(ep_return) if done: # Reset the model next_obs = env.reset() next_obs = normalize_obs(next_obs) epsilon *= epsilon_decay obs = next_obs rnn_state0 = next_rnn_state # Save final model when training is finished. if save_model: agent.save('final_model') logger.info('Finished Training. Saving Final model.') if rew_results_path is not None: logger.info('Save reward trajectory to %s' % rew_results_path) with open(rew_results_path, "a") as csvfile: writer = csv.writer(csvfile) traj = np.asanyarray(rew_traj).reshape(-1).tolist() traj[0:0] = [np.mean(traj)] # i_train, i_sample writer.writerow(traj) logger.info('*******************************************************') logger.info('Total number of interactions with the environment: %s' % i_sample) logger.info('Total number of parameter updates during training: %s' % i_train) logger.info('*******************************************************\n') return early_stopped, i_sample