class Agent: def __init__(self, env, gamma, gae_lambda, batch_size, lr_rate, ratio_clipping, epochs): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_bound = env.action_space.high[0] self.gamma = gamma self.gae_lambda = gae_lambda self.batch_size = batch_size self.epochs = epochs self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, lr_rate[0], ratio_clipping) self.critic = Critic(self.state_dim, lr_rate[1]) self.save_epi_reward = [] def gae_target(self, rewards, v_values, next_v_value, done): n_step_targets = torch.zeros_like(rewards) gae = torch.zeros_like(rewards) gae_cumulative = 0. forward_val = 0. if not done: forward_val = next_v_value for k in reversed(range(0, len(rewards))): delta = rewards[k] + self.gamma * forward_val - v_values[k] gae_cumulative = self.gamma * self.gae_lambda * gae_cumulative + delta gae[k] = gae_cumulative forward_val = v_values[k] n_step_targets[k] = gae[k] + v_values[k] return gae, n_step_targets def unpack_batch(self, batch): unpack = [] for idx in range(len(batch)): unpack.append(batch[idx]) unpack = torch.cat(unpack, axis=0) return unpack def train(self, max_episode_num, save_path, save_names): batch_state, batch_action, batch_reward = [], [], [] batch_log_old_policy_pdf = [] for episode in range(max_episode_num): time, episode_reward, done = 0, 0, False state = self.env.reset() state = torch.from_numpy(state).type(torch.FloatTensor) while not done: #env.render() mu_old, std_old, action = self.actor.get_policy_action(state) action = np.array([action.item()]) mu_old = np.array([mu_old.item()]) std_old = np.array([std_old.item()]) action = np.clip(action, -self.action_bound, self.action_bound) var_old = std_old**2 log_old_policy_pdf = -0.5 * ( action - mu_old)**2 / var_old - 0.5 * np.log( var_old * 2 * np.pi) log_old_policy_pdf = np.sum(log_old_policy_pdf) next_state, reward, done, _ = self.env.step(action) next_state = torch.from_numpy(next_state).type( torch.FloatTensor) action = torch.from_numpy(action).type(torch.FloatTensor) reward = torch.FloatTensor([reward]) log_old_policy_pdf = torch.FloatTensor([log_old_policy_pdf]) state = state.view(1, self.state_dim) next_state = next_state.view(1, self.state_dim) action = action.view(1, self.action_dim) reward = reward.view(1, 1) log_old_policy_pdf = log_old_policy_pdf.view(1, 1) batch_state.append(state) batch_action.append(action) batch_reward.append((reward + 8) / 8) batch_log_old_policy_pdf.append(log_old_policy_pdf) if len(batch_state) < self.batch_size: state = next_state[0] episode_reward += reward[0] time += 1 continue states = self.unpack_batch(batch_state) actions = self.unpack_batch(batch_action) rewards = self.unpack_batch(batch_reward) log_old_policy_pdfs = self.unpack_batch( batch_log_old_policy_pdf) batch_state, batch_action, batch_reward = [], [], [] batch_log_old_policy_pdf = [] v_values = self.critic.get_value(states) next_v_value = self.critic.get_value(next_state) gaes, y_i = self.gae_target(rewards, v_values, next_v_value, done) for _ in range(self.epochs): self.actor.update(states, actions, gaes, log_old_policy_pdfs) self.critic.update(states, y_i) state = next_state[0] episode_reward += reward[0] time += 1 self.save_epi_reward.append(episode_reward.item()) if len(self.save_epi_reward) < 20: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward)) else: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward[-20:])) if episode % 10 == 0: self.actor.save(save_path, save_names[0]) self.critic.save(save_path, save_names[1])
class Agent: def __init__(self, sess, config, environment): # Get the session, config, environment, and create a replaymemory self.sess = sess self.config = config self.environment = environment self.init_dirs() self.init_cur_epsiode() self.init_global_step() self.init_summaries() # Intialize the graph which contain 2 Networks Actor and Critic self.actor = Actor(sess, self.environment.n_actions, self.environment.state_shape, config) self.critic = Critic(sess, self.environment.state_shape, config) # To initialize all variables self.init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init) self.saver = tf.train.Saver(max_to_keep=10) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph) if config.is_train and not config.cont_training: pass elif config.is_train and config.cont_training: self.load() elif config.is_play: self.load() else: raise Exception("Please Set proper mode for training or playing") def load(self): latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) self.saver.restore(self.sess, latest_checkpoint) def save(self): self.saver.save(self.sess, self.checkpoint_dir, self.global_step_tensor) def init_dirs(self): # Create directories for checkpoints and summaries self.checkpoint_dir = os.path.join(self.config.experiment_dir, "checkpoints/") self.summary_dir = os.path.join(self.config.experiment_dir, "summaries/") def init_cur_epsiode(self): """Create cur episode tensor to totally save the process of the training""" with tf.variable_scope('cur_episode'): self.cur_episode_tensor = tf.Variable(-1, trainable=False, name='cur_epsiode') self.cur_epsiode_input = tf.placeholder('int32', None, name='cur_episode_input') self.cur_episode_assign_op = self.cur_episode_tensor.assign( self.cur_epsiode_input) def init_global_step(self): """Create a global step variable to be a reference to the number of iterations""" with tf.variable_scope('step'): self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step') self.global_step_input = tf.placeholder('int32', None, name='global_step_input') self.global_step_assign_op = self.global_step_tensor.assign( self.global_step_input) def init_summaries(self): """Create the summary part of the graph""" with tf.variable_scope('summary'): self.summary_placeholders = {} self.summary_ops = {} self.scalar_summary_tags = [ 'episode.total_reward', 'episode.length', 'evaluation.total_reward', 'evaluation.length', 'epsilon' ] for tag in self.scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.summary.scalar( tag, self.summary_placeholders[tag]) def add_summary(self, summaries_dict, step): """Add the summaries to tensorboard""" summary_list = self.sess.run( [self.summary_ops[tag] for tag in summaries_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in summaries_dict.items() }) for summary in summary_list: self.summary_writer.add_summary(summary, step) self.summary_writer.flush() def take_action(self, state): """Take the action""" action_probs = self.actor.predict(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) return action def observe(self, action): """Function that observe the new state, reward""" return self.environment.step(action) def train_episodic(self): """Train the agent in episodic techniques""" for cur_episode in range( self.cur_episode_tensor.eval(self.sess) + 1, self.config.num_episodes, 1): # Save the current checkpoint self.save() # Update the Cur Episode tensor self.cur_episode_assign_op.eval( session=self.sess, feed_dict={ self.cur_epsiode_input: self.cur_episode_tensor.eval(self.sess) + 1 }) state = self.environment.reset() total_reward = 0 # Take steps in the environment untill terminal state of epsiode for t in itertools.count(): # Update the Global step self.global_step_assign_op.eval( session=self.sess, feed_dict={ self.global_step_input: self.global_step_tensor.eval(self.sess) + 1 }) # Take an action action = self.take_action(state) next_state, reward, done = self.observe( self.environment.valid_actions[action]) # Calculate the TD Target value_next = self.critic.predict(next_state) td_target = reward + self.config.discount_factor * value_next td_error = td_target - self.critic.predict(state) # Update the Critic self.critic.update(state, td_target) # Update the Actor # using the td error as our advantage estimate # TODO Research about the best advantage estimate self.actor.update(state, action, td_error) total_reward += reward if done: # IF terminal state so exit the episode # Add summaries to tensorboard summaries_dict = { 'episode.total_reward': total_reward, 'episode.length': t } self.add_summary(summaries_dict, self.global_step_tensor.eval(self.sess)) break state = next_state print("Training Finished")