def train(self, sess, saver, summary_writer, progress_fd, model_path, batch_size=64, step=10, start_episode=0, train_episodes=1000, save_episodes=100, epsilon=0.3, apply_her=False, n_goals=10): total_rewards = [] sess.run([self.init_critic]) for i_episode in tqdm(range(train_episodes), ncols=100): states, actions, returns, nexts, are_non_terminal, total_reward = self.collect_trajectory() feed_dict = {self.states: states, self.actions: actions, self.rewards: returns, self.nexts: nexts, self.are_non_terminal: are_non_terminal, self.training: True} total_rewards.append(total_reward) perm = np.random.permutation(len(states)) for s in range(step): sess.run([self.critic_step], feed_dict=feed_dict) sess.run([self.update_critic]) sess.run([self.actor_step], feed_dict=feed_dict) # summary_writer.add_summary(summary, global_step=self.global_step.eval()) critic_loss = self.critic_loss.eval(feed_dict=feed_dict).mean() actor_loss = self.actor_loss.eval(feed_dict=feed_dict).mean() append_summary(progress_fd, str(start_episode + i_episode) + ",{0:.2f}".format(total_reward)\ +",{0:.4f}".format(actor_loss)+",{0:.4f}".format(critic_loss)) if (i_episode + 1) % save_episodes == 0: saver.save(sess, model_path) return total_rewards
def train(self, sess, saver, summary_writer, progress_fd, model_path, batch_size=64, step=10, start_episode=0, train_episodes=1000, save_episodes=100, epsilon=0.3, apply_her=False, n_goals=10): total_rewards = [] sess.run([self.init_actor, self.init_critic]) for i_episode in tqdm(range(train_episodes), ncols=100): total_reward = self.collect_trajectory(epsilon, apply_her, n_goals) append_summary( progress_fd, str(start_episode + i_episode) + ',{0:.2f}'.format(total_reward)) total_rewards.append(total_reward) states, actions, rewards, nexts, are_non_terminal = self.replay_memory.sample_batch( step * batch_size) for t in range(step): sess.run( [self.critic_step], feed_dict={ self.states: states[t * batch_size:(t + 1) * batch_size], self.actions: actions[t * batch_size:(t + 1) * batch_size], self.rewards: rewards[t * batch_size:(t + 1) * batch_size], self.nexts: nexts[t * batch_size:(t + 1) * batch_size], self.are_non_terminal: are_non_terminal[t * batch_size:(t + 1) * batch_size], self.training: True }) sess.run( [self.actor_step], feed_dict={ self.states: states[t * batch_size:(t + 1) * batch_size], self.training: True }) sess.run([self.update_actor, self.update_critic]) # summary_writer.add_summary(summary, global_step=self.global_step.eval()) if (i_episode + 1) % save_episodes == 0: saver.save(sess, model_path) return total_rewards
def train(self, sess, saver, summary_writer, progress_fd, model_path, batch_size=64, step=10, start_episode=0, train_episodes=1000, save_episodes=100, epsilon=0.3, max_episode_len=25): total_rewards = [] sess.run([agent.init_qnetwork for agent in self.agents]) for i_episode in tqdm(range(train_episodes), ncols=100): cur_epsilon = self.linear_decay_epsilon(i_episode, train_episodes * 0.5, epsilon) total_reward = self.collect_trajectory(cur_epsilon, max_episode_len) append_summary( progress_fd, str(start_episode + i_episode) + ',{0:.2f}'.format(total_reward)) total_rewards.append(total_reward) for agent in self.agents: states, actions, rewards, nexts, are_non_terminal = agent.replay_memory.sample_batch( step * batch_size) for t in range(step): sess.run( [agent.step], feed_dict={ agent.states: states[t * batch_size:(t + 1) * batch_size], agent.actions: actions[t * batch_size:(t + 1) * batch_size], agent.rewards: rewards[t * batch_size:(t + 1) * batch_size], agent.nexts: nexts[t * batch_size:(t + 1) * batch_size], agent.are_non_terminal: are_non_terminal[t * batch_size:(t + 1) * batch_size], agent.training: True }) sess.run([agent.update_qnetwork]) # summary_writer.add_summary(summary, global_step=self.global_step.eval()) if (i_episode + 1) % save_episodes == 0: saver.save(sess, model_path) return total_rewards
def train(self, sess, saver, summary_writer, progress_fd, model_path, batch_size=64, step=10, start_episode=0, train_episodes=1000, save_episodes=100, epsilon=0.3, apply_her=False, n_goals=10, train_steps=-1): total_rewards = [] n_step = 0 i_episode = 0 for i_episode in tqdm(range(train_episodes), ncols=100): states_mem, actions_mem, action_loglikelihood_mem, returns_mem, advantage_mem, epi_avg_reward = self.collect_transitions( sess) #self.global_step.assign_add(1) for s in range(step): perm = np.random.permutation(len(states_mem)) for sample_id in range(0, len(perm), batch_size): feed_dict = { self.states: states_mem[perm[sample_id:sample_id + batch_size]], self.actions: actions_mem[perm[sample_id:sample_id + batch_size]], self.action_loglikelihood: action_loglikelihood_mem[perm[sample_id:sample_id + batch_size]], self.returns: returns_mem[perm[sample_id:sample_id + batch_size]], self.advantages: advantage_mem[perm[sample_id:sample_id + batch_size]], self.training: True } sess.run([self.actor_step, self.critic_step], feed_dict=feed_dict) n_step += len(states_mem) if not epi_avg_reward is None: append_summary( progress_fd, str(start_episode + i_episode) + ",{0:.2f}".format(epi_avg_reward) + ",{}".format(n_step)) total_rewards.append(epi_avg_reward) if (i_episode + 1) % save_episodes == 0: saver.save(sess, model_path) return total_rewards
def train(self, sess, saver, summary_writer, progress_fd, model_path, filter_path, batch_size=64, step=10, start_episode=0, train_episodes=1000, save_episodes=100, max_episode_len=25, **kargs): total_rewards = [] n_step = 0 i_episode = 0 for i_episode in tqdm(range(train_episodes), ncols=100): states_mem, actions_mem, action_loglikelihood_mem, returns_mem, advantage_mem, epi_avg_reward = self.collect_transitions( sess, max_episode_len) for s in range(step): perm = np.random.permutation(len(states_mem)) for sample_id in range(0, len(perm), batch_size): feed_dict = { self.states: states_mem[perm[sample_id:sample_id + batch_size]], self.actions: actions_mem[perm[sample_id:sample_id + batch_size]], self.action_loglikelihood: action_loglikelihood_mem[perm[sample_id:sample_id + batch_size]], self.returns: returns_mem[perm[sample_id:sample_id + batch_size]], self.advantages: advantage_mem[perm[sample_id:sample_id + batch_size]], self.training: True } sess.run(self.actor_step_list + self.critic_step_list, feed_dict=feed_dict) n_step += len(states_mem) append_summary( progress_fd, str(start_episode + i_episode) + ",{0:.2f}".format(epi_avg_reward) + ",{}".format(n_step)) total_rewards.append(epi_avg_reward) if (i_episode + 1) % save_episodes == 0: saver.save(sess, model_path) self.save_state_filter(filter_path) return total_rewards
saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(log_path, graph=tf.get_default_graph()) with tf.Session(config=config) as sess: if args.eval or args.restore: saver.restore(sess, model_path) if not args.eval: progress_fd = open(progress_file, 'r') start_episode = len(progress_fd.readlines()) - 1 progress_fd.close() progress_fd = open(progress_file, 'a') else: progress_fd = open(progress_file, 'w') append_summary(progress_fd, 'episode, avg-reward, n_step') progress_fd.flush() start_episode = 0 tf.global_variables_initializer().run() if not args.eval: total_rewards = agent.train(sess, saver, summary_writer, progress_fd, model_path, batch_size=args.batch_size, step=args.step, train_episodes=args.train_episodes, start_episode=start_episode, save_episodes=args.save_episodes, epsilon=args.epsilon,
config = tf.ConfigProto(gpu_options=gpu_ops, allow_soft_placement=True) saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(log_path, graph=tf.get_default_graph()) with tf.Session(config=config) as sess: if args.eval or args.restore: saver.restore(sess, model_path) agent.load_state_filter(filter_path) if not args.eval: progress_fd = open(progress_file, 'r') start_episode = len(progress_fd.readlines()) - 1 progress_fd.close() progress_fd = open(progress_file, 'a') else: progress_fd = open(progress_file, 'w') append_summary(progress_fd, 'episode, first-agent-reward') progress_fd.flush() start_episode = 0 tf.global_variables_initializer().run() if not args.eval: total_rewards = agent.train( sess, saver, summary_writer, progress_fd, model_path, filter_path, batch_size=args.batch_size, step=args.step, train_episodes=args.train_episodes, start_episode=start_episode, save_episodes=args.save_episodes, max_episode_len=args.max_episode_len) progress_fd.close() plot(os.path.join(args.plot_dir, args.model + '_' + args.env), np.array(total_rewards) + 1e-10) summary_writer.close() else: if args.benchmark: infos = [] n_epi = 400