def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def train(self): # This seems like a rather sequential method input_shapes = dims_to_shapes(self.input_dims) pool = ReplayBuffer( buffer_shapes=input_shapes, max_buffer_size=self.replay_pool_size, ) self.start_worker() self.init_opt() itr = 0 path_length = 0 path_return = 0 terminal = False observation = self.env.reset() sample_policy = pickle.loads(pickle.dumps(self.policy)) for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # Execute policy if terminal: # or path_length > self.max_path_length: # Note that if the last time step ends an episode, the very # last state and observation will be ignored and not added # to the replay pool observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 action = self.es.get_action( itr, observation, policy=sample_policy) next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True # only include the terminal transition in this case if the # flag was set if self.include_horizon_terminal_transitions: pool.add_transition( observation=observation, action=action, reward=reward * self.scale_reward, terminal=terminal, next_observation=next_observation) else: pool.add_transition( observation=observation, action=action, reward=reward * self.scale_reward, terminal=terminal, next_observation=next_observation) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in range(self.n_updates_per_sample): # Train policy batch = pool.sample(self.batch_size) self.do_training(itr, batch) sample_policy.set_param_values( self.policy.get_param_values()) itr += 1 logger.log("Training finished") if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.env.close() self.policy.terminate() self.plotter.close()
tabular_log_file = osp.join(log_dir, 'process.csv') text_log_file = osp.join(log_dir, 'text.txt') params_log_file = osp.join(log_dir, 'args.txt') logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_snapshot_gap(args.snapshot_gap) logger.log_parameters_lite(params_log_file, args) if trial > 0: old_log_dir = args.log_dir + '/' + str(trial - 1) logger.pop_prefix() logger.remove_text_output(osp.join(old_log_dir, 'text.txt')) logger.remove_tabular_output(osp.join(old_log_dir, 'process.csv')) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) logger.push_prefix("[" + args.exp_name + '_trial ' + str(trial) + "]") np.random.seed(trial) # Instantiate the garage objects top_paths = BPQ.BoundedPriorityQueue(top_k) algo = MCTS(env=env, stress_test_num=stress_test_num, max_path_length=max_path_length, ec=ec, n_itr=args.n_itr, k=k, alpha=alpha, clear_nodes=True, log_interval=args.log_interval, top_paths=top_paths,
log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) # logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_snapshot_gap(args.snapshot_gap) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) seed = 0 top_k = 10 max_path_length = 100 top_paths = BPQ.BoundedPriorityQueue(top_k) np.random.seed(seed) tf.set_random_seed(seed) with tf.Session() as sess: # Create env data = joblib.load("../CartPole/ControlPolicy/itr_5.pkl") sut = data['policy'] reward_function = ASTRewardS()
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) cur_std = self.sigma0 cur_mean = self.policy.get_param_values() es = cma.CMAEvolutionStrategy(cur_mean, cur_std) parallel_sampler.populate_task(self.env, self.policy) if self.plot: self.plotter.init_plot(self.env, self.policy) cur_std = self.sigma0 cur_mean = self.policy.get_param_values() itr = 0 while itr < self.n_itr and not es.stop(): if self.batch_size is None: # Sample from multivariate normal distribution. xs = es.ask() xs = np.asarray(xs) # For each sample, do a rollout. infos = (stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs])) else: cum_len = 0 infos = [] xss = [] done = False while not done: sbs = stateful_pool.singleton_pool.n_parallel * 2 # Sample from multivariate normal distribution. # You want to ask for sbs samples here. xs = es.ask(sbs) xs = np.asarray(xs) xss.append(xs) sinfos = stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs]) for info in sinfos: infos.append(info) cum_len += len(info['returns']) if cum_len >= self.batch_size: xs = np.concatenate(xss) done = True break # Evaluate fitness of samples (negative as it is minimization # problem). fs = -np.array([info['returns'][0] for info in infos]) # When batching, you could have generated too many samples compared # to the actual evaluations. So we cut it off in this case. xs = xs[:len(fs)] # Update CMA-ES params based on sample fitness. es.tell(xs, fs) logger.push_prefix('itr #%d | ' % itr) logger.record_tabular('Iteration', itr) logger.record_tabular('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array( [info['undiscounted_return'] for info in infos]) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('StdReturn', np.mean(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) logger.record_tabular('AverageDiscountedReturn', np.mean(fs)) logger.record_tabular( 'AvgTrajLen', np.mean([len(info['returns']) for info in infos])) self.policy.log_diagnostics(infos) logger.save_itr_params( itr, dict( itr=itr, policy=self.policy, env=self.env, )) logger.dump_tabular(with_prefix=False) if self.plot: self.plotter.update_plot(self.policy, self.max_path_length) logger.pop_prefix() # Update iteration. itr += 1 # Showing policy from time to time if self.play_every_itr is not None and self.play_every_itr > 0 and itr % self.play_every_itr == 0: self.play_policy(env=self.env, policy=self.policy, n_rollout=self.play_rollouts_num) # Set final params. self.policy.set_param_values(es.result()[0]) parallel_sampler.terminate_task() self.plotter.close()
def run_experiment(argv): default_log_dir = config.LOG_DIR now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument( '--n_parallel', type=int, default=1, help=("Number of parallel workers to perform rollouts. " "0 => don't start any workers")) parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument( '--log_dir', type=str, default=None, help='Path to save the log and iteration snapshot.') parser.add_argument( '--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), "gap" (every' '`snapshot_gap` iterations are saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--snapshot_gap', type=int, default=1, help='Gap between snapshot iterations.') parser.add_argument( '--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument( '--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument( '--tensorboard_step_key', type=str, default=None, help=("Name of the step key in tensorboard_summary.")) parser.add_argument( '--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument( '--variant_log_file', type=str, default='variant.json', help='Name of the variant log file (in json).') parser.add_argument( '--resume_from', type=str, default=None, help='Name of the pickle file to resume experiment from.') parser.add_argument( '--plot', type=ast.literal_eval, default=False, help='Whether to plot the iteration results') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help='Print only the tabular log information (in a horizontal format)') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument( '--args_data', type=str, help='Pickled data for objects') parser.add_argument( '--variant_data', type=str, help='Pickled data for variant configuration') parser.add_argument( '--use_cloudpickle', type=ast.literal_eval, default=False) args = parser.parse_args(argv[1:]) if args.seed is not None: set_seed(args.seed) # SIGINT is blocked for all processes created in parallel_sampler to avoid # the creation of sleeping and zombie processes. # # If the user interrupts run_experiment, there's a chance some processes # won't die due to a dead lock condition where one of the children in the # parallel sampler exits without releasing a lock once after it catches # SIGINT. # # Later the parent tries to acquire the same lock to proceed with his # cleanup, but it remains sleeping waiting for the lock to be released. # In the meantime, all the process in parallel sampler remain in the zombie # state since the parent cannot proceed with their clean up. with mask_signals([signal.SIGINT]): if args.n_parallel > 0: parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: parallel_sampler.set_seed(args.seed) if not args.plot: garage.plotter.Plotter.disable() garage.tf.plotter.Plotter.disable() if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) if args.variant_data is not None: variant_data = pickle.loads(base64.b64decode(args.variant_data)) variant_log_file = osp.join(log_dir, args.variant_log_file) logger.log_variant(variant_log_file, variant_data) else: variant_data = None if not args.use_cloudpickle: logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) logger.set_tensorboard_dir(log_dir) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_snapshot_gap(args.snapshot_gap) logger.set_log_tabular_only(args.log_tabular_only) logger.set_tensorboard_step_key(args.tensorboard_step_key) logger.push_prefix("[%s] " % args.exp_name) if args.resume_from is not None: data = joblib.load(args.resume_from) assert 'algo' in data algo = data['algo'] algo.train() else: # read from stdin if args.use_cloudpickle: import cloudpickle method_call = cloudpickle.loads(base64.b64decode(args.args_data)) try: method_call(variant_data) except BaseException: children = garage.plotter.Plotter.get_plotters() children += garage.tf.plotter.Plotter.get_plotters() if args.n_parallel > 0: children += [parallel_sampler] child_proc_shutdown(children) raise else: data = pickle.loads(base64.b64decode(args.args_data)) maybe_iter = concretize(data) if is_iterable(maybe_iter): for _ in maybe_iter: pass logger.set_snapshot_mode(prev_mode) logger.set_snapshot_dir(prev_snapshot_dir) logger.remove_tabular_output(tabular_log_file) logger.remove_text_output(text_log_file) logger.pop_prefix()
def train(self, sess=None): # created_session = True if (sess is None) else False # if sess is None: # sess = tf.Session() # sess.__enter__() sess.run(tf.global_variables_initializer()) parallel_sampler.populate_task(self.env, self.policy) if self.plot: self.plotter.init_plot(self.env, self.policy) cur_std = self.init_std cur_mean = self.policy.get_param_values() # K = cur_mean.size n_best = max(1, int(self.n_samples * self.best_frac)) for itr in range(self.n_itr): # sample around the current distribution extra_var_mult = max(1.0 - itr / self.extra_decay_time, 0) sample_std = np.sqrt( np.square(cur_std) + np.square(self.extra_std) * extra_var_mult) if self.batch_size is None: criterion = 'paths' threshold = self.n_samples else: criterion = 'samples' threshold = self.batch_size infos = stateful_pool.singleton_pool.run_collect( _worker_rollout_policy, threshold=threshold, args=(dict( cur_mean=cur_mean, sample_std=sample_std, max_path_length=self.max_path_length, discount=self.discount, criterion=criterion, n_evals=self.n_evals), )) xs = np.asarray([info[0] for info in infos]) paths = [info[1] for info in infos] fs = np.array([path['returns'][0] for path in paths]) print((xs.shape, fs.shape)) best_inds = (-fs).argsort()[:n_best] best_xs = xs[best_inds] cur_mean = best_xs.mean(axis=0) cur_std = best_xs.std(axis=0) best_x = best_xs[0] logger.push_prefix('itr #%d | ' % itr) logger.record_tabular('Iteration', itr) logger.record_tabular('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array( [path['undiscounted_return'] for path in paths]) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) logger.record_tabular('AverageDiscountedReturn', np.mean(fs)) logger.record_tabular('NumTrajs', len(paths)) paths = list(chain( *[d['full_paths'] for d in paths])) # flatten paths for the case n_evals > 1 logger.record_tabular( 'AvgTrajLen', np.mean([len(path['returns']) for path in paths])) self.policy.set_param_values(best_x) self.policy.log_diagnostics(paths) logger.save_itr_params( itr, dict( itr=itr, policy=self.policy, env=self.env, cur_mean=cur_mean, cur_std=cur_std, )) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.plotter.update_plot(self.policy, self.max_path_length) # Showing policy from time to time if self.play_every_itr is not None and self.play_every_itr > 0 and itr % self.play_every_itr == 0: self.play_policy(env=self.env, policy=self.policy, n_rollout=self.play_rollouts_num) parallel_sampler.terminate_task() self.plotter.close() if created_session: sess.close()
def run_experiment(argv): default_log_dir = config.LOG_DIR now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument( '--n_parallel', type=int, default=1, help=("Number of parallel workers to perform rollouts. " "0 => don't start any workers")) parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument( '--log_dir', type=str, default=None, help='Path to save the log and iteration snapshot.') parser.add_argument( '--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), "gap" (every' '`snapshot_gap` iterations are saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--snapshot_gap', type=int, default=1, help='Gap between snapshot iterations.') parser.add_argument( '--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument( '--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument( '--tensorboard_step_key', type=str, default=None, help=("Name of the step key in tensorboard_summary.")) parser.add_argument( '--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument( '--variant_log_file', type=str, default='variant.json', help='Name of the variant log file (in json).') parser.add_argument( '--resume_from', type=str, default=None, help='Name of the pickle file to resume experiment from.') parser.add_argument( '--plot', type=ast.literal_eval, default=False, help='Whether to plot the iteration results') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help='Print only the tabular log information (in a horizontal format)') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument( '--args_data', type=str, help='Pickled data for stub objects') parser.add_argument( '--variant_data', type=str, help='Pickled data for variant configuration') parser.add_argument( '--use_cloudpickle', type=ast.literal_eval, default=False) args = parser.parse_args(argv[1:]) assert (os.environ.get("JOBLIB_START_METHOD", None) == "forkserver") if args.seed is not None: set_seed(args.seed) if args.n_parallel > 0: from garage.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: parallel_sampler.set_seed(args.seed) if not args.plot: garage.plotter.Plotter.disable() garage.tf.plotter.Plotter.disable() if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) if args.variant_data is not None: variant_data = pickle.loads(base64.b64decode(args.variant_data)) variant_log_file = osp.join(log_dir, args.variant_log_file) logger.log_variant(variant_log_file, variant_data) else: variant_data = None if not args.use_cloudpickle: logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) logger.set_tensorboard_dir(log_dir) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_snapshot_gap(args.snapshot_gap) logger.set_log_tabular_only(args.log_tabular_only) logger.set_tensorboard_step_key(args.tensorboard_step_key) logger.push_prefix("[%s] " % args.exp_name) if args.resume_from is not None: data = joblib.load(args.resume_from) assert 'algo' in data algo = data['algo'] algo.train() else: # read from stdin if args.use_cloudpickle: import cloudpickle method_call = cloudpickle.loads(base64.b64decode(args.args_data)) try: method_call(variant_data) except BaseException: if args.n_parallel > 0: parallel_sampler.terminate() raise else: data = pickle.loads(base64.b64decode(args.args_data)) maybe_iter = concretize(data) if is_iterable(maybe_iter): for _ in maybe_iter: pass logger.set_snapshot_mode(prev_mode) logger.set_snapshot_dir(prev_snapshot_dir) logger.remove_tabular_output(tabular_log_file) logger.remove_text_output(text_log_file) logger.pop_prefix()
def train(self, sess=None): """ Training process of DDPG algorithm. Args: sess: A TensorFlow session for executing ops. """ created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() # Start plotter if self.plot: self.plotter = Plotter(self.env, self.actor, sess) self.plotter.start() sess.run(tf.global_variables_initializer()) self.f_init_target() observation = self.env.reset() if self.es: self.es.reset() episode_reward = 0. episode_step = 0 episode_rewards = [] episode_steps = [] episode_actor_losses = [] episode_critic_losses = [] episodes = 0 epoch_ys = [] epoch_qs = [] for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") self.success_history.clear() for epoch_cycle in pyprind.prog_bar(range(self.n_epoch_cycles)): if self.use_her: successes = [] for rollout in range(self.n_rollout_steps): o = np.clip(observation["observation"], -self.clip_obs, self.clip_obs) g = np.clip(observation["desired_goal"], -self.clip_obs, self.clip_obs) obs_goal = np.concatenate((o, g), axis=-1) action = self.es.get_action(rollout, obs_goal, self.actor) next_observation, reward, terminal, info = self.env.step( # noqa: E501 action) if 'is_success' in info: successes.append([info["is_success"]]) episode_reward += reward episode_step += 1 info_dict = { "info_{}".format(key): info[key].reshape(1) for key in info.keys() } self.replay_buffer.add_transition( observation=observation['observation'], action=action, goal=observation['desired_goal'], achieved_goal=observation['achieved_goal'], **info_dict, ) observation = next_observation if rollout == self.n_rollout_steps - 1: self.replay_buffer.add_transition( observation=observation['observation'], achieved_goal=observation['achieved_goal']) episode_rewards.append(episode_reward) episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 episodes += 1 observation = self.env.reset() if self.es: self.es.reset() successful = np.array(successes)[-1, :] success_rate = np.mean(successful) self.success_history.append(success_rate) for train_itr in range(self.n_train_steps): self.evaluate = True critic_loss, y, q, action_loss = self._learn() episode_actor_losses.append(action_loss) episode_critic_losses.append(critic_loss) epoch_ys.append(y) epoch_qs.append(q) self.f_update_target() else: for rollout in range(self.n_rollout_steps): action = self.es.get_action(rollout, observation, self.actor) assert action.shape == self.env.action_space.shape next_observation, reward, terminal, info = self.env.step( # noqa: E501 action) episode_reward += reward episode_step += 1 self.replay_buffer.add_transition( observation=observation, action=action, reward=reward * self.reward_scale, terminal=terminal, next_observation=next_observation, ) observation = next_observation if terminal or rollout == self.n_rollout_steps - 1: episode_rewards.append(episode_reward) episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 episodes += 1 observation = self.env.reset() if self.es: self.es.reset() for train_itr in range(self.n_train_steps): if self.replay_buffer.size >= self.min_buffer_size: self.evaluate = True critic_loss, y, q, action_loss = self._learn() episode_actor_losses.append(action_loss) episode_critic_losses.append(critic_loss) epoch_ys.append(y) epoch_qs.append(q) logger.log("Training finished") logger.log("Saving snapshot") itr = epoch * self.n_epoch_cycles + epoch_cycle params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") if self.evaluate: logger.record_tabular('Epoch', epoch) logger.record_tabular('Episodes', episodes) logger.record_tabular('AverageReturn', np.mean(episode_rewards)) logger.record_tabular('StdReturn', np.std(episode_rewards)) logger.record_tabular('Policy/AveragePolicyLoss', np.mean(episode_actor_losses)) logger.record_tabular('QFunction/AverageQFunctionLoss', np.mean(episode_critic_losses)) logger.record_tabular('QFunction/AverageQ', np.mean(epoch_qs)) logger.record_tabular('QFunction/MaxQ', np.max(epoch_qs)) logger.record_tabular('QFunction/AverageAbsQ', np.mean(np.abs(epoch_qs))) logger.record_tabular('QFunction/AverageY', np.mean(epoch_ys)) logger.record_tabular('QFunction/MaxY', np.max(epoch_ys)) logger.record_tabular('QFunction/AverageAbsY', np.mean(np.abs(epoch_ys))) if self.use_her: logger.record_tabular('AverageSuccessRate', np.mean(self.success_history)) # Uncomment the following if you want to calculate the average # in each epoch, better uncomment when self.use_her is True # episode_rewards = [] # episode_actor_losses = [] # episode_critic_losses = [] # epoch_ys = [] # epoch_qs = [] logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.plotter.update_plot(self.actor, self.n_rollout_steps) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") if self.plot: self.plotter.close() if created_session: sess.close()
def train(self, sess=None): """ Training process of DDPG algorithm. Args: sess: A TensorFlow session for executing ops. """ replay_buffer = self.opt_info["replay_buffer"] f_init_target = self.opt_info["f_init_target"] created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() # Start plotter if self.plot: self.plotter = Plotter(self.env, self.actor, sess) self.plotter.start() sess.run(tf.global_variables_initializer()) f_init_target() observation = self.env.reset() if self.es: self.es.reset() episode_reward = 0. episode_step = 0 episode_rewards = [] episode_steps = [] episode_actor_losses = [] episode_critic_losses = [] episodes = 0 epoch_ys = [] epoch_qs = [] for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") for epoch_cycle in pyprind.prog_bar(range(self.n_epoch_cycles)): for rollout in range(self.n_rollout_steps): action = self.es.get_action(rollout, observation, self.actor) assert action.shape == self.env.action_space.shape next_observation, reward, terminal, info = self.env.step( action) episode_reward += reward episode_step += 1 replay_buffer.add_transition(observation, action, reward * self.reward_scale, terminal, next_observation) observation = next_observation if terminal: episode_rewards.append(episode_reward) episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 episodes += 1 observation = self.env.reset() if self.es: self.es.reset() for train_itr in range(self.n_train_steps): if replay_buffer.size >= self.min_buffer_size: critic_loss, y, q, action_loss = self._learn() episode_actor_losses.append(action_loss) episode_critic_losses.append(critic_loss) epoch_ys.append(y) epoch_qs.append(q) logger.log("Training finished") if replay_buffer.size >= self.min_buffer_size: logger.record_tabular('Epoch', epoch) logger.record_tabular('Episodes', episodes) logger.record_tabular('AverageReturn', np.mean(episode_rewards)) logger.record_tabular('StdReturn', np.std(episode_rewards)) logger.record_tabular('Policy/AveragePolicyLoss', np.mean(episode_actor_losses)) logger.record_tabular('QFunction/AverageQFunctionLoss', np.mean(episode_critic_losses)) logger.record_tabular('QFunction/AverageQ', np.mean(epoch_qs)) logger.record_tabular('QFunction/MaxQ', np.max(epoch_qs)) logger.record_tabular('QFunction/AverageAbsQ', np.mean(np.abs(epoch_qs))) logger.record_tabular('QFunction/AverageY', np.mean(epoch_ys)) logger.record_tabular('QFunction/MaxY', np.max(epoch_ys)) logger.record_tabular('QFunction/AverageAbsY', np.mean(np.abs(epoch_ys))) # Uncomment the following if you want to calculate the average # in each epoch # episode_rewards = [] # episode_actor_losses = [] # episode_critic_losses = [] # epoch_ys = [] # epoch_qs = [] logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.plotter.update_plot(self.actor, self.n_rollout_steps) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") if self.plot: self.plotter.shutdown() if created_session: sess.close()
log_dir = "Data/Train" tabular_log_file = osp.join(log_dir, "progress.csv") text_log_file = osp.join(log_dir, "debug.log") params_log_file = osp.join(log_dir, "params.json") pkl_file = osp.join(log_dir, "params.pkl") logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode("gap") logger.set_snapshot_gap(1) logger.set_log_tabular_only(False) logger.push_prefix("[%s] " % "Carpole-RL") env = TfEnv(CartPoleEnv(use_seed=False)) # env = TfEnv(GridWorldEnv()) policy = CategoricalMLPPolicy( name='protagonist', env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy,