def train(self): self.start_worker() self.init_opt() for itr in range(self.current_itr, self.n_itr): with logger.prefix('itr #%d | ' % itr): paths = self.sampler.obtain_samples(itr) samples_data = self.sampler.process_samples(itr, paths) self.log_diagnostics(paths) self.optimize_policy(itr, samples_data) logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) self.current_itr = itr + 1 params["algo"] = self if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def populate_task(env, policy, dynamics): logger.log("Populating workers...") singleton_pool.run_each( _worker_populate_task, [(env, policy, dynamics)] * singleton_pool.n_parallel ) logger.log("Populated")
def worker_init_envs(G, alloc, scope, env): logger.log("initializing environment on worker %d" % G.worker_id) if not hasattr(G, 'parallel_vec_envs'): G.parallel_vec_envs = dict() G.parallel_vec_env_template = dict() G.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env))) for idx in alloc] G.parallel_vec_env_template[scope] = env
def populate_task(env, policy): logger.log("Populating workers...") singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy))] * singleton_pool.n_parallel ) logger.log("Populated")
def __init__(self, env_name, record_video=True, video_schedule=None, log_dir=None, record_log=True, force_reset=False): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) env = gym.envs.make(env_name) self.env = env self.env_id = env.spec.id monitor_manager.logger.setLevel(logging.WARNING) assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._observation_space = convert_gym_space(env.observation_space) self._action_space = convert_gym_space(env.action_space) self._horizon = env.spec.timestep_limit self._log_dir = log_dir self._force_reset = force_reset
def train(self): self.start_worker() self.init_opt() rets = [] for itr in range(self.start_itr, self.n_itr): with logger.prefix('itr #%d | ' % itr): paths = self.obtain_samples(itr) print(("BatchPolopt:train len(paths)", len(paths))) samples_data, total_returns_per_episode = self.process_samples(itr, paths) rets.append(total_returns_per_episode) self.log_diagnostics(paths) self.optimize_policy(itr, samples_data) logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: eval(input("Plotting evaluation run: Press Enter to " "continue...")) self.shutdown_worker() return rets
def optimize_policy(self, itr, samples_data): all_input_values = tuple(ext.extract( samples_data, "observations", "actions", "advantages" )) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"],) logger.log("Computing loss before") loss_before = self.optimizer.loss(all_input_values) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(all_input_values) logger.log("Optimizing") self.optimizer.optimize(all_input_values) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(all_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize(self, inputs, extra_inputs=None, callback=None): if len(inputs) == 0: # Assumes that we should always sample mini-batches raise NotImplementedError f_loss = self._opt_fun["f_loss"] if extra_inputs is None: extra_inputs = tuple() last_loss = f_loss(*(tuple(inputs) + extra_inputs)) start_time = time.time() dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs) sess = tf.get_default_session() for epoch in range(self._max_epochs): if self._verbose: logger.log("Epoch %d" % (epoch)) progbar = pyprind.ProgBar(len(inputs[0])) for batch in dataset.iterate(update=True): if self._init_train_op is not None: sess.run(self._init_train_op, dict(list(zip(self._input_vars, batch)))) self._init_train_op = None # only use it once else: sess.run(self._train_op, dict(list(zip(self._input_vars, batch)))) if self._verbose: progbar.update(len(batch[0])) if self._verbose: if progbar.active: progbar.stop() new_loss = f_loss(*(tuple(inputs) + extra_inputs)) if self._verbose: logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss)) if self._callback or callback: elapsed = time.time() - start_time callback_args = dict( loss=new_loss, params=self._target.get_param_values(trainable=True) if self._target else None, itr=epoch, elapsed=elapsed, ) if self._callback: self._callback(callback_args) if callback: callback(**callback_args) if abs(last_loss - new_loss) < self._tolerance: break last_loss = new_loss
def train(self): memory = ReplayMem( obs_dim=self.env.observation_space.flat_dim, act_dim=self.env.action_space.flat_dim, memory_size=self.memory_size) itr = 0 path_length = 0 path_return = 0 end = False obs = self.env.reset() for epoch in xrange(self.n_epochs): logger.push_prefix("epoch #%d | " % epoch) logger.log("Training started") for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # run the policy if end: # reset the environment and stretegy when an episode ends obs = self.env.reset() self.strategy.reset() # self.policy.reset() self.strategy_path_returns.append(path_return) path_length = 0 path_return = 0 # note action is sampled from the policy not the target policy act = self.strategy.get_action(obs, self.policy) nxt, rwd, end, _ = self.env.step(act) path_length += 1 path_return += rwd if not end and path_length >= self.max_path_length: end = True if self.include_horizon_terminal: memory.add_sample(obs, act, rwd, end) else: memory.add_sample(obs, act, rwd, end) obs = nxt if memory.size >= self.memory_start_size: for update_time in xrange(self.n_updates_per_sample): batch = memory.get_batch(self.batch_size) self.do_update(itr, batch) itr += 1 logger.log("Training finished") if memory.size >= self.memory_start_size: self.evaluate(epoch, memory) logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def evaluate(self, epoch, memory): if epoch == self.n_epochs - 1: logger.log("Collecting samples for evaluation") rewards = sample_rewards(env=self.env, policy=self.policy, eval_samples=self.eval_samples, max_path_length=self.max_path_length) average_discounted_return = np.mean( [discount_return(reward, self.discount) for reward in rewards]) returns = [sum(reward) for reward in rewards] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_qfunc_loss = np.mean(self.qfunc_loss_averages) average_policy_loss = np.mean(self.policy_loss_averages) logger.record_tabular('Epoch', epoch) if epoch == self.n_epochs - 1: logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) if len(self.strategy_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.strategy_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.strategy_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.strategy_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.strategy_path_returns)) logger.record_tabular('AverageQLoss', average_qfunc_loss) logger.record_tabular('AveragePolicyLoss', average_policy_loss) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) self.qfunc_loss_averages = [] self.policy_loss_averages = [] self.q_averages = [] self.y_averages = [] self.strategy_path_returns = []
def populate_task(env, policy, scope=None): logger.log("Populating workers...") if singleton_pool.n_parallel > 1: singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy), scope)] * singleton_pool.n_parallel ) else: # avoid unnecessary copying G = _get_scoped_G(singleton_pool.G, scope) G.env = env G.policy = policy logger.log("Populated")
def worker_run_reset(G, flags, scope): if not hasattr(G, 'parallel_vec_envs'): logger.log("on worker %d" % G.worker_id) import traceback for line in traceback.format_stack(): logger.log(line) # log the stacktrace at least logger.log("oops") for k, v in G.__dict__.items(): logger.log(str(k) + " : " + str(v)) assert hasattr(G, 'parallel_vec_envs') assert scope in G.parallel_vec_envs N = len(G.parallel_vec_envs[scope]) env_template = G.parallel_vec_env_template[scope] obs_dim = env_template.observation_space.flat_dim ret_arr = np.zeros((N, obs_dim)) ids = [] flat_obs = [] reset_ids = [] for itr_idx, (idx, env) in enumerate(G.parallel_vec_envs[scope]): flag = flags[idx] if flag: flat_obs.append(env.reset()) reset_ids.append(itr_idx) ids.append(idx) if len(reset_ids) > 0: ret_arr[reset_ids] = env_template.observation_space.flatten_n(flat_obs) return ids, ret_arr
def advance_until_terminate(self): skip = self.get_skip_flag() n_skips = 0 old_top = self._top new_top = (old_top + 1) % self._max_pool_size while skip and old_top != new_top and n_skips < self._max_skip_episode: n_skips += 1 self.advance() while not self._initials[self._top]: self.advance() skip = self.get_skip_flag() new_top = self._top logger.log("add_sample, skipped %d episodes, top=%d->%d"%( n_skips, old_top, new_top))
def optimize_gen(self, inputs, extra_inputs=None, callback=None, yield_itr=None): if len(inputs) == 0: # Assumes that we should always sample mini-batches raise NotImplementedError f_opt = self._opt_fun["f_opt"] f_loss = self._opt_fun["f_loss"] if extra_inputs is None: extra_inputs = tuple() last_loss = f_loss(*(tuple(inputs) + extra_inputs)) start_time = time.time() dataset = BatchDataset( inputs, self._batch_size, extra_inputs=extra_inputs #, randomized=self._randomized ) itr = 0 for epoch in pyprind.prog_bar(list(range(self._max_epochs))): for batch in dataset.iterate(update=True): f_opt(*batch) if yield_itr is not None and (itr % (yield_itr+1)) == 0: yield itr += 1 new_loss = f_loss(*(tuple(inputs) + extra_inputs)) if self._verbose: logger.log("Epoch %d, loss %s" % (epoch, new_loss)) if self._callback or callback: elapsed = time.time() - start_time callback_args = dict( loss=new_loss, params=self._target.get_param_values(trainable=True) if self._target else None, itr=epoch, elapsed=elapsed, ) if self._callback: self._callback(callback_args) if callback: callback(**callback_args) if abs(last_loss - new_loss) < self._tolerance: break last_loss = new_loss
def optimize_policy(self, itr, all_samples_data): assert len(all_samples_data) == self.num_grad_updates + 1 # we collected the rollouts to compute the grads and then the test! if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range(len(all_samples_data)): # these are the gradient steps obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract( all_samples_data[step][i], "observations", "actions", "advantages" ) obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list # [ [obs_0], [act_0], [adv_0], [obs_1], ... ] if step == 0: ##CF not used? init_inputs = input_list if self.use_maml: dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[self.kl_constrain_step][i]['agent_infos'] dist_info_list += [agent_infos[k] for k in self.policy.distribution.dist_info_keys] input_list += tuple(dist_info_list) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(input_list) logger.log("Computing loss before") loss_before = self.optimizer.loss(input_list) logger.log("Optimizing") self.optimizer.optimize(input_list) logger.log("Computing loss after") loss_after = self.optimizer.loss(input_list) if self.use_maml: logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(input_list) logger.record_tabular('MeanKLBefore', mean_kl_before) # this now won't be 0! logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract( samples_data, "observations", "actions", "advantages" ) if self.policy.recurrent: inputs += (samples_data["valids"],) agent_infos = samples_data["agent_infos"] dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def optimize(self, inputs, extra_inputs=None, callback=None): if len(inputs) == 0: # Assumes that we should always sample mini-batches raise NotImplementedError f_opt = self._opt_fun["f_opt"] f_loss = self._opt_fun["f_loss"] if extra_inputs is None: extra_inputs = tuple() last_loss = f_loss(*(tuple(inputs) + extra_inputs)) start_time = time.time() dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs) for epoch in range(self._max_epochs): if self._verbose: logger.log("Epoch %d" % epoch) for batch in dataset.iterate(update=True): f_opt(*batch) new_loss = f_loss(*(tuple(inputs) + extra_inputs)) if self._callback or callback: elapsed = time.time() - start_time callback_args = dict( loss=new_loss, params=self._target.get_param_values(trainable=True) if self._target else None, itr=epoch, elapsed=elapsed, ) if self._callback: self._callback(callback_args) if callback: callback(**callback_args) if abs(last_loss - new_loss) < self._tolerance: break last_loss = new_loss
def train(self): with tf.Session() as sess: sess.run(tf.initialize_all_variables()) self.start_worker() for itr in xrange(self.start_itr, self.n_itr): with logger.prefix('itr #%d | ' % itr): paths = self.obtain_samples(itr) samples_data = self.process_samples(itr, paths) self.log_diagnostics(paths) self.optimize_policy(itr, samples_data) logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: raw_input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False, force_reset=True): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) env = gym.envs.make(env_name) # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when # the time limit specified for each environment has been passed and # therefore the environment is not Markovian (terminal condition depends # on time rather than state). env = env.env self.env = env self.env_id = env.spec.id assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._observation_space = convert_gym_space(env.observation_space) logger.log("observation space: {}".format(self._observation_space)) self._action_space = convert_gym_space(env.action_space) logger.log("action space: {}".format(self._action_space)) self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps'] self._log_dir = log_dir self._force_reset = force_reset
def optimize(self, inputs, extra_inputs=None, subsample_grouped_inputs=None): prev_param = np.copy(self._target.get_param_values(trainable=True)) inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() if self._subsample_factor < 1: if subsample_grouped_inputs is None: subsample_grouped_inputs = [inputs] subsample_inputs = tuple() for inputs_grouped in subsample_grouped_inputs: n_samples = len(inputs_grouped[0]) inds = np.random.choice( n_samples, int(n_samples * self._subsample_factor), replace=False) subsample_inputs += tuple([x[inds] for x in inputs_grouped]) else: subsample_inputs = inputs logger.log("Start CG optimization: #parameters: %d, #inputs: %d, #subsample_inputs: %d"%(len(prev_param),len(inputs[0]), len(subsample_inputs[0]))) logger.log("computing loss before") loss_before = sliced_fun(self._opt_fun["f_loss"], self._num_slices)(inputs, extra_inputs) logger.log("performing update") logger.log("computing gradient") flat_g = sliced_fun(self._opt_fun["f_grad"], self._num_slices)(inputs, extra_inputs) logger.log("gradient computed") logger.log("computing descent direction") Hx = self._hvp_approach.build_eval(subsample_inputs + extra_inputs) descent_direction = krylov.cg(Hx, flat_g, cg_iters=self._cg_iters) initial_step_size = np.sqrt( 2.0 * self._max_constraint_val * (1. / (descent_direction.dot(Hx(descent_direction)) + 1e-8)) ) if np.isnan(initial_step_size): initial_step_size = 1. flat_descent_step = initial_step_size * descent_direction logger.log("descent direction computed") n_iter = 0 for n_iter, ratio in enumerate(self._backtrack_ratio ** np.arange(self._max_backtracks)): cur_step = ratio * flat_descent_step cur_param = prev_param - cur_step self._target.set_param_values(cur_param, trainable=True) loss, constraint_val = sliced_fun(self._opt_fun["f_loss_constraint"], self._num_slices)(inputs, extra_inputs) if self._debug_nan and np.isnan(constraint_val): import ipdb; ipdb.set_trace() if loss < loss_before and constraint_val <= self._max_constraint_val: break if (np.isnan(loss) or np.isnan(constraint_val) or loss >= loss_before or constraint_val >= self._max_constraint_val) and not self._accept_violation: logger.log("Line search condition violated. Rejecting the step!") if np.isnan(loss): logger.log("Violated because loss is NaN") if np.isnan(constraint_val): logger.log("Violated because constraint %s is NaN" % self._constraint_name) if loss >= loss_before: logger.log("Violated because loss not improving") if constraint_val >= self._max_constraint_val: logger.log("Violated because constraint %s is violated" % self._constraint_name) self._target.set_param_values(prev_param, trainable=True) logger.log("backtrack iters: %d" % n_iter) logger.log("computing loss after") logger.log("optimization finished")
from rllab.misc.instrument import to_local_command filename = str(uuid.uuid4()) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('file', type=str, help='path to the snapshot file') parser.add_argument('--log_dir', type=str, default=None, help='path to the new log directory') # Look for params.json file args = parser.parse_args() parent_dir = os.path.dirname(os.path.realpath(args.file)) json_file_path = os.path.join(parent_dir, "params.json") logger.log("Looking for params.json at %s..." % json_file_path) try: with open(json_file_path, "r") as f: params = json.load(f) # exclude certain parameters excluded = ['json_args'] for k in excluded: if k in params: del params[k] for k, v in list(params.items()): if v is None: del params[k] if args.log_dir is not None: params['log_dir'] = args.log_dir params['resume_from'] = args.file command = to_local_command(params, script='scripts/run_experiment_lite.py')
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean( [special.discount_return(path["rewards"], self.discount) for path in paths] ) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean(np.square(np.concatenate( [path["actions"] for path in paths] ))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True) ) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True) ) logger.record_tabular('Epoch', epoch) logger.record_tabular('Iteration', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
def process_samples(self, itr, paths): advantage_baselines = [] baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] # all_path_advantages = [self.algo.extra_baseline.predict(path) for path in paths] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) # path_advantages = np.append(all_path_advantages[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["qvalues"] = path["advantages"] + path_baselines[:-1] path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) # advantage_baselines.append(path_advantages[:-1]) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) old_advantages_to_fit = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) logger.record_tabular("AdvantagesMean", old_advantages_to_fit.mean()) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) qvalues = tensor_utils.concat_tensor_list( [path["qvalues"] for path in paths]) baselines_tensor = tensor_utils.concat_tensor_list(baselines) # baselines_advantage_tensor = tensor_utils.concat_tensor_list(advantage_baselines) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) etas = None if hasattr(self.algo, 'qprop') and self.algo.qprop: old_advantages = np.copy(advantages) old_advantages, _ = self.process_advantages(old_advantages) old_advantages_scale = np.abs(old_advantages).mean() logger.record_tabular("OldAdvantagesMSE", np.square(advantages).mean()) logger.record_tabular("AbsLearnSignalOld", old_advantages_scale) logger.log("Qprop, subtracting control variate") advantages_bar = self.algo.get_control_variate( observations=observations, actions=actions) if self.algo.qprop_eta_option == 'ones': etas = np.ones_like(advantages) elif self.algo.qprop_eta_option == 'adapt1': # conservative etas = (advantages * advantages_bar) > 0 etas = etas.astype(advantages.dtype) logger.log("Qprop, etas: %d 1s, %d 0s" % ((etas == 1).sum(), (etas == 0).sum())) elif self.algo.qprop_eta_option == 'adapt2': # aggressive etas = np.sign(advantages * advantages_bar) etas = etas.astype(advantages.dtype) logger.log("Qprop, etas: %d 1s, %d -1s" % ((etas == 1).sum(), (etas == -1).sum())) else: raise NotImplementedError(self.algo.qprop_eta_option) """ logger.record_tabular("Before Advantages MSE", np.mean(np.square(advantages))) advantages -= baselines_advantage_tensor logger.record_tabular("After Advantages MSE", np.mean(np.square(advantages))) """ advantages -= etas * advantages_bar logger.record_tabular("NewAdvantagesMSE", np.square(advantages).mean()) advantages, adv_std = self.process_advantages(advantages) if self.algo.qprop_unbias: logger.log("Unbiasing Qprop estimator...") etas /= adv_std advantages_scale = np.abs(advantages).mean() logger.record_tabular("AbsLearnSignalNew", advantages_scale) else: advantages, _ = self.process_advantages(advantages) advantages_scale = np.abs(advantages).mean() logger.record_tabular("AbsLearnSignal", advantages_scale) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, qvalues=qvalues, env_infos=env_infos, agent_infos=agent_infos, paths=paths, baselines=baselines_tensor, etas=etas, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) baselines_tensor = tensor_utils.pad_tensor_n( baselines, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, baselines=baselines_tensor, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") # logger.log("evaluating fit baseline with another baseline...") # self.algo.extra_baseline.fit(old_advantages_to_fit, paths) # logger.log("fitted again") """ logger.log("evaluating fit baseline with another baseline...") self.algo.extra_baseline.fit(baselines_tensor, paths) logger.log("fitted again") """ logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker() if created_session: sess.close()
def process_samples(self, itr, paths): baselines = [] returns = [] for path in paths: path_baselines = np.append(self.algo.baseline.predict(path), 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array([tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array([tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array([tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 50, 25), hidden_nonlinearity=tf.nn.relu, ) baseline = LinearFeatureBaseline(env_spec=env.spec) with tf.Session() as sess: for env_name, env in envs: logger.log("Training Policy on %s" % env_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=env.horizon, n_itr=args.num_epochs, discount=0.99, step_size=args.step_size, optimizer=ConjugateGradientOptimizer(reg_coeff=args.reg_coeff, hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)) ) custom_train(algo, sess=sess)
def train(self): # This seems like a rather sequential method pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_dim=self.env.observation_space.flat_dim, action_dim=self.env.action_space.flat_dim, ) self.start_worker() self.init_opt() itr = 0 path_length = 0 path_return = 0 terminal = False observation = self.env.reset() sample_policy = pickle.loads(pickle.dumps(self.policy)) for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # Execute policy if terminal: # or path_length > self.max_path_length: # Note that if the last time step ends an episode, the very # last state and observation will be ignored and not added # to the replay pool observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True # only include the terminal transition in this case if the flag was set if self.include_horizon_terminal_transitions: pool.add_sample(observation, action, reward * self.scale_reward, terminal) else: pool.add_sample(observation, action, reward * self.scale_reward, terminal) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in range(self.n_updates_per_sample): # Train policy batch = pool.random_batch(self.batch_size) self.do_training(itr, batch) sample_policy.set_param_values(self.policy.get_param_values()) itr += 1 logger.log("Training finished") if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.env.terminate() self.policy.terminate()
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean( [special.discount_return(path["rewards"], self.discount) for path in paths] ) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean(np.square(np.concatenate( [path["actions"] for path in paths] ))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True) ) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True) ) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 import time while n_samples < self.algo.batch_size: t = time.time() self.algo.policy.reset(dones) actions, agent_infos = self.algo.policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in xrange(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def optimize_policy(self, itr, samples_data): # Init vars rewards = samples_data['rewards'] actions = samples_data['actions'] observations = samples_data['observations'] agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] if self.policy.recurrent: recurrent_vals = [samples_data["valids"]] else: recurrent_vals = [] # Compute sample Bellman error. feat_diff = [] for path in samples_data['paths']: feats = self._features(path) feats = np.vstack([feats, np.zeros(feats.shape[1])]) feat_diff.append(feats[1:] - feats[:-1]) if self.policy.recurrent: max_path_length = max([len(path["advantages"]) for path in samples_data["paths"]]) # pad feature diffs feat_diff = np.array([tensor_utils.pad_tensor(fd, max_path_length) for fd in feat_diff]) else: feat_diff = np.vstack(feat_diff) ################# # Optimize dual # ################# # Here we need to optimize dual through BFGS in order to obtain \eta # value. Initialize dual function g(\theta, v). \eta > 0 # First eval delta_v f_dual = self.opt_info['f_dual'] f_dual_grad = self.opt_info['f_dual_grad'] # Set BFGS eval function def eval_dual(input): param_eta = input[0] param_v = input[1:] val = f_dual(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v])) return val.astype(np.float64) # Set BFGS gradient eval function def eval_dual_grad(input): param_eta = input[0] param_v = input[1:] grad = f_dual_grad(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v])) eta_grad = np.float(grad[0]) v_grad = grad[1] return np.hstack([eta_grad, v_grad]) # Initial BFGS parameter values. x0 = np.hstack([self.param_eta, self.param_v]) # Set parameter boundaries: \eta>0, v unrestricted. bounds = [(-np.inf, np.inf) for _ in x0] bounds[0] = (0., np.inf) # Optimize through BFGS logger.log('optimizing dual') eta_before = x0[0] dual_before = eval_dual(x0) params_ast, _, _ = self.optimizer( func=eval_dual, x0=x0, fprime=eval_dual_grad, bounds=bounds, maxiter=self.max_opt_itr, disp=0 ) dual_after = eval_dual(params_ast) # Optimal values have been obtained self.param_eta = params_ast[0] self.param_v = params_ast[1:] ################### # Optimize policy # ################### cur_params = self.policy.get_param_values(trainable=True) f_loss = self.opt_info["f_loss"] f_loss_grad = self.opt_info['f_loss_grad'] input = [rewards, observations, feat_diff, actions] + state_info_list + recurrent_vals + [self.param_eta, self.param_v] # Set loss eval function def eval_loss(params): self.policy.set_param_values(params, trainable=True) val = f_loss(*input) return val.astype(np.float64) # Set loss gradient eval function def eval_loss_grad(params): self.policy.set_param_values(params, trainable=True) grad = f_loss_grad(*input) flattened_grad = tensor_utils.flatten_tensors(list(map(np.asarray, grad))) return flattened_grad.astype(np.float64) loss_before = eval_loss(cur_params) logger.log('optimizing policy') params_ast, _, _ = self.optimizer( func=eval_loss, x0=cur_params, fprime=eval_loss_grad, disp=0, maxiter=self.max_opt_itr ) loss_after = eval_loss(params_ast) f_kl = self.opt_info['f_kl'] mean_kl = f_kl(*([observations, actions] + state_info_list + dist_info_list + recurrent_vals)).astype( np.float64) logger.log('eta %f -> %f' % (eta_before, self.param_eta)) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) logger.record_tabular('DualBefore', dual_before) logger.record_tabular('DualAfter', dual_after) logger.record_tabular('MeanKL', mean_kl)
def train(self): with tf.Session() as sess: if self.load_policy is not None: import joblib self.policy = joblib.load(self.load_policy)['policy'] self.init_opt() # initialize uninitialized vars (I know, it's ugly) uninit_vars = [] for var in tf.all_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) sess.run(tf.initialize_variables(uninit_vars)) #sess.run(tf.initialize_all_variables()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) #new_param_values = self.policy.get_variable_values(self.policy.all_params) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) #import pickle #with open('paths_itr'+str(itr)+'.pkl', 'wb') as f: # pickle.dump(paths, f) # debugging """ if itr % 1 == 0: logger.log("Saving visualization of paths") import matplotlib.pyplot as plt; for ind in range(5): plt.clf(); plt.hold(True) points = paths[ind]['observations'] plt.plot(points[:,0], points[:,1], '-r', linewidth=2) plt.xlim([-1.0, 1.0]) plt.ylim([-1.0, 1.0]) plt.legend(['path']) plt.savefig('/home/cfinn/path'+str(ind)+'.png') """ # end debugging logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def optimize(self, inputs): inputs = tuple(inputs) try_penalty = np.clip( self._penalty, self._min_penalty, self._max_penalty) penalty_scale_factor = None f_opt = self._opt_fun["f_opt"] f_penalized_loss = self._opt_fun["f_penalized_loss"] def gen_f_opt(penalty): def f(flat_params): self._target.set_param_values(flat_params, trainable=True) return f_opt(*(inputs + (penalty,))) return f cur_params = self._target.get_param_values(trainable=True).astype('float64') opt_params = cur_params for penalty_itr in range(self._max_penalty_itr): logger.log('trying penalty=%.3f...' % try_penalty) itr_opt_params, _, _ = scipy.optimize.fmin_l_bfgs_b( func=gen_f_opt(try_penalty), x0=cur_params, maxiter=self._max_opt_itr ) _, try_loss, try_constraint_val = f_penalized_loss(*(inputs + (try_penalty,))) logger.log('penalty %f => loss %f, %s %f' % (try_penalty, try_loss, self._constraint_name, try_constraint_val)) # Either constraint satisfied, or we are at the last iteration already and no alternative parameter # satisfies the constraint if try_constraint_val < self._max_constraint_val or \ (penalty_itr == self._max_penalty_itr - 1 and opt_params is None): opt_params = itr_opt_params if not self._adapt_penalty: break # Decide scale factor on the first iteration, or if constraint violation yields numerical error if penalty_scale_factor is None or np.isnan(try_constraint_val): # Increase penalty if constraint violated, or if constraint term is NAN if try_constraint_val > self._max_constraint_val or np.isnan(try_constraint_val): penalty_scale_factor = self._increase_penalty_factor else: # Otherwise (i.e. constraint satisfied), shrink penalty penalty_scale_factor = self._decrease_penalty_factor opt_params = itr_opt_params else: if penalty_scale_factor > 1 and \ try_constraint_val <= self._max_constraint_val: break elif penalty_scale_factor < 1 and \ try_constraint_val >= self._max_constraint_val: break old_penalty = try_penalty try_penalty *= penalty_scale_factor try_penalty = np.clip(try_penalty, self._min_penalty, self._max_penalty) if try_penalty == old_penalty: break self._penalty = try_penalty self._target.set_param_values(opt_params, trainable=True)
def train(self): gc_dump_time = time.time() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # This seems like a rather sequential method pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_dim=self.env.observation_space.flat_dim, action_dim=self.env.action_space.flat_dim, replacement_prob=self.replacement_prob, ) self.start_worker() self.init_opt() # This initializes the optimizer parameters sess.run(tf.global_variables_initializer()) itr = 0 path_length = 0 path_return = 0 terminal = False initial = False observation = self.env.reset() #with tf.variable_scope("sample_policy"): #with suppress_params_loading(): #sample_policy = pickle.loads(pickle.dumps(self.policy)) with tf.variable_scope("sample_policy"): sample_policy = Serializable.clone(self.policy) for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") train_qf_itr, train_policy_itr = 0, 0 for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # Execute policy if terminal: # or path_length > self.max_path_length: # Note that if the last time step ends an episode, the very # last state and observation will be ignored and not added # to the replay pool observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 initial = True else: initial = False action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True # only include the terminal transition in this case if the flag was set if self.include_horizon_terminal_transitions: pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) else: pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in range(self.n_updates_per_sample): # Train policy batch = pool.random_batch(self.batch_size) itrs = self.do_training(itr, batch) train_qf_itr += itrs[0] train_policy_itr += itrs[1] sample_policy.set_param_values(self.policy.get_param_values()) itr += 1 if time.time() - gc_dump_time > 100: gc.collect() gc_dump_time = time.time() logger.log("Training finished") logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr)) if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.env.terminate() self.policy.terminate()