def train_once(self, itr, paths): epoch = itr // self.n_samples i_sample = itr - epoch * self.n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = paths['average_return'] self.all_returns.append(paths['average_return']) if (itr + 1) % self.n_samples == 0: avg_rtns = np.array(self.all_returns) self.es.tell(self.all_params, -avg_rtns) self.policy.set_param_values(self.es.result()[0]) # Clear for next epoch rtn = max(self.all_returns) self.all_returns.clear() self.all_params = self.sample_params() self.cur_params = self.all_params[(i_sample + 1) % self.n_samples] self.policy.set_param_values(self.cur_params) logger.log(tabular) return rtn
def optimize_policy(self, itr, samples_data): policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log('Computing loss before') loss_before = self.optimizer.loss(policy_opt_input_values) logger.log('Computing KL before') policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log('Optimizing') self.optimizer.optimize(policy_opt_input_values) logger.log('Computing KL after') policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log('Computing loss after') loss_after = self.optimizer.loss(policy_opt_input_values) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl) pol_ent = self.f_policy_entropy(*policy_opt_input_values) tabular.record('{}/Entropy'.format(self.policy.name), np.mean(pol_ent)) self._fit_baseline(samples_data)
def train_once(self, itr, paths): epoch = itr // self.n_samples i_sample = itr - epoch * self.n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) # -- Stage: Process path rtn = paths['average_return'] self.all_returns.append(paths['average_return']) # -- Stage: Update policy distribution. if (itr + 1) % self.n_samples == 0: avg_rtns = np.array(self.all_returns) best_inds = np.argsort(-avg_rtns)[:self.n_best] best_params = np.array(self.all_params)[best_inds] # MLE of normal distribution self.cur_mean = best_params.mean(axis=0) self.cur_std = best_params.std(axis=0) self.policy.set_param_values(self.cur_mean) # Clear for next epoch rtn = max(self.all_returns) self.all_returns.clear() self.all_params.clear() # -- Stage: Generate a new policy for next path sampling self.cur_params = self.sample_params(itr) self.all_params.append(self.cur_params.copy()) self.policy.set_param_values(self.cur_params) logger.log(tabular) return rtn
def optimize_policy(self, itr, samples_data): policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log("Computing loss before") loss_before = self.optimizer.loss(policy_opt_input_values) logger.log("Computing KL before") policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log("Optimizing") self.optimizer.optimize(policy_opt_input_values) logger.log("Computing KL after") policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(policy_opt_input_values) tabular.record("{}/LossBefore".format(self.policy.name), loss_before) tabular.record("{}/LossAfter".format(self.policy.name), loss_after) tabular.record("{}/dLoss".format(self.policy.name), loss_before - loss_after) tabular.record("{}/KLBefore".format(self.policy.name), policy_kl_before) tabular.record("{}/KL".format(self.policy.name), policy_kl) pol_ent = self.f_policy_entropy(*policy_opt_input_values) tabular.record("{}/Entropy".format(self.policy.name), pol_ent) num_traj = self.batch_size // self.max_path_length actions = samples_data["actions"][:num_traj, ...] histogram = EmpiricalDistribution(actions) tabular.record("{}/Actions".format(self.policy.name), histogram) self._fit_baseline(samples_data) return self.get_itr_snapshot(itr, samples_data)
def worker_init_envs(g, alloc, scope, env): logger.log("initializing environment on worker %d" % g.worker_id) if not hasattr(g, 'parallel_vec_envs'): g.parallel_vec_envs = dict() g.parallel_vec_env_template = dict() g.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env))) for idx in alloc] g.parallel_vec_env_template[scope] = env
def optimize(self, inputs, extra_inputs=None, callback=None): if not inputs: # Assumes that we should always sample mini-batches raise NotImplementedError f_loss = self._opt_fun['f_loss'] if extra_inputs is None: extra_inputs = tuple() last_loss = f_loss(*(tuple(inputs) + extra_inputs)) start_time = time.time() dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs) sess = tf.get_default_session() for epoch in range(self._max_epochs): if self._verbose: logger.log('Epoch {}'.format(epoch)) progbar = pyprind.ProgBar(len(inputs[0])) for batch in dataset.iterate(update=True): sess.run(self._train_op, dict(list(zip(self._input_vars, batch)))) if self._verbose: progbar.update(len(batch[0])) if self._verbose: if progbar.active: progbar.stop() new_loss = f_loss(*(tuple(inputs) + extra_inputs)) if self._verbose: logger.log('Epoch: {} | Loss: {}'.format(epoch, new_loss)) if self._callback or callback: elapsed = time.time() - start_time callback_args = dict( loss=new_loss, params=self._target.get_param_values( trainable=True) if self._target else None, itr=epoch, elapsed=elapsed, ) if self._callback: self._callback(callback_args) if callback: callback(**callback_args) if abs(last_loss - new_loss) < self._tolerance: break last_loss = new_loss
def populate_task(env, policy, scope=None): logger.log("Populating workers...") if singleton_pool.n_parallel > 1: singleton_pool.run_each(_worker_populate_task, [ (pickle.dumps(env), pickle.dumps(policy), scope) ] * singleton_pool.n_parallel) else: # avoid unnecessary copying g = _get_scoped_g(singleton_pool.G, scope) g.env = env g.policy = policy logger.log("Populated")
def train_once(self, itr, paths): epoch = itr / self.n_epoch_cycles self.episode_rewards.extend(paths['undiscounted_returns']) self.success_history.extend(paths['success_history']) last_average_return = np.mean(self.episode_rewards) self.log_diagnostics(paths) for train_itr in range(self.n_train_steps): if self.replay_buffer.n_transitions_stored >= self.min_buffer_size: # noqa: E501 self.evaluate = True qf_loss, y, q, policy_loss = self.optimize_policy(epoch, paths) self.episode_policy_losses.append(policy_loss) self.episode_qf_losses.append(qf_loss) self.epoch_ys.append(y) self.epoch_qs.append(q) if itr % self.n_epoch_cycles == 0: logger.log('Training finished') if self.evaluate: tabular.record('Epoch', epoch) tabular.record('AverageReturn', np.mean(self.episode_rewards)) tabular.record('StdReturn', np.std(self.episode_rewards)) tabular.record('Policy/AveragePolicyLoss', np.mean(self.episode_policy_losses)) tabular.record('QFunction/AverageQFunctionLoss', np.mean(self.episode_qf_losses)) tabular.record('QFunction/AverageQ', np.mean(self.epoch_qs)) tabular.record('QFunction/MaxQ', np.max(self.epoch_qs)) tabular.record('QFunction/AverageAbsQ', np.mean(np.abs(self.epoch_qs))) tabular.record('QFunction/AverageY', np.mean(self.epoch_ys)) tabular.record('QFunction/MaxY', np.max(self.epoch_ys)) tabular.record('QFunction/AverageAbsY', np.mean(np.abs(self.epoch_ys))) if self.input_include_goal: tabular.record('AverageSuccessRate', np.mean(self.success_history)) if not self.smooth_return: self.episode_rewards = [] self.episode_policy_losses = [] self.episode_qf_losses = [] self.epoch_ys = [] self.epoch_qs = [] self.success_history.clear() return last_average_return
def worker_run_reset(g, flags, scope): if not hasattr(g, 'parallel_vec_envs'): logger.log("on worker %d" % g.worker_id) import traceback for line in traceback.format_stack(): logger.log(line) # log the stacktrace at least logger.log("oops") for k, v in g.__dict__.items(): logger.log(str(k) + " : " + str(v)) assert hasattr(g, 'parallel_vec_envs') assert scope in g.parallel_vec_envs n = len(g.parallel_vec_envs[scope]) env_template = g.parallel_vec_env_template[scope] obs_dim = env_template.observation_space.flat_dim ret_arr = np.zeros((n, obs_dim)) ids = [] flat_obs = [] reset_ids = [] for itr_idx, (idx, env) in enumerate(g.parallel_vec_envs[scope]): flag = flags[idx] if flag: flat_obs.append(env.reset()) reset_ids.append(itr_idx) ids.append(idx) if reset_ids: ret_arr[reset_ids] = env_template.observation_space.flatten_n(flat_obs) return ids, ret_arr
def test_polopt_algo(self, algo_cls, env_cls, policy_cls, baseline_cls): logger.log('Testing {}, {}, {}'.format( algo_cls.__name__, env_cls.__name__, policy_cls.__name__)) env = GarageEnv(env_cls()) policy = policy_cls(env_spec=env) baseline = baseline_cls(env_spec=env) algo = algo_cls( env=env, policy=policy, baseline=baseline, **(algo_args.get(algo_cls, dict()))) algo.train() assert not np.any(np.isnan(policy.get_param_values())) env.close()
def obtain_samples(self, itr, batch_size): """Obtain one batch of samples. Args: itr: Index of iteration (epoch). batch_size: Number of steps in batch. This is a hint that the sampler may or may not respect. Returns: One batch of samples. """ if self.n_epoch_cycles == 1: logger.log('Obtaining samples...') return self.sampler.obtain_samples(itr, batch_size)
def save_snapshot(self, itr, paths=None): """Save snapshot of current batch. Args: itr: Index of iteration (epoch). paths: Batch of samples after preprocessed. """ assert self.has_setup logger.log("Saving snapshot...") params = self.algo.get_itr_snapshot(itr) params['env'] = self.env if paths: params['paths'] = paths snapshotter.save_itr_params(itr, params) logger.log('Saved')
def setup(self, algo, env, sampler_cls=None, sampler_args=None): """Set up runner for algorithm and environment. This method saves algo and env within runner and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo: An algorithm instance. env: An environement instance. sampler_cls: A sampler class. sampler_args: Arguments to be passed to sampler constructor. """ self.algo = algo self.env = env self.policy = self.algo.policy if sampler_args is None: sampler_args = {} if sampler_cls is None: from garage.tf.algos.batch_polopt import BatchPolopt # import pdb; pdb.set_trace() if isinstance(algo, BatchPolopt): if self.policy.vectorized: from garage.tf.samplers import OnPolicyVectorizedSampler sampler_cls = OnPolicyVectorizedSampler else: from garage.tf.samplers import BatchSampler sampler_cls = BatchSampler else: from garage.tf.samplers import OffPolicyVectorizedSampler sampler_cls = OffPolicyVectorizedSampler self.sampler = sampler_cls(algo, env, **sampler_args) self.initialize_tf_vars() logger.log(self.sess.graph) self.has_setup = True
def train(self): plotter = Plotter() if self.plot: plotter.init_plot(self.env, self.policy) self.start_worker() self.init_opt() for itr in range(self.current_itr, self.n_itr): with logger.prefix('itr #{} | '.format(itr)): paths = self.sampler.obtain_samples(itr) samples_data = self.sampler.process_samples(itr, paths) self.log_diagnostics(paths) self.optimize_policy(itr, samples_data) logger.log('Saving snapshot...') params = self.get_itr_snapshot(itr, samples_data) self.current_itr = itr + 1 params['algo'] = self if self.store_paths: params['paths'] = samples_data['paths'] snapshotter.save_itr_params(itr, params) logger.log('saved') logger.log(tabular) if self.plot: plotter.update_plot(self.policy, self.max_path_length) if self.pause_for_plot: input('Plotting evaluation run: Press Enter to ' 'continue...') plotter.close() self.shutdown_worker()
def _fit_baseline(self, samples_data): """ Update baselines from samples. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self.f_rewards(*policy_opt_input_values) returns_tensor = self.f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = tensor_utils.concat_tensor_list(aug_rewards) aug_returns = tensor_utils.concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate explained variance ev = special.explained_variance_1d(np.concatenate(baselines), aug_returns) tabular.record('{}/ExplainedVariance'.format(self.baseline.name), ev) # Fit baseline logger.log('Fitting baseline...') if hasattr(self.baseline, 'fit_with_samples'): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths)
def log_diagnostics(self, pause_for_plot=False): """Log diagnostics. Args: pause_for_plot: Pause for plot. """ logger.log('Time %.2f s' % (time.time() - self.start_time)) logger.log('EpochTime %.2f s' % (time.time() - self.itr_start_time)) logger.log(tabular) if self.plot: self.plotter.update_plot(self.policy, self.algo.max_path_length) if pause_for_plot: input('Plotting evaluation run: Press Enter to " "continue...')
def train(self): address = ('localhost', 6000) conn = Client(address) try: plotter = Plotter() if self.plot: plotter.init_plot(self.env, self.policy) conn.send(ExpLifecycle.START) self.start_worker() self.init_opt() for itr in range(self.current_itr, self.n_itr): with logger.prefix('itr #{} | '.format(itr)): conn.send(ExpLifecycle.OBTAIN_SAMPLES) paths = self.sampler.obtain_samples(itr) conn.send(ExpLifecycle.PROCESS_SAMPLES) samples_data = self.sampler.process_samples(itr, paths) self.log_diagnostics(paths) conn.send(ExpLifecycle.OPTIMIZE_POLICY) self.optimize_policy(itr, samples_data) logger.log('saving snapshot...') params = self.get_itr_snapshot(itr, samples_data) self.current_itr = itr + 1 params['algo'] = self if self.store_paths: params['paths'] = samples_data['paths'] snapshotter.save_itr_params(itr, params) logger.log('saved') logger.log(tabular) if self.plot: conn.send(ExpLifecycle.UPDATE_PLOT) plotter.update_plot(self.policy, self.max_path_length) if self.pause_for_plot: input('Plotting evaluation run: Press Enter to ' 'continue...') conn.send(ExpLifecycle.SHUTDOWN) plotter.close() self.shutdown_worker() finally: conn.close()
def _worker_set_seed(_, seed): logger.log("Setting seed to %d" % seed) deterministic.set_seed(seed)
def optimize(self, inputs, extra_inputs=None, subsample_grouped_inputs=None, name=None): with tf.name_scope( name, 'optimize', values=[inputs, extra_inputs, subsample_grouped_inputs]): prev_param = np.copy(self._target.get_param_values(trainable=True)) inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() if self._subsample_factor < 1: if subsample_grouped_inputs is None: subsample_grouped_inputs = [inputs] subsample_inputs = tuple() for inputs_grouped in subsample_grouped_inputs: n_samples = len(inputs_grouped[0]) inds = np.random.choice(n_samples, int(n_samples * self._subsample_factor), replace=False) subsample_inputs += tuple( [x[inds] for x in inputs_grouped]) else: subsample_inputs = inputs logger.log( ('Start CG optimization: ' '#parameters: %d, #inputs: %d, #subsample_inputs: %d') % (len(prev_param), len(inputs[0]), len(subsample_inputs[0]))) logger.log('computing loss before') loss_before = sliced_fun(self._opt_fun['f_loss'], self._num_slices)(inputs, extra_inputs) logger.log('performing update') logger.log('computing gradient') flat_g = sliced_fun(self._opt_fun['f_grad'], self._num_slices)(inputs, extra_inputs) logger.log('gradient computed') logger.log('computing descent direction') hx = self._hvp_approach.build_eval(subsample_inputs + extra_inputs) descent_direction = krylov.cg(hx, flat_g, cg_iters=self._cg_iters) initial_step_size = np.sqrt( 2.0 * self._max_constraint_val * (1. / (descent_direction.dot(hx(descent_direction)) + 1e-8))) if np.isnan(initial_step_size): initial_step_size = 1. flat_descent_step = initial_step_size * descent_direction logger.log('descent direction computed') n_iter = 0 for n_iter, ratio in enumerate(self._backtrack_ratio**np.arange( self._max_backtracks)): # yapf: disable cur_step = ratio * flat_descent_step cur_param = prev_param - cur_step self._target.set_param_values(cur_param, trainable=True) loss, constraint_val = sliced_fun( self._opt_fun['f_loss_constraint'], self._num_slices)(inputs, extra_inputs) if self._debug_nan and np.isnan(constraint_val): break if loss < loss_before and \ constraint_val <= self._max_constraint_val: break if (np.isnan(loss) or np.isnan(constraint_val) or loss >= loss_before or constraint_val >= self._max_constraint_val) and not self._accept_violation: logger.log( 'Line search condition violated. Rejecting the step!') if np.isnan(loss): logger.log('Violated because loss is NaN') if np.isnan(constraint_val): logger.log('Violated because constraint %s is NaN' % self._constraint_name) if loss >= loss_before: logger.log('Violated because loss not improving') if constraint_val >= self._max_constraint_val: logger.log('Violated because constraint %s is violated' % self._constraint_name) self._target.set_param_values(prev_param, trainable=True) logger.log('backtrack iters: %d' % n_iter) logger.log('computing loss after') logger.log('optimization finished')
def train_once(self, itr, paths): self.log_diagnostics(paths) logger.log('Optimizing policy...') self.optimize_policy(itr, paths) return paths['average_return']
def log_diagnostics(self, paths): logger.log('Logging diagnostics...') self.policy.log_diagnostics(paths) self.baseline.log_diagnostics(paths)
def train(self): parallel_sampler.populate_task(self.env, self.policy) if self.plot: self.plotter.init_plot(self.env, self.policy) cur_std = self.init_std cur_mean = self.policy.get_param_values() # K = cur_mean.size n_best = max(1, int(self.n_samples * self.best_frac)) for itr in range(self.n_itr): # sample around the current distribution extra_var_mult = max(1.0 - itr / self.extra_decay_time, 0) sample_std = np.sqrt( np.square(cur_std) + np.square(self.extra_std) * extra_var_mult) if self.batch_size is None: criterion = 'paths' threshold = self.n_samples else: criterion = 'samples' threshold = self.batch_size infos = stateful_pool.singleton_pool.run_collect( _worker_rollout_policy, threshold=threshold, args=(dict( cur_mean=cur_mean, sample_std=sample_std, max_path_length=self.max_path_length, discount=self.discount, criterion=criterion, n_evals=self.n_evals), )) xs = np.asarray([info[0] for info in infos]) paths = [info[1] for info in infos] fs = np.array([path['returns'][0] for path in paths]) print((xs.shape, fs.shape)) best_inds = (-fs).argsort()[:n_best] best_xs = xs[best_inds] cur_mean = best_xs.mean(axis=0) cur_std = best_xs.std(axis=0) best_x = best_xs[0] logger.push_prefix('itr #{} | '.format(itr)) tabular.record('Iteration', itr) tabular.record('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array( [path['undiscounted_return'] for path in paths]) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('AverageDiscountedReturn', np.mean(fs)) tabular.record('NumTrajs', len(paths)) paths = list(chain( *[d['full_paths'] for d in paths])) # flatten paths for the case n_evals > 1 tabular.record('AvgTrajLen', np.mean([len(path['returns']) for path in paths])) self.policy.set_param_values(best_x) self.policy.log_diagnostics(paths) snapshotter.save_itr_params( itr, dict( itr=itr, policy=self.policy, env=self.env, cur_mean=cur_mean, cur_std=cur_std, )) logger.log(tabular) logger.pop_prefix() if self.plot: self.plotter.update_plot(self.policy, self.max_path_length) parallel_sampler.terminate_task() self.plotter.close()
def train(self): cur_std = self.sigma0 cur_mean = self.policy.get_param_values() es = cma.CMAEvolutionStrategy(cur_mean, cur_std) parallel_sampler.populate_task(self.env, self.policy) if self.plot: self.plotter.init_plot(self.env, self.policy) cur_std = self.sigma0 cur_mean = self.policy.get_param_values() itr = 0 while itr < self.n_itr and not es.stop(): if self.batch_size is None: # Sample from multivariate normal distribution. xs = es.ask() xs = np.asarray(xs) # For each sample, do a rollout. infos = (stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs])) else: cum_len = 0 infos = [] xss = [] done = False while not done: sbs = stateful_pool.singleton_pool.n_parallel * 2 # Sample from multivariate normal distribution. # You want to ask for sbs samples here. xs = es.ask(sbs) xs = np.asarray(xs) xss.append(xs) sinfos = stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs]) for info in sinfos: infos.append(info) cum_len += len(info['returns']) if cum_len >= self.batch_size: xs = np.concatenate(xss) done = True break # Evaluate fitness of samples (negative as it is minimization # problem). fs = -np.array([info['returns'][0] for info in infos]) # When batching, you could have generated too many samples compared # to the actual evaluations. So we cut it off in this case. xs = xs[:len(fs)] # Update CMA-ES params based on sample fitness. es.tell(xs, fs) logger.push_prefix('itr #{} | '.format(itr)) tabular.record('Iteration', itr) tabular.record('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array( [info['undiscounted_return'] for info in infos]) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.mean(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('AverageDiscountedReturn', np.mean(fs)) tabular.record('AvgTrajLen', np.mean([len(info['returns']) for info in infos])) self.policy.log_diagnostics(infos) snapshotter.save_itr_params( itr, dict( itr=itr, policy=self.policy, env=self.env, )) logger.log(tabular) if self.plot: self.plotter.update_plot(self.policy, self.max_path_length) logger.pop_prefix() # Update iteration. itr += 1 # Set final params. self.policy.set_param_values(es.result()[0]) parallel_sampler.terminate_task() self.plotter.close()
def train(self, sess=None): address = ("localhost", 6000) conn = Client(address) last_average_return = None try: created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) conn.send(ExpLifecycle.START) self.start_worker(sess) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") conn.send(ExpLifecycle.OBTAIN_SAMPLES) paths = self.obtain_samples(itr) logger.log("Processing samples...") conn.send(ExpLifecycle.PROCESS_SAMPLES) samples_data = self.process_samples(itr, paths) last_average_return = samples_data["average_return"] logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") conn.send(ExpLifecycle.OPTIMIZE_POLICY) self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) if self.store_paths: params["paths"] = samples_data["paths"] snapshotter.save_itr_params(itr, params) logger.log("Saved") tabular.record('Time', time.time() - start_time) tabular.record('ItrTime', time.time() - itr_start_time) logger.log(tabular) if self.plot: conn.send(ExpLifecycle.UPDATE_PLOT) self.plotter.update_plot(self.policy, self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") conn.send(ExpLifecycle.SHUTDOWN) self.shutdown_worker() if created_session: sess.close() finally: conn.close() return last_average_return
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = utils.center_advantages(advantages) if self.algo.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('ExplainedVariance', ev) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def optimize(self, inputs, name=None): with tf.name_scope(name, "optimize", values=[inputs]): inputs = tuple(inputs) try_penalty = np.clip(self._penalty, self._min_penalty, self._max_penalty) penalty_scale_factor = None f_opt = self._opt_fun["f_opt"] f_penalized_loss = self._opt_fun["f_penalized_loss"] def gen_f_opt(penalty): def f(flat_params): self._target.set_param_values(flat_params, trainable=True) return f_opt(*(inputs + (penalty, ))) return f cur_params = self._target.get_param_values( trainable=True).astype('float64') opt_params = cur_params for penalty_itr in range(self._max_penalty_itr): logger.log('trying penalty=%.3f...' % try_penalty) itr_opt_params, _, _ = scipy.optimize.fmin_l_bfgs_b( func=gen_f_opt(try_penalty), x0=cur_params, maxiter=self._max_opt_itr) _, try_loss, try_constraint_val = f_penalized_loss( *(inputs + (try_penalty, ))) logger.log('penalty %f => loss %f, %s %f' % (try_penalty, try_loss, self._constraint_name, try_constraint_val)) # Either constraint satisfied, or we are at the last iteration # already and no alternative parameter satisfies the constraint if try_constraint_val < self._max_constraint_val or \ (penalty_itr == self._max_penalty_itr - 1 and opt_params is None): opt_params = itr_opt_params if not self._adapt_penalty: break # Decide scale factor on the first iteration, or if constraint # violation yields numerical error if (penalty_scale_factor is None or np.isnan(try_constraint_val)): # Increase penalty if constraint violated, or if constraint # term is NAN if (try_constraint_val > self._max_constraint_val or np.isnan(try_constraint_val)): penalty_scale_factor = self._increase_penalty_factor else: # Otherwise (i.e. constraint satisfied), shrink penalty penalty_scale_factor = self._decrease_penalty_factor opt_params = itr_opt_params else: if (penalty_scale_factor > 1 and try_constraint_val <= self._max_constraint_val): break elif (penalty_scale_factor < 1 and try_constraint_val >= self._max_constraint_val): break try_penalty *= penalty_scale_factor try_penalty = np.clip(try_penalty, self._min_penalty, self._max_penalty) self._penalty = try_penalty self._target.set_param_values(opt_params, trainable=True)
filename = str(uuid.uuid4()) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('file', type=str, help='path to the snapshot file') parser.add_argument('--log_dir', type=str, default=None, help='path to the new log directory') # Look for params.json file args = parser.parse_args() parent_dir = os.path.dirname(os.path.realpath(args.file)) json_file_path = os.path.join(parent_dir, 'params.json') logger.log('Looking for params.json at {}...'.format(json_file_path)) try: with open(json_file_path, 'r') as f: params = json.load(f) # exclude certain parameters excluded = ['json_args'] for k in excluded: if k in params: del params[k] for k, v in list(params.items()): if v is None: del params[k] if args.log_dir is not None: params['log_dir'] = args.log_dir params['resume_from'] = args.file command = to_local_command(
def optimize_policy(self, itr, samples_data): """Perform the policy optimization.""" # Initial BFGS parameter values. x0 = np.hstack([self.param_eta, self.param_v]) # Set parameter boundaries: \eta>=1e-12, v unrestricted. bounds = [(-np.inf, np.inf) for _ in x0] bounds[0] = (1e-12, np.inf) # Optimize dual eta_before = self.param_eta logger.log('Computing dual before') self.feat_diff = self._features(samples_data) dual_opt_input_values = self._dual_opt_input_values(samples_data) dual_before = self.f_dual(*dual_opt_input_values) logger.log('Optimizing dual') def eval_dual(x): self.param_eta = x[0] self.param_v = x[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) return self.f_dual(*dual_opt_input_values) def eval_dual_grad(x): self.param_eta = x[0] self.param_v = x[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) grad = self.f_dual_grad(*dual_opt_input_values) eta_grad = np.float(grad[0]) v_grad = grad[1] return np.hstack([eta_grad, v_grad]) params_ast, _, _ = self.dual_optimizer( func=eval_dual, x0=x0, fprime=eval_dual_grad, bounds=bounds, **self.dual_optimizer_args, ) logger.log('Computing dual after') self.param_eta, self.param_v = params_ast[0], params_ast[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) dual_after = self.f_dual(*dual_opt_input_values) # Optimize policy policy_opt_input_values = self._policy_opt_input_values(samples_data) logger.log('Computing policy loss before') loss_before = self.optimizer.loss(policy_opt_input_values) logger.log('Computing policy KL before') policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log('Optimizing policy') self.optimizer.optimize(policy_opt_input_values) logger.log('Computing policy KL') policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log('Computing policy loss after') loss_after = self.optimizer.loss(policy_opt_input_values) tabular.record('EtaBefore', eta_before) tabular.record('EtaAfter', self.param_eta) tabular.record('DualBefore', dual_before) tabular.record('DualAfter', dual_after) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl)