def train_once(self, itr, paths): epoch = itr // self.n_samples i_sample = itr - epoch * self.n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = paths['average_return'] self.all_returns.append(paths['average_return']) if (itr + 1) % self.n_samples == 0: avg_rtns = np.array(self.all_returns) self.es.tell(self.all_params, -avg_rtns) self.policy.set_param_values(self.es.result()[0]) # Clear for next epoch rtn = max(self.all_returns) self.all_returns.clear() self.all_params = self.sample_params() self.cur_params = self.all_params[(i_sample + 1) % self.n_samples] self.policy.set_param_values(self.cur_params) logger.log(tabular) return rtn
def train_once(self, itr, paths): epoch = itr // self.n_samples i_sample = itr - epoch * self.n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) # -- Stage: Process path rtn = paths['average_return'] self.all_returns.append(paths['average_return']) # -- Stage: Update policy distribution. if (itr + 1) % self.n_samples == 0: avg_rtns = np.array(self.all_returns) best_inds = np.argsort(-avg_rtns)[:self.n_best] best_params = np.array(self.all_params)[best_inds] # MLE of normal distribution self.cur_mean = best_params.mean(axis=0) self.cur_std = best_params.std(axis=0) self.policy.set_param_values(self.cur_mean) # Clear for next epoch rtn = max(self.all_returns) self.all_returns.clear() self.all_params.clear() # -- Stage: Generate a new policy for next path sampling self.cur_params = self.sample_params(itr) self.all_params.append(self.cur_params.copy()) self.policy.set_param_values(self.cur_params) logger.log(tabular) return rtn
def train(self, sess=None): address = ("localhost", 6000) conn = Client(address) last_average_return = None try: created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) conn.send(ExpLifecycle.START) self.start_worker(sess) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") conn.send(ExpLifecycle.OBTAIN_SAMPLES) paths = self.obtain_samples(itr) logger.log("Processing samples...") conn.send(ExpLifecycle.PROCESS_SAMPLES) samples_data = self.process_samples(itr, paths) last_average_return = samples_data["average_return"] logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") conn.send(ExpLifecycle.OPTIMIZE_POLICY) self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) if self.store_paths: params["paths"] = samples_data["paths"] snapshotter.save_itr_params(itr, params) logger.log("Saved") tabular.record('Time', time.time() - start_time) tabular.record('ItrTime', time.time() - itr_start_time) logger.log(tabular) if self.plot: conn.send(ExpLifecycle.UPDATE_PLOT) self.plotter.update_plot(self.policy, self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") conn.send(ExpLifecycle.SHUTDOWN) self.shutdown_worker() if created_session: sess.close() finally: conn.close() return last_average_return
def fit(self, xs, ys): if self.normalize_inputs: # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 tf.get_default_session().run( tf.group( tf.assign(self.x_mean_var, new_mean), tf.assign(self.x_std_var, new_std), )) # self._x_mean_var.set_value(np.mean(xs, axis=0, keepdims=True)) # self._x_std_var.set_value( # np.std(xs, axis=0, keepdims=True) + 1e-8) if self.use_trust_region and self.first_optimized: old_p = self.f_p(xs) inputs = [xs, ys, old_p] optimizer = self.tr_optimizer else: inputs = [xs, ys] optimizer = self.optimizer loss_before = optimizer.loss(inputs) if self.name: prefix = self.name + "/" else: prefix = "" tabular.record(prefix + 'LossBefore', loss_before) optimizer.optimize(inputs) loss_after = optimizer.loss(inputs) tabular.record(prefix + 'LossAfter', loss_after) tabular.record(prefix + 'dLoss', loss_before - loss_after) self.first_optimized = True
def _fit_baseline(self, samples_data): """ Update baselines from samples. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self.f_rewards(*policy_opt_input_values) returns_tensor = self.f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = tensor_utils.concat_tensor_list(aug_rewards) aug_returns = tensor_utils.concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate explained variance ev = special.explained_variance_1d(np.concatenate(baselines), aug_returns) tabular.record('{}/ExplainedVariance'.format(self.baseline.name), ev) # Fit baseline logger.log('Fitting baseline...') if hasattr(self.baseline, 'fit_with_samples'): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths)
def fit(self, xs, ys): if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] sess = tf.get_default_session() if self._normalize_inputs: # recompute normalizing constants for inputs feed_dict = { self._x_mean_var_ph: np.mean(xs, axis=0, keepdims=True), self._x_std_var_ph: np.std(xs, axis=0, keepdims=True) + 1e-8, } sess.run([ self._assign_x_mean, self._assign_x_std, ], feed_dict=feed_dict) # yapf: disable if self._normalize_outputs: # recompute normalizing constants for outputs feed_dict = { self._y_mean_var_ph: np.mean(ys, axis=0, keepdims=True), self._y_std_var_ph: np.std(ys, axis=0, keepdims=True) + 1e-8, } sess.run([self._assign_y_mean, self._assign_y_std], feed_dict=feed_dict) if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + "/" else: prefix = "" tabular.record(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record(prefix + 'LossAfter', loss_after) if self._use_trust_region: tabular.record(prefix + 'MeanKL', self._optimizer.constraint_val(inputs)) tabular.record(prefix + 'dLoss', loss_before - loss_after)
def fit(self, xs, ys): """Optimize the regressor based on the inputs.""" if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] sess = tf.get_default_session() if self._normalize_inputs: # recompute normalizing constants for inputs sess.run([ tf.assign(self._x_mean_var, np.mean(xs, axis=0, keepdims=True)), tf.assign(self._x_std_var, np.std(xs, axis=0, keepdims=True) + 1e-8), ]) if self._normalize_outputs: # recompute normalizing constants for outputs sess.run([ tf.assign(self._y_mean_var, np.mean(ys, axis=0, keepdims=True)), tf.assign(self._y_std_var, np.std(ys, axis=0, keepdims=True) + 1e-8), ]) if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + "/" else: prefix = "" tabular.record(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record(prefix + 'LossAfter', loss_after) if self._use_trust_region: tabular.record(prefix + 'MeanKL', self._optimizer.constraint_val(inputs)) tabular.record(prefix + 'dLoss', loss_before - loss_after)
def fit(self, xs, ys): """ Fit with input data xs and label ys. Args: xs (numpy.ndarray): Input data. ys (numpy.ndarray): Label of input data. """ if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] if self._normalize_inputs: # recompute normalizing constants for inputs self.model.networks['default'].x_mean.load( np.mean(xs, axis=0, keepdims=True)) self.model.networks['default'].x_std.load( np.std(xs, axis=0, keepdims=True) + 1e-8) if self._normalize_outputs: # recompute normalizing constants for outputs self.model.networks['default'].y_mean.load( np.mean(ys, axis=0, keepdims=True)) self.model.networks['default'].y_std.load( np.std(ys, axis=0, keepdims=True) + 1e-8) if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) tabular.record('{}/LossBefore'.format(self._name), loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record('{}/LossAfter'.format(self._name), loss_after) if self._use_trust_region: tabular.record('{}/MeanKL'.format(self._name), self._optimizer.constraint_val(inputs)) tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after)
def fit(self, xs, ys): if self.normalize_inputs: # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 tf.get_default_session().run( tf.group( tf.assign(self.x_mean_var, new_mean), tf.assign(self.x_std_var, new_std), )) inputs = [xs, ys] loss_before = self.optimizer.loss(inputs) if self.name: prefix = self.name + "/" else: prefix = "" tabular.record(prefix + 'LossBefore', loss_before) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) tabular.record(prefix + 'LossAfter', loss_after) tabular.record(prefix + 'dLoss', loss_before - loss_after)
def optimize_policy(self, itr, samples_data): policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log('Computing loss before') loss_before = self.optimizer.loss(policy_opt_input_values) logger.log('Computing KL before') policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log('Optimizing') self.optimizer.optimize(policy_opt_input_values) logger.log('Computing KL after') policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log('Computing loss after') loss_after = self.optimizer.loss(policy_opt_input_values) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl) pol_ent = self.f_policy_entropy(*policy_opt_input_values) tabular.record('{}/Entropy'.format(self.policy.name), np.mean(pol_ent)) self._fit_baseline(samples_data)
def train(self): cur_std = self.sigma0 cur_mean = self.policy.get_param_values() es = cma.CMAEvolutionStrategy(cur_mean, cur_std) parallel_sampler.populate_task(self.env, self.policy) if self.plot: self.plotter.init_plot(self.env, self.policy) cur_std = self.sigma0 cur_mean = self.policy.get_param_values() itr = 0 while itr < self.n_itr and not es.stop(): if self.batch_size is None: # Sample from multivariate normal distribution. xs = es.ask() xs = np.asarray(xs) # For each sample, do a rollout. infos = (stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs])) else: cum_len = 0 infos = [] xss = [] done = False while not done: sbs = stateful_pool.singleton_pool.n_parallel * 2 # Sample from multivariate normal distribution. # You want to ask for sbs samples here. xs = es.ask(sbs) xs = np.asarray(xs) xss.append(xs) sinfos = stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs]) for info in sinfos: infos.append(info) cum_len += len(info['returns']) if cum_len >= self.batch_size: xs = np.concatenate(xss) done = True break # Evaluate fitness of samples (negative as it is minimization # problem). fs = -np.array([info['returns'][0] for info in infos]) # When batching, you could have generated too many samples compared # to the actual evaluations. So we cut it off in this case. xs = xs[:len(fs)] # Update CMA-ES params based on sample fitness. es.tell(xs, fs) logger.push_prefix('itr #{} | '.format(itr)) tabular.record('Iteration', itr) tabular.record('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array( [info['undiscounted_return'] for info in infos]) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.mean(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('AverageDiscountedReturn', np.mean(fs)) tabular.record('AvgTrajLen', np.mean([len(info['returns']) for info in infos])) self.policy.log_diagnostics(infos) snapshotter.save_itr_params( itr, dict( itr=itr, policy=self.policy, env=self.env, )) logger.log(tabular) if self.plot: self.plotter.update_plot(self.policy, self.max_path_length) logger.pop_prefix() # Update iteration. itr += 1 # Set final params. self.policy.set_param_values(es.result()[0]) parallel_sampler.terminate_task() self.plotter.close()
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = utils.center_advantages(advantages) if self.algo.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('ExplainedVariance', ev) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] max_path_length = self.algo.max_path_length if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] \ + self.algo.discount * path_baselines[1:] \ - path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["deltas"] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) returns.append(path["returns"]) # make all paths the same length obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) advantages = [path["advantages"] for path in paths] advantages = tensor_utils.pad_tensor_n(advantages, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = (np.mean( [path["returns"][0] for path in paths])) undiscounted_returns = [sum(path["rewards"]) for path in paths] self.eprewmean.extend(undiscounted_returns) ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, rewards=rewards, advantages=advantages, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.eprewmean)) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def log_diagnostics(self, paths): log_stds = paths["agent_infos"]["log_std"] tabular.record("{}/AverageStd".format(self.name), np.mean(np.exp(log_stds)))
def log_diagnostics(self, paths): log_stds = paths["agent_infos"]["log_std"] tabular.record('AveragePolicyStd', np.mean(np.exp(log_stds)))
def train_once(self, itr, paths): epoch = itr / self.n_epoch_cycles self.episode_rewards.extend(paths['undiscounted_returns']) self.success_history.extend(paths['success_history']) last_average_return = np.mean(self.episode_rewards) self.log_diagnostics(paths) for train_itr in range(self.n_train_steps): if self.replay_buffer.n_transitions_stored >= self.min_buffer_size: # noqa: E501 self.evaluate = True qf_loss, y, q, policy_loss = self.optimize_policy(epoch, paths) self.episode_policy_losses.append(policy_loss) self.episode_qf_losses.append(qf_loss) self.epoch_ys.append(y) self.epoch_qs.append(q) if itr % self.n_epoch_cycles == 0: logger.log('Training finished') if self.evaluate: tabular.record('Epoch', epoch) tabular.record('AverageReturn', np.mean(self.episode_rewards)) tabular.record('StdReturn', np.std(self.episode_rewards)) tabular.record('Policy/AveragePolicyLoss', np.mean(self.episode_policy_losses)) tabular.record('QFunction/AverageQFunctionLoss', np.mean(self.episode_qf_losses)) tabular.record('QFunction/AverageQ', np.mean(self.epoch_qs)) tabular.record('QFunction/MaxQ', np.max(self.epoch_qs)) tabular.record('QFunction/AverageAbsQ', np.mean(np.abs(self.epoch_qs))) tabular.record('QFunction/AverageY', np.mean(self.epoch_ys)) tabular.record('QFunction/MaxY', np.max(self.epoch_ys)) tabular.record('QFunction/AverageAbsY', np.mean(np.abs(self.epoch_ys))) if self.input_include_goal: tabular.record('AverageSuccessRate', np.mean(self.success_history)) if not self.smooth_return: self.episode_rewards = [] self.episode_policy_losses = [] self.episode_qf_losses = [] self.epoch_ys = [] self.epoch_qs = [] self.success_history.clear() return last_average_return
def train(self): parallel_sampler.populate_task(self.env, self.policy) if self.plot: self.plotter.init_plot(self.env, self.policy) cur_std = self.init_std cur_mean = self.policy.get_param_values() # K = cur_mean.size n_best = max(1, int(self.n_samples * self.best_frac)) for itr in range(self.n_itr): # sample around the current distribution extra_var_mult = max(1.0 - itr / self.extra_decay_time, 0) sample_std = np.sqrt( np.square(cur_std) + np.square(self.extra_std) * extra_var_mult) if self.batch_size is None: criterion = 'paths' threshold = self.n_samples else: criterion = 'samples' threshold = self.batch_size infos = stateful_pool.singleton_pool.run_collect( _worker_rollout_policy, threshold=threshold, args=(dict( cur_mean=cur_mean, sample_std=sample_std, max_path_length=self.max_path_length, discount=self.discount, criterion=criterion, n_evals=self.n_evals), )) xs = np.asarray([info[0] for info in infos]) paths = [info[1] for info in infos] fs = np.array([path['returns'][0] for path in paths]) print((xs.shape, fs.shape)) best_inds = (-fs).argsort()[:n_best] best_xs = xs[best_inds] cur_mean = best_xs.mean(axis=0) cur_std = best_xs.std(axis=0) best_x = best_xs[0] logger.push_prefix('itr #{} | '.format(itr)) tabular.record('Iteration', itr) tabular.record('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array( [path['undiscounted_return'] for path in paths]) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('AverageDiscountedReturn', np.mean(fs)) tabular.record('NumTrajs', len(paths)) paths = list(chain( *[d['full_paths'] for d in paths])) # flatten paths for the case n_evals > 1 tabular.record('AvgTrajLen', np.mean([len(path['returns']) for path in paths])) self.policy.set_param_values(best_x) self.policy.log_diagnostics(paths) snapshotter.save_itr_params( itr, dict( itr=itr, policy=self.policy, env=self.env, cur_mean=cur_mean, cur_std=cur_std, )) logger.log(tabular) logger.pop_prefix() if self.plot: self.plotter.update_plot(self.policy, self.max_path_length) parallel_sampler.terminate_task() self.plotter.close()
def optimize_policy(self, itr, samples_data): policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log("Computing loss before") loss_before = self.optimizer.loss(policy_opt_input_values) logger.log("Computing KL before") policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log("Optimizing") self.optimizer.optimize(policy_opt_input_values) logger.log("Computing KL after") policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(policy_opt_input_values) tabular.record("{}/LossBefore".format(self.policy.name), loss_before) tabular.record("{}/LossAfter".format(self.policy.name), loss_after) tabular.record("{}/dLoss".format(self.policy.name), loss_before - loss_after) tabular.record("{}/KLBefore".format(self.policy.name), policy_kl_before) tabular.record("{}/KL".format(self.policy.name), policy_kl) pol_ent = self.f_policy_entropy(*policy_opt_input_values) tabular.record("{}/Entropy".format(self.policy.name), pol_ent) num_traj = self.batch_size // self.max_path_length actions = samples_data["actions"][:num_traj, ...] histogram = EmpiricalDistribution(actions) tabular.record("{}/Actions".format(self.policy.name), histogram) self._fit_baseline(samples_data) return self.get_itr_snapshot(itr, samples_data)
def optimize_policy(self, itr, samples_data): """Perform the policy optimization.""" # Initial BFGS parameter values. x0 = np.hstack([self.param_eta, self.param_v]) # Set parameter boundaries: \eta>=1e-12, v unrestricted. bounds = [(-np.inf, np.inf) for _ in x0] bounds[0] = (1e-12, np.inf) # Optimize dual eta_before = self.param_eta logger.log('Computing dual before') self.feat_diff = self._features(samples_data) dual_opt_input_values = self._dual_opt_input_values(samples_data) dual_before = self.f_dual(*dual_opt_input_values) logger.log('Optimizing dual') def eval_dual(x): self.param_eta = x[0] self.param_v = x[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) return self.f_dual(*dual_opt_input_values) def eval_dual_grad(x): self.param_eta = x[0] self.param_v = x[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) grad = self.f_dual_grad(*dual_opt_input_values) eta_grad = np.float(grad[0]) v_grad = grad[1] return np.hstack([eta_grad, v_grad]) params_ast, _, _ = self.dual_optimizer( func=eval_dual, x0=x0, fprime=eval_dual_grad, bounds=bounds, **self.dual_optimizer_args, ) logger.log('Computing dual after') self.param_eta, self.param_v = params_ast[0], params_ast[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) dual_after = self.f_dual(*dual_opt_input_values) # Optimize policy policy_opt_input_values = self._policy_opt_input_values(samples_data) logger.log('Computing policy loss before') loss_before = self.optimizer.loss(policy_opt_input_values) logger.log('Computing policy KL before') policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log('Optimizing policy') self.optimizer.optimize(policy_opt_input_values) logger.log('Computing policy KL') policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log('Computing policy loss after') loss_after = self.optimizer.loss(policy_opt_input_values) tabular.record('EtaBefore', eta_before) tabular.record('EtaAfter', self.param_eta) tabular.record('DualBefore', dual_before) tabular.record('DualAfter', dual_after) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl)