def optimize_policy(self, itr, samples_data): all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) returns = ext.extract(samples_data, "returns") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) logger.log("Computing loss before") # TODO: loss_before = self.optimizer.loss(all_input_values) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(all_input_values) logger.log("Optimizing") self.optimizer.optimize(all_input_values) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(all_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy(self, itr, samples_data): print(len(samples_data['observations']), self.period) assert len(samples_data['observations']) % self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse if self.use_skill_dependent_baseline: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos", "skill_advantages")) else: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos")) obs_raw = input_values[0].reshape( input_values[0].shape[0] // self.period, self.period, input_values[0].shape[1]) obs_sparse = input_values[0].take( [i for i in range(0, input_values[0].shape[0], self.period)], axis=0) if not self.continuous_latent: advantage_sparse = input_values[2].reshape( [input_values[2].shape[0] // self.period, self.period])[:, 0] latents = input_values[3]['latents'] latents_sparse = latents.take( [i for i in range(0, latents.shape[0], self.period)], axis=0) prob = np.array(list(input_values[3]['prob'].take( [i for i in range(0, latents.shape[0], self.period)], axis=0)), dtype=np.float32) mean = input_values[3]['mean'] log_std = input_values[3]['log_std'] if self.use_skill_dependent_baseline: advantage_var = input_values[4] else: advantage_var = input_values[2] # import ipdb; ipdb.set_trace() if self.freeze_skills and not self.freeze_manager: raise NotImplementedError elif self.freeze_manager and not self.freeze_skills: raise NotImplementedError else: assert (not self.freeze_manager) or (not self.freeze_skills) all_input_values = (obs_raw, obs_sparse, input_values[1], advantage_var, mean, log_std) # todo: assign current parameters to old policy; does this work? # old_param_values = self.policy.get_param_values(trainable=True) # self.old_policy.set_param_values(old_param_values, trainable=True) # old_param_values = self.policy.get_param_values() # self.old_policy.set_param_values(old_param_values) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy(self, itr, samples_data): logger.log('optimizing policy...') all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "weights")) if self.safety_constraint: all_input_values += tuple( ext.extract(samples_data, "safety_values")) self.safety_gradient_rescale.set_value( samples_data['safety_rescale']) logger.record_tabular('SafetyGradientRescale', self.safety_gradient_rescale.get_value()) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) loss_before = self.optimizer.loss(all_input_values) if not (self.safety_constrained_optimizer): self.optimizer.optimize(all_input_values) else: threshold = max( self.safety_step_size - samples_data['safety_eval'], 0) if 'advantage' in self.safety_key: std_adv = np.std(samples_data["safety_values"]) logger.record_tabular('StdSafetyAdv', std_adv) threshold = max(threshold - self.robustness_coeff * std_adv, 0) if 'safety_offset' in samples_data: logger.record_tabular('SafetyOffset', samples_data['safety_offset']) self.optimizer.optimize( all_input_values, precomputed_eval=samples_data['safety_eval'], precomputed_threshold=threshold, diff_threshold=True) mean_kl, max_kl = self.opt_info['f_kl'](*all_input_values) loss_after = self.optimizer.loss(all_input_values) if self.entropy_regularize and not (self.entropy_coeff_decay == 1): current_entropy_coeff = self.entropy_beta.get_value( ) * self.entropy_coeff_decay self.entropy_beta.set_value(current_entropy_coeff) logger.record_tabular('EntropyCoeff', current_entropy_coeff) if self.learn_safety_tradeoff_coeff: delta = samples_data['safety_eval'] - self.safety_step_size self.safety_tradeoff_coeff += self.safety_tradeoff_coeff_lr * delta self.safety_tradeoff_coeff = max(0, self.safety_tradeoff_coeff)
def do_training(self, itr, batch, offpolicy_batch): obs, actions, rewards, next_obs, terminals = ext.extract( batch, "observations", "actions", "rewards", "next_observations", "terminals") obs_off, actions_off, rewards_off, next_obs_off, terminals_off = ext.extract( offpolicy_batch, "observations", "actions", "rewards", "next_observations", "terminals") # compute the on-policy y values target_qf = self.opt_info["target_qf"] target_policy = self.opt_info["target_policy"] next_actions, _ = target_policy.get_actions(next_obs) next_qvals = target_qf.get_qval(next_obs, next_actions) ys = rewards + (1. - terminals) * self.discount * next_qvals.reshape(-1) next_actions_off, _ = target_policy.get_actions(next_obs_off) next_qvals_off = target_qf.get_qval(next_obs_off, next_actions_off) ys_off = rewards + ( 1. - terminals_off) * self.discount * next_qvals_off.reshape(-1) f_train_qf = self.opt_info["f_train_qf"] f_train_policy = self.opt_info["f_train_policy"] qf_loss, qval, _ = f_train_qf(ys, obs, actions, ys_off, obs_off, actions_off, self.global_train_step) target_qf.set_param_values(target_qf.get_param_values() * (1.0 - self.soft_target_tau) + self.qf.get_param_values() * self.soft_target_tau) self.qf_loss_averages.append(qf_loss) self.q_averages.append(qval) self.y_averages.append(ys) #TODO: also add ys_off self.train_policy_itr += self.policy_updates_ratio train_policy_itr = 0 while self.train_policy_itr > 0: f_train_policy = self.opt_info["f_train_policy"] policy_surr, _ = f_train_policy(obs, obs_off, self.global_train_step) target_policy.set_param_values(target_policy.get_param_values() * (1.0 - self.soft_target_tau) + self.policy.get_param_values() * self.soft_target_tau) self.policy_surr_averages.append(policy_surr) self.train_policy_itr -= 1 train_policy_itr += 1 return 1, train_policy_itr # number of itrs qf, policy are trained
def optimize_policy(self, itr, samples_data): # print(len(samples_data['observations']), self.period) # assert len(samples_data['observations']) % self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse if self.use_skill_dependent_baseline: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos", "skill_advantages")) else: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos")) time_remaining = input_values[3]['time_remaining'] resampled_period = input_values[3]['resampled_period'] obs_var = np.insert(input_values[0], self.policy.obs_robot_dim, time_remaining, axis=1) manager_obs_var = obs_var[resampled_period] action_var = input_values[1] manager_adv_var = input_values[2][resampled_period] latent_var = input_values[3]['latents'] latent_var_sparse = latent_var[resampled_period] mean = input_values[3]['mean'] log_std = input_values[3]['log_std'] prob = input_values[3]['prob'][resampled_period] if self.use_skill_dependent_baseline: skill_adv_var = input_values[4] all_input_values = (obs_var, manager_obs_var, action_var, manager_adv_var, skill_adv_var, latent_var, latent_var_sparse, mean, log_std, prob) else: skill_adv_var = input_values[2] all_input_values = (obs_var, manager_obs_var, action_var, manager_adv_var, skill_adv_var, latent_var, latent_var_sparse, mean, log_std, prob) # todo: assign current parameters to old policy; does this work? # old_param_values = self.policy.get_param_values() # self.old_policy.set_param_values(old_param_values) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy( self, itr, samples_data ): # make that samples_data comes with latents: see train in batch_polopt all_input_values = tuple( ext.extract( # it will be in agent_infos!!! under key "latents" samples_data, "observations", "actions", "advantages")) agent_infos = samples_data["agent_infos"] all_input_values += ( agent_infos["latents"], ) # latents has already been processed and is the concat of all latents, but keeps key "latents" info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] # these are the mean and var used at rollout, corresponding to all_input_values += tuple( info_list) # old_dist_info_vars_list as symbolic var if self.policy.recurrent: all_input_values += (samples_data["valids"], ) loss_before = self.optimizer.loss(all_input_values) # this should always be 0. If it's not there is a problem. mean_kl_before = self.optimizer.constraint_val(all_input_values) logger.record_tabular('MeanKL_Before', mean_kl_before) with logger.prefix(' PolicyOptimize | '): self.optimizer.optimize(all_input_values) mean_kl = self.optimizer.constraint_val(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_global_policy(self, itr, all_samples_data): all_observations = np.concatenate([ samples_data['observations'] for samples_data in all_samples_data ]) all_actions = np.concatenate([ samples_data['agent_infos']['mean'] for samples_data in all_samples_data ]) num_itrs = 1 if itr % self.distillation_period != 0 else 30 for _ in range(num_itrs): self.center_optimizer.optimize([all_observations, all_actions]) paths = self.global_sampler.obtain_samples(itr) samples_data = self.global_sampler.process_samples(itr, paths) obs_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) dist_info_list = [ samples_data["agent_infos"][k] for k in self.policy.distribution.dist_info_keys ] all_input_values = obs_values + tuple(dist_info_list) self.center_trpo_optimizer.optimize(all_input_values) self.env.log_diagnostics(paths)
def optimize_policy(self, itr, samples_data): all_input_values = tuple(ext.extract( samples_data, "observations", "actions", "advantages" )) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"],) logger.log("Computing loss before") loss_before = self.optimizer.loss(all_input_values) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(all_input_values) logger.log("Optimizing") self.optimizer.optimize(all_input_values) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(all_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy(self, itr, samples_data): inputs = ext.extract(samples_data, "observations", "actions", "advantages") agent_info = samples_data["agent_info"] state_info_list = [agent_info[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) dist_info_list = [ agent_info[k] for k in self.policy.distribution.dist_info_keys ] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) rewards = samples_data['rewards'] entropy_loss = np.mean(self.policy.distribution.entropy(agent_info)) self.log_summary(itr, loss_after, entropy_loss, np.mean(rewards), np.sum(rewards)) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl) return dict()
def optimize_policy(self, itr, samples_data): logger.log('optimizing policy...') all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "weights")) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) mean_kl, max_kl = self.opt_info['f_kl'](*all_input_values) loss_after = self.optimizer.loss(all_input_values) if self.entropy_regularize and not (self.entropy_coeff_decay == 1): current_entropy_coeff = self.entropy_beta.get_value( ) * self.entropy_coeff_decay self.entropy_beta.set_value(current_entropy_coeff) logger.record_tabular('EntropyCoeff', current_entropy_coeff) logger.record_tabular('Time', time.time() - self.start_time) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl) logger.record_tabular('dLoss', loss_before - loss_after) logger.log('optimization finished')
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract( samples_data, "observations", "actions", "advantages" ## GAE R - V(s) ) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] inputs += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: inputs += (samples_data["valids"], ) dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] loss_before = self.optimizer.loss(inputs) ### For PPO this should be more than one step. self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl, clip_frac, log_std = self.opt_info['f_kl']( *(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl) logger.record_tabular("ClipFrac", clip_frac) logger.record_tabular("AvgStd", np.mean(np.exp(log_std))) if self.comet_logger: self.comet_logger.log_metric('ClipFrac', clip_frac)
def do_training(self, itr, batch): obs, actions, rewards, next_obs, terminals = ext.extract( batch, "observations", "actions", "rewards", "next_observations", "terminals") # compute the on-policy y values target_qf = self.opt_info_critic["target_qf"] next_actions, next_actions_dict = self.policy.get_actions(next_obs) if self.qprop_use_mean_action: next_actions = next_actions_dict["mean"] next_qvals = target_qf.get_qval(next_obs, next_actions) ys = rewards + (1. - terminals) * self.discount * next_qvals f_train_qf = self.opt_info_critic["f_train_qf"] qf_loss, qval, _ = f_train_qf(ys, obs, actions) target_qf.set_param_values(target_qf.get_param_values() * (1.0 - self.soft_target_tau) + self.qf.get_param_values() * self.soft_target_tau) self.qf_loss_averages.append(qf_loss) self.q_averages.append(qval) self.y_averages.append(ys)
def do_phi_training(self, itr, indices=None, samples_data=None): batch_samples = samples_data ''' dict( observations=samples_data["observations"][indices], actions=samples_data["actions"][indices], origin_advantages=samples_data["origin_advantages"][indices],) ''' inputs = ext.extract( batch_samples, "observations", "actions", "origin_advantages", "etas",) # the following code is useless # FIXME: write a better version of this agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) #TODO: add recurrent if self.policy.recurrent: inputs += (samples_data["valid"], ) pf_outputs = self.opt_train_phi['f_train_pf'](*inputs) pf_loss = pf_outputs.pop(0) self.pf_loss_averages.append(pf_loss)
def optimize_policy(self, itr, all_samples_data, particle_idx): self.policy = self.policy_list[particle_idx] self.optimizer = self.optimizer_list[particle_idx] assert len(all_samples_data) == len(self.policy_list) assert len(all_samples_data[0]) == self.num_grad_updates + 1 input_list = [] for step in range(len( all_samples_data[0])): # these are the gradient steps for n in range(len(all_samples_data)): obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[n][step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list # [ [obs_0], [act_0], [adv_0], [obs_1], ... ] dist_info_list = [] for i in range(self.meta_batch_size): obs_list = all_samples_data[particle_idx][ self.kl_constrain_step][i]['observations'] agent_infos = self.policy.get_mean_logstd(obs_list, self.meta_batch_size, i) dist_info_list += [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] input_list += tuple(dist_info_list) self.optimizer.optimize(input_list) return dict()
def optimize_policy(self, itr, all_samples_data, particle_idx): logger.log("optimizing policy") assert len(all_samples_data) == len(self.policy_list) assert len(all_samples_data[0]) == self.num_leader_grad_updates input_list = [] for step in range(len(all_samples_data[0])): for n in range(len(all_samples_data)): obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract( all_samples_data[n][step][i], "observations", "actions", "advantages" ) obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list if particle_idx == 0 and (self.n_particles > 1): sess = tf.get_default_session() global_h = sess.run(self.global_h,feed_dict=dict(list(zip(self.policy_list[0].input_list_for_grad, input_list)))) logger.record_tabular('global_h', global_h) self.optimizer_list[particle_idx].optimize(input_list)
def optimize_policy(self, itr, samples_data): inputs = ext.extract(samples_data, "observations", "actions", "target") self.optimizer.optimize(inputs) self.loss_after = self.optimizer.loss(inputs) return dict()
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract(samples_data, "observations", "actions", "advantages") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] # state_info_keys is prev_action # so agent_infos should include prev_action # agent_infos from policy.get_action inputs += tuple(state_info_list) if self.policy.recurrent: inputs += (samples_data["valids"], ) dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def do_training(self, itr, batch): obs, actions, rewards, next_obs, terminals = ext.extract( batch, "observations", "actions", "rewards", "next_observations", "terminals") # compute the on-policy y values target_qf = self.opt_info["target_qf"] target_policy = self.opt_info["target_policy"] next_actions, _ = target_policy.get_actions(next_obs) next_qvals = target_qf.get_qval(next_obs, next_actions) ys = rewards + (1. - terminals) * self.discount * next_qvals f_train_qf = self.opt_info["f_train_qf"] f_train_policy = self.opt_info["f_train_policy"] qf_loss, qval = f_train_qf(ys, obs, actions) policy_surr = f_train_policy(obs) target_policy.set_param_values(target_policy.get_param_values() * (1.0 - self.soft_target_tau) + self.policy.get_param_values() * self.soft_target_tau) target_qf.set_param_values(target_qf.get_param_values() * (1.0 - self.soft_target_tau) + self.qf.get_param_values() * self.soft_target_tau) self.qf_loss_averages.append(qf_loss) self.policy_surr_averages.append(policy_surr) self.q_averages.append(qval) self.y_averages.append(ys)
def optimize_policy(self, itr, samples_data, paths): sortedPaths = paths # select a subset of paths for training. if len(sortedPaths) > POWERGradient.numSampledPaths: # select a random subset of paths for tranining. # selected_samples = random.sample(range(len(sortedPaths)), 10) # sortedPaths = [sortedPaths[x] for x in selected_samples] # select the subset of best paths for training. sortedPaths = sorted(paths, key=lambda path: np.sum(path["rewards"]), reverse=True) sortedPaths = sortedPaths[0:POWERGradient.numSampledPaths] processed_samples = self.sampler.process_samples(itr, sortedPaths) all_input_values = ext.extract( processed_samples, "observations", "actions", "path_rewards" ) polGrad = np.array([0.] * len(self.policy.get_param_values())) for obv, act, path_rew in zip(*all_input_values): polGrad = polGrad + path_rew * np.array(self.polLogGradFunc(obv, act)) polGrad = polGrad / POWERGradient.numSampledPaths # RMSProp update of policy parameters. self.gS = 0.9 * self.gS + 0.1 * (polGrad ** 2) newPolParams = self.policy.get_param_values() + self.step_size * polGrad / np.sqrt(self.gS + TINY) self.policy.set_param_values(newPolParams) return dict()
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract( samples_data, "observations", "actions", "advantages", "noises", "task_idxs" ) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) if self.policy.recurrent: inputs += (samples_data["valids"],) dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] loss_before = self.optimizer.loss(inputs) curr_mean = sess.run(self.policy.all_params['latent_means']) curr_std = np.exp(sess.run(self.policy.all_params['latent_stds'])) import ipdb ipdb.set_trace() self.optimizer.optimize(inputs) curr_mean = sess.run(self.policy.all_params['latent_means']) curr_std = np.exp(sess.run(self.policy.all_params['latent_stds'])) import ipdb ipdb.set_trace() loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def MPC(self, num_samples): all_samples = [] paths = self.obtain_samples(0, self.goal) samples_data = {} for key in paths.keys(): # the keys are the tasks # don't log because this will spam the consol with every task. samples_data[key] = self.process_samples(0, paths[key], log=False) obs_list, action_list, adv_list = [], [], [] for i in range(num_samples): inputs = ext.extract(samples_data[i], 'observations', 'actions', 'advantages') # inputs_list.append(np.concatenate((inputs[0], inputs[1]), axis = 1).astype(np.float32)) obs_list.append(inputs[0]) action_list.append(inputs[1].reshape( [-1, 20, self.env.action_space.flat_dim])) adv_list.append(0) for i in range(num_samples): new_obs_list = [] for j in range(action_list[i].shape[0]): self.env.reset(init_state=obs_list[i][j]) action = np.clip(action_list[i][j], *self.env.action_space.bounds) _, reward, _, _ = self.env.step(action) adv_list[i] = adv_list[i] + reward new_obs_list.append( self.policy.get_state(obs_list[i][j], action_list[i][j])) obs_list[i] = new_obs_list index = np.argmax(adv_list) return action_list[index][0]
def optimize_policy(self, itr, samples_latent): logger.log("optimizing policy") # inputs = ext.extract(samples, # 'observations', 'actions', 'advantages', 'noises', 'task_idxs') # obs=inputs[0] # actions=inputs[1] # advantages=inputs[2] # noises=inputs[3] # task_idxs = inputs[4] latent_inputs = ext.extract(samples_latent, "advantages", "noises", "task_idxs") latent_advantages = latent_inputs[0] latent_noises = latent_inputs[1] latent_task_idxs = latent_inputs[2] sess = tf.get_default_session() means = sess.run( tf.gather(self.policy.all_params['latent_means'], latent_task_idxs)) logstds = sess.run( tf.gather(self.policy.all_params['latent_stds'], latent_task_idxs)) #import ipdb #ipdb.set_trace() zs = means + latent_noises * np.exp(logstds) # self.num_top = 10 # best_indices = advantages.argsort()[-self.num_top:][::-1] # good_noises = np.asarray([zs[ind] for ind in best_indices]) # inputs = [obs, actions, advantages, noises, task_idxs, latent_advantages, zs, latent_task_idxs] inputs = [latent_advantages, zs, latent_task_idxs] self.optimize(inputs, sess, itr)
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract(samples_data, "observations", "actions", "advantages") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) if self.policy.recurrent: inputs += (samples_data["valids"], ) if self.qprop: inputs += (samples_data["etas"], ) logger.log("Using Qprop optimizer") optimizer = self.optimizer dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] loss_before = optimizer.loss(inputs) gc.collect() optimizer.optimize(inputs) gc.collect() loss_after = optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def optimize_policy(self, itr, samples_data): all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) loss_before = self.optimizer.loss(all_input_values) mean_kl_before = self.optimizer.constraint_val(all_input_values) if (itr == 0): acceptViolation = True else: acceptViolation = False self.optimizer.optimize(all_input_values, acceptViolation=acceptViolation) mean_kl = self.optimizer.constraint_val(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def do_training(self, itr, batch): # Update Q Function obs, actions, rewards, next_obs, terminals = ext.extract( batch, "observations", "actions", "rewards", "next_observations", "terminals") next_actions, _ = self.target_policy.get_action(next_obs) next_qvals = self.target_qf.get_qval(next_obs, next_actions) rewards = rewards.reshape(-1, 1) terminals_mask = (1.0 - terminals).reshape(-1, 1) ys = rewards + terminals_mask * self.discount * next_qvals qf_loss = self.train_qf(ys, obs, actions) policy_surr = self.train_policy(obs) self.target_policy.set_param_values( self.target_policy.get_param_values() * (1 - self.soft_target_tau) + self.policy.get_param_values() * self.soft_target_tau) self.target_qf.set_param_values(self.target_qf.get_param_values() * (1 - self.soft_target_tau) + self.qf.get_param_values() * self.soft_target_tau) self.qf_loss_averages.append(qf_loss) self.policy_surr_averages.append(policy_surr)
def optimize_policy(self, itr, samples_data): assert len(samples_data) // self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) # print(input_values[0].shape) obs_raw = input_values[0].reshape( input_values[0].shape[0] // self.period, self.period, input_values[0].shape[1]) # obs_raw = input_values[0] obs_sparse = input_values[0].take( [i for i in range(0, input_values[0].shape[0], self.period)], axis=0) advantage_sparse = np.sum(input_values[2].reshape( [input_values[2].shape[0] // self.period, self.period]), axis=1) all_input_values = (obs_raw, obs_sparse, input_values[1], advantage_sparse) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def do_training(self, itr, batch): obs, actions, rewards, next_obs, terminals = ext.extract( batch, "observations", "actions", "rewards", "next_observations", "terminals" ) # compute the on-policy y values target_qf = self.opt_info["target_qf"] target_policy = self.opt_info["target_policy"] next_actions, _ = target_policy.get_actions(next_obs) next_qvals = target_qf.get_qval(next_obs, next_actions) ys = rewards + (1. - terminals) * self.discount * next_qvals f_train_qf = self.opt_info["f_train_qf"] f_train_policy = self.opt_info["f_train_policy"] qf_loss, qval = f_train_qf(ys, obs, actions) policy_surr = f_train_policy(obs) target_policy.set_param_values( target_policy.get_param_values() * (1.0 - self.soft_target_tau) + self.policy.get_param_values() * self.soft_target_tau) target_qf.set_param_values( target_qf.get_param_values() * (1.0 - self.soft_target_tau) + self.qf.get_param_values() * self.soft_target_tau) self.qf_loss_averages.append(qf_loss) self.policy_surr_averages.append(policy_surr) self.q_averages.append(qval) self.y_averages.append(ys)
def optimize_policy(self, itr, samples_data): # import IPython; IPython.embed() print(len(samples_data['observations']), self.period) assert len(samples_data['observations']) % self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse if self.use_skill_dependent_baseline: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos", "skill_advantages")) else: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos")) # print(input_values[0].shape) obs_raw = input_values[0].reshape( input_values[0].shape[0] // self.period, self.period, input_values[0].shape[1]) # obs_raw = input_values[0] obs_sparse = input_values[0].take( [i for i in range(0, input_values[0].shape[0], self.period)], axis=0) advantage_sparse = input_values[2].reshape( [input_values[2].shape[0] // self.period, self.period])[:, 0] latents = input_values[3]['latents'] latents_sparse = latents.take( [i for i in range(0, latents.shape[0], self.period)], axis=0) if self.use_skill_dependent_baseline: all_input_values = (obs_raw, obs_sparse, input_values[1], input_values[4], advantage_sparse, latents, latents_sparse) else: all_input_values = (obs_raw, obs_sparse, input_values[1], input_values[2], advantage_sparse, latents, latents_sparse) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def __setstate__(self, d): super(ReplayPool, self).__setstate__(d) self.bottom, self.top, self.size, self.observations, self.actions, \ self.rewards, self.terminals, self.extras, self.rng = extract( d, "bottom", "top", "size", "observations", "actions", "rewards", "terminals", "extras", "rng" )
def optimize_policy(self, itr, samples_data): all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) aux_pred_data = self.aux_pred_pool.random_batch( int(self.pool_batch_size)) all_input_values += tuple([np.array(aux_pred_data['inputs'])]) + tuple( [aux_pred_data['outputs']]) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) loss_before = self.optimizer.loss(all_input_values) mean_kl_before = self.optimizer.constraint_val(all_input_values) self.optimizer.optimize(all_input_values) pred_loss = self.policy.aux_loss(aux_pred_data['inputs'], aux_pred_data['outputs']) if itr == 0: self.optimize_aux_tasks(epoch=100) '''loss_after = self.optimizer.loss(all_input_values) param_before = np.copy(self.policy.get_param_values(trainable=True)) aux_net_param_before = np.copy(self.policy._aux_pred_network.get_param_values(trainable=True)) if itr == 0: auxstep_size = 0 self.optimize_aux_tasks(epoch=100) else: self.optimize_aux_tasks(1) policy_direction = self.policy.get_param_values(trainable=True) - param_before aux_net_direction = self.policy._aux_pred_network.get_param_values(trainable=True) - aux_net_param_before auxstep_size = 1 for line_step in range(20): self.policy.set_param_values(param_before + auxstep_size * policy_direction, trainable=True) temp_kl = self.optimizer.constraint_val(all_input_values) temp_loss = self.optimizer.loss(all_input_values) if temp_loss < loss_after+abs(loss_after)*0.001 and temp_kl < self.step_size: break auxstep_size *= 0.6 self.policy._aux_pred_network.set_param_values(aux_net_param_before + auxstep_size * aux_net_direction,trainable=True)''' mean_kl = self.optimizer.constraint_val(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('Prediction Loss', pred_loss) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy(self, itr, all_samples_data, particle_idx): self.policy = self.policy_list[particle_idx] self.optimizer = self.optimizer_list[particle_idx] assert len(all_samples_data) == len(self.policy_list) assert len(all_samples_data[0]) == self.num_grad_updates + 1 input_list = [] for n in range(len(all_samples_data)): for step in range(len(all_samples_data[0]) - 1): # these are the gradient steps obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract( all_samples_data[n][step][i], "observations", "actions", "advantages" ) obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list # [ [obs_0], [act_0], [adv_0], [obs_1], ... ] if particle_idx == 0 and (self.n_particles > 1): sess = tf.get_default_session() global_h = sess.run(self.global_h,feed_dict=dict(list(zip(self.policy_list[0].input_list_for_grad, input_list)))) logger.record_tabular('global_h', global_h) obs_list, action_list, adv_list = [], [] , [] for i in range(self.meta_batch_size): inputs = ext.extract( all_samples_data[particle_idx][-1][i], "observations", "actions", "advantages" ) obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[particle_idx][self.kl_constrain_step][i]['agent_infos'] dist_info_list += [agent_infos[k] for k in self.policy.distribution.dist_info_keys] input_list += tuple(dist_info_list) self.optimizer.optimize(input_list) return dict()
def optimize_policy(self, itr, samples_data): # update the weight entropy input list ent_input = [] for i in range(1000): ent_input.append( np.concatenate([self.base, np.random.random(self.mp_dim)]).tolist()) self.ent_input = [np.array(ent_input)] all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) ooo = ext.extract(samples_data, "observations", "actions", "advantages") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) all_input_values += tuple(self.ent_input) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) loss_before = self.optimizer.loss(all_input_values) mean_kl_before = self.optimizer.constraint_val(all_input_values) self.optimizer.optimize(all_input_values) ent_input = tuple(self.ent_input) blend_weight_entropy = self.policy._f_weightentropy(ent_input[0])[0] blend_choice_entropy = self.policy._f_choiceentropy(ent_input[0])[0] mean_kl = self.optimizer.constraint_val(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('Blend Weight Entropy', blend_weight_entropy) logger.record_tabular('Blend Choice Entropy', blend_choice_entropy) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def train_from_paths(self, paths, sub_sample=None, path_percentile=[10,15,33,50,66,85,90]): if sub_sample != None: # Pick subset of paths whose returns are in the sub_sample percentile range path_returns = [sum(p["rewards"]) for p in paths] sub_range = [np.percentile(path_returns, sub_sample[i]) for i in range(2)] # Find paths which satisfy criteria idx = [i for i,ret in enumerate(path_returns) if sub_range[0]<=ret and ret<=sub_range[1]] chosen_paths = [paths[i] for i in idx] else: chosen_paths = paths self.baseline.fit(paths) # concatenate from all the trajectories observations = tensor_utils.concat_tensor_list([path["observations"] for path in chosen_paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in chosen_paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in chosen_paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in chosen_paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in chosen_paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in chosen_paths]) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, ) all_input_values = tuple(ext.extract( samples_data, "observations", "actions", "advantages" )) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] all_input_values += tuple(state_info_list) + tuple(dist_info_list) # Take a step with optimizer self.optimizer.optimize(all_input_values) # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) sub_mean = np.mean([sum(p["rewards"]) for p in chosen_paths]) base_stats = [mean_return, std_return, min_return, max_return, sub_mean] percetile_stats = [] for p in path_percentile: percetile_stats.append(np.percentile(path_returns, p)) return [base_stats, percetile_stats]
def compute_updated_dists(self, samples): """ Compute fast gradients once and pull them out of tensorflow for sampling. """ num_tasks = len(samples) param_keys = self.all_params.keys() sess = tf.get_default_session() obs_list, action_list, adv_list = [], [], [] for i in range(num_tasks): inputs = ext.extract(samples[i], 'observations', 'actions', 'advantages') obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) inputs = obs_list + action_list + adv_list # To do a second update, replace self.all_params below with the params that were used to collect the policy. init_param_values = None if self.all_param_vals is not None: init_param_values = self.get_variable_values(self.all_params) step_size = self.step_size for i in range(num_tasks): if self.all_param_vals is not None: self.assign_params(self.all_params, self.all_param_vals[i]) if 'all_fast_params_tensor' not in dir(self): # make computation graph once self.all_fast_params_tensor = [] for i in range(num_tasks): gradients = dict(zip(param_keys, tf.gradients(self.surr_objs[i], [self.all_params[key] for key in param_keys]))) fast_params_tensor = dict(zip(param_keys, [self.all_params[key] - step_size*gradients[key] for key in param_keys])) self.all_fast_params_tensor.append(fast_params_tensor) # pull new param vals out of tensorflow, so gradient computation only done once self.all_param_vals = sess.run(self.all_fast_params_tensor, feed_dict=dict(list(zip(self.input_list_for_grad, inputs)))) if init_param_values is not None: self.assign_params(self.all_params, init_param_values) outputs = [] inputs = tf.split(0, num_tasks, self._l_obs) for i in range(num_tasks): # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time. task_inp = inputs[i] info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_vals[i], is_training=False) outputs.append([info['prob']]) self._cur_f_prob = tensor_utils.compile_function( inputs = [self._l_obs], outputs = outputs, )
def optimize_policy(self, itr, all_samples_data): assert len(all_samples_data) == self.num_grad_updates + 1 # we collected the rollouts to compute the grads and then the test! if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range(len(all_samples_data)): # these are the gradient steps obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract( all_samples_data[step][i], "observations", "actions", "advantages" ) obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list # [ [obs_0], [act_0], [adv_0], [obs_1], ... ] if step == 0: ##CF not used? init_inputs = input_list if self.use_maml: dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[self.kl_constrain_step][i]['agent_infos'] dist_info_list += [agent_infos[k] for k in self.policy.distribution.dist_info_keys] input_list += tuple(dist_info_list) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(input_list) logger.log("Computing loss before") loss_before = self.optimizer.loss(input_list) logger.log("Optimizing") self.optimizer.optimize(input_list) logger.log("Computing loss after") loss_after = self.optimizer.loss(input_list) if self.use_maml: logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(input_list) logger.record_tabular('MeanKLBefore', mean_kl_before) # this now won't be 0! logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract( samples_data, "observations", "actions", "advantages" ) if self.policy.recurrent: inputs += (samples_data["valids"],) agent_infos = samples_data["agent_infos"] dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def train_from_paths(self, paths): self.baseline.fit(paths) # concatenate from all the trajectories observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, ) all_input_values = tuple(ext.extract( samples_data, "observations", "actions", "advantages" )) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] all_input_values += tuple(state_info_list) # Take a step with optimizer self.optimizer.optimize(all_input_values) # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) return (mean_return, std_return, min_return, max_return)