def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(meta_op_input_dict) logger.log("Computing loss before") loss_before = self.optimizer.loss(meta_op_input_dict) logger.log("Optimizing") self.optimizer.optimize(meta_op_input_dict) logger.log("Computing loss after") loss_after = self.optimizer.loss(meta_op_input_dict) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(meta_op_input_dict) if log: logger.logkv('MeanKLBefore', mean_kl_before) logger.logkv('MeanKL', mean_kl) logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) logger.logkv('dLoss', loss_before - loss_after)
def optimize(self, input_val_dict): """ Carries out the optimization step Args: input_val_dict (dict): dict containing the values to be fed into the computation graph Returns: (float) loss before optimization """ sess = tf.get_default_session() feed_dict = self.create_feed_dict(input_val_dict) # Overload self._batch size # dataset = MAMLBatchDataset(inputs, num_batches=self._batch_size, extra_inputs=extra_inputs, meta_batch_size=self.meta_batch_size, num_grad_updates=self.num_grad_updates) # Todo: reimplement minibatches loss_before_opt = None for epoch in range(self._max_epochs): if self._verbose: logger.log("Epoch %d" % epoch) loss, _ = sess.run([self._loss, self._train_op], feed_dict) if not loss_before_opt: loss_before_opt = loss # if self._verbose: # logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss)) # # if abs(last_loss - new_loss) < self._tolerance: # break # last_loss = new_loss return loss_before_opt
def optimize(self, input_val_dict): """ Carries out the optimization step Args: input_val_dict (dict): dict containing the values to be fed into the computation graph Returns: (float) loss before optimization """ sess = tf.get_default_session() feed_dict = self.create_feed_dict(input_val_dict) loss_before_opt = None for epoch in range(self._max_epochs): if self._verbose: logger.log("Epoch %d" % epoch) loss, _ = sess.run([self._loss, self._train_op], feed_dict) if not loss_before_opt: loss_before_opt = loss return loss_before_opt
def optimize_policy(self, samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') if log: logger.log("Optimizing") loss_before = self.optimizer.optimize(input_val_dict=input_dict) if log: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=input_dict) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after)
def compute_gradients(self, all_samples_data, log=True): meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) feed_dict = utils.create_feed_dict( placeholder_dict=self.meta_op_phs_dict, value_dict=meta_op_input_dict) if log: logger.log("compute gradients") gradients_values = tf.get_default_session().run(self.gradients, feed_dict=feed_dict) return gradients_values
def train(self): for i in range(1, self.eff+1): with self.sess.as_default() as sess: logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------") # self.sampler.rollouts_per_meta_task = 10000 self.sampler.update_batch_size(i) # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) self.task = self.env.sample_tasks(self.sampler.meta_batch_size, is_eval=True) self.sampler.set_tasks(self.task) #logger.log("\n ---------------- Iteration %d ----------------" % itr) logger.log("Sampling set of tasks/goals for this meta-batch...") """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") paths = self.sampler.obtain_samples(log=True, log_prefix='train-') """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='train-') self.log_diagnostics(sum(paths.values(), []), prefix='train-') #""" ------------------ Policy Update ---------------------""" #logger.log("Optimizing policy...") ## This needs to take all samples_data so that it can construct graph for meta-optimization. #time_optimization_step_start = time.time() #self.algo.optimize_policy(samples_data) """ ------------------- Logging Stuff --------------------------""" logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) #logger.log("Saving snapshot...") #params = self.get_itr_snapshot(itr) #logger.save_itr_params(itr, params) #logger.log("Saved") logger.dumpkvs() # if itr == 0: # sess.graph.finalize() logger.log("Training finished") self.sess.close()
def optimize(self, input_val_dict): """ Carries out the optimization step Args: input_val_dict (dict): dict containing the values to be fed into the computation graph Returns: (float) loss before optimization """ sess = tf.get_default_session() batch_size, seq_len, *_ = list(input_val_dict.values())[0].shape loss_before_opt = None for epoch in range(self._max_epochs): hidden_batch = self._target.get_zero_state(batch_size) if self._verbose: logger.log("Epoch %d" % epoch) # run train op loss = [] all_grads = [] for i in range(0, seq_len, self._backprop_steps): n_i = i + self._backprop_steps feed_dict = dict([(self._input_ph_dict[key], input_val_dict[key][:, i:n_i]) for key in self._input_ph_dict.keys()]) feed_dict[self._hidden_ph] = hidden_batch batch_loss, grads, hidden_batch = sess.run( [self._loss, self._gradients_var, self._next_hidden_var], feed_dict=feed_dict) loss.append(batch_loss) all_grads.append(grads) grads = [np.mean(grad, axis=0) for grad in zip(*all_grads)] feed_dict = dict(zip(self._gradients_ph, grads)) _ = sess.run(self._train_op, feed_dict=feed_dict) if not loss_before_opt: loss_before_opt = np.mean(loss) # if self._verbose: # logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss)) # # if abs(last_loss - new_loss) < self._tolerance: # break # last_loss = new_loss return loss_before_opt
def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op(all_samples_data, self._optimization_keys) # add kl_coeffs / clip_eps to meta_op_input_dict meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff meta_op_input_dict['clip_eps'] = self.clip_eps if log: logger.log("Optimizing") loss_before = self.optimizer.optimize(input_val_dict=meta_op_input_dict) if log: logger.log("Computing statistics") loss_after, inner_kls, outer_kl = self.optimizer.compute_stats(input_val_dict=meta_op_input_dict) if self.adaptive_inner_kl_penalty: if log: logger.log("Updating inner KL loss coefficients") self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff, inner_kls, self.target_inner_step) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) logger.logkv('KLInner', np.mean(inner_kls)) logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
def optimize_policy(self, all_samples_data, mod_samples_data, num_paths_per_rollout, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) extra_feed_dict = { self.policy.mod_input_var: mod_samples_data, self.policy.num_paths_var: num_paths_per_rollout, } # add kl_coeffs / clip_eps to meta_op_input_dict meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff meta_op_input_dict['clip_eps'] = self.clip_eps if log: logger.log("Optimizing") loss_before, grad_norms = self.optimizer.optimize( input_val_dict=meta_op_input_dict, extra_feed_dict=extra_feed_dict) if self.summary_writer is not None: for name, norm in grad_norms.items(): tensorboard_util.log_scalar(self.summary_writer, 'grads/' + name, norm, self.log_step) self.log_step += 1 if log: logger.log("Computing statistics") loss_after, inner_kls, outer_kl = self.optimizer.compute_stats( input_val_dict=meta_op_input_dict, extra_feed_dict=extra_feed_dict) if self.adaptive_inner_kl_penalty: if log: logger.log("Updating inner KL loss coefficients") self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff, inner_kls, self.target_inner_step) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) logger.logkv('KLInner', np.mean(inner_kls)) logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log("\n ---------------- Iteration %d ----------------" % itr) logger.log("Sampling set of tasks/goals for this meta-batch...") #self.sampler.update_tasks() self.policy.switch_to_pre_update() # Switch to pre-update policy all_samples_data, all_paths = [], [] list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], [] start_total_inner_time = time.time() for step in range(self.num_inner_grad_steps+1): logger.log('** Step ' + str(step) + ' **') """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) list_sampling_time.append(time.time() - time_env_sampling_start) all_paths.append(paths) """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) list_proc_samples_time.append(time.time() - time_proc_samples_start) self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) """ ------------------- Inner Policy Update --------------------""" time_inner_step_start = time.time() if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph', # sess.graph) list_inner_step_time.append(time.time() - time_inner_step_start) total_inner_time = time.time() - start_total_inner_time time_maml_opt_start = time.time() """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_outer_step_start = time.time() self.algo.optimize_policy(all_samples_data) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) #writer.add_scalar(self.algo.name, self.sample_processor.AR, self.sampler.total_timesteps_sampled) logger.logkv('Time-OuterStep', time.time() - time_outer_step_start) logger.logkv('Time-TotalInner', total_inner_time) logger.logkv('Time-InnerStep', np.sum(list_inner_step_time)) logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time)) logger.logkv('Time-Sampling', np.sum(list_sampling_time)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() logger.log("Training finished") self.sess.close()
def train(self): policy_0 = self.policy for i in [4, 3, 2, 1]: #range(1, self.eff+1): print("On", i, "self.policy == policy_0: ", self.policy == policy_0) with self.sess.as_default() as sess: logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------") undiscounted_returns = [] for j in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): logger.log("---------Testing on task", j, "~", j + self.sampler.meta_batch_size - 1, "---------") # initialize uninitialized vars (only initialize vars that were not loaded) # uninit_vars = [var for var in tf.global_variables() if # not sess.run(tf.is_variable_initialized(var))] # sess.run(tf.variables_initializer(uninit_vars)) uninit_vars = [var for var in tf.global_variables()] sess.run(tf.variables_initializer(uninit_vars)) logger.log( "Sampling set of tasks/goals for this meta-batch...") self.sampler.update_tasks( test=True, start_from=j) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): if step < self.num_inner_grad_steps: self.sampler.update_batch_size_v2( i) ###################### logger.log("On step-0: Obtaining samples...") else: self.sampler.update_batch_size(2) logger.log("On step-1: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) logger.logkv('x', i) logger.logkv('return', test_average_return) logger.dumpkvs() logger.log("------Testing rollouts per meta-task = ", i, "finished------") '''
def main(config): set_seed(config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=gpu_config) saver = tf.train.Saver( keep_checkpoint_every_n_hours=config['keep_checkpoint_every_n_hours'], max_to_keep=config['max_checkpoints_to_keep']) save_path = os.path.join(args.dump_path, 'model.ckpt') if config['restore_path'] is not None: logger.log('Restoring parameters from {}'.format( config['restore_path'])) saver.restore(sess, config['restore_path']) logger.log('Restored') trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, saver=saver, save_path=save_path, save_steps=config['save_steps'], n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], sess=sess, ) trainer.train()
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) logger.log( "Sampling set of tasks/goals for this meta-batch...") self.sampler.update_tasks() # sample tasks! self.policy.switch_to_pre_update( ) # Switch to pre-update policy all_samples_data, all_paths = [], [] list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], [] start_total_inner_time = time.time() for step in range(self.num_inner_grad_steps + 1): logger.log('** Step ' + str(step) + ' **') """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() ''' if step == self.num_inner_grad_steps: temp = self.sampler.batch_size self.sampler.update_batch_size(2) paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) self.sampler.update_batch_size(temp) else: paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) ''' paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) list_sampling_time.append(time.time() - time_env_sampling_start) all_paths.append(paths) """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) list_proc_samples_time.append(time.time() - time_proc_samples_start) self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) """ ------------------- Inner Policy Update --------------------""" time_inner_step_start = time.time() if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph', # sess.graph) list_inner_step_time.append(time.time() - time_inner_step_start) total_inner_time = time.time() - start_total_inner_time time_maml_opt_start = time.time() """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_outer_step_start = time.time() self.algo.optimize_policy(all_samples_data) """ ------------------ Test-split Performance for logging ---------------------""" logger.log( "Testing on test-tasks split for logging, rollout_per_task = 20..." ) undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): self.sampler.update_tasks( test=True, start_from=i) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test20-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) logger.logkv('test20-AverageReturn', test_average_return) logger.log( "Testing on test-tasks split for logging, rollout_per_task = 2..." ) sampler_batch_size = self.sampler.batch_size self.sampler.update_batch_size(2) ############## undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): self.sampler.update_tasks( test=True, start_from=i) # sample from test split! self.policy.switch_to_pre_update( ) # Switch to pre-update policy for step in range(self.num_inner_grad_steps + 1): logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") samples_data = self.sample_processor.process_samples( paths, log=False ) # log='all', log_prefix='test-Step_%d-' % step self.log_diagnostics(sum(list(paths.values()), []), prefix='test-Step_%d-' % step) """ ------------------- Inner Policy Update / logging returns --------------------""" if step < self.num_inner_grad_steps: logger.log( "On Test: Computing inner policy updates...") self.algo._adapt(samples_data) else: paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) self.sampler.update_batch_size(sampler_batch_size) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('test-AverageReturn', test_average_return) logger.logkv('Time-OuterStep', time.time() - time_outer_step_start) logger.logkv('Time-TotalInner', total_inner_time) logger.logkv('Time-InnerStep', np.sum(list_inner_step_time)) logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time)) logger.logkv('Time-Sampling', np.sum(list_sampling_time)) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode:: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) n_timesteps = 0 start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) gradients = [] for i in range(self.num_sapling_rounds): logger.log("\n ----- Sampling Round %d ---" % i) dry = i < self.num_sapling_rounds - 1 if not dry: self.sampler.update_tasks() self.policy.switch_to_pre_update( ) # Switch to pre-update policy all_samples_data, all_paths = [], [] for step in range(self.num_inner_grad_steps + 1): logger.log('** Step ' + str(step) + ' **') logger.log("Obtaining samples...") paths = self.sampler.obtain_samples( log=True, log_prefix='Step_%d-' % step) all_paths.append(paths) logger.log("Processing samples...") samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='Step_%d-' % step) all_samples_data.append(samples_data) if not dry: self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) if step < self.num_inner_grad_steps: logger.log("Computing inner policy updates...") self.algo._adapt(samples_data) """ compute gradients """ gradients.append( self.algo.compute_gradients(all_samples_data)) if not dry: """ ------------ Compute and log gradient variance ------------""" # compute variance of adaptation gradients for step_id in range(self.num_inner_grad_steps): meta_batch_size = len(gradients[0][0]) grad_std, grad_rstd = [], [] for task_id in range(meta_batch_size): stacked_grads = np.stack([ gradients[round_id][step_id][task_id] for round_id in range(self.num_sapling_rounds) ], axis=1) std = np.std(stacked_grads, axis=1) mean = np.abs(np.mean(stacked_grads, axis=1)) grad_std.append(np.mean(std)) grad_rstd.append(np.mean(std / mean)) logger.logkv('Step_%i-GradientMean', np.mean(mean)) logger.logkv('Step_%i-GradientStd' % step_id, np.mean(grad_std)) logger.logkv('Step_%i-GradientRStd' % step_id, np.mean(grad_rstd)) # compute variance of meta gradients stacked_grads = np.stack([ gradients[round_id][self.num_inner_grad_steps] for round_id in range(self.num_sapling_rounds) ], axis=1) std = np.std(stacked_grads, axis=1) mean = np.abs(np.mean(stacked_grads, axis=1)) meta_grad_std = np.mean(std) meta_grad_rstd = np.mean(std / (mean + 1e-8)) meta_grad_rvar = np.mean(std**2 / (mean + 1e-8)) logger.logkv('Meta-GradientMean', np.mean(mean)) logger.logkv('Meta-GradientStd', meta_grad_std) logger.logkv('Meta-GradientRStd', meta_grad_rstd) logger.logkv('Meta-GradientRVariance', meta_grad_rvar) # compute cosine dists cosine_dists = cdist(np.transpose(stacked_grads), np.transpose( np.mean(stacked_grads, axis=1).reshape( (-1, 1))), metric='cosine') mean_abs_cos_dist = np.mean(np.abs(cosine_dists)) mean_squared_cosine_dists = np.mean(cosine_dists**2) mean_squared_cosine_dists_sqrt = np.sqrt( mean_squared_cosine_dists) logger.logkv('Meta-GradientCosAbs', mean_abs_cos_dist) logger.logkv('Meta-GradientCosVar', mean_squared_cosine_dists) logger.logkv('Meta-GradientCosStd', mean_squared_cosine_dists_sqrt) """ ------------------ Outer Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. self.algo.optimize_policy(all_samples_data) """ ------------------- Logging Stuff --------------------------""" n_timesteps += (self.num_inner_grad_steps + 1) * self.sampler.total_samples logger.logkv('n_timesteps', n_timesteps) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) # , **kwargs) logger.save_itr_params(itr, params) logger.log("Saved") logger.logkv('Itr', itr) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.dumpkvs() logger.log("Training finished") self.sess.close()
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) start_time = time.time() for itr in range(self.start_itr, self.n_itr): self.task = self.env.sample_tasks(self.sampler.meta_batch_size) self.sampler.set_tasks(self.task) itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) logger.log( "Sampling set of tasks/goals for this meta-batch...") """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.sampler.obtain_samples(log=True, log_prefix='train-') sampling_time = time.time() - time_env_sampling_start """ ----------------- Processing Samples ---------------------""" logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='train-') proc_samples_time = time.time() - time_proc_samples_start self.log_diagnostics(sum(paths.values(), []), prefix='train-') """ ------------------ Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_optimization_step_start = time.time() self.algo.optimize_policy(samples_data) """ ------------------ Test-split Performance for logging ---------------------""" logger.log("Testing on test-tasks split for logging...") sampler_batch_size = self.sampler.batch_size self.sampler.update_batch_size(3) ####################2 undiscounted_returns = [] for i in range(0, self.env.NUM_EVAL, self.sampler.meta_batch_size): # Caution: Here actually i in [0] since self.meta_batch_size=100(when running on linux) self.sampler.update_tasks( test=True, start_from=i) # sample from test split! #self.policy.switch_to_pre_update() # Switch to pre-update policy logger.log("On Test: Obtaining samples...") paths = self.sampler.obtain_samples( log=False, test=True) # log_prefix='test-Step_%d-' % step logger.log("On Test: Processing Samples...") self.log_diagnostics(sum(list(paths.values()), []), prefix='test-') """ ------------------- Logging Returns --------------------""" paths = self.sample_processor.gao_paths(paths) undiscounted_returns.extend( [sum(path["rewards"]) for path in paths]) test_average_return = np.mean(undiscounted_returns) self.sampler.update_batch_size(sampler_batch_size) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('test-AverageReturn', test_average_return) logger.logkv('Time-Optimization', time.time() - time_optimization_step_start) logger.logkv('Time-SampleProc', np.sum(proc_samples_time)) logger.logkv('Time-Sampling', sampling_time) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.log("Training finished") self.sess.close()
def optimize(self, input_val_dict): """ Carries out the optimization step Args: inputs (list): inputs for the optimization extra_inputs (list): extra inputs for the optimization subsample_grouped_inputs (None or list): subsample data from each element of the list """ logger.log("Start CG optimization") logger.log("computing loss before") loss_before = self.loss(input_val_dict) logger.log("performing update") logger.log("computing gradient") gradient = self.gradient(input_val_dict) logger.log("gradient computed") logger.log("computing descent direction") Hx = self._hvp_approach.build_eval(input_val_dict) descent_direction = conjugate_gradients(Hx, gradient, cg_iters=self._cg_iters) initial_step_size = np.sqrt( 2.0 * self._max_constraint_val * (1. / (descent_direction.dot(Hx(descent_direction)) + 1e-8))) if np.isnan(initial_step_size): logger.log("Initial step size is NaN! Rejecting the step!") return initial_descent_step = initial_step_size * descent_direction logger.log("descent direction computed") prev_params = self._target.get_param_values() prev_params_values = _flatten_params(prev_params) loss, constraint_val, n_iter, violated = 0, 0, 0, False for n_iter, ratio in enumerate(self._backtrack_ratio**np.arange( self._max_backtracks)): cur_step = ratio * initial_descent_step cur_params_values = prev_params_values - cur_step cur_params = _unflatten_params(cur_params_values, params_example=prev_params) self._target.set_params(cur_params) loss, constraint_val = self.loss( input_val_dict), self.constraint_val(input_val_dict) if loss < loss_before and constraint_val <= self._max_constraint_val: break """ ------------------- Logging Stuff -------------------------- """ if np.isnan(loss): violated = True logger.log("Line search violated because loss is NaN") if np.isnan(constraint_val): violated = True logger.log("Line search violated because constraint %s is NaN" % self._constraint_name) if loss >= loss_before: violated = True logger.log("Line search violated because loss not improving") if constraint_val >= self._max_constraint_val: violated = True logger.log( "Line search violated because constraint %s is violated" % self._constraint_name) if violated and not self._accept_violation: logger.log("Line search condition violated. Rejecting the step!") self._target.set_params(prev_params) logger.log("backtrack iters: %d" % n_iter) logger.log("computing loss after") logger.log("optimization finished")
policy=policy, inner_lr=params['inner_lr'], meta_batch_size=params['meta_batch_size'], num_inner_grad_steps=params['num_inner_grad_steps'], learning_rate=params['learning_rate'], num_ppo_steps=params['num_promp_steps'], clip_eps=params['clip_eps'], target_inner_step=params['target_inner_step'], init_inner_kl_penalty=params['init_inner_kl_penalty'], adaptive_inner_kl_penalty=params['adaptive_inner_kl_penalty'], ) saver = tf.train.Saver() if args.restore_path is not None: logger.log('Restoring parameters from {}'.format(args.restore_path)) saver.restore(sess, args.restore_path) logger.log('Restored') uninit_vars = [ var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var)) ] sess.run(tf.variables_initializer(uninit_vars)) wrapped_env = env while hasattr(wrapped_env, '_wrapped_env'): wrapped_env = wrapped_env._wrapped_env frame_skip = wrapped_env.frame_skip if hasattr(wrapped_env, 'frame_skip') else 1