def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def train(self): ''' meta-training loop ''' start_time = time.time() print("starting to pretrain") self.pretrain() print("done pretraining after time:", time.time() - start_time) params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) if it_ == 0 and self.num_initial_steps > 0: print('collecting initial pool of data for train and eval') # temp for evaluating for task_idx in self.train_task_indices: if self.expl_data_collector: init_expl_paths = self.expl_data_collector.collect_new_paths( max_path_length=self.max_path_length, num_steps=self.num_initial_steps, discard_incomplete_paths=False, task_idx=task_idx, ) self.replay_buffer.add_paths(task_idx, init_expl_paths) if not self.use_rl_buffer_for_enc_buffer: self.enc_replay_buffer.add_paths( task_idx, init_expl_paths) self.expl_data_collector.end_epoch(-1) else: self.collect_exploration_data(self.num_initial_steps, 1, np.inf, task_idx) self.in_unsupervised_phase = ( it_ >= self.num_iterations_with_reward_supervision) if it_ == self.num_iterations_with_reward_supervision: self._transition_to_unsupervised() update_encoder_buffer = not ( self.in_unsupervised_phase and self.freeze_encoder_buffer_in_unsupervised_phase ) and not self.use_rl_buffer_for_enc_buffer clear_encoder_buffer = ( update_encoder_buffer and self.clear_encoder_buffer_before_every_update ) and not self.use_rl_buffer_for_enc_buffer # TODO: propogate unsupervised mode elegantly # Sample data from train tasks. for i in range(self.num_tasks_sample): if len(self.exploration_task_indices) == 0: # do no data collection break task_idx = np.random.choice(self.exploration_task_indices) if clear_encoder_buffer: self.enc_replay_buffer.task_buffers[task_idx].clear() # collect some trajectories with z ~ prior if self.num_steps_prior > 0: if self.expl_data_collector: # TODO: implement new_expl_paths = self.expl_data_collector.collect_new_paths( task_idx=task_idx, max_path_length=self.max_path_length, resample_latent_period=self. exploration_resample_latent_period, update_posterior_period=np.inf, num_steps=self.num_steps_prior, use_predicted_reward=self.in_unsupervised_phase, discard_incomplete_paths=False, ) self.replay_buffer.add_paths(task_idx, new_expl_paths) self._n_env_steps_total += sum( len(p['actions']) for p in new_expl_paths) self._n_rollouts_total += len(new_expl_paths) if update_encoder_buffer: self.enc_replay_buffer.add_paths( task_idx, new_expl_paths) else: self.collect_exploration_data( num_samples=self.num_steps_prior, resample_latent_period=self. exploration_resample_latent_period, update_posterior_period=np.inf, add_to_enc_buffer=update_encoder_buffer, use_predicted_reward=self.in_unsupervised_phase, task_idx=task_idx, # TODO: figure out if I want to replace this? # it's only used when `clear_encoder_buffer_before_every_update` is True # and when `freeze_encoder_buffer_in_unsupervised_phase` is False # and when we're in unsupervised phase ) # collect some trajectories with z ~ posterior if self.num_steps_posterior > 0: if self.expl_data_collector: # TODO: implement new_expl_paths = self.expl_data_collector.collect_new_paths( task_idx=task_idx, max_path_length=self.max_path_length, resample_latent_period=self. exploration_resample_latent_period, update_posterior_period=self.update_post_train, num_steps=self.num_steps_posterior, use_predicted_reward=self.in_unsupervised_phase, discard_incomplete_paths=False, ) self.replay_buffer.add_paths(task_idx, new_expl_paths) self._n_env_steps_total += sum( len(p['actions']) for p in new_expl_paths) self._n_rollouts_total += len(new_expl_paths) if update_encoder_buffer and not self.use_rl_buffer_for_enc_buffer: self.enc_replay_buffer.add_paths( task_idx, new_expl_paths) else: self.collect_exploration_data( num_samples=self.num_steps_posterior, resample_latent_period=self. exploration_resample_latent_period, update_posterior_period=self.update_post_train, add_to_enc_buffer=update_encoder_buffer, use_predicted_reward=self.in_unsupervised_phase, task_idx=task_idx, ) # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior if self.num_extra_rl_steps_posterior > 0: # TODO: implement if self.expl_data_collector: new_expl_paths = self.expl_data_collector.collect_new_paths( task_idx=task_idx, max_path_length=self.max_path_length, resample_latent_period=self. exploration_resample_latent_period, update_posterior_period=self.update_post_train, num_steps=self.num_extra_rl_steps_posterior, use_predicted_reward=self.in_unsupervised_phase, discard_incomplete_paths=False, ) self.replay_buffer.add_paths(task_idx, new_expl_paths) self._n_env_steps_total += sum( len(p['actions']) for p in new_expl_paths) self._n_rollouts_total += len(new_expl_paths) if not self.use_rl_buffer_for_enc_buffer: self.enc_replay_buffer.add_paths( task_idx, new_expl_paths) else: add_to_enc_buffer = ( self.debug_enc_buffer_matches_rl_buffer and not self.use_rl_buffer_for_enc_buffer) self.collect_exploration_data( num_samples=self.num_extra_rl_steps_posterior, resample_latent_period=self. exploration_resample_latent_period, update_posterior_period=self.update_post_train, add_to_enc_buffer=add_to_enc_buffer, use_predicted_reward=self.in_unsupervised_phase, task_idx=task_idx, ) gt.stamp('sample') # Sample train tasks and compute gradient updates on parameters. for train_step in range(self.num_train_steps_per_itr): if self.use_meta_learning_buffer: batch = self.meta_replay_buffer.sample_meta_batch( rl_batch_size=self.batch_size, meta_batch_size=self.meta_batch, embedding_batch_size=self.embedding_batch_size, ) self.trainer.train(batch) else: indices = np.random.choice(self.train_task_indices, self.meta_batch) mb_size = self.embedding_mini_batch_size num_updates = self.embedding_batch_size // mb_size # sample context batch # context_batch = self.sample_context(indices) context_batch = self.enc_replay_buffer.sample_context( indices, self.embedding_batch_size) # zero out context and hidden encoder state # self.agent.clear_z(num_tasks=len(indices)) # do this in a loop so we can truncate backprop in the recurrent encoder for i in range(num_updates): if self._debug_use_ground_truth_context: context = context_batch else: context = context_batch[:, i * mb_size:i * mb_size + mb_size, :] # batch = self.sample_batch(indices) batch = self.replay_buffer.sample_batch( indices, self.batch_size) batch['context'] = context batch['task_indices'] = indices self.trainer.train(batch) self._n_train_steps_total += 1 # stop backprop # self.agent.detach_z() # train_data = self.replay_buffer.random_batch(self.batch_size) gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) self._end_epoch(it_)
def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) if it_ == 0: print('collecting initial pool of data for train and eval') # temp for evaluating for idx in self.train_tasks: print('idx:', idx) print('num initial steps:', self.num_initial_steps) self.task_idx = idx self.env.reset_task(idx) self.collect_data(self.num_initial_steps, 1, np.inf) # Sample data from train tasks. for i in range(self.num_tasks_sample): idx = np.random.randint(len(self.train_tasks)) self.task_idx = idx self.env.reset_task(idx) self.enc_replay_buffer.task_buffers[idx].clear() # collect some trajectories with z ~ prior if self.num_steps_prior > 0: self.collect_data(self.num_steps_prior, 1, np.inf) #NOTE: the second argument to self.collect_data is how often you reset the context # collect some trajectories with z ~ posterior if self.num_steps_posterior > 0: self.collect_data(self.num_steps_posterior, 1, self.update_post_train) # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior if self.num_extra_rl_steps_posterior > 0: self.collect_data(self.num_extra_rl_steps_posterior, 1, self.update_post_train, add_to_enc_buffer=False) #So self.task_idx always ONE number and is used for data collection, #but during the actual training steps, you can have multiple indices #used at once # Sample train tasks and compute gradient updates on parameters. for train_step in range( self.num_train_steps_per_itr ): ## DEFAULT: num_train_steps_per_itr=2000 indices = np.random.choice( self.train_tasks, self.meta_batch) ##DEFAULT:self.meta_batch=16 self._do_training(indices) self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch()
def _train(self, env, policy, initial_exploration_policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z(next_ob, z, self._num_skills) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self.sampler._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch( self.sampler._batch_size) self._do_training(iteration, batch) gt.stamp('train') if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) # self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool model_metrics = {} if not self._training_started: self._init_training() self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): if self._epoch % 200 == 0: #### model training print('[ MOPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MOPO ] Training model at epoch {} | freq {} | timestep {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep)) max_epochs = 1 if self._model.model_loaded else None model_train_metrics = self._train_model( batch_size=256, max_epochs=max_epochs, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) self._log_model() gt.stamp('epoch_train_model') #### self._epoch_before_hook() gt.stamp('epoch_before_hook') self._training_progress = Progress(self._epoch_length * self._n_train_repeat) start_samples = self.sampler._total_samples for timestep in count(): self._timestep = timestep if (timestep >= self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') ## model rollouts if timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() self._set_rollout_length() self._reallocate_model_pool() model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) gt.stamp('epoch_rollout_model') self._training_progress.resume() ## train actor and critic if self.ready_to_train: self._do_training_repeats(timestep=timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) ## ensure we did not collect any more data assert self._pool.size == self._init_pool_size yield diagnostics epi_ret = self._rollout_model_for_eval( self._training_environment.reset) np.savetxt("EEepi_ret__fin.csv", epi_ret, delimiter=',') self.sampler.terminate() self._training_after_hook() self._training_progress.close() yield {'done': True, **diagnostics}
def _train(self, env, policy, pool, initial_exploration_policy=None): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ if not self._training_started: self._init_training() self._initial_exploration_hook(env, initial_exploration_policy, pool) self.sampler.initialize(env, policy, pool) evaluation_env = env.copy() if self._eval_n_episodes else None gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_env) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, env) gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_env) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), #('orthogonal-constraints',self.get_penalty()), ))) try: diagnostics.update( OrderedDict( (('orthogonal-constraints', self.get_penalty()), ))) except: pass if self._eval_render_mode is not None and hasattr( evaluation_env, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. env.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() yield {'done': True, **diagnostics}
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) if self.iter_callback is not None: self.iter_callback(locals(), globals()) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def train(self, start_epoch=0): self.pretrain() self.training_mode(False) gt.reset() gt.set_def_unique(False) self.start_training(start_epoch=start_epoch)
def test(self, task, seed, iteration_num, render, load_iter=None, debug=False): set_seed(seed) iteration_num = int(iteration_num) gt.reset() gt.set_def_unique(False) start_iter = 0 params = self.logger.load_params(load_iter) start_iter = self._set_params(params) self.theta.train() extra_data = self.logger.load_extra_data() self._set_extra_data(extra_data) rollout = [] state = task.reset() self.controller.set_task(task) for i in gt.timed_for(range(start_iter, iteration_num), save_itrs=True): t = 0 done = False reward_sum = 0 state = task.reset() while not done: past_traj = [r[-self.M:] for r in rollout] if past_traj != []: for _ in range(self.adaptation_update_num): loss = self._compute_adaptation_loss( self.theta, past_traj) zero_grad(self.theta.parameters()) self._meta_update(loss) action = self.controller.plan(self.theta, state, None, debug) next_state, reward, done, _ = task.step(action) reward_sum += reward if render: task.render() if action.shape == (): action = [action] rollout = _aggregate_rollout(rollout, state, action, next_state) state = next_state t += 1 if done: rollout = [] state = task.reset() print('Iteration:', i, 'Reward:', reward_sum, 'Traj len:', t)
def _train(self, env, policy, initial_exploration_policy, sub_level_policies_paths, pool, g): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ '''self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy,sub_level_policies, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy,sub_level_policies, pool) initial_exploration_done = False''' with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) #loading-low-level-policies sub_level_policies = [] for p in range(0, len(sub_level_policies_paths)): path = sub_level_policies_paths[p] if path[:2] == 'ik': with tf.variable_scope(str(p), reuse=False): policy_snapshot = IK_Policy(path[2]) sub_level_policies.append(policy_snapshot) else: with tf.variable_scope(str(p), reuse=False): policy_snapshot = joblib.load( sub_level_policies_paths[p]) sub_level_policies.append(policy_snapshot["policy"]) self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy, sub_level_policies, pool, g, self.use_demos) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy, sub_level_policies, pool, g, self.use_demos) initial_exploration_done = False import pdb pdb.set_trace() if self.use_demos: print("Using demonstrations: {} steps".format( self.sampler.pool.demo_buffer._size)) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True, quick_print=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in trange(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler if not initial_exploration_done: if self._epoch_length * epoch >= self._n_initial_exploration_steps: self.sampler.set_policy(policy) initial_exploration_done = True self.sampler.sample( initial_exploration_done, g, ) if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch, initial_exploration_done, sub_level_policies, g) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): env._wrapped_env.env.initialize(seed_task=SEED_TASK) observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 self.prev_n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z(next_ob, z, self._num_skills) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) # print("\n===RESET", epoch, n_episodes, "===", self._epoch_length, path_length, "===", # # env._wrapped_env.env.nstep_internal, # datetime.datetime.now()) env._wrapped_env.env.initialize(seed_task=SEED_TASK) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 # EPOCH IS DONE epoch if not epoch % 10: logger.log("Epoch: {:4} | Episodes: {}".format( epoch, n_episodes), with_prefix=False) if not n_episodes % self.eval_freq or \ n_episodes >= EPISODE_LIMIT or \ epoch >= self._n_epochs: # is_final = epoch >= self._n_epochs \ # or n_episodes >= EPISODE_LIMIT self.sample_skills_to_bd(n_epoch=epoch, n_episodes=n_episodes) # Make snapshot params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) gt.stamp('behaviours') else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') # Terminate after 1000000 episodes if n_episodes >= EPISODE_LIMIT: break else: continue break if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) logger.push_prefix('Epoch #%d | ' % epoch) self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) if it_ == 0: print('collecting initial pool of data for train and eval') # temp for evaluating #3-9 for idx in self.train_tasks: #对于每个任务 self.task_idx = idx #更换当前task id self.env.reset_task(idx) #重置task self.collect_data( self.num_initial_steps, 1, np.inf) #训练前采集num_initial_steps条transitions,每采集一条,更新z # Sample data from train tasks. for i in range( self.num_tasks_sample ): #随机抽取num_tasks_sample个任务 3-10 idx = np.random.randint(len(self.train_tasks)) self.task_idx = idx self.env.reset_task(idx) self.enc_replay_buffer.task_buffers[idx].clear() # collect some trajectories with z ~ prior if self.num_steps_prior > 0: #采集多少c来推先验z self.collect_data(self.num_steps_prior, 1, np.inf) # # collect some trajectories with z ~ posterior if self.num_steps_posterior > 0: #采集多少c来推后验z self.collect_data(self.num_steps_posterior, 1, self.update_post_train) # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior if self.num_extra_rl_steps_posterior > 0: #额外采集用于rl训练的c,不用于encoder self.collect_data(self.num_extra_rl_steps_posterior, 1, self.update_post_train, add_to_enc_buffer=False) # Sample train tasks and compute gradient updates on parameters. for train_step in range( self.num_train_steps_per_itr ): #每次迭代需要梯度更新多少步 11 indices = np.random.choice( self.train_tasks, self.meta_batch) #从train_tasks里面随机选出meta_batch个任务 13 self._do_training(indices) #14-21 self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ #### pool is e.g. simple_replay_pool training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool model_metrics = {} #### init Qs for SAC if not self._training_started: self._init_training() #### perform some initial steps (gather samples) using initial policy ###### fills pool with _n_initial_exploration_steps samples self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) #### set up sampler with train env and actual policy (may be different from initial exploration policy) ######## note: sampler is set up with the pool that may be already filled from initial exploration hook self.sampler.initialize(training_environment, policy, pool) #### reset gtimer (for coverage of project development) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) #### not implemented, could train policy before hook self._training_before_hook() #### iterate over epochs, gt.timed_for to create loop with gt timestamps for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): #### do something at beginning of epoch (in this case reset self._train_steps_this_epoch=0) self._epoch_before_hook() gt.stamp('epoch_before_hook') #### util class Progress, e.g. for plotting a progress bar ####### note: sampler may already contain samples in its pool from initial_exploration_hook or previous epochs self._training_progress = Progress(self._epoch_length * self._n_train_repeat / self._train_every_n_steps) start_samples = self.sampler._total_samples ### train for epoch_length ### for i in count(): #### _timestep is within an epoch samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples #### check if you're at the end of an epoch to train if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break #### not implemented atm self._timestep_before_hook() gt.stamp('timestep_before_hook') #### start model rollout if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() print('[ MBPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MBPO ] Training model at epoch {} | freq {} | timestep {} (total: {}) | epoch train steps: {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep, self._train_steps_this_epoch, self._num_train_steps)) #### train the model with input:(obs, act), outputs: (rew, delta_obs), inputs are divided into sets with holdout_ratio #@anyboby debug samples = self._pool.return_all_samples() self.fake_env.reset_model() model_train_metrics = self.fake_env.train( samples, batch_size=512, max_epochs=None, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) gt.stamp('epoch_train_model') #### rollout model env #### self._set_rollout_length() self._reallocate_model_pool( use_mjc_model_pool=self.use_mjc_state_model) model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) ########################### gt.stamp('epoch_rollout_model') # self._visualize_model(self._evaluation_environment, self._total_timestep) self._training_progress.resume() ##### śampling from the real world ! ##### ##### _total_timestep % train_every_n_steps is checked inside _do_sampling self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') ### n_train_repeat from config ### if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() self._training_progress.close() ### this is where we yield the episode diagnostics to tune trial runner ### yield {'done': True, **diagnostics}
def _train(self, env, policy, pool, initial_exploration_policy=None): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ if not self._training_started: self._init_training() self._initial_exploration_hook(env, initial_exploration_policy, pool) self.sampler.initialize(env, policy, pool) evaluation_env = env.copy() if self._eval_n_episodes else None gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_env) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, env) gt.stamp('training_metrics') should_save_path = (self._path_save_frequency > 0 and self._epoch % self._path_save_frequency == 0) if should_save_path: import pickle for i, path in enumerate(training_paths): #path.pop('images') path_file_name = f'training_path_{self._epoch}_{i}.pkl' path_file_path = os.path.join(os.getcwd(), 'paths', path_file_name) if not os.path.exists(os.path.dirname(path_file_path)): os.makedirs(os.path.dirname(path_file_path)) with open(path_file_path, 'wb') as f: pickle.dump(path, f) if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_env) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_env, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. env.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool if not self._training_started: self._init_training() self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() import time for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples sample_times = [] for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') t0 = time.time() self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') sample_times.append(time.time() - t0) if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') print("Average Sample Time: ", np.mean(np.array(sample_times))) training_paths = self._training_paths() # self.sampler.get_last_n_paths( # math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') #should_save_path = ( # self._path_save_frequency > 0 # and self._epoch % self._path_save_frequency == 0) #if should_save_path: # import pickle # for i, path in enumerate(training_paths): # #path.pop('images') # path_file_name = f'training_path_{self._epoch}_{i}.pkl' # path_file_path = os.path.join( # os.getcwd(), 'paths', path_file_name) # if not os.path.exists(os.path.dirname(path_file_path)): # os.makedirs(os.path.dirname(path_file_path)) # with open(path_file_path, 'wb' ) as f: # pickle.dump(path, f) if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_kwargs and hasattr(evaluation_environment, 'render_rollouts'): # TODO: Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() del evaluation_paths yield {'done': True, **diagnostics}
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): # reset with goal goal = env.sample_goal() observation = env.reset(goal=goal) policy.reset() # sample z ~ p(z|g) z = self._embedding.get_z(goal=goal) path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 trajectory = [] z_indx = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) path_length_list = [] for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length # flatten observation with given latent z aug_obs = np.concatenate((observation['observation'], z)) action, _ = policy.get_action(aug_obs) next_ob, reward, terminal, info = env.step(action) # assert all(next_ob['desired_goal'] == goal) assert reward == env.compute_reward( next_ob['achieved_goal'], next_ob['desired_goal'], info) path_length += 1 path_return += reward trajectory.append( (observation, action, reward, next_ob, terminal)) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) # add hindsight samples self._pool.add_hindsight_episode( episode=trajectory, embedding=self._embedding, latent=z, goal=goal, ) z_indx += 1 if z_indx >= self._n_latents: goal = env.sample_goal() z_indx = 0 z = self._embedding.get_z(goal=goal) observation = env.reset(goal=goal) policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 trajectory = [] else: observation = next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('steps', iteration) # also record total steps logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def main(arglist): game_name = arglist.game_name # 'abs', 'one' reward_type = arglist.reward_type p = arglist.p agent_num = arglist.n u_range = 1. k = 0 print(arglist.aux, 'arglist.aux') model_names_setting = arglist.model_names_setting.split('_') model_names = [model_names_setting[0]] + [model_names_setting[1]] * (agent_num - 1) model_name = '_'.join(model_names) path_prefix = game_name if game_name == 'pbeauty': env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p) path_prefix = game_name + '-' + reward_type + '-' + str(p) elif 'matrix' in game_name: matrix_game_name = game_name.split('-')[-1] repeated = arglist.repeat max_step = arglist.max_path_length memory = arglist.memory env = MatrixGame(game=matrix_game_name, agent_num=agent_num, action_num=2, repeated=repeated, max_step=max_step, memory=memory, discrete_action=False, tuple_obs=False) path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory) elif 'particle' in game_name: particle_game_name = game_name.split('-')[-1] env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist) now = datetime.datetime.now() timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z') if 'CG' in model_name: model_name = model_name + '-{}'.format(arglist.mu) if not arglist.aux: model_name = model_name + '-{}'.format(arglist.aux) suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp) print(suffix) logger.add_tabular_output('./log/{}.csv'.format(suffix)) snapshot_dir = './snapshot/{}'.format(suffix) policy_dir = './policy/{}'.format(suffix) os.makedirs(snapshot_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) logger.set_snapshot_dir(snapshot_dir) agents = [] M = arglist.hidden_size batch_size = arglist.batch_size sampler = MASampler(agent_num=agent_num, joint=True, max_path_length=30, min_pool_size=100, batch_size=batch_size) base_kwargs = { 'sampler': sampler, 'epoch_length': 1, 'n_epochs': arglist.max_steps, 'n_train_repeat': 1, 'eval_render': True, 'eval_n_episodes': 10 } with U.single_threaded_session(): for i, model_name in enumerate(model_names): if 'PR2AC' in model_name: k = int(model_name[-1]) g = False mu = arglist.mu if 'G' in model_name: g = True agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux) elif model_name == 'MASQL': agent = masql_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name) else: if model_name == 'DDPG': joint = False opponent_modelling = False elif model_name == 'MADDPG': joint = True opponent_modelling = False elif model_name == 'DDPG-OM' or model_name == 'DDPG-ToM': joint = True opponent_modelling = True agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs, game_name=game_name) agents.append(agent) sampler.initialize(env, agents) for agent in agents: agent._init_training() gt.rename_root('MARLAlgorithm') gt.reset() gt.set_def_unique(False) initial_exploration_done = False # noise = .1 noise = 1. alpha = .5 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass for epoch in gt.timed_for(range(base_kwargs['n_epochs'] + 1)): logger.push_prefix('Epoch #%d | ' % epoch) if epoch % 1 == 0: print(suffix) for t in range(base_kwargs['epoch_length']): # TODO.code consolidation: Add control interval to sampler if not initial_exploration_done: if epoch >= 1000: initial_exploration_done = True sampler.sample() # print('Sampling') if not initial_exploration_done: continue gt.stamp('sample') # print('Sample Done') if epoch == base_kwargs['n_epochs']: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .1 if epoch > base_kwargs['n_epochs'] / 10: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .1 if epoch > base_kwargs['n_epochs'] / 5: noise = 0.05 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if epoch > base_kwargs['n_epochs'] / 6: noise = 0.01 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass for j in range(base_kwargs['n_train_repeat']): batch_n = [] recent_batch_n = [] indices = None receent_indices = None for i, agent in enumerate(agents): if i == 0: batch = agent.pool.random_batch(batch_size) indices = agent.pool.indices receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top)) batch_n.append(agent.pool.random_batch_by_indices(indices)) recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices)) # print(len(batch_n)) target_next_actions_n = [] try: for agent, batch in zip(agents, batch_n): target_next_actions_n.append(agent._target_policy.get_actions(batch['next_observations'])) except: pass opponent_actions_n = np.array([batch['actions'] for batch in batch_n]) recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n]) ####### figure out recent_opponent_observations_n = [] for batch in recent_batch_n: recent_opponent_observations_n.append(batch['observations']) current_actions = [agents[i]._policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)] all_actions_k = [] for i, agent in enumerate(agents): if isinstance(agent, MAVBAC): if agent._k > 0: batch_actions_k = agent._policy.get_all_actions(batch_n[i]['next_observations']) actions_k = [a[0][0] for a in batch_actions_k] all_actions_k.append(';'.join(list(map(str, actions_k)))) if len(all_actions_k) > 0: with open('{}/all_actions.csv'.format(policy_dir), 'a') as f: f.write(','.join(list(map(str, all_actions_k))) + '\n') with open('{}/policy.csv'.format(policy_dir), 'a') as f: f.write(','.join(list(map(str, current_actions)))+'\n') # print('============') for i, agent in enumerate(agents): try: batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i]) except: pass batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) if agent.joint: if agent.opponent_modelling: batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i] batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations']) else: batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim)) if isinstance(agent, MAVBAC) or isinstance(agent, MASQL): agent._do_training(iteration=t + epoch * agent._epoch_length, batch=batch_n[i], annealing=alpha) else: agent._do_training(iteration=t + epoch * agent._epoch_length, batch=batch_n[i]) gt.stamp('train') # self._evaluate(epoch) # for agent in agents: # params = agent.get_snapshot(epoch) # logger.save_itr_params(epoch, params) # times_itrs = gt.get_times().stamps.itrs # # eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 # total_time = gt.get_times().total # logger.record_tabular('time-train', times_itrs['train'][-1]) # logger.record_tabular('time-eval', eval_time) # logger.record_tabular('time-sample', times_itrs['sample'][-1]) # logger.record_tabular('time-total', total_time) # logger.record_tabular('epoch', epoch) # sampler.log_diagnostics() # logger.dump_tabular(with_prefix=False) logger.pop_prefix() sampler.terminate()
def objective(arglist): config = tf.ConfigProto() config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU sess = tf.Session(config=config) set_session(sess) game_name = arglist.game_name # 'abs', 'one' reward_type = arglist.reward_type p = arglist.p agent_num = arglist.n u_range = 1. k = 0 print(arglist.aux, 'arglist.aux') model_names_setting = arglist.model_names_setting.split('_') model_names = model_names_setting model_name = '_'.join(model_names) path_prefix = game_name if game_name == 'pbeauty': env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p) path_prefix = game_name + '-' + reward_type + '-' + str(p) elif 'matrix' in game_name: matrix_game_name = game_name.split('-')[-1] repeated = arglist.repeat max_step = arglist.max_path_length memory = arglist.memory env = MatrixGame(game=matrix_game_name, agent_num=agent_num, action_num=2, repeated=repeated, max_step=max_step, memory=memory, discrete_action=False, tuple_obs=False) path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory) elif 'diff' in game_name: diff_game_name = game_name.split('-')[-1] agent_num = 3 s2 = arglist.s2 x2 = arglist.x2 y2 = arglist.y2 con = arglist.con env = DifferentialGame(diff_game_name, agent_num, x2, y2, s2, con) elif 'particle' in game_name: particle_game_name = game_name.split('-')[-1] env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist) now = datetime.datetime.now() timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z') if 'CG' in model_name: model_name = model_name + '-{}'.format(arglist.mu) if not arglist.aux: model_name = model_name + '-{}'.format(arglist.aux) suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp) print(suffix) # logger.add_tabular_output('./log/{}.csv'.format(suffix)) # snapshot_dir = './snapshot/{}'.format(suffix) # policy_dir = './policy/{}'.format(suffix) # os.makedirs(snapshot_dir, exist_ok=True) # os.makedirs(policy_dir, exist_ok=True) # logger.set_snapshot_dir(snapshot_dir) agents = [] M = arglist.hidden_size batch_size = arglist.batch_size sampler = MASampler(agent_num=agent_num, joint=True, global_reward=arglist.global_reward, max_path_length=25, min_pool_size=100, batch_size=batch_size) base_kwargs = { 'sampler': sampler, 'epoch_length': 1, 'n_epochs': arglist.max_steps, 'n_train_repeat': 1, 'eval_render': True, 'eval_n_episodes': 10 } _alpha = arglist.alpha lr = arglist.lr n_pars = arglist.n_pars result = 0. with U.single_threaded_session(): for i, model_name in enumerate(model_names): if 'PR2AC' in model_name: k = int(model_name[-1]) g = False mu = arglist.mu if 'G' in model_name: g = True agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux) elif model_name == 'MASQL': agent = masql_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, game_name=game_name) elif model_name == 'ROMMEO': agent = rom_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name) else: if model_name == 'DDPG': joint = False opponent_modelling = False elif model_name == 'MADDPG': joint = True opponent_modelling = False elif model_name == 'DDPG-OM': joint = True opponent_modelling = True agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs,lr=lr, game_name=game_name) agents.append(agent) sampler.initialize(env, agents) for agent in agents: agent._init_training() gt.rename_root('MARLAlgorithm') gt.reset() gt.set_def_unique(False) initial_exploration_done = False # noise = .1 noise = .5 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .5 for steps in gt.timed_for(range(base_kwargs['n_epochs'] + 1)): # import pdb; pdb.set_trace() # alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500. if steps < base_kwargs['n_epochs']//3: # alpha = _alpha alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500. elif steps < base_kwargs['n_epochs']//2: alpha = _alpha/10 else: alpha = .3 tflog('alpha', alpha) print('alpha', alpha) # if steps > 100 and steps<150: # alpha = .1 - 0.099 * steps/(150) # elif steps >= 150: # alpha = 1e-3 print('alpha', alpha) # logger.push_prefix('Epoch #%d | ' % steps) if steps % (25*1000) == 0: print(suffix) for t in range(base_kwargs['epoch_length']): # TODO.code consolidation: Add control interval to sampler if not initial_exploration_done: # if steps >= 1000: if steps >= 10: initial_exploration_done = True sampler.sample() if not initial_exploration_done: continue gt.stamp('sample') print('Sample Done') if steps == 10000: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = 10. # if steps == 2000: if steps > base_kwargs['n_epochs'] / 10: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .1 if steps > base_kwargs['n_epochs'] / 5: noise = 0.05 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if steps > base_kwargs['n_epochs'] / 6: noise = 0.01 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if steps % arglist.training_interval != 0: continue for j in range(base_kwargs['n_train_repeat']): batch_n = [] recent_batch_n = [] indices = None receent_indices = None for i, agent in enumerate(agents): if i == 0: batch = agent.pool.random_batch(batch_size) indices = agent.pool.indices receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top)) batch_n.append(agent.pool.random_batch_by_indices(indices)) recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices)) # print(len(batch_n)) target_next_actions_n = [] # try: all_obs = np.array(np.concatenate([batch['observations'] for batch in batch_n], axis=-1)) all_next_obs = np.array(np.concatenate([batch['next_observations'] for batch in batch_n], axis=-1)) # print(all_obs[0]) for batch in batch_n: # print('making all obs') batch['all_observations'] = deepcopy(all_obs) batch['all_next_observations'] = deepcopy(all_next_obs) opponent_current_actions_n = [] for agent, batch in zip(agents, batch_n): target_next_actions_n.append(agent.target_policy.get_actions(batch['next_observations'])) opponent_current_actions_n.append(agent.policy.get_actions(batch['observations'])) for i, agent in enumerate(agents): batch_n[i]['opponent_current_actions'] = np.reshape( np.delete(deepcopy(opponent_current_actions_n), i, 0), (-1, agent._opponent_action_dim)) # except: # pass opponent_actions_n = np.array([batch['actions'] for batch in batch_n]) recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n]) ####### figure out recent_opponent_observations_n = [] for batch in recent_batch_n: recent_opponent_observations_n.append(batch['observations']) current_actions = [agents[i].policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)] all_actions_k = [] for i, agent in enumerate(agents): if isinstance(agent, MAVBAC): if agent._k > 0: batch_actions_k = agent.policy.get_all_actions(batch_n[i]['next_observations']) actions_k = [a[0][0] for a in batch_actions_k] all_actions_k.append(';'.join(list(map(str, actions_k)))) # if len(all_actions_k) > 0: # with open('{}/all_actions.csv'.format(policy_dir), 'a') as f: # f.write(','.join(list(map(str, all_actions_k))) + '\n') # with open('{}/policy.csv'.format(policy_dir), 'a') as f: # f.write(','.join(list(map(str, current_actions)))+'\n') # print('============') for i, agent in enumerate(agents): try: batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i]) except: pass batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) if agent.joint: if agent.opponent_modelling: batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i] batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations']) else: batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim)) if isinstance(agent, MAVBAC) or isinstance(agent, MASQL) or isinstance(agent, ROMMEO): agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i], annealing=alpha) else: agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i]) gt.stamp('train') result = sampler.terminate() clear_session() return result
def train(self, init_episode=0): if init_episode == 0: # Eval and log self.eval() self.log(write_table_header=True) gt.reset() gt.set_def_unique(False) expected_accum_rewards = np.zeros(self.total_episodes) episodes_iter = range(init_episode, self.total_episodes) if not logger.get_log_stdout(): # Fancy iterable bar episodes_iter = tqdm.tqdm(episodes_iter) for it in gt.timed_for(episodes_iter, save_itrs=True): # Put models in training mode for model in self.trainable_models: model.train() obs = self.env.reset() rollout_steps = 0 for step in range(self.train_steps): if self.render: self.env.render() interaction_info = interaction( self.env, self.policy, obs, device=self.torch_device, deterministic=False, ) self.num_train_interactions += 1 rollout_steps += 1 gt.stamp('sample') # Add data to replay_buffer self.replay_buffer.add_sample(**interaction_info) # Only train when there are enough samples from buffer if self.replay_buffer.available_samples() > self.batch_size: for ii in range(self.optimization_steps): self.learn() gt.stamp('train') # Reset environment if it is done if interaction_info['termination'] \ or rollout_steps > self.max_horizon: obs = self.env.reset() rollout_steps = 0 else: obs = interaction_info['next_obs'] # Evaluate current policy to check performance expected_accum_rewards[it] = self.eval() self.log() self.num_episodes += 1 return expected_accum_rewards
def _train(self, env, policy, pool, load=None): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) # evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default() as sess: if load is not None: saver = tf.train.Saver() saver.restore(sess, load) print('pre-trained model restored ...') gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') if epoch % 1 == 0 or epoch >= ENV_PARAMS['n_epochs'] - 20: self._evaluate(policy, env) print('@ epoch %d : ' % epoch) # gt.stamp('eval') # # params = self.get_snapshot(epoch) # logger.save_itr_params(epoch, params) # # time_itrs = gt.get_times().stamps.itrs # time_eval = time_itrs['eval'][-1] # time_total = gt.get_times().total # time_train = time_itrs.get('train', [0])[-1] # time_sample = time_itrs.get('sample', [0])[-1] # # logger.record_tabular('time-train', time_train) # logger.record_tabular('time-eval', time_eval) # logger.record_tabular('time-sample', time_sample) # logger.record_tabular('time-total', time_total) # logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() # env.reset() if (epoch > ENV_PARAMS['n_epochs'] * 0 and epoch % 5 == 0) or epoch >= ENV_PARAMS['n_epochs'] - 100: saver = tf.train.Saver() saver.save(sess, save_path=save_path + '/model-' + str(epoch) + '.ckpt') print('Model saved ...') self.sampler.terminate()
def _train(self, env, policy, replay_buffer, sess): """Perform RL training. Parameters ---------- env : gym.Env Environment used for training policy : Policy Policy used for training replay_buffer : ReplayBuffer Replay buffer to add samples to """ self._init_training() self.sampler.initialize(env, policy, replay_buffer) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) logger.record_time(time_total, time_sample) self.sampler.log_diagnostics() logger.save_stats() logger.dump_tabular(with_prefix=False) logger.pop_prefix() self.sampler.terminate()
def _train(self, env, policy, pool, qf=None, vf=None, saver=None, _ec=None, dynamic_ec=False): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) self.sampler.initialize(env, policy, pool) if dynamic_ec: dicrese_rate = _ec / self._n_epochs logger2 = mylogger2.get_logger() os.makedirs(os.path.join(logger2.log_dir, 'model'), exist_ok=logger2.exist_ok) optuna_break = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs), save_itrs=True): if optuna_break: continue # logger.push_prefix('Epoch #%d | ' % epoch) epoch_states = [] kurtosis = [] signed_variance = [] for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler done, _n_episodes, obs, next_obs, info = self.sampler.sample( ) epoch_states.append(obs) state_importances = self.policy.calc_knack([obs]) kurtosis.append(state_importances["kurtosis"][0]) signed_variance.append( state_importances["signed_variance"] [0]) # be careful of batch_ready < epoch_length if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') # evaluation if epoch % self._eval_n_frequency == 0: eval_average_return = self._evaluate(epoch) logger.record_tabular('eval_average_return', eval_average_return) if hasattr(self.policy, "optuna_trial"): if self.policy.optuna_trial is not None: self.policy.optuna_trial.report( eval_average_return, epoch) # report intermediate_value if self.policy.optuna_trial.should_prune(): optuna_break = True continue # raise optuna.structs.TrialPruned() else: logger.record_tabular('eval_average_return', np.nan) gt.stamp('eval') # logging about time and step times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('total_step', self.sampler._total_samples) logger.record_tabular('total_episode', self.sampler._n_episodes) # logging about array if hasattr(self.policy, "current_knack_thresh"): current_knack_thresh = self.policy.current_knack_thresh _ = self.policy.calc_and_update_knack(epoch_states) if logger2.save_array_flag: kwargs1 = { 'epoch': epoch, 'states': np.array(epoch_states), 'knack_kurtosis': np.array(kurtosis), 'signed_variance': np.array(signed_variance) } if hasattr(self.policy, "current_knack_thresh"): kwargs1.update( {'current_knack_thresh': current_knack_thresh}) kwargs1.update(self.policy.get_q_params()) logger2.add_array_data(kwargs1) if epoch % 10 == 0: # TODO save only parameters saver.save(self._sess, os.path.join(logger2.log_dir, 'model')) gt.stamp("tf save") gt.stamp("calc knacks") if dynamic_ec: self._sess.run(tf.assign(_ec, _ec - dicrese_rate)) logger.dump_tabular() logger2.write() # print(gt.report()) # finalize processing if optuna_break: return None if logger2.save_array_flag: saver.save(self._sess, os.path.join(logger2.log_dir, 'model')) self.sampler.terminate() return eval_average_return
def _train(self): """Return a generator that runs the standard RL loop.""" training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') update_diagnostics = [] start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: repeat_diagnostics = self._do_training_repeats( timestep=self._total_timestep) if repeat_diagnostics is not None: update_diagnostics.append(repeat_diagnostics) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') update_diagnostics = tree.map_structure(lambda *d: np.mean(d), *update_diagnostics) training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment, self._total_timestep, evaluation_type='train') gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment, self._total_timestep, evaluation_type='evaluation') gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = { key: times[-1] for key, times in gt.get_times().stamps.itrs.items() } # TODO(hartikainen/tf2): Fix the naming of training/update # diagnostics/metric diagnostics.update(( ('evaluation', evaluation_metrics), ('training', training_metrics), ('update', update_diagnostics), ('times', time_diagnostics), ('sampler', sampler_diagnostics), ('epoch', self._epoch), ('timestep', self._timestep), ('total_timestep', self._total_timestep), ('num_train_steps', self._num_train_steps), )) if self._eval_render_kwargs and hasattr(evaluation_environment, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() yield {'done': True, **diagnostics}
def _train(self, env, policy, initial_exploration_policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy, pool) initial_exploration_done = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler if not initial_exploration_done: if self._epoch_length * epoch >= self._n_initial_exploration_steps: self.sampler.set_policy(policy) initial_exploration_done = True self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool model_metrics = {} if not self._training_started: self._init_training() self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') self._training_progress = Progress(self._epoch_length * self._n_train_repeat) start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() print('[ MEEE ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MEEE ] Training model at epoch {} | freq {} | timestep {} (total: {}) | epoch train steps: {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep, self._train_steps_this_epoch, self._num_train_steps)) model_train_metrics = self._train_model( batch_size=256, max_epochs=None, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) gt.stamp('epoch_train_model') self._set_rollout_length() self._reallocate_model_pool() model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) gt.stamp('epoch_rollout_model') # self._visualize_model(self._evaluation_environment, self._total_timestep) self._training_progress.resume() # No UCB exploration #self._do_sampling(timestep=self._total_timestep) self._do_sampling(timestep=self._total_timestep, disturb=True, fake_env=self.fake_env, Qs=self._Qs) #print("**exploration**") gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() self._training_progress.close() yield {'done': True, **diagnostics}
def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) # In offline PEARL, we do not need collecting data # self.replay_buffer = self.enc_replay_buffer from above initialization if it_ == 0: print('collecting initial pool of data for train and eval') # temp for evaluating for idx in self.train_tasks: # [0, 1] self.task_idx = idx self.env.reset_task( idx ) # 0 for {'distributions':-1} which is cheetah-mixed self.collect_data(self.num_initial_steps, 1, np.inf) # Sample data from train tasks. for i in range(self.num_tasks_sample): idx = np.random.randint(len(self.train_tasks)) self.task_idx = idx self.env.reset_task(idx) self.enc_replay_buffer.task_buffers[idx].clear() # collect some trajectories with z ~ prior if self.num_steps_prior > 0: self.collect_data(self.num_steps_prior, 1, np.inf) # collect some trajectories with z ~ posterior if self.num_steps_posterior > 0: self.collect_data(self.num_steps_posterior, 1, self.update_post_train) # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior if self.num_extra_rl_steps_posterior > 0: self.collect_data(self.num_extra_rl_steps_posterior, 1, self.update_post_train, add_to_enc_buffer=False) # Sample train tasks and compute gradient updates on parameters. for train_step in range(self.num_train_steps_per_itr): indices = np.random.choice(self.train_tasks, self.meta_batch) self._do_training(indices) self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch()
def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() self.train_obs = self._start_new_rollout() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) if it_ == 0: print('collecting initial pool of data for train and eval') # temp for evaluating for idx in self.train_tasks: self.task_idx = idx self.env.reset_task(idx) self.collect_data_sampling_from_prior( num_samples=self.max_path_length * 10, resample_z_every_n=self.max_path_length, eval_task=False) """ for idx in self.eval_tasks: self.task_idx = idx self.env.reset_task(idx) # TODO: make number of initial trajectories a parameter self.collect_data_sampling_from_prior(num_samples=self.max_path_length * 20, resample_z_every_n=self.max_path_length, eval_task=True) """ # Sample data from train tasks. for i in range(self.num_tasks_sample): idx = np.random.randint(len(self.train_tasks)) self.task_idx = idx self.env.reset_task(idx) # TODO: there may be more permutations of sampling/adding to encoding buffer we may wish to try if self.train_embedding_source == 'initial_pool': # embeddings are computed using only the initial pool of data # sample data from posterior to train RL algorithm self.collect_data_from_task_posterior( idx=idx, num_samples=self.num_steps_per_task, add_to_enc_buffer=False) elif self.train_embedding_source == 'posterior_only': self.collect_data_from_task_posterior( idx=idx, num_samples=self.num_steps_per_task, eval_task=False, add_to_enc_buffer=True) elif self.train_embedding_source == 'online_exploration_trajectories': # embeddings are computed using only data collected using the prior # sample data from posterior to train RL algorithm self.enc_replay_buffer.task_buffers[idx].clear() # resamples using current policy, conditioned on prior self.collect_data_sampling_from_prior( num_samples=self.num_steps_per_task, resample_z_every_n=self.max_path_length, add_to_enc_buffer=True) self.collect_data_from_task_posterior( idx=idx, num_samples=self.num_steps_per_task, add_to_enc_buffer=False) elif self.train_embedding_source == 'online_on_policy_trajectories': # sample from prior, then sample more from the posterior # embeddings computed from both prior and posterior data self.enc_replay_buffer.task_buffers[idx].clear() self.collect_data_online( idx=idx, num_samples=self.num_steps_per_task, add_to_enc_buffer=True) else: raise Exception( "Invalid option for computing train embedding {}". format(self.train_embedding_source)) # Sample train tasks and compute gradient updates on parameters. for train_step in range(self.num_train_steps_per_itr): indices = np.random.choice(self.train_tasks, self.meta_batch) self._do_training(indices) self._n_train_steps_total += 1 gt.stamp('train') #self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch()