def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None # TODO: use Ezpickle to deep_clone??? # evaluation_env = env with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() self.sampler.terminate()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) self.sampler.initialize(env, policy, pool) with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _train(self, env, policy, uniform_policy, pool): self._init_training(env, policy, pool) self.sampler.initialize(env, uniform_policy, pool) # use uniform sampler initially with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): if self._epoch_length * epoch >= self._n_random_steps: self.sampler.set_policy(policy) logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( itr=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ #### pool is e.g. simple_replay_pool training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool model_metrics = {} #### init Qs for SAC if not self._training_started: self._init_training() #### perform some initial steps (gather samples) using initial policy ###### fills pool with _n_initial_exploration_steps samples self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) #### set up sampler with train env and actual policy (may be different from initial exploration policy) ######## note: sampler is set up with the pool that may be already filled from initial exploration hook self.sampler.initialize(training_environment, policy, pool) #### reset gtimer (for coverage of project development) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) #### not implemented, could train policy before hook self._training_before_hook() #### iterate over epochs, gt.timed_for to create loop with gt timestamps for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): #### do something at beginning of epoch (in this case reset self._train_steps_this_epoch=0) self._epoch_before_hook() gt.stamp('epoch_before_hook') #### util class Progress, e.g. for plotting a progress bar ####### note: sampler may already contain samples in its pool from initial_exploration_hook or previous epochs self._training_progress = Progress(self._epoch_length * self._n_train_repeat / self._train_every_n_steps) start_samples = self.sampler._total_samples ### train for epoch_length ### for i in count(): #### _timestep is within an epoch samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples #### check if you're at the end of an epoch to train if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break #### not implemented atm self._timestep_before_hook() gt.stamp('timestep_before_hook') #### start model rollout if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() print('[ MBPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MBPO ] Training model at epoch {} | freq {} | timestep {} (total: {}) | epoch train steps: {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep, self._train_steps_this_epoch, self._num_train_steps)) #### train the model with input:(obs, act), outputs: (rew, delta_obs), inputs are divided into sets with holdout_ratio #@anyboby debug samples = self._pool.return_all_samples() self.fake_env.reset_model() model_train_metrics = self.fake_env.train( samples, batch_size=512, max_epochs=None, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) gt.stamp('epoch_train_model') #### rollout model env #### self._set_rollout_length() self._reallocate_model_pool( use_mjc_model_pool=self.use_mjc_state_model) model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) ########################### gt.stamp('epoch_rollout_model') # self._visualize_model(self._evaluation_environment, self._total_timestep) self._training_progress.resume() ##### śampling from the real world ! ##### ##### _total_timestep % train_every_n_steps is checked inside _do_sampling self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') ### n_train_repeat from config ### if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() self._training_progress.close() ### this is where we yield the episode diagnostics to tune trial runner ### yield {'done': True, **diagnostics}
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool model_metrics = {} if not self._training_started: self._init_training() self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') self._training_progress = Progress(self._epoch_length * self._n_train_repeat) start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() print('[ MBPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MBPO ] Training model at epoch {} | freq {} | timestep {} (total: {}) | epoch train steps: {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep, self._train_steps_this_epoch, self._num_train_steps)) model_train_metrics = self._train_model( batch_size=256, max_epochs=None, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) gt.stamp('epoch_train_model') self._set_rollout_length() if self._rollout_batch_size > 30000: factor = self._rollout_batch_size // 30000 + 1 mini_batch = self._rollout_batch_size // factor for i in range(factor): model_rollout_metrics = self._rollout_model( rollout_batch_size=mini_batch, deterministic=self._deterministic) else: model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) gt.stamp('epoch_rollout_model') # self._visualize_model(self._evaluation_environment, self._total_timestep) self._training_progress.resume() self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() self._training_progress.close() yield {'done': True, **diagnostics}
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool if not self._training_started: self._init_training() self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() import time for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples sample_times = [] for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') t0 = time.time() self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') sample_times.append(time.time() - t0) if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') print("Average Sample Time: ", np.mean(np.array(sample_times))) training_paths = self._training_paths() # self.sampler.get_last_n_paths( # math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') #should_save_path = ( # self._path_save_frequency > 0 # and self._epoch % self._path_save_frequency == 0) #if should_save_path: # import pickle # for i, path in enumerate(training_paths): # #path.pop('images') # path_file_name = f'training_path_{self._epoch}_{i}.pkl' # path_file_path = os.path.join( # os.getcwd(), 'paths', path_file_name) # if not os.path.exists(os.path.dirname(path_file_path)): # os.makedirs(os.path.dirname(path_file_path)) # with open(path_file_path, 'wb' ) as f: # pickle.dump(path, f) if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) obs = self._pool.last_n_batch( self._pool.size)['observations']['state_observation'] plt.cla() plt.clf() plt.xlim(-20, 20) plt.ylim(-20, 20) plt.plot(obs[:, 0], obs[:, 1]) plt.savefig('traj_plot_%d.png' % (self._epoch)) if self._rnd_int_rew_coeff: errors = [] for i in np.arange(-20, 20, 0.5): error = [] for j in np.arange(-20, 20, 0.5): curr_pos = np.array([i, j]) err = self._session.run( self._rnd_errors, { self._placeholders['observations']['state_observation']: [curr_pos] })[0] error.append(err) errors.append(error) plt.cla() plt.clf() plt.imshow(np.asarray(errors)[:, :, 0]) plt.savefig('errors_%d.png' % (self._epoch)) if self._eval_render_kwargs and hasattr(evaluation_environment, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() del evaluation_paths yield {'done': True, **diagnostics}
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ #### pool is e.g. simple_replay_pool training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool if not self._training_started: #### perform some initial steps (gather samples) using initial policy ###### fills pool with _n_initial_exploration_steps samples self._initial_exploration_hook(training_environment, self._policy, pool) #### set up sampler with train env and actual policy (may be different from initial exploration policy) ######## note: sampler is set up with the pool that may be already filled from initial exploration hook self.sampler.initialize(training_environment, policy, pool) self.model_sampler.initialize(self.fake_env, policy, self.model_pool) rollout_dkl_lim = self.model_sampler.compute_dynamics_dkl( obs_batch=self._pool.rand_batch_from_archive( 5000, fields=['observations'])['observations'], depth=self._rollout_schedule[2]) self.model_sampler.set_rollout_dkl(rollout_dkl_lim) self.initial_model_dkl = self.model_sampler.dyn_dkl #### reset gtimer (for coverage of project development) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self.policy_epoch = 0 ### count policy updates self.new_real_samples = 0 self.last_eval_step = 0 self.diag_counter = 0 running_diag = {} self.approx_model_batch = self.batch_size_policy - self.min_real_samples_per_epoch ### some size to start off #### not implemented, could train policy before hook self._training_before_hook() #### iterate over epochs, gt.timed_for to create loop with gt timestamps for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): #### do something at beginning of epoch (in this case reset self._train_steps_this_epoch=0) self._epoch_before_hook() gt.stamp('epoch_before_hook') #### util class Progress, e.g. for plotting a progress bar ####### note: sampler may already contain samples in its pool from initial_exploration_hook or previous epochs self._training_progress = Progress(self._epoch_length * self._n_train_repeat / self._train_every_n_steps) samples_added = 0 #=====================================================================# # Rollout model # #=====================================================================# model_samples = None keep_rolling = True model_metrics = {} #### start model rollout if self._real_ratio < 1.0: #if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: #=====================================================================# # Model Rollouts # #=====================================================================# if self.rollout_mode == 'schedule': self._set_rollout_length() while keep_rolling: ep_b = self._pool.epoch_batch( batch_size=self._rollout_batch_size, epochs=self._pool.epochs_list, fields=['observations', 'pi_infos']) kls = np.clip(self._policy.compute_DKL( ep_b['observations'], ep_b['mu'], ep_b['log_std']), a_min=0, a_max=None) btz_dist = self._pool.boltz_dist(kls, alpha=self.policy_alpha) btz_b = self._pool.distributed_batch_from_archive( self._rollout_batch_size, btz_dist, fields=['observations', 'pi_infos']) start_states, mus, logstds = btz_b['observations'], btz_b[ 'mu'], btz_b['log_std'] btz_kl = np.clip(self._policy.compute_DKL( start_states, mus, logstds), a_min=0, a_max=None) self.model_sampler.reset(start_states) if self.rollout_mode == 'uncertainty': self.model_sampler.set_max_uncertainty( self.max_tddyn_err) for i in count(): # print(f'Model Sampling step Nr. {i+1}') _, _, _, info = self.model_sampler.sample( max_samples=int(self.approx_model_batch - samples_added)) if self.model_sampler._total_samples + samples_added >= .99 * self.approx_model_batch: keep_rolling = False break if info['alive_ratio'] <= 0.1: break ### diagnostics for rollout ### rollout_diagnostics = self.model_sampler.finish_all_paths() if self.rollout_mode == 'iv_gae': keep_rolling = self.model_pool.size + samples_added <= .99 * self.approx_model_batch ###################################################################### ### get model_samples, get() invokes the inverse variance rollouts ### model_samples_new, buffer_diagnostics_new = self.model_pool.get( ) model_samples = [ np.concatenate((o, n), axis=0) for o, n in zip(model_samples, model_samples_new) ] if model_samples else model_samples_new ###################################################################### ### diagnostics new_n_samples = len(model_samples_new[0]) + EPS diag_weight_old = samples_added / (new_n_samples + samples_added) diag_weight_new = new_n_samples / (new_n_samples + samples_added) model_metrics = update_dict(model_metrics, rollout_diagnostics, weight_a=diag_weight_old, weight_b=diag_weight_new) model_metrics = update_dict(model_metrics, buffer_diagnostics_new, weight_a=diag_weight_old, weight_b=diag_weight_new) ### run diagnostics on model data if buffer_diagnostics_new['poolm_batch_size'] > 0: model_data_diag = self._policy.run_diagnostics( model_samples_new) model_data_diag = { k + '_m': v for k, v in model_data_diag.items() } model_metrics = update_dict(model_metrics, model_data_diag, weight_a=diag_weight_old, weight_b=diag_weight_new) samples_added += new_n_samples model_metrics.update({'samples_added': samples_added}) ###################################################################### ## for debugging model_metrics.update({ 'cached_var': np.mean(self.fake_env._model.scaler_out.cached_var) }) model_metrics.update({ 'cached_mu': np.mean(self.fake_env._model.scaler_out.cached_mu) }) print(f'Rollouts finished') gt.stamp('epoch_rollout_model') #=====================================================================# # Sample # #=====================================================================# n_real_samples = self.model_sampler.dyn_dkl / self.initial_model_dkl * self.min_real_samples_per_epoch n_real_samples = max(n_real_samples, 1000) # n_real_samples = self.min_real_samples_per_epoch ### for ablation model_metrics.update({'n_real_samples': n_real_samples}) start_samples = self.sampler._total_samples ### train for epoch_length ### for i in count(): #### _timestep is within an epoch samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples #### not implemented atm self._timestep_before_hook() gt.stamp('timestep_before_hook') ##### śampling from the real world ! ##### _, _, _, _ = self._do_sampling(timestep=self.policy_epoch) gt.stamp('sample') self._timestep_after_hook() gt.stamp('timestep_after_hook') if self.ready_to_train or self._timestep > n_real_samples: self.sampler.finish_all_paths(append_val=True, append_cval=True, reset_path=False) self.new_real_samples += self._timestep break #=====================================================================# # Train model # #=====================================================================# if self.new_real_samples > 2048 and self._real_ratio < 1.0: model_diag = self.train_model(min_epochs=1, max_epochs=10) self.new_real_samples = 0 model_metrics.update(model_diag) #=====================================================================# # Get Buffer Data # #=====================================================================# real_samples, buf_diag = self._pool.get() ### run diagnostics on real data policy_diag = self._policy.run_diagnostics(real_samples) policy_diag = {k + '_r': v for k, v in policy_diag.items()} model_metrics.update(policy_diag) model_metrics.update(buf_diag) #=====================================================================# # Update Policy # #=====================================================================# train_samples = [ np.concatenate((r, m), axis=0) for r, m in zip(real_samples, model_samples) ] if model_samples else real_samples self._policy.update_real_c(real_samples) self._policy.update_policy(train_samples) self._policy.update_critic( train_samples, train_vc=(train_samples[-3] > 0).any()) ### only train vc if there are any costs if self._real_ratio < 1.0: self.approx_model_batch = self.batch_size_policy - n_real_samples #self.model_sampler.dyn_dkl/self.initial_model_dkl * self.min_real_samples_per_epoch self.policy_epoch += 1 self.max_tddyn_err *= self.max_tddyn_err_decay #### log policy diagnostics self._policy.log() gt.stamp('train') #=====================================================================# # Log performance and stats # #=====================================================================# self.sampler.log() # write results to file, ray prints for us, so no need to print from logger logger_diagnostics = self.logger.dump_tabular( output_dir=self._log_dir, print_out=False) #=====================================================================# if self._total_timestep // self.eval_every_n_steps > self.last_eval_step: evaluation_paths = self._evaluation_paths( policy, evaluation_environment) gt.stamp('evaluation_paths') self.last_eval_step = self._total_timestep // self.eval_every_n_steps else: evaluation_paths = [] if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') diag_obs_batch = np.concatenate(([ evaluation_paths[i]['observations'] for i in range(len(evaluation_paths)) ]), axis=0) else: evaluation_metrics = {} diag_obs_batch = [] gt.stamp('epoch_after_hook') new_diagnostics = {} time_diagnostics = gt.get_times().stamps.itrs # add diagnostics from logger new_diagnostics.update(logger_diagnostics) new_diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) #### updateing and averaging old_ts_diag = running_diag.get('timestep', 0) new_ts_diag = self._total_timestep - self.diag_counter - old_ts_diag w_olddiag = old_ts_diag / (new_ts_diag + old_ts_diag) w_newdiag = new_ts_diag / (new_ts_diag + old_ts_diag) running_diag = update_dict(running_diag, new_diagnostics, weight_a=w_olddiag, weight_b=w_newdiag) running_diag.update({'timestep': new_ts_diag + old_ts_diag}) #### if new_ts_diag + old_ts_diag > self.eval_every_n_steps: running_diag.update({ 'epoch': self._epoch, 'timesteps_total': self._total_timestep, 'train-steps': self._num_train_steps, }) self.diag_counter = self._total_timestep diag = running_diag.copy() running_diag = {} yield diag if self._total_timestep >= self.n_env_interacts: self.sampler.terminate() self._training_after_hook() self._training_progress.close() print("###### DONE ######") yield {'done': True, **running_diag} break
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') update_diagnostics = [] start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: update_diagnostics.append( self._do_training_repeats( timestep=self._total_timestep)) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') update_diagnostics = tree.map_structure(lambda *d: np.mean(d), *update_diagnostics) training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment, self._total_timestep, evaluation_type='train') gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment, self._total_timestep, evaluation_type='evaluation') gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = { key: times[-1] for key, times in gt.get_times().stamps.itrs.items() } # TODO(hartikainen/tf2): Fix the naming of training/update # diagnostics/metric diagnostics.update(( ('evaluation', evaluation_metrics), ('training', training_metrics), ('update', update_diagnostics), ('times', time_diagnostics), ('sampler', sampler_diagnostics), ('epoch', self._epoch), ('timestep', self._timestep), ('total_timestep', self._total_timestep), ('num_train_steps', self._num_train_steps), )) if self._eval_render_kwargs and hasattr(evaluation_environment, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() yield {'done': True, **diagnostics}
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) # if initial_exploration_policy is None: self.sampler.initialize(env, policy, pool) #initial_exploration_done = False # else: # self.sampler.initialize(env, initial_exploration_policy, pool) # initial_exploration_done = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) ############## Initialize buffer ##################### for i in range(self._n_initial_exploration_steps): self.sampler.sample() if i % 100 == 0: print(i) ###################################################### for epoch in gt.timed_for(range(self._n_epoch + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch ) for t in range(self._epoch_length): #for t in itertools.count(): #terminal = self.sampler.sample() self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') # self._do_training( # iteration=t + epoch * self._epoch_length*168 + episode*168, # batch=self.sampler.random_batch()) self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') #if terminal: # break self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def objective(arglist): config = tf.ConfigProto() config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU sess = tf.Session(config=config) set_session(sess) game_name = arglist.game_name # 'abs', 'one' reward_type = arglist.reward_type p = arglist.p agent_num = arglist.n u_range = 1. k = 0 print(arglist.aux, 'arglist.aux') model_names_setting = arglist.model_names_setting.split('_') model_names = [model_names_setting[0]] model_name = '_'.join(model_names) path_prefix = game_name if game_name == 'pbeauty': env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p) path_prefix = game_name + '-' + reward_type + '-' + str(p) elif 'matrix' in game_name: matrix_game_name = game_name.split('-')[-1] repeated = arglist.repeat max_step = arglist.max_path_length memory = arglist.memory env = MatrixGame(game=matrix_game_name, agent_num=agent_num, action_num=2, repeated=repeated, max_step=max_step, memory=memory, discrete_action=False, tuple_obs=False) path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory) elif 'diff' in game_name: diff_game_name = game_name.split('-')[-1] s2 = arglist.s2 x2 = arglist.x2 y2 = arglist.y2 con = arglist.con env = DifferentialGame(diff_game_name, 3, x2, y2, s2, con) agent_num = 1 elif 'particle' in game_name: particle_game_name = game_name.split('-')[-1] env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist) now = datetime.datetime.now() timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z') if 'CG' in model_name: model_name = model_name + '-{}'.format(arglist.mu) if not arglist.aux: model_name = model_name + '-{}'.format(arglist.aux) suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp) print(suffix) # logger.add_tabular_output('./log/{}.csv'.format(suffix)) # snapshot_dir = './snapshot/{}'.format(suffix) # policy_dir = './policy/{}'.format(suffix) # os.makedirs(snapshot_dir, exist_ok=True) # os.makedirs(policy_dir, exist_ok=True) # logger.set_snapshot_dir(snapshot_dir) agents = [] M = arglist.hidden_size batch_size = arglist.batch_size sampler = SSampler(agent_num=agent_num, joint=True, global_reward=arglist.global_reward, max_path_length=25, min_pool_size=100, batch_size=batch_size) base_kwargs = { 'sampler': sampler, 'epoch_length': 1, 'n_epochs': arglist.max_steps, 'n_train_repeat': 1, 'eval_render': True, 'eval_n_episodes': 10 } _alpha = arglist.alpha lr = arglist.lr n_pars = arglist.n_pars nego_round = arglist.nego_round result = 0. with U.single_threaded_session(): for i, model_name in enumerate(model_names): if 'PR2AC' in model_name: k = int(model_name[-1]) g = False mu = arglist.mu if 'G' in model_name: g = True agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux) elif model_name == 'MASQL': agent = masql_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, game_name=game_name) elif model_name == 'JSQL': agent = jsql_ss_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, game_name=game_name) elif model_name == 'GPF': agent = gpf_ss_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, batch_size = batch_size, game_name=game_name) elif model_name == 'AGPF': agent = agpf_ss_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, batch_size = batch_size, game_name=game_name) elif model_name == 'CCF': agent = ccf_ss_agent(model_name, i, env, M, u_range, base_kwargs, nego_round=nego_round, lr=lr, n_pars=n_pars, batch_size = batch_size, game_name=game_name) elif model_name == 'ACCF': agent = accf_ss_agent(model_name, i, env, M, u_range, base_kwargs, nego_round=nego_round, lr=lr, n_pars=n_pars, batch_size = batch_size, game_name=game_name) elif model_name == 'ROMMEO': agent = rom_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name) else: if model_name == 'DDPG': joint = False opponent_modelling = False elif model_name == 'MADDPG': joint = True opponent_modelling = False elif model_name == 'DDPG-OM': joint = True opponent_modelling = True agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs,lr=lr, game_name=game_name) agents.append(agent) sampler.initialize(env, agents) for agent in agents: agent._init_training() gt.rename_root('MARLAlgorithm') gt.reset() gt.set_def_unique(False) initial_exploration_done = False # noise = .1 noise = .5 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .5 for steps in gt.timed_for(range(base_kwargs['n_epochs'] + 1)): # import pdb; pdb.set_trace() # # if steps > 100 and steps<150: # alpha = .1 - 0.099 * steps/(150) # elif steps >= 150: # alpha = 1e-3 if steps < base_kwargs['n_epochs']//3: # alpha = _alpha alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500. elif steps < base_kwargs['n_epochs']//2: alpha = _alpha/10 else: alpha = .3 tflog('alpha', alpha) print('alpha', alpha) # logger.push_prefix('Epoch #%d | ' % steps) if steps % (25*1000) == 0: print(suffix) for t in range(base_kwargs['epoch_length']): # TODO.code consolidation: Add control interval to sampler if not initial_exploration_done: # if steps >= 1000: if steps >= 10: initial_exploration_done = True sampler.sample() if not initial_exploration_done: continue gt.stamp('sample') print('Sample Done') if steps == 10000: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = 10. # if steps == 2000: if steps > base_kwargs['n_epochs'] / 10: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .1 if steps > base_kwargs['n_epochs'] / 5: noise = 0.05 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if steps > base_kwargs['n_epochs'] / 6: noise = 0.01 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if steps % arglist.training_interval != 0: continue for j in range(base_kwargs['n_train_repeat']): batch_n = [] recent_batch_n = [] indices = None receent_indices = None for i, agent in enumerate(agents): if i == 0: batch = agent.pool.random_batch(batch_size) indices = agent.pool.indices receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top)) batch_n.append(agent.pool.random_batch_by_indices(indices)) recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices)) # print(len(batch_n)) target_next_actions_n = [] # try: all_obs = np.array(np.concatenate([batch['observations'] for batch in batch_n], axis=-1)) all_next_obs = np.array(np.concatenate([batch['next_observations'] for batch in batch_n], axis=-1)) # print(all_obs[0]) for batch in batch_n: # print('making all obs') batch['all_observations'] = deepcopy(all_obs) batch['all_next_observations'] = deepcopy(all_next_obs) # opponent_current_actions_n = [] # for agent, batch in zip(agents, batch_n): # target_next_actions_n.append(agent.target_policy.get_actions(batch['next_observations'])) # opponent_current_actions_n.append(agent.policy.get_actions(batch['observations'])) # for i, agent in enumerate(agents): # batch_n[i]['opponent_current_actions'] = np.reshape( # np.delete(deepcopy(opponent_current_actions_n), i, 0), (-1, agent._opponent_action_dim)) # except: # pass # opponent_actions_n = np.array([batch['actions'] for batch in batch_n]) # recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n]) ####### figure out # recent_opponent_observations_n = [] # for batch in recent_batch_n: # recent_opponent_observations_n.append(batch['observations']) # current_actions = [agents[i].policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)] # all_actions_k = [] # for i, agent in enumerate(agents): # if isinstance(agent, MAVBAC): # if agent._k > 0: # batch_actions_k = agent.policy.get_all_actions(batch_n[i]['next_observations']) # actions_k = [a[0][0] for a in batch_actions_k] # all_actions_k.append(';'.join(list(map(str, actions_k)))) # if len(all_actions_k) > 0: # with open('{}/all_actions.csv'.format(policy_dir), 'a') as f: # f.write(','.join(list(map(str, all_actions_k))) + '\n') # with open('{}/policy.csv'.format(policy_dir), 'a') as f: # f.write(','.join(list(map(str, current_actions)))+'\n') # print('============') for i, agent in enumerate(agents): # try: # batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i]) # except: # pass # batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) # if agent.joint: # if agent.opponent_modelling: # batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i] # batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) # batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations']) # else: # batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim)) # if isinstance(agent, MAVBAC) or isinstance(agent, MASQL) or isinstance(agent, ROMMEO): agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i], annealing=alpha) # else: # agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i]) gt.stamp('train') result = sampler.terminate() print('res is', result) clear_session() with open('step_act.pickle', 'wb') as handle: pickle.dump(sampler.step_act_dict, handle) with open('step_rew.pickle', 'wb') as handle: pickle.dump(sampler.step_rew_dict, handle) return result
def _train(self, env, policy, replay_buffer, sess): """Perform RL training. Parameters ---------- env : gym.Env Environment used for training policy : Policy Policy used for training replay_buffer : ReplayBuffer Replay buffer to add samples to """ self._init_training() self.sampler.initialize(env, policy, replay_buffer) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) logger.record_time(time_total, time_sample) self.sampler.log_diagnostics() logger.save_stats() logger.dump_tabular(with_prefix=False) logger.pop_prefix() self.sampler.terminate()
def main(arglist): game_name = arglist.game_name # 'abs', 'one' reward_type = arglist.reward_type p = arglist.p agent_num = arglist.n u_range = 1. k = 0 print(arglist.aux, 'arglist.aux') model_names_setting = arglist.model_names_setting.split('_') model_names = [model_names_setting[0]] + [model_names_setting[1]] * (agent_num - 1) model_name = '_'.join(model_names) path_prefix = game_name if game_name == 'pbeauty': env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p) path_prefix = game_name + '-' + reward_type + '-' + str(p) elif 'matrix' in game_name: matrix_game_name = game_name.split('-')[-1] repeated = arglist.repeat max_step = arglist.max_path_length memory = arglist.memory env = MatrixGame(game=matrix_game_name, agent_num=agent_num, action_num=2, repeated=repeated, max_step=max_step, memory=memory, discrete_action=False, tuple_obs=False) path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory) elif 'particle' in game_name: particle_game_name = game_name.split('-')[-1] env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist) now = datetime.datetime.now() timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z') if 'CG' in model_name: model_name = model_name + '-{}'.format(arglist.mu) if not arglist.aux: model_name = model_name + '-{}'.format(arglist.aux) suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp) print(suffix) logger.add_tabular_output('./log/{}.csv'.format(suffix)) snapshot_dir = './snapshot/{}'.format(suffix) policy_dir = './policy/{}'.format(suffix) os.makedirs(snapshot_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) logger.set_snapshot_dir(snapshot_dir) agents = [] M = arglist.hidden_size batch_size = arglist.batch_size sampler = MASampler(agent_num=agent_num, joint=True, max_path_length=30, min_pool_size=100, batch_size=batch_size) base_kwargs = { 'sampler': sampler, 'epoch_length': 1, 'n_epochs': arglist.max_steps, 'n_train_repeat': 1, 'eval_render': True, 'eval_n_episodes': 10 } with U.single_threaded_session(): for i, model_name in enumerate(model_names): if 'PR2AC' in model_name: k = int(model_name[-1]) g = False mu = arglist.mu if 'G' in model_name: g = True agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux) elif model_name == 'MASQL': agent = masql_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name) else: if model_name == 'DDPG': joint = False opponent_modelling = False elif model_name == 'MADDPG': joint = True opponent_modelling = False elif model_name == 'DDPG-OM' or model_name == 'DDPG-ToM': joint = True opponent_modelling = True agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs, game_name=game_name) agents.append(agent) sampler.initialize(env, agents) for agent in agents: agent._init_training() gt.rename_root('MARLAlgorithm') gt.reset() gt.set_def_unique(False) initial_exploration_done = False # noise = .1 noise = 1. alpha = .5 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass for epoch in gt.timed_for(range(base_kwargs['n_epochs'] + 1)): logger.push_prefix('Epoch #%d | ' % epoch) if epoch % 1 == 0: print(suffix) for t in range(base_kwargs['epoch_length']): # TODO.code consolidation: Add control interval to sampler if not initial_exploration_done: if epoch >= 1000: initial_exploration_done = True sampler.sample() # print('Sampling') if not initial_exploration_done: continue gt.stamp('sample') # print('Sample Done') if epoch == base_kwargs['n_epochs']: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .1 if epoch > base_kwargs['n_epochs'] / 10: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .1 if epoch > base_kwargs['n_epochs'] / 5: noise = 0.05 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if epoch > base_kwargs['n_epochs'] / 6: noise = 0.01 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass for j in range(base_kwargs['n_train_repeat']): batch_n = [] recent_batch_n = [] indices = None receent_indices = None for i, agent in enumerate(agents): if i == 0: batch = agent.pool.random_batch(batch_size) indices = agent.pool.indices receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top)) batch_n.append(agent.pool.random_batch_by_indices(indices)) recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices)) # print(len(batch_n)) target_next_actions_n = [] try: for agent, batch in zip(agents, batch_n): target_next_actions_n.append(agent._target_policy.get_actions(batch['next_observations'])) except: pass opponent_actions_n = np.array([batch['actions'] for batch in batch_n]) recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n]) ####### figure out recent_opponent_observations_n = [] for batch in recent_batch_n: recent_opponent_observations_n.append(batch['observations']) current_actions = [agents[i]._policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)] all_actions_k = [] for i, agent in enumerate(agents): if isinstance(agent, MAVBAC): if agent._k > 0: batch_actions_k = agent._policy.get_all_actions(batch_n[i]['next_observations']) actions_k = [a[0][0] for a in batch_actions_k] all_actions_k.append(';'.join(list(map(str, actions_k)))) if len(all_actions_k) > 0: with open('{}/all_actions.csv'.format(policy_dir), 'a') as f: f.write(','.join(list(map(str, all_actions_k))) + '\n') with open('{}/policy.csv'.format(policy_dir), 'a') as f: f.write(','.join(list(map(str, current_actions)))+'\n') # print('============') for i, agent in enumerate(agents): try: batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i]) except: pass batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) if agent.joint: if agent.opponent_modelling: batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i] batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations']) else: batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim)) if isinstance(agent, MAVBAC) or isinstance(agent, MASQL): agent._do_training(iteration=t + epoch * agent._epoch_length, batch=batch_n[i], annealing=alpha) else: agent._do_training(iteration=t + epoch * agent._epoch_length, batch=batch_n[i]) gt.stamp('train') # self._evaluate(epoch) # for agent in agents: # params = agent.get_snapshot(epoch) # logger.save_itr_params(epoch, params) # times_itrs = gt.get_times().stamps.itrs # # eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 # total_time = gt.get_times().total # logger.record_tabular('time-train', times_itrs['train'][-1]) # logger.record_tabular('time-eval', eval_time) # logger.record_tabular('time-sample', times_itrs['sample'][-1]) # logger.record_tabular('time-total', total_time) # logger.record_tabular('epoch', epoch) # sampler.log_diagnostics() # logger.dump_tabular(with_prefix=False) logger.pop_prefix() sampler.terminate()
def _train(self, env, policy, initial_exploration_policy, sub_level_policies_paths, pool, g): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ '''self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy,sub_level_policies, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy,sub_level_policies, pool) initial_exploration_done = False''' with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) #loading-low-level-policies sub_level_policies = [] for p in range(0, len(sub_level_policies_paths)): with tf.variable_scope(str(p), reuse=False): policy_snapshot = joblib.load(sub_level_policies_paths[p]) sub_level_policies.append(policy_snapshot["policy"]) self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy, sub_level_policies, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy, sub_level_policies, pool) initial_exploration_done = False for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler if not initial_exploration_done: if self._epoch_length * epoch >= self._n_initial_exploration_steps: self.sampler.set_policy(policy) initial_exploration_done = True self.sampler.sample(initial_exploration_done, g) if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch, initial_exploration_done, sub_level_policies, g) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): env._wrapped_env.env.initialize(seed_task=SEED_TASK) observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 self.prev_n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z(next_ob, z, self._num_skills) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) # print("\n===RESET", epoch, n_episodes, "===", self._epoch_length, path_length, "===", # # env._wrapped_env.env.nstep_internal, # datetime.datetime.now()) env._wrapped_env.env.initialize(seed_task=SEED_TASK) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 # EPOCH IS DONE epoch if not epoch % 10: logger.log("Epoch: {:4} | Episodes: {}".format( epoch, n_episodes), with_prefix=False) if not n_episodes % self.eval_freq or \ n_episodes >= EPISODE_LIMIT or \ epoch >= self._n_epochs: # is_final = epoch >= self._n_epochs \ # or n_episodes >= EPISODE_LIMIT self.sample_skills_to_bd(n_epoch=epoch, n_episodes=n_episodes) # Make snapshot params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) gt.stamp('behaviours') else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') # Terminate after 1000000 episodes if n_episodes >= EPISODE_LIMIT: break else: continue break if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) logger.push_prefix('Epoch #%d | ' % epoch) self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills, concat_type=self.concat_type) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z( next_ob, z, self._num_skills, concat_type=self.concat_type) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def _train(self, env, policy): self._init_training(env, policy) with self._sess.as_default(): observation = env.reset() policy.reset() itr = 0 path_length = 0 path_return = 0 gt.rename_root('online algo') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # Sample next action and state. action, _ = policy.get_action(observation) gt.stamp('train: get actions') action.squeeze() if self._render: env.render() next_ob, raw_reward, terminal, info = env.step(action) reward = raw_reward * self._scale_reward path_length += 1 path_return += reward gt.stamp('train: simulation') # Add experience to replay pool. self._pool.add_sample(observation, action, reward, terminal, False) should_reset = (terminal or path_length >= self._max_path_length) if should_reset: # noinspection PyTypeChecker self._pool.add_sample(next_ob, np.zeros_like(action), np.zeros_like(reward), np.zeros_like(terminal), True) observation = env.reset() policy.reset() path_length = 0 path_return = 0 else: observation = next_ob gt.stamp('train: fill replay pool') # Train. if self._pool.size >= self._min_pool_size: self._do_training(itr) itr += 1 gt.stamp('train: updates') # Evaluate. self._evaluate(epoch) gt.stamp("test") # Log. params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs train_time = np.sum([ times_itrs[key][-1] for key in times_itrs.keys() if 'train: ' in key ]) eval_time = times_itrs["test"][-1] total_time = gt.get_times().total logger.record_tabular("time: train", train_time) logger.record_tabular("time: eval", eval_time) logger.record_tabular("time: total", total_time) logger.record_tabular("scale_reward", self._scale_reward) logger.record_tabular("epochs", epoch) logger.record_tabular("steps: all", itr) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp("logging") print( gt.report( include_itrs=False, format_options={'itr_name_width': 30}, )) env.terminate()
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool if not self._training_started: self._init_training() self._initial_exploration_hook(training_environment, self._initial_exploration_policy, pool) self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): print('starting epoch', self._epoch) self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples print('start samples', start_samples) for i in count(): samples_now = self.sampler._total_samples # print('samples now', samples_now) self._timestep = samples_now - start_samples if (-samples_now + (start_samples + self._epoch_length)) % 100 == 0: print('samples needed', -samples_now + (start_samples + self._epoch_length)) if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') print('after hook', self._epoch) training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, training_environment) gt.stamp('training_metrics') should_save_path = (self._path_save_frequency > 0 and self._epoch % self._path_save_frequency == 0) if should_save_path: import pickle for i, path in enumerate(training_paths): #path.pop('images') path_file_name = f'training_path_{self._epoch}_{i}.pkl' path_file_path = os.path.join(os.getcwd(), 'paths', path_file_name) if not os.path.exists(os.path.dirname(path_file_path)): os.makedirs(os.path.dirname(path_file_path)) with open(path_file_path, 'wb') as f: pickle.dump(path, f) if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() yield {'done': True, **diagnostics}
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment # policy = self._policy pool = self._pool model_metrics = {} # if not self._training_started: self._init_training() # TODO: change policy to placeholder or a function def get_action(state, hidden, deterministic=False): return self.get_action_meta(state, hidden, deterministic) def make_init_hidden(batch_size=1): return self.make_init_hidden(batch_size) self.sampler.initialize(training_environment, (get_action, make_init_hidden), pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) # self._training_before_hook() #### model training print('[ MOPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MOPO ] Training model at epoch {} | freq {} | timestep {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep)) # train dynamics model offline max_epochs = 1 if self._model.model_loaded else None model_train_metrics = self._train_model(batch_size=256, max_epochs=max_epochs, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) self._log_model() gt.stamp('epoch_train_model') #### tester.time_step_holder.set_time(0) for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') self._training_progress = Progress(self._epoch_length * self._n_train_repeat) start_samples = self.sampler._total_samples training_logs = {} for timestep in count(): self._timestep = timestep if (timestep >= self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') ## model rollouts if timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() self._set_rollout_length() self._reallocate_model_pool() model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) gt.stamp('epoch_rollout_model') self._training_progress.resume() ## train actor and critic if self.ready_to_train: # print('[ DEBUG ]: ready to train at timestep: {}'.format(timestep)) training_logs = self._do_training_repeats( timestep=timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) # evaluate the polices evaluation_paths = self._evaluation_paths( (lambda _state, _hidden: get_action(_state, _hidden, True), make_init_hidden), evaluation_environment) gt.stamp('evaluation_paths') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict( (*(('evaluation/{}'.format(key), evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *(('times/{}'.format(key), time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *(('sampler/{}'.format(key), sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *(('model/{}'.format(key), model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), *(('training/{}'.format(key), training_logs[key]) for key in sorted(training_logs.keys()))))) diagnostics['perf/AverageReturn'] = diagnostics[ 'evaluation/return-average'] diagnostics['perf/AverageLength'] = diagnostics[ 'evaluation/episode-length-avg'] if not self.min_ret == self.max_ret: diagnostics['perf/NormalizedReturn'] = (diagnostics['perf/AverageReturn'] - self.min_ret) \ / (self.max_ret - self.min_ret) # diagnostics['keys/logp_pi'] = diagnostics['training/sac_pi/logp_pi'] if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) ## ensure we did not collect any more data assert self._pool.size == self._init_pool_size for k, v in diagnostics.items(): # print('[ DEBUG ] epoch: {} diagnostics k: {}, v: {}'.format(self._epoch, k, v)) self._writer.add_scalar(k, v, self._epoch) yield diagnostics self.sampler.terminate() self._training_after_hook() self._training_progress.close() yield {'done': True, **diagnostics}
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): # reset with goal goal = env.sample_goal() observation = env.reset(goal=goal) policy.reset() # sample z ~ p(z|g) z = self._embedding.get_z(goal=goal) path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 trajectory = [] z_indx = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) path_length_list = [] for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length # flatten observation with given latent z aug_obs = np.concatenate((observation['observation'], z)) action, _ = policy.get_action(aug_obs) next_ob, reward, terminal, info = env.step(action) # assert all(next_ob['desired_goal'] == goal) assert reward == env.compute_reward( next_ob['achieved_goal'], next_ob['desired_goal'], info) path_length += 1 path_return += reward trajectory.append( (observation, action, reward, next_ob, terminal)) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) # add hindsight samples self._pool.add_hindsight_episode( episode=trajectory, embedding=self._embedding, latent=z, goal=goal, ) z_indx += 1 if z_indx >= self._n_latents: goal = env.sample_goal() z_indx = 0 z = self._embedding.get_z(goal=goal) observation = env.reset(goal=goal) policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 trajectory = [] else: observation = next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('steps', iteration) # also record total steps logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) if(epoch%20==0): logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate() # logger. np.save(logger._snapshot_dir+'/reward_data.npy', self.reward)
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) if self.iter_callback is not None: self.iter_callback(locals(), globals()) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def _train(self, env, policy, pool, load=None): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) # evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default() as sess: if load is not None: saver = tf.train.Saver() saver.restore(sess, load) print('pre-trained model restored ...') gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') if epoch % 1 == 0 or epoch >= ENV_PARAMS['n_epochs'] - 20: self._evaluate(policy, env) print('@ epoch %d : ' % epoch) # gt.stamp('eval') # # params = self.get_snapshot(epoch) # logger.save_itr_params(epoch, params) # # time_itrs = gt.get_times().stamps.itrs # time_eval = time_itrs['eval'][-1] # time_total = gt.get_times().total # time_train = time_itrs.get('train', [0])[-1] # time_sample = time_itrs.get('sample', [0])[-1] # # logger.record_tabular('time-train', time_train) # logger.record_tabular('time-eval', time_eval) # logger.record_tabular('time-sample', time_sample) # logger.record_tabular('time-total', time_total) # logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() # env.reset() if (epoch > ENV_PARAMS['n_epochs'] * 0 and epoch % 5 == 0) or epoch >= ENV_PARAMS['n_epochs'] - 100: saver = tf.train.Saver() saver.save(sess, save_path=save_path + '/model-' + str(epoch) + '.ckpt') print('Model saved ...') self.sampler.terminate()
def _train(self, env, policy, pool, qf=None, vf=None, saver=None, _ec=None, dynamic_ec=False): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) self.sampler.initialize(env, policy, pool) if dynamic_ec: dicrese_rate = _ec / self._n_epochs logger2 = mylogger2.get_logger() os.makedirs(os.path.join(logger2.log_dir, 'model'), exist_ok=logger2.exist_ok) optuna_break = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs), save_itrs=True): if optuna_break: continue # logger.push_prefix('Epoch #%d | ' % epoch) epoch_states = [] kurtosis = [] signed_variance = [] for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler done, _n_episodes, obs, next_obs, info = self.sampler.sample( ) epoch_states.append(obs) state_importances = self.policy.calc_knack([obs]) kurtosis.append(state_importances["kurtosis"][0]) signed_variance.append( state_importances["signed_variance"] [0]) # be careful of batch_ready < epoch_length if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') # evaluation if epoch % self._eval_n_frequency == 0: eval_average_return = self._evaluate(epoch) logger.record_tabular('eval_average_return', eval_average_return) if hasattr(self.policy, "optuna_trial"): if self.policy.optuna_trial is not None: self.policy.optuna_trial.report( eval_average_return, epoch) # report intermediate_value if self.policy.optuna_trial.should_prune(): optuna_break = True continue # raise optuna.structs.TrialPruned() else: logger.record_tabular('eval_average_return', np.nan) gt.stamp('eval') # logging about time and step times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('total_step', self.sampler._total_samples) logger.record_tabular('total_episode', self.sampler._n_episodes) # logging about array if hasattr(self.policy, "current_knack_thresh"): current_knack_thresh = self.policy.current_knack_thresh _ = self.policy.calc_and_update_knack(epoch_states) if logger2.save_array_flag: kwargs1 = { 'epoch': epoch, 'states': np.array(epoch_states), 'knack_kurtosis': np.array(kurtosis), 'signed_variance': np.array(signed_variance) } if hasattr(self.policy, "current_knack_thresh"): kwargs1.update( {'current_knack_thresh': current_knack_thresh}) kwargs1.update(self.policy.get_q_params()) logger2.add_array_data(kwargs1) if epoch % 10 == 0: # TODO save only parameters saver.save(self._sess, os.path.join(logger2.log_dir, 'model')) gt.stamp("tf save") gt.stamp("calc knacks") if dynamic_ec: self._sess.run(tf.assign(_ec, _ec - dicrese_rate)) logger.dump_tabular() logger2.write() # print(gt.report()) # finalize processing if optuna_break: return None if logger2.save_array_flag: saver.save(self._sess, os.path.join(logger2.log_dir, 'model')) self.sampler.terminate() return eval_average_return
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ import gtimer as gt from itertools import count training_environment = self._training_environment evaluation_environment = self._evaluation_environment training_metrics = [0 for _ in range(self._num_goals)] if not self._training_started: self._init_training() for i in range(self._num_goals): self._initial_exploration_hook( training_environment, self._initial_exploration_policy, i) self._initialize_samplers() self._sample_count = 0 gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) print("starting_training") self._training_before_hook() import time for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = sum([self._samplers[i]._total_samples for i in range(self._num_goals)]) sample_times = [] for i in count(): samples_now = sum([self._samplers[i]._total_samples for i in range(self._num_goals)]) self._timestep = samples_now - start_samples if samples_now >= start_samples + self._epoch_length and self.ready_to_train: break t0 = time.time() self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') sample_times.append(time.time() - t0) t0 = time.time() if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') # print("Train time: ", time.time() - t0) self._timestep_after_hook() gt.stamp('timestep_after_hook') # TODO diagnostics per goal print("Average Sample Time: ", np.mean(np.array(sample_times))) print("Step count", self._sample_count) training_paths_per_policy = self._training_paths() # self.sampler.get_last_n_paths( # math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths_per_policy = self._evaluation_paths() gt.stamp('evaluation_paths') training_metrics_per_policy = self._evaluate_rollouts( training_paths_per_policy, training_environment) gt.stamp('training_metrics') if evaluation_paths_per_policy: evaluation_metrics_per_policy = self._evaluate_rollouts( evaluation_paths_per_policy, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics_per_policy = [{} for _ in range(self._num_goals)] self._epoch_after_hook(training_paths_per_policy) gt.stamp('epoch_after_hook') t0 = time.time() sampler_diagnostics_per_policy = [ self._samplers[i].get_diagnostics() for i in range(self._num_goals)] diagnostics = self.get_diagnostics( iteration=self._total_timestep, batches=self._evaluation_batches(), training_paths_per_policy=training_paths_per_policy, evaluation_paths_per_policy=evaluation_paths_per_policy) time_diagnostics = gt.get_times().stamps.itrs print("Basic diagnostics: ", time.time() - t0) print("Sample count: ", self._sample_count) diagnostics.update(OrderedDict(( *( (f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys()) ), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) print("Other basic diagnostics: ", time.time() - t0) for i, (evaluation_metrics, training_metrics, sampler_diagnostics) in ( enumerate(zip(evaluation_metrics_per_policy, training_metrics_per_policy, sampler_diagnostics_per_policy))): diagnostics.update(OrderedDict(( *( (f'evaluation_{i}/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys()) ), *( (f'training_{i}/{key}', training_metrics[key]) for key in sorted(training_metrics.keys()) ), *( (f'sampler_{i}/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys()) ), ))) # if self._eval_render_kwargs and hasattr( # evaluation_environment, 'render_rollouts'): # # TODO(hartikainen): Make this consistent such that there's no # # need for the hasattr check. # training_environment.render_rollouts(evaluation_paths) yield diagnostics print("Diagnostic time: ", time.time() - t0) for i in range(self._num_goals): self._samplers[i].terminate() self._training_after_hook() del evaluation_paths_per_policy yield {'done': True, **diagnostics}
def _train(self): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy pool = self._pool model_metrics = {} if not self._training_started: self._init_training() self.sampler.initialize(training_environment, policy, pool) gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): if self._epoch % 200 == 0: #### model training print('[ MOPO ] log_dir: {} | ratio: {}'.format( self._log_dir, self._real_ratio)) print( '[ MOPO ] Training model at epoch {} | freq {} | timestep {} (total: {})' .format(self._epoch, self._model_train_freq, self._timestep, self._total_timestep)) max_epochs = 1 if self._model.model_loaded else None model_train_metrics = self._train_model( batch_size=256, max_epochs=max_epochs, holdout_ratio=0.2, max_t=self._max_model_t) model_metrics.update(model_train_metrics) self._log_model() gt.stamp('epoch_train_model') #### self._epoch_before_hook() gt.stamp('epoch_before_hook') self._training_progress = Progress(self._epoch_length * self._n_train_repeat) start_samples = self.sampler._total_samples for timestep in count(): self._timestep = timestep if (timestep >= self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') ## model rollouts if timestep % self._model_train_freq == 0 and self._real_ratio < 1.0: self._training_progress.pause() self._set_rollout_length() self._reallocate_model_pool() model_rollout_metrics = self._rollout_model( rollout_batch_size=self._rollout_batch_size, deterministic=self._deterministic) model_metrics.update(model_rollout_metrics) gt.stamp('epoch_rollout_model') self._training_progress.resume() ## train actor and critic if self.ready_to_train: self._do_training_repeats(timestep=timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) evaluation_paths = self._evaluation_paths(policy, evaluation_environment) gt.stamp('evaluation_paths') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), *((f'model/{key}', model_metrics[key]) for key in sorted(model_metrics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_environment, 'render_rollouts'): training_environment.render_rollouts(evaluation_paths) ## ensure we did not collect any more data assert self._pool.size == self._init_pool_size yield diagnostics epi_ret = self._rollout_model_for_eval( self._training_environment.reset) np.savetxt("EEepi_ret__fin.csv", epi_ret, delimiter=',') self.sampler.terminate() self._training_after_hook() self._training_progress.close() yield {'done': True, **diagnostics}
def main(arglist): game_name = arglist.game_name # 'abs', 'one' reward_type = arglist.reward_type p = arglist.p agent_num = arglist.n u_range = 1. k = 0 print(arglist.aux, 'arglist.aux') model_names_setting = arglist.model_names_setting.split('_') model_names = [model_names_setting[0]] + [model_names_setting[1]] * (agent_num - 1) model_name = '_'.join(model_names) path_prefix = game_name if game_name == 'pbeauty': env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p) path_prefix = game_name + '-' + reward_type + '-' + str(p) elif 'matrix' in game_name: matrix_game_name = game_name.split('-')[-1] repeated = arglist.repeat max_step = arglist.max_path_length memory = arglist.memory env = MatrixGame(game=matrix_game_name, agent_num=agent_num, action_num=2, repeated=repeated, max_step=max_step, memory=memory, discrete_action=False, tuple_obs=False) path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory) elif 'diff' in game_name: diff_game_name = game_name.split('-')[-1] agent_num = 2 env = DifferentialGame(diff_game_name, agent_num) elif 'particle' in game_name: particle_game_name = game_name.split('-')[-1] env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist) elif 'alvi' in game_name: mat_scene = -1 # Create environment and scenario characteristics env, scn = make_env(arglist.scenario, arglist, arglist.benchmark, mat_scene=mat_scene) now = datetime.datetime.now() timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z') if 'CG' in model_name: model_name = model_name + '-{}'.format(arglist.mu) if not arglist.aux: model_name = model_name + '-{}'.format(arglist.aux) suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp) print(suffix) logger.add_tabular_output('./log/{}.csv'.format(suffix)) snapshot_dir = './snapshot/{}'.format(suffix) policy_dir = './policy/{}'.format(suffix) os.makedirs(snapshot_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) logger.set_snapshot_dir(snapshot_dir) agents = [] M = arglist.hidden_size batch_size = arglist.batch_size sampler = MASampler(agent_num=agent_num, joint=True, global_reward=arglist.global_reward, max_path_length=25, min_pool_size=100, batch_size=batch_size) base_kwargs = { 'sampler': sampler, 'epoch_length': 1, 'n_epochs': arglist.max_steps, 'n_train_repeat': 1, 'eval_render': True, 'eval_n_episodes': 10 } with U.single_threaded_session(): for i, model_name in enumerate(model_names): if 'PR2AC' in model_name: k = int(model_name[-1]) g = False mu = arglist.mu if 'G' in model_name: g = True agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux) elif model_name == 'MASQL': agent = masql_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name) elif model_name == 'MASAC': agent = masac_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name) elif model_name == 'ROMMEO': agent = rom_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name) else: if model_name == 'DDPG': joint = False opponent_modelling = False elif model_name == 'MADDPG': joint = True opponent_modelling = False elif model_name == 'DDPG-OM': joint = True opponent_modelling = True agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs, game_name=game_name) agents.append(agent) sampler.initialize(env, agents) for agent in agents: agent._init_training() gt.rename_root('MARLAlgorithm') gt.reset() gt.set_def_unique(False) initial_exploration_done = False # noise = .1 noise = .5 alpha = .1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .5 for steps in gt.timed_for(range(base_kwargs['n_epochs'] + 1)): # alpha = .1 + np.exp(-0.1 * max(steps-10, 0)) * 500. logger.push_prefix('Epoch #%d | ' % steps) if steps % (25*1000) == 0: print(suffix) for t in range(base_kwargs['epoch_length']): # TODO.code consolidation: Add control interval to sampler if not initial_exploration_done: if steps >= 1000: initial_exploration_done = True sampler.sample() if not initial_exploration_done: continue gt.stamp('sample') print('Sample Done') if steps == 1000: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = 10. if steps == 2000: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .1 if steps == 3000: noise = 0.05 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if steps > base_kwargs['n_epochs'] / 6: noise = 0.01 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if steps % arglist.training_interval != 0: continue for j in range(base_kwargs['n_train_repeat']): batch_n = [] recent_batch_n = [] indices = None receent_indices = None for i, agent in enumerate(agents): if i == 0: batch = agent.pool.random_batch(batch_size) indices = agent.pool.indices receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top)) batch_n.append(agent.pool.random_batch_by_indices(indices)) recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices)) # print(len(batch_n)) target_next_actions_n = [] # try: all_obs = np.array(np.concatenate([batch['observations'] for batch in batch_n], axis=-1)) all_next_obs = np.array(np.concatenate([batch['next_observations'] for batch in batch_n], axis=-1)) # print(all_obs[0]) for batch in batch_n: # print('making all obs') batch['all_observations'] = deepcopy(all_obs) batch['all_next_observations'] = deepcopy(all_next_obs) opponent_current_actions_n = [] for agent, batch in zip(agents, batch_n): target_next_actions_n.append(agent.target_policy.get_actions(batch['next_observations'])) opponent_current_actions_n.append(agent.policy.get_actions(batch['observations'])) for i, agent in enumerate(agents): batch_n[i]['opponent_current_actions'] = np.reshape( np.delete(deepcopy(opponent_current_actions_n), i, 0), (-1, agent._opponent_action_dim)) # except: # pass opponent_actions_n = np.array([batch['actions'] for batch in batch_n]) recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n]) ####### figure out recent_opponent_observations_n = [] for batch in recent_batch_n: recent_opponent_observations_n.append(batch['observations']) current_actions = [agents[i].policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)] all_actions_k = [] for i, agent in enumerate(agents): if isinstance(agent, MAVBAC): if agent._k > 0: batch_actions_k = agent.policy.get_all_actions(batch_n[i]['next_observations']) actions_k = [a[0][0] for a in batch_actions_k] all_actions_k.append(';'.join(list(map(str, actions_k)))) if len(all_actions_k) > 0: with open('{}/all_actions.csv'.format(policy_dir), 'a') as f: f.write(','.join(list(map(str, all_actions_k))) + '\n') with open('{}/policy.csv'.format(policy_dir), 'a') as f: f.write(','.join(list(map(str, current_actions)))+'\n') # print('============') for i, agent in enumerate(agents): try: batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i]) except: pass batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) if agent.joint: if agent.opponent_modelling: batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i] batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations']) else: batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim)) if isinstance(agent, MAVBAC) or isinstance(agent, MASQL) or isinstance(agent, ROMMEO): agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i], annealing=alpha) else: agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i]) gt.stamp('train') sampler.terminate()
def train(self): """ CG: the function that conducts ensemble training. :return: """ # Set up parameters for the training process. self._n_epochs = self._base_ac_params['n_epochs'] self._epoch_length = self._base_ac_params['epoch_length'] self._n_train_repeat = self._base_ac_params['n_train_repeat'] self._n_initial_exploration_steps = self._base_ac_params[ 'n_initial_exploration_steps'] self._eval_render = self._base_ac_params['eval_render'] self._eval_n_episodes = self._base_ac_params['eval_n_episodes'] self._eval_deterministic = self._base_ac_params['eval_deterministic'] # Set up the evaluation environment. if self._eval_n_episodes > 0: with tf.variable_scope("low_level_policy", reuse=True): self._eval_env = deep_clone(self._env) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() # Import required libraries for training. import random import math import operator import numpy as np # Initialize the sampler. alg_ins = random.choice(self._alg_instances) self._sampler.initialize(self._env, alg_ins[0].policy, self._pool) # Perform the training/evaluation process. num_episode = 0. with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): isEpisodeEnd = self._sampler.sample() # If an episode is ended, we need to update performance statistics for each AC instance and # pick randomly another AC instance for next episode of exploration. if isEpisodeEnd: num_episode = num_episode + 1. alg_ins[1] = 0.9 * alg_ins[ 1] + 0.1 * self._sampler._last_path_return alg_ins[2] = alg_ins[2] + 1. if self._use_ucb: # Select an algorithm instance based on UCB. selected = False for ains in self._alg_instances: if ains[2] < 1.: alg_ins = ains selected = True break else: ains[3] = ains[1] + math.sqrt( 2.0 * math.log(num_episode) / ains[2]) if not selected: alg_ins = max(self._alg_instances, key=operator.itemgetter(3)) else: # Select an algorithm instance uniformly at random. alg_ins = random.choice(self._alg_instances) self._sampler.set_policy(alg_ins[0].policy) if not self._sampler.batch_ready(): continue gt.stamp('sample') # Perform training over all AC instances. for i in range(self._n_train_repeat): batch = self._sampler.random_batch() for ains in self._alg_instances: ains[0]._do_training(iteration=t + epoch * self._epoch_length, batch=batch) gt.stamp('train') # Perform evaluation after one full epoch of training is completed. if self._eval_n_episodes < 1: continue if self._evaluation_strategy == 'ensemble': # Use a whole ensemble of AC instances for evaluation. paths = rollouts(self._eval_env, self, self._sampler._max_path_length, self._eval_n_episodes) elif self._evaluation_strategy == 'best-policy': # Choose the AC instance with the highest observed performance so far for evaluation. eval_alg_ins = max(self._alg_instances, key=operator.itemgetter(1)) with eval_alg_ins[0].policy.deterministic( self._eval_deterministic): paths = rollouts(self._eval_env, eval_alg_ins[0].policy, self._sampler._max_path_length, self._eval_n_episodes) else: paths = None if paths is not None: total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) # Produce log info after each episode of training and evaluation. times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self._sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') # Terminate the sampler after the training process is completed. self._sampler.terminate()
def _train(self, env, policy, pool, initial_exploration_policy=None): """Return a generator that performs RL training. Args: env (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ if not self._training_started: self._init_training() self._initial_exploration_hook(env, initial_exploration_policy, pool) self.sampler.initialize(env, policy, pool) evaluation_env = env.copy() if self._eval_n_episodes else None gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if samples_now >= start_samples + self._epoch_length: break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: self._do_training_repeats(timestep=self._total_timestep) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths(policy, evaluation_env) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts(training_paths, env) gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_env) gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = gt.get_times().stamps.itrs diagnostics.update( OrderedDict(( *((f'evaluation/{key}', evaluation_metrics[key]) for key in sorted(evaluation_metrics.keys())), *((f'training/{key}', training_metrics[key]) for key in sorted(training_metrics.keys())), *((f'times/{key}', time_diagnostics[key][-1]) for key in sorted(time_diagnostics.keys())), *((f'sampler/{key}', sampler_diagnostics[key]) for key in sorted(sampler_diagnostics.keys())), ('epoch', self._epoch), ('timestep', self._timestep), ('timesteps_total', self._total_timestep), ('train-steps', self._num_train_steps), ))) if self._eval_render_mode is not None and hasattr( evaluation_env, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. env.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook()
def _train(self): """Return a generator that runs the standard RL loop.""" training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') update_diagnostics = [] start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: repeat_diagnostics = self._do_training_repeats( timestep=self._total_timestep) if repeat_diagnostics is not None: update_diagnostics.append(repeat_diagnostics) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') update_diagnostics = tree.map_structure(lambda *d: np.mean(d), *update_diagnostics) training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') """ evaluation_paths = self._evaluation_paths( policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts( training_paths, training_environment, self._total_timestep, evaluation_type='train') gt.stamp('training_metrics') if False: # evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment, self._total_timestep, evaluation_type='evaluation') gt.stamp('evaluation_metrics') else: evaluation_metrics = {} """ self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics(iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=None) time_diagnostics = { key: times[-1] for key, times in gt.get_times().stamps.itrs.items() } # TODO(hartikainen/tf2): Fix the naming of training/update # diagnostics/metric diagnostics.update(( ('evaluation', None), ('training', None), ('update', update_diagnostics), ('times', time_diagnostics), ('sampler', sampler_diagnostics), ('epoch', self._epoch), ('timestep', self._timestep), ('total_timestep', self._total_timestep), ('num_train_steps', self._num_train_steps), )) yield diagnostics self.sampler.terminate() self._training_after_hook() yield {'done': True, **diagnostics}