def setup(self, algo, env, sampler_cls=None, sampler_args=None, n_workers=psutil.cpu_count(logical=False), worker_class=None, worker_args=None): """Set up trainer and sessions for algorithm and environment. This method saves algo and env within trainer and creates a sampler, and initializes all uninitialized variables in session. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (RLAlgorithm): An algorithm instance. env (Environment): An environment instance. sampler_cls (type): A class which implements :class:`Sampler` sampler_args (dict): Arguments to be passed to sampler constructor. n_workers (int): The number of workers the sampler should use. worker_class (type): Type of worker the sampler should use. worker_args (dict or None): Additional arguments that should be passed to the worker. """ self.initialize_tf_vars() logger.log(self.sess.graph) super().setup(algo, env, sampler_cls, sampler_args, n_workers, worker_class, worker_args)
def evaluate(self, algo, test_rollouts_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (metarl.np.algos.MetaRLAlgorithm): The algorithm to evaluate. test_rollouts_per_task (int or None): Number of rollouts per task. """ if test_rollouts_per_task is None: test_rollouts_per_task = self._n_test_rollouts adapted_trajectories = [] logger.log('Sampling for adapation and meta-testing...') for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = TrajectoryBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, test_rollouts_per_task * self._max_path_length, adapted_policy) adapted_trajectories.append(adapted_traj) logger.log('Finished meta-testing...') with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, TrajectoryBatch.concatenate(*adapted_trajectories), getattr(algo, 'discount', 1.0), task_names=self._test_task_names) self._eval_itr += 1
def save(self, epoch, paths=None): """Save snapshot of current batch. Args: itr(int): Index of iteration (epoch). paths(dict): Batch of samples after preprocessed. If None, no paths will be logged to the snapshot. """ if not self.has_setup: raise Exception('Use setup() to setup runner before saving.') logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self._setup_args params['train_args'] = self.train_args # Save states params['env'] = self.env # TODO: add this back # params['algo'] = self.algo if paths: params['paths'] = paths params['last_epoch'] = epoch self._snapshotter.save_itr_params(epoch, params) logger.log('Saved')
def train_once(self, itr, paths): paths = self.process_samples(itr, paths) epoch = itr // self.n_samples i_sample = itr - epoch * self.n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) # -- Stage: Process path rtn = paths['average_return'] self.all_returns.append(paths['average_return']) # -- Stage: Update policy distribution. if (itr + 1) % self.n_samples == 0: avg_rtns = np.array(self.all_returns) best_inds = np.argsort(-avg_rtns)[:self.n_best] best_params = np.array(self.all_params)[best_inds] # MLE of normal distribution self.cur_mean = best_params.mean(axis=0) self.cur_std = best_params.std(axis=0) self.policy.set_param_values(self.cur_mean) # Clear for next epoch rtn = max(self.all_returns) self.all_returns.clear() self.all_params.clear() # -- Stage: Generate a new policy for next path sampling self.cur_params = self.sample_params(itr) self.all_params.append(self.cur_params.copy()) self.policy.set_param_values(self.cur_params) logger.log(tabular) return rtn
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. """ paths = self.process_samples(itr, paths) epoch = itr // self.n_samples i_sample = itr - epoch * self.n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = paths['average_return'] self.all_returns.append(paths['average_return']) if (itr + 1) % self.n_samples == 0: avg_rtns = np.array(self.all_returns) self.es.tell(self.all_params, -avg_rtns) self.policy.set_param_values(self.es.best.get()[0]) # Clear for next epoch rtn = max(self.all_returns) self.all_returns.clear() self.all_params = self._sample_params() self.cur_params = self.all_params[(i_sample + 1) % self.n_samples] self.policy.set_param_values(self.cur_params) logger.log(tabular) return rtn
def _train_once(self, itr): """Perform one iteration of training. Args: itr (int): Iteration number. """ for grad_step_timer in range(self._grad_steps_per_env_step): if (self._replay_buffer.n_transitions_stored >= self._min_buffer_size): # Sample from buffer samples = self._replay_buffer.sample_transitions( self._buffer_batch_size) samples = as_torch_dict(samples) # Optimize qf_loss, y, q, policy_loss = torch_to_np( self._optimize_policy(samples, grad_step_timer)) self._episode_policy_losses.append(policy_loss) self._episode_qf_losses.append(qf_loss) self._epoch_ys.append(y) self._epoch_qs.append(q) if itr % self._steps_per_epoch == 0: logger.log('Training finished') epoch = itr // self._steps_per_epoch if (self._replay_buffer.n_transitions_stored >= self._min_buffer_size): tabular.record('Epoch', epoch) self._log_statistics()
def train_once(self, itr, paths): """Perform one iteration of training.""" paths = self.process_samples(itr, paths) epoch = itr / self.n_epoch_cycles self.episode_rewards.extend([ path for path, complete in zip(paths['undiscounted_returns'], paths['complete']) if complete ]) self.success_history.extend([ path for path, complete in zip(paths['success_history'], paths['complete']) if complete ]) last_average_return = np.mean(self.episode_rewards) for train_itr in range(self.n_train_steps): if self.replay_buffer.n_transitions_stored >= self.min_buffer_size: # noqa: E501 self.evaluate = True samples = self.replay_buffer.sample(self.buffer_batch_size) qf_loss, y, q, policy_loss = torch_to_np( self.optimize_policy(itr, samples)) # pylint: disable=all self.episode_policy_losses.append(policy_loss) self.episode_qf_losses.append(qf_loss) self.epoch_ys.append(y) self.epoch_qs.append(q) if itr % self.n_epoch_cycles == 0: logger.log('Training finished') if self.evaluate: tabular.record('Epoch', epoch) tabular.record('AverageReturn', np.mean(self.episode_rewards)) tabular.record('StdReturn', np.std(self.episode_rewards)) tabular.record('Policy/AveragePolicyLoss', np.mean(self.episode_policy_losses)) tabular.record('QFunction/AverageQFunctionLoss', np.mean(self.episode_qf_losses)) tabular.record('QFunction/AverageQ', np.mean(self.epoch_qs)) tabular.record('QFunction/MaxQ', np.max(self.epoch_qs)) tabular.record('QFunction/AverageAbsQ', np.mean(np.abs(self.epoch_qs))) tabular.record('QFunction/AverageY', np.mean(self.epoch_ys)) tabular.record('QFunction/MaxY', np.max(self.epoch_ys)) tabular.record('QFunction/AverageAbsY', np.mean(np.abs(self.epoch_ys))) tabular.record('AverageSuccessRate', np.mean(self.success_history)) if not self.smooth_return: self.episode_rewards = [] self.episode_policy_losses = [] self.episode_qf_losses = [] self.epoch_ys = [] self.epoch_qs = [] self.success_history.clear() return last_average_return
def optimize_policy(self, itr, samples_data): policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log('Computing loss before') loss_before = self.optimizer.loss(policy_opt_input_values) logger.log('Computing KL before') policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log('Optimizing') self.optimizer.optimize(policy_opt_input_values) logger.log('Computing KL after') policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log('Computing loss after') loss_after = self.optimizer.loss(policy_opt_input_values) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl) pol_ent = self.f_policy_entropy(*policy_opt_input_values) tabular.record('{}/Entropy'.format(self.policy.name), pol_ent) num_traj = self.batch_size // self.max_path_length actions = samples_data['actions'][:num_traj, ...] histogram = Histogram(actions) tabular.record('{}/Actions'.format(self.policy.name), histogram) self._fit_baseline(samples_data) return self.get_itr_snapshot(itr, samples_data)
def optimize_policy(self, itr, samples_data): """Optimize policy. Args: itr (int): Iteration number. samples_data (dict): Processed sample data. See process_samples() for details. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log('Computing loss before') loss_before = self._optimizer.loss(policy_opt_input_values) logger.log('Computing KL before') policy_kl_before = self._f_policy_kl(*policy_opt_input_values) logger.log('Optimizing') self._optimizer.optimize(policy_opt_input_values) logger.log('Computing KL after') policy_kl = self._f_policy_kl(*policy_opt_input_values) logger.log('Computing loss after') loss_after = self._optimizer.loss(policy_opt_input_values) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl) pol_ent = self._f_policy_entropy(*policy_opt_input_values) tabular.record('{}/Entropy'.format(self.policy.name), np.mean(pol_ent)) self._fit_baseline(samples_data)
def _fit_baseline_with_data(self, samples_data): """Update baselines from samples. Args: samples_data (dict): Processed sample data. See garage.tf.paths_to_tensors() for details. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) samples_data['rewards'] = np_tensor_utils.pad_tensor_n( aug_rewards, self.max_path_length) samples_data['returns'] = np_tensor_utils.pad_tensor_n( aug_returns, self.max_path_length) # Fit baseline logger.log('Fitting baseline...') self._baseline.fit(paths)
def test_one_folder(self, meta_train_dir, itrs): snapshot_config = SnapshotConfig(snapshot_dir=meta_train_dir, snapshot_mode='all', snapshot_gap=1) runner = LocalRunner(snapshot_config=snapshot_config) meta_sampler = AllSetTaskSampler(self.meta_task_cls) runner.restore(meta_train_dir) meta_evaluator = MetaEvaluator( runner, test_task_sampler=meta_sampler, max_path_length=self.max_path_length, n_test_tasks=meta_sampler.n_tasks, n_exploration_traj=self.adapt_rollout_per_task, prefix='') for itr in itrs: log_filename = os.path.join(meta_train_dir, 'meta-test-itr_{}.csv'.format(itr)) logger.add_output(CsvOutput(log_filename)) logger.log("Writing into {}".format(log_filename)) runner.restore(meta_train_dir, from_epoch=itr) meta_evaluator.evaluate(runner._algo, self.test_rollout_per_task) tabular.record('Iteration', runner._stats.total_epoch) tabular.record('TotalEnvSteps', runner._stats.total_env_steps) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput)
def save(self, epoch, paths=None): """Save snapshot of current batch. Args: itr(int): Index of iteration (epoch). paths(dict): Batch of samples after preprocessed. If None, no paths will be logged to the snapshot. """ assert self.has_setup logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self.setup_args params['train_args'] = self.train_args # Save states params['env'] = self.env params['algo'] = self.algo if paths: params['paths'] = paths params['last_epoch'] = epoch snapshotter.save_itr_params(epoch, params) logger.log('Saved')
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (EpisodeBatch): Batch of episodes. Returns: numpy.float64: Average return. """ # -- Stage: Calculate and pad baselines obs = [ self._baseline.predict({'observations': obs}) for obs in episodes.observations_list ] baselines = episodes.pad_to_last(np.concatenate(obs)) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, episodes, discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) logger.log('Optimizing policy...') self._optimize_policy(episodes, baselines) return np.mean(undiscounted_returns)
def save(self, epoch): """Save snapshot of current batch. Args: epoch (int): Epoch. Raises: NotSetupError: if save() is called before the runner is set up. """ if not self._has_setup: raise NotSetupError('Use setup() to setup runner before saving.') logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self._setup_args params['train_args'] = self._train_args params['stats'] = self._stats # Save states params['env'] = self._env params['algo'] = self._algo params['n_workers'] = self._n_workers params['worker_class'] = self._worker_class params['worker_args'] = self._worker_args self._snapshotter.save_itr_params(epoch, params) logger.log('Saved')
def populate_task(env, policy, scope=None): """Set each worker's env and policy. Parameters ---------- env : :py:class:`ast_toolbox.envs.ASTEnv` The environment. policy : :py:class:`garage.tf.policies.Policy` The policy. scope : str Scope for identifying the algorithm. Must be specified if running multiple algorithms simultaneously, each using different environments and policies. """ logger.log('Populating workers...') if singleton_pool.n_parallel > 1: singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy), scope)] * singleton_pool.n_parallel) else: # avoid unnecessary copying g = _get_scoped_g(singleton_pool.G, scope) g.env = env g.policy = policy logger.log('Populated')
def _optimize_policy(self, itr, episodes, baselines, embed_eps, embed_ep_infos): """Optimize policy. Args: itr (int): Iteration number. episodes (EpisodeBatch): Batch of episodes. baselines (np.ndarray): Baseline predictions. embed_eps (np.ndarray): Embedding episodes. embed_ep_infos (dict): Embedding distribution information. """ del itr policy_opt_input_values = self._policy_opt_input_values( episodes, baselines, embed_eps) inference_opt_input_values = self._inference_opt_input_values( episodes, embed_eps, embed_ep_infos) self._train_policy_and_encoder_networks(policy_opt_input_values) self._train_inference_network(inference_opt_input_values) # paths = samples_data['paths'] fit_paths = self._evaluate(policy_opt_input_values, episodes, baselines, embed_ep_infos) self._visualize_distribution() logger.log('Fitting baseline...') self._baseline.fit(fit_paths) self._old_policy.parameters = self.policy.parameters self._old_policy.encoder.model.parameters = ( self.policy.encoder.model.parameters) self._old_inference.model.parameters = self._inference.model.parameters
def test_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(PointEnv, wrapper=set_length) max_episode_length = 200 with tempfile.TemporaryDirectory() as log_dir_name: trainer = Trainer( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) env = PointEnv(max_episode_length=max_episode_length) algo = OptimalActionInference(env=env, max_episode_length=max_episode_length) trainer.setup(algo, env) meta_eval = MetaEvaluator(test_task_sampler=tasks, n_test_tasks=10) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) logger.log(tabular) meta_eval.evaluate(algo) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) assert len(rows) == 2 assert float( rows[0]['MetaTest/__unnamed_task__/TerminationRate']) < 1.0 assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0 assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float( rows[0]['MetaTest/__unnamed_task__/AverageReturn'])) assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >= float(rows[0]['MetaTest/__unnamed_task__/MinReturn'])) assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
def optimize_policy(self, itr, samples_data): """Optimize policy. Args: itr (int): Iteration number. samples_data (dict): Processed sample data. See process_samples() for details. """ del itr policy_opt_input_values = self._policy_opt_input_values(samples_data) inference_opt_input_values = self._inference_opt_input_values( samples_data) self._train_policy_and_encoder_networks(policy_opt_input_values) self._train_inference_network(inference_opt_input_values) paths = samples_data['paths'] self.evaluate(policy_opt_input_values, samples_data) self.visualize_distribution() logger.log('Fitting baseline...') self._baseline.fit(paths) self._old_policy.parameters = self.policy.parameters self._old_policy.encoder.model.parameters = ( self.policy.encoder.model.parameters) self._old_inference.model.parameters = self._inference.model.parameters
def train_once(self, itr, paths): paths = self.process_samples(itr, paths) self.log_diagnostics(paths) logger.log('Optimizing policy...') self.optimize_policy(itr, paths) return paths['average_return']
def optimize_policy(self, itr, samples_data): policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log('Computing loss before') loss_before = self.optimizer.loss(policy_opt_input_values) logger.log('Computing KL before') policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log('Optimizing') self.optimizer.optimize(policy_opt_input_values) logger.log('Computing KL after') policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log('Computing loss after') loss_after = self.optimizer.loss(policy_opt_input_values) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl) pol_ent = self.f_policy_entropy(*policy_opt_input_values) tabular.record('{}/Entropy'.format(self.policy.name), np.mean(pol_ent)) self._fit_baseline(samples_data)
def _fit_baseline_with_data(self, episodes, baselines): """Update baselines from samples. Args: episodes (EpisodeBatch): Batch of episodes. baselines (np.ndarray): Baseline predictions. Returns: np.ndarray: Augment returns. """ policy_opt_input_values = self._policy_opt_input_values( episodes, baselines) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = [] valids = episodes.valids observations = episodes.padded_observations # Compute returns for ret, val, ob in zip(returns_tensor, valids, observations): returns = ret[val.astype(np.bool)] obs = ob[val.astype(np.bool)] paths.append(dict(observations=obs, returns=returns)) # Fit baseline logger.log('Fitting baseline...') self._baseline.fit(paths) return returns_tensor
def train_once(self, itr, paths): epoch = itr // self.n_samples i_sample = itr - epoch * self.n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = paths['average_return'] self.all_returns.append(paths['average_return']) if (itr + 1) % self.n_samples == 0: avg_rtns = np.array(self.all_returns) self.es.tell(self.all_params, -avg_rtns) self.policy.set_param_values(self.es.result()[0]) # Clear for next epoch rtn = max(self.all_returns) self.all_returns.clear() self.all_params = self.sample_params() self.cur_params = self.all_params[(i_sample + 1) % self.n_samples] self.policy.set_param_values(self.cur_params) logger.log(tabular) return rtn
def worker_init_envs(g, alloc, scope, env): logger.log('initializing environment on worker %d' % g.worker_id) if not hasattr(g, 'parallel_vec_envs'): g.parallel_vec_envs = dict() g.parallel_vec_env_template = dict() g.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env))) for idx in alloc] g.parallel_vec_env_template[scope] = env
def evaluate(self, algo, test_rollouts_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate. test_rollouts_per_task (int or None): Number of rollouts per task. """ if test_rollouts_per_task is None: test_rollouts_per_task = self._n_test_rollouts adapted_trajectories = [] logger.log('Sampling for adapation and meta-testing...') if self._test_sampler is None: self._test_sampler = self._sampler_class.from_worker_factory( WorkerFactory(seed=get_seed(), max_path_length=self._max_path_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=self._test_task_sampler.sample(1)) for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = self._trajectory_batch_class.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, test_rollouts_per_task * self._max_path_length, adapted_policy) adapted_trajectories.append(adapted_traj) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, self._trajectory_batch_class.concatenate( *adapted_trajectories), getattr(algo, 'discount', 1.0), trajectory_class=self._trajectory_batch_class, name_map=name_map) self._eval_itr += 1 if self._trajectory_batch_class == TrajectoryBatch: rewards = self._trajectory_batch_class.concatenate( *adapted_trajectories).rewards else: rewards = self._trajectory_batch_class.concatenate( *adapted_trajectories).env_rewards return sum(rewards) / len(rewards)
def optimize(self, inputs, extra_inputs=None, callback=None): if not inputs: # Assumes that we should always sample mini-batches raise NotImplementedError f_loss = self._opt_fun['f_loss'] if extra_inputs is None: extra_inputs = tuple() last_loss = f_loss(*(tuple(inputs) + extra_inputs)) start_time = time.time() dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs) sess = tf.get_default_session() for epoch in range(self._max_epochs): if self._verbose: logger.log('Epoch {}'.format(epoch)) progbar = pyprind.ProgBar(len(inputs[0])) for batch in dataset.iterate(update=True): sess.run(self._train_op, dict(list(zip(self._input_vars, batch)))) if self._verbose: progbar.update(len(batch[0])) if self._verbose: if progbar.active: progbar.stop() new_loss = f_loss(*(tuple(inputs) + extra_inputs)) if self._verbose: logger.log('Epoch: {} | Loss: {}'.format(epoch, new_loss)) if self._callback or callback: elapsed = time.time() - start_time callback_args = dict( loss=new_loss, params=self._target.get_param_values( trainable=True) if self._target else None, itr=epoch, elapsed=elapsed, ) if self._callback: self._callback(callback_args) if callback: callback(**callback_args) if abs(last_loss - new_loss) < self._tolerance: break last_loss = new_loss
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_path_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def _make_context(self, *args, **kwargs): """Make a context from the template information and variant args. Currently, all arguments should be keyword arguments. Args: args (list): Should be empty. kwargs (dict): Keyword arguments for the wrapped function. Will be logged to `variant.json` Returns: ExperimentContext: The created experiment context. Raises: ValueError: If args is not empty. """ if args: raise ValueError('garage.experiment currently only supports ' 'keyword arguments') name = self.name if name is None: name = self.function.__name__ if self.name_parameters: name = self._augment_name(name, kwargs) log_dir = self.log_dir if log_dir is None: log_dir = ('{data}/local/{prefix}/{name}'.format( data=os.path.join(os.getcwd(), 'data'), prefix=self.prefix, name=name)) log_dir = _make_sequential_log_dir(log_dir) tabular_log_file = os.path.join(log_dir, 'progress.csv') text_log_file = os.path.join(log_dir, 'debug.log') variant_log_file = os.path.join(log_dir, 'variant.json') metadata_log_file = os.path.join(log_dir, 'metadata.json') dump_json(variant_log_file, kwargs) git_root_path, metadata = get_metadata() dump_json(metadata_log_file, metadata) if git_root_path and self.archive_launch_repo: make_launcher_archive(git_root_path=git_root_path, log_dir=log_dir) logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output( dowel.TensorBoardOutput(log_dir, x_axis='TotalEnvSteps')) logger.add_output(dowel.StdOutput()) logger.push_prefix('[{}] '.format(name)) logger.log('Logging to {}'.format(log_dir)) return ExperimentContext(snapshot_dir=log_dir, snapshot_mode=self.snapshot_mode, snapshot_gap=self.snapshot_gap)
def log_diagnostics(self, paths): """Log diagnostic information. Args: paths (list[dict]): A list of collected paths. """ logger.log('Logging diagnostics...') self.policy.log_diagnostics(paths) self._baseline.log_diagnostics(paths)
def _tasks_adapt_train_once(self): for idx in range(self._num_steps_per_epoch): indices = np.random.choice(range(self._num_train_tasks), self._meta_batch_size) kl_loss, value_loss, policy_loss = self._tasks_adapt_optimize_policy( indices) if idx % BATCH_PRINT == 0: logger.log( "task adapt at batch {} with kl_loss {}, value loss {} and policy loss {}" .format(idx, kl_loss, value_loss, policy_loss))
def evaluate_arduino(datum, send_type=int, return_type=float): if send_type is int and return_type is float: return_val = evaluate_int_float(datum) elif send_type is bool and return_type is bool: return_val = evaluate_bool_bool(datum) else: raise NotImplementedError if send_type is int and not isinstance(return_val, return_type): logger.log(return_val) import pdb; pdb.set_trace() return return_val