def log_diagnostics(self, paths, prefix): progs = [ path["observations"][-1][-3] - path["observations"][0][-3] for path in paths ] logger.logkv(prefix + 'AverageForwardProgress', np.mean(progs)) logger.logkv(prefix + 'MaxForwardProgress', np.max(progs)) logger.logkv(prefix + 'MinForwardProgress', np.min(progs)) logger.logkv(prefix + 'StdForwardProgress', np.std(progs))
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (list): A list of dicts with the samples """ # initial setup / preparation paths = [] n_samples = 0 num_envs = self.vec_env.num_envs running_paths = [ _get_empty_running_paths_dict() for _ in range(num_envs) ] pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True] * self.vec_env.num_envs) # initial reset of meta_envs obses = np.asarray(self.vec_env.reset()) while n_samples < self.total_samples: # execute policy t = time.time() if random: actions = np.stack( [self.env.action_space.sample() for _ in range(num_envs)], axis=0) agent_infos = {} else: a_bs = self.adapt_batch_size if a_bs is not None and len( running_paths[0]['observations']) > a_bs + 1: adapt_obs = [ np.stack(running_paths[idx]['observations'][-a_bs - 1:-1]) for idx in range(num_envs) ] adapt_act = [ np.stack(running_paths[idx]['actions'][-a_bs - 1:-1]) for idx in range(num_envs) ] adapt_next_obs = [ np.stack(running_paths[idx]['observations'][-a_bs:]) for idx in range(num_envs) ] policy.dynamics_model.switch_to_pre_adapt() policy.dynamics_model.adapt(adapt_obs, adapt_act, adapt_next_obs) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t # step environments t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts( agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() pbar.update(self.vec_env.num_envs) n_samples += new_samples obses = next_obses pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def train(self): """ Collects data and trains the dynamics model """ with self.sess.as_default() as sess: # Initialize uninitialized vars (only initialize vars that were not loaded) # uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.initializers.global_variables()) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log("\n ---------------- Iteration %d ----------------" % itr) time_env_sampling_start = time.time() if self.initial_random_samples and itr == 0: logger.log("Obtaining random samples from the environment...") env_paths = self.sampler.obtain_samples(log=True, random=True, log_prefix='') else: logger.log("Obtaining samples from the environment using the policy...") env_paths = self.sampler.obtain_samples(log=True, log_prefix='') logger.record_tabular('Time-EnvSampling', time.time() - time_env_sampling_start) ''' -------------- Process the samples ----------------''' logger.log("Processing environment samples...") time_env_samp_proc = time.time() samples_data = self.sample_processor.process_samples(env_paths, log=True) logger.record_tabular('Time-EnvSampleProc', time.time() - time_env_samp_proc) ''' --------------- Fit the dynamics model --------------- ''' time_fit_start = time.time() logger.log("Training dynamics model for %i epochs ..." % (self.dynamics_model_max_epochs)) self.dynamics_model.fit(samples_data['observations'], samples_data['actions'], samples_data['next_observations'], epochs=self.dynamics_model_max_epochs, verbose=True, log_tabular=True) logger.record_tabular('Time-ModelFit', time.time() - time_fit_start) """ ------------------- Logging --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) self.log_diagnostics(env_paths, '') logger.save_itr_params(itr, params) logger.log("Saved") logger.dumpkvs() if itr == 1: sess.graph.finalize() logger.log("Training finished") self.sess.close()
def fit(self, obs, act, obs_next, epochs=1000, compute_normalization=True, valid_split_ratio=None, rolling_average_persitency=None, verbose=False, log_tabular=False): assert obs.ndim == 3 and obs.shape[2] == self.obs_space_dims assert obs_next.ndim == 3 and obs_next.shape[2] == self.obs_space_dims assert act.ndim == 3 and act.shape[2] == self.action_space_dims if valid_split_ratio is None: valid_split_ratio = self.valid_split_ratio if rolling_average_persitency is None: rolling_average_persitency = self.rolling_average_persitency assert 1 > valid_split_ratio >= 0 sess = tf.get_default_session() if (self.normalization is None or compute_normalization) and self.normalize_input: self.compute_normalization(obs, act, obs_next) if self.normalize_input: # Normalize data obs, act, delta = self._normalize_data(obs, act, obs_next) assert obs.ndim == act.ndim == obs_next.ndim == 3 else: delta = obs_next - obs # Split into valid and test set obs_train, act_train, delta_train, obs_test, act_test, delta_test = train_test_split(obs, act, delta, test_split_ratio=valid_split_ratio) if self._dataset_test is None: self._dataset_test = dict(obs=obs_test, act=act_test, delta=delta_test) self._dataset_train = dict(obs=obs_train, act=act_train, delta=delta_train) else: self._dataset_test['obs'] = np.concatenate([self._dataset_test['obs'], obs_test]) self._dataset_test['act'] = np.concatenate([self._dataset_test['act'], act_test]) self._dataset_test['delta'] = np.concatenate([self._dataset_test['delta'], delta_test]) self._dataset_train['obs'] = np.concatenate([self._dataset_train['obs'], obs_train]) self._dataset_train['act'] = np.concatenate([self._dataset_train['act'], act_train]) self._dataset_train['delta'] = np.concatenate([self._dataset_train['delta'], delta_train]) valid_loss_rolling_average = None epoch_times = [] """ ------- Looping over training epochs ------- """ num_steps_per_epoch = max(int(np.prod(self._dataset_train['obs'].shape[:2]) / (self.meta_batch_size * self.batch_size * 2)), 1) num_steps_test = max(int(np.prod(self._dataset_test['obs'].shape[:2]) / (self.meta_batch_size * self.batch_size * 2)), 1) for epoch in range(epochs): # preparations for recording training stats pre_batch_losses = [] post_batch_losses = [] t0 = time.time() """ ------- Looping through the shuffled and batched dataset for one epoch -------""" for _ in range(num_steps_per_epoch): obs_batch, act_batch, delta_batch = self._get_batch(train=True) pre_batch_loss, post_batch_loss, _ = sess.run([self.pre_loss, self.post_loss, self.train_op], feed_dict={self.obs_ph: obs_batch, self.act_ph: act_batch, self.delta_ph: delta_batch}) pre_batch_losses.append(pre_batch_loss) post_batch_losses.append(post_batch_loss) valid_losses = [] for _ in range(num_steps_test): obs_test, act_test, delta_test = self._get_batch(train=False) # compute validation loss feed_dict = {self.obs_ph: obs_test, self.act_ph: act_test, self.delta_ph: delta_test} valid_loss = sess.run(self.loss, feed_dict=feed_dict) valid_losses.append(valid_loss) valid_loss = np.mean(valid_losses) if valid_loss_rolling_average is None: valid_loss_rolling_average = 1.5 * valid_loss # set initial rolling to a higher value avoid too early stopping valid_loss_rolling_average_prev = 2 * valid_loss if valid_loss < 0: valid_loss_rolling_average = valid_loss/1.5 # set initial rolling to a higher value avoid too early stopping valid_loss_rolling_average_prev = valid_loss/2 valid_loss_rolling_average = rolling_average_persitency*valid_loss_rolling_average \ + (1.0-rolling_average_persitency)*valid_loss epoch_times.append(time.time() - t0) if verbose: logger.log("Training DynamicsModel - finished epoch %i - " "train loss: %.4f valid loss: %.4f valid_loss_mov_avg: %.4f epoch time: %.2f" % (epoch, np.mean(post_batch_losses), valid_loss, valid_loss_rolling_average, time.time() - t0)) if valid_loss_rolling_average_prev < valid_loss_rolling_average or epoch == epochs - 1: logger.log('Stopping Training of Model since its valid_loss_rolling_average decreased') break valid_loss_rolling_average_prev = valid_loss_rolling_average """ ------- Tabular Logging ------- """ if log_tabular: logger.logkv('AvgModelEpochTime', np.mean(epoch_times)) logger.logkv('Post-Loss', np.mean(post_batch_losses)) logger.logkv('Pre-Loss', np.mean(pre_batch_losses)) logger.logkv('Epochs', epoch)
def _log_path_stats(self, paths, log=False, log_prefix=''): # compute log stats average_discounted_return = np.mean( [path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', average_discounted_return) logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.logkv(log_prefix + 'NumTrajs', len(paths)) logger.logkv(log_prefix + 'StdReturn', np.std(undiscounted_returns)) logger.logkv(log_prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.logkv(log_prefix + 'MinReturn', np.min(undiscounted_returns))
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = [] n_samples = 0 running_paths = _get_empty_running_paths_dict() pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True]) # initial reset of meta_envs obs = np.asarray(self.env.reset()) ts = 0 while n_samples < self.total_samples: # execute policy t = time.time() if random: action = self.env.action_space.sample() agent_info = {} else: action, agent_info = policy.get_action(obs) if action.ndim == 2: action = action[0] policy_time += time.time() - t # step environments t = time.time() next_obs, reward, done, env_info = self.env.step(action) ts += 1 done = done or ts >= self.max_path_length if done: next_obs = self.env.reset() ts = 0 env_time += time.time() - t new_samples = 0 # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths["observations"].append(obs) running_paths["actions"].append(action) running_paths["rewards"].append(reward) running_paths["dones"].append(done) running_paths["env_infos"].append(env_info) running_paths["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray(running_paths["observations"]), actions=np.asarray(running_paths["actions"]), rewards=np.asarray(running_paths["rewards"]), dones=np.asarray(running_paths["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths["agent_infos"]), )) new_samples += len(running_paths["rewards"]) running_paths = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obs = next_obs pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def fit(self, obs, act, obs_next, epochs=1000, compute_normalization=True, valid_split_ratio=None, rolling_average_persitency=None, verbose=False, log_tabular=False): assert obs.ndim == 3 and obs.shape[2] == self.obs_space_dims assert obs_next.ndim == 3 and obs_next.shape[2] == self.obs_space_dims assert act.ndim == 3 and act.shape[2] == self.action_space_dims if valid_split_ratio is None: valid_split_ratio = self.valid_split_ratio if rolling_average_persitency is None: rolling_average_persitency = self.rolling_average_persitency assert 1 > valid_split_ratio >= 0 sess = tf.get_default_session() if (self.normalization is None or compute_normalization) and self.normalize_input: self.compute_normalization(obs, act, obs_next) if self.normalize_input: # normalize data obs, act, delta = self._normalize_data(obs, act, obs_next) assert obs.ndim == act.ndim == obs_next.ndim == 3 else: delta = obs_next - obs obs_train, act_train, delta_train, obs_test, act_test, delta_test = train_test_split( obs, act, delta, test_split_ratio=valid_split_ratio) if self._dataset_test is None: self._dataset_test = dict(obs=obs_test, act=act_test, delta=delta_test) self._dataset_train = dict(obs=obs_train, act=act_train, delta=delta_train) else: self._dataset_test['obs'] = np.concatenate( [self._dataset_test['obs'], obs_test]) self._dataset_test['act'] = np.concatenate( [self._dataset_test['act'], act_test]) self._dataset_test['delta'] = np.concatenate( [self._dataset_test['delta'], delta_test]) self._dataset_train['obs'] = np.concatenate( [self._dataset_train['obs'], obs_train]) self._dataset_train['act'] = np.concatenate( [self._dataset_train['act'], act_train]) self._dataset_train['delta'] = np.concatenate( [self._dataset_train['delta'], delta_train]) # create data queue if self.next_batch is None: self.next_batch, self.iterator = self._data_input_fn( self._dataset_train['obs'], self._dataset_train['act'], self._dataset_train['delta'], batch_size=self.batch_size) valid_loss_rolling_average = None epoch_times = [] """ ------- Looping over training epochs ------- """ for epoch in range(epochs): # initialize data queue feed_dict = { self.obs_dataset_ph: self._dataset_train['obs'], self.act_dataset_ph: self._dataset_train['act'], self.delta_dataset_ph: self._dataset_train['delta'] } sess.run(self.iterator.initializer, feed_dict=feed_dict) # preparations for recording training stats batch_losses = [] """ ------- Looping through the shuffled and batched dataset for one epoch -------""" t0 = time.time() while True: try: obs_batch, act_batch, delta_batch = sess.run( self.next_batch) hidden_batch = self.get_initial_hidden(obs_batch.shape[0]) seq_len = obs_batch.shape[1] # run train op all_grads = [] for i in range(0, seq_len, self.backprop_steps): end_i = i + self.backprop_steps feed_dict = { self.obs_ph: obs_batch[:, i:end_i, :], self.act_ph: act_batch[:, i:end_i, :], self.delta_ph: delta_batch[:, i:end_i, :] } hidden_feed_dict = dict( zip(self.hidden_state_ph, hidden_batch)) feed_dict.update(hidden_feed_dict) batch_loss, grads, hidden_batch = sess.run( [ self.loss, self._gradients_vars, self.next_hidden_state_var ], feed_dict=feed_dict) all_grads.append(grads) batch_losses.append(batch_loss) grads = [np.mean(grad, axis=0) for grad in zip(*all_grads)] feed_dict = dict(zip(self._gradients_ph, grads)) _ = sess.run(self.train_op, feed_dict=feed_dict) except tf.errors.OutOfRangeError: obs_test = self._dataset_test['obs'] act_test = self._dataset_test['act'] delta_test = self._dataset_test['delta'] hidden_batch = self.get_initial_hidden(obs_test.shape[0]) # compute validation loss feed_dict = { self.obs_ph: obs_test, self.act_ph: act_test, self.delta_ph: delta_test, self.hidden_state_ph: hidden_batch } valid_loss = sess.run(self.loss, feed_dict=feed_dict) if valid_loss_rolling_average is None: valid_loss_rolling_average = 1.5 * valid_loss # set initial rolling to a higher value avoid too early stopping valid_loss_rolling_average_prev = 2 * valid_loss if valid_loss < 0: valid_loss_rolling_average = valid_loss / 1.5 # set initial rolling to a higher value avoid too early stopping valid_loss_rolling_average_prev = valid_loss / 2 valid_loss_rolling_average = rolling_average_persitency*valid_loss_rolling_average \ + (1.0-rolling_average_persitency)*valid_loss epoch_times.append(time.time() - t0) if verbose: logger.log( "Training RNNDynamicsModel - finished epoch %i --" "train loss: %.4f valid loss: %.4f valid_loss_mov_avg: %.4f epoch time: %.2f" % (epoch, np.mean(batch_losses), valid_loss, valid_loss_rolling_average, time.time() - t0)) break if valid_loss_rolling_average_prev < valid_loss_rolling_average or epoch == epochs - 1: logger.log( 'Stopping Training of Model since its valid_loss_rolling_average decreased' ) break valid_loss_rolling_average_prev = valid_loss_rolling_average """ ------- Tabular Logging ------- """ if log_tabular: logger.logkv('AvgModelEpochTime', np.mean(epoch_times)) logger.logkv('Epochs', epoch)