def mpi_average(value): """ calculate the average from the array, using MPI :param value: (np.ndarray) the array :return: (float) the average """ if len(value) == 0: value = [0.] if not isinstance(value, list): value = [value] return mpi_moments(np.array(value))[0]
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="PPO1", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ "an instance of common.policies.ActorCriticPolicy." with self.sess.as_default(): self.adam.sync() callback.on_training_start(locals(), globals()) # Prepare for rollouts seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, callback=callback) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() # rolling buffer for episode lengths len_buffer = deque(maxlen=100) # rolling buffer for episode rewards reward_buffer = deque(maxlen=100) while True: if timesteps_so_far >= total_timesteps: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / total_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() # Stop training early (triggered by the callback) if not seg.get('continue_training', True): # pytype: disable=attribute-error break add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observations, actions = seg["observations"], seg["actions"] atarg, tdlamret = seg["adv"], seg["tdlamret"] # true_rew is the reward without discount if writer is not None: total_episode_reward_logger( self.episode_reward, seg["true_rewards"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) # predicted value function before udpate vpredbefore = seg["vpred"] # standardized advantage function estimate atarg = (atarg - atarg.mean()) / atarg.std() dataset = Dataset(dict(ob=observations, ac=actions, atarg=atarg, vtarg=tdlamret), shuffle=not self.policy.recurrent) optim_batchsize = self.optim_batchsize or observations.shape[ 0] # set old parameter values to new parameter values self.assign_old_eq_new(sess=self.sess) logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for k in range(self.optim_epochs): # list of tuples, each of which gives the loss for a minibatch losses = [] for i, batch in enumerate( dataset.iterate_once(optim_batchsize)): steps = ( self.num_timesteps + k * optim_batchsize + int(i * (optim_batchsize / len(dataset.data_map)))) if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata # (memory, compute time, ...) if self.full_tensorboard_log and (1 + k) % 10 == 0: run_options = tf.compat.v1.RunOptions( trace_level=tf.compat.v1.RunOptions. FULL_TRACE) run_metadata = tf.compat.v1.RunMetadata() summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % steps) else: summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) writer.add_summary(summary, steps) else: _, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) self.adam.update(grad, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in dataset.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) losses.append(newlosses) mean_losses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, mean_losses)) for (loss_val, name) in zipsame(mean_losses, self.loss_names): logger.record_tabular("loss_" + name, loss_val) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # local values lrlocal = (seg["ep_lens"], seg["ep_rets"]) # list of tuples listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO1", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ "an instance of common.policies.ActorCriticPolicy({}).".format(self.policy) with self.sess.as_default(): self.adam.sync() trajectory_dic = None # Prepare for rollouts seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() # rolling buffer for episode lengths lenbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer = deque(maxlen=100) self.episode_reward = np.zeros((self.n_envs, )) if self.save_trajectory: hidden_list = [] obs_list = [] act_list = [] rwds_list = [] dones_list = [] while True: if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if total_timesteps and timesteps_so_far >= total_timesteps: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / total_timesteps, 0) else: raise NotImplementedError # logger.log("********** Iteration %i ************" % iters_so_far) logger.log("********** Iteration %i %i************" % (iters_so_far, self.n_envs)) seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) obs_ph, hiddens_ph, action_ph, atarg, tdlamret = seg[ "ob"], seg["hiddens"], seg["ac"], seg["adv"], seg[ "tdlamret"] # print(">>>hiddens_ph:",len(hiddens_ph)) if self.save_trajectory: rwds_ph, dones_ph = seg["rew"], seg["dones"] obs_list.append(obs_ph.copy()) hidden_list.append(hiddens_ph.copy()) act_list.append(action_ph.copy()) rwds_list.append(rwds_ph.copy()) dones_list.append(dones_ph.copy()) # true_rew is the reward without discount if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, seg["true_rew"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) # predicted value function before udpate vpredbefore = seg["vpred"] # standardized advantage function estimate atarg = (atarg - atarg.mean()) / atarg.std() dataset = Dataset(dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret), shuffle=not self.policy.recurrent) optim_batchsize = self.optim_batchsize or obs_ph.shape[0] # set old parameter values to new parameter values self.assign_old_eq_new(sess=self.sess) logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for k in range(self.optim_epochs): # list of tuples, each of which gives the loss for a minibatch losses = [] for i, batch in enumerate( dataset.iterate_once(optim_batchsize)): steps = ( self.num_timesteps + k * optim_batchsize + int(i * (optim_batchsize / len(dataset.data_map)))) if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata # (memory, compute time, ...) if self.full_tensorboard_log and (1 + k) % 10 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % steps) else: summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) writer.add_summary(summary, steps) else: _, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) self.adam.update(grad, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in dataset.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) losses.append(newlosses) mean_losses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, mean_losses)) for (loss_val, name) in zipsame(mean_losses, self.loss_names): logger.record_tabular("loss_" + name, loss_val) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # local values lrlocal = (seg["ep_lens"], seg["ep_rets"]) # list of tuples listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) if len(lenbuffer) > 0: logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() if self.save_trajectory: length = np.vstack(obs_list).shape[0] print("Save trajectory...(length:{})".format(length)) trajectory_dic = { "all_obvs": np.vstack(obs_list).reshape(length, -1), "all_hiddens": np.vstack(hidden_list).reshape(length, -1), "all_acts": np.vstack(act_list).reshape(length, -1), "all_rwds": np.vstack(rwds_list).reshape(length, -1), "all_dones": np.vstack(dones_list).reshape(length, -1) } # with open('../saved/{}-trajectory.pkl'.format(str(self.__class__).split("'")[-2].split(".")[-1]), 'wb+') as f: # pkl.dump(trajectory_dic, f, protocol=2) return self, trajectory_dic
def update(self): if self.normrew: rffs = np.array( [self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) else: rews = np.copy(self.rollout.buf_rews) self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) info = dict(advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), recent_best_ext_ret=self.rollout.current_max) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) ph_buf = [ (self.stochpol.ph_ac, resh(self.rollout.buf_acs)), (self.ph_rews, resh(self.rollout.buf_rews)), (self.ph_oldvpred, resh(self.rollout.buf_vpreds)), (self.ph_oldnlp, resh(self.rollout.buf_nlps)), (self.stochpol.ph_ob, resh(self.rollout.buf_obs)), (self.ph_ret, resh(self.buf_rets)), (self.ph_adv, resh(self.buf_advs)), ] ph_buf.extend([(self.dynamics.last_ob, self.rollout.buf_obs_last.reshape([ self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape ]))]) mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({ self.ph_lr: self.lr, self.ph_cliprange: self.cliprange }) mblossvals.append(getsess().run(self._losses + (self._train, ), fd)[:-1]) mblossvals = [mblossvals[0]] info.update( zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = MPI.COMM_WORLD.Get_size( ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow return info
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): with SetVerbosity(self.verbose): self._setup_learn(seed) with self.sess.as_default(): self.adam.sync() # Prepare for rollouts seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() # rolling buffer for episode lengths lenbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer = deque(maxlen=100) while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / total_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["adv"], seg["tdlamret"] # predicted value function before udpate vpredbefore = seg["vpred"] # standardized advantage function estimate atarg = (atarg - atarg.mean()) / atarg.std() dataset = Dataset( dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret), shuffle=not issubclass(self.policy, LstmPolicy)) optim_batchsize = self.optim_batchsize or obs_ph.shape[0] # set old parameter values to new parameter values self.assign_old_eq_new(sess=self.sess) logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(self.optim_epochs): # list of tuples, each of which gives the loss for a minibatch losses = [] for batch in dataset.iterate_once(optim_batchsize): *newlosses, grad = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) self.adam.update(grad, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in dataset.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) losses.append(newlosses) mean_losses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, mean_losses)) for (loss_val, name) in zipsame(mean_losses, self.loss_names): logger.record_tabular("loss_" + name, loss_val) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # local values lrlocal = (seg["ep_lens"], seg["ep_rets"]) # list of tuples listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += seg["total_timestep"] iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return self