def train(self): o = self.train_env.reset() first_tstart = time.perf_counter() for _epoch in range(self._epoch, self.total_epoch): tstart = time.perf_counter() for _t in range(self.nsteps): if self._t > self.start_steps: a = self.ac.act(np2tentor(o)) a = action4env(a) else: a = np.concatenate([ self.train_env.action_space.sample().reshape(1, -1) for _ in range(self.nenv) ], axis=0) o2, r, d, infos = self.train_env.step(a) self.buffer.store(o, a, r, o2, d) o = o2 for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: logger.logkv_mean('eprewtrain', maybeepinfo['r']) logger.logkv_mean('eplentrain', maybeepinfo['l']) self._t += 1 if self._t >= self.update_after and self._t % self.update_every == 0: self.update() if self._t > self.n_timesteps: break fps = int((_t + 1) / (time.perf_counter() - tstart)) if (_epoch % self.log_freq == 0 or _epoch == self.total_epoch - 1): self.test_agent() logger.logkv('epoch', _epoch) logger.logkv('lr', self.optimizer.param_groups[0]['lr']) logger.logkv('timesteps', self._t) logger.logkv('fps', fps) logger.logkv('time_elapsed', time.perf_counter() - first_tstart) logger.dump_tabular() self._epoch = _epoch # self.save_model() self.lr_scheduler.step()
def log(self, rewards, aux_rewards, dones): dones = np.array(dones, dtype=int) for i, d in enumerate(self.logs['dones']): self.logs['ep_rew'][d, i] += rewards[ i] #the record is the reward rather than the aux_reward self.logs['aux_ep_rew'][d, i] += aux_rewards[i] if self.logs['dones'][i] + dones[i] == 2: self.logs['ep_rew'][0, i] = self.logs['ep_rew'][1, i] self.logs['aux_ep_rew'][0, i] = self.logs['aux_ep_rew'][1, i] self.logs['ep_rew'][1, i] = 0 self.logs['aux_ep_rew'][1, i] = 0 self.logs['eps'] += sum(dones) self.logs['dones'] = np.maximum(self.logs['dones'], dones) if sum(self.logs['dones']) < self.envs.num_envs: return left = right = 0 for key, value in self.maps.items(): right += value self.logs[key + '_ep_rew'] = self.logs['ep_rew'][0][left:right] left = right self.logs['ep_rews'] = np.mean(self.logs['ep_rew'][0]) self.logs['aux_ep_rews'] = np.mean(self.logs['aux_ep_rew'][0]) self.logs['rew_best'] = max(self.logs['rew_best'], self.logs['ep_rews']) self.logs['aux_rew_best'] = max(self.logs['aux_rew_best'], self.logs['aux_ep_rews']) for key in self.maps: hasdata = len(self.logs[key + '_ep_rew']) > 0 self.logs[key + '_ep_rews'] = np.mean( self.logs[key + '_ep_rew']) if hasdata else 0 self.logs[key + '_rew_best'] = max( self.logs[key + '_rew_best'], self.logs[key + '_ep_rews']) if hasdata else 0 if self.logs[list(self.maps)[0] + '_ep_rews'] > self.agent.next_best: if not self.args.distill_restore: self.agent.save_best() self.agent.next_best = self.logs[ list(self.maps)[0] + '_ep_rews'] + self.agent.best_interval print('best snapshot saved') elapsed_time = time.time() - self.logs['start_time'] frames = self.envs.num_envs * self.n_steps * self.logs['updates'] logger.logkv('fps', int(frames / elapsed_time)) logger.logkv('elapsed_time', int(elapsed_time)) logger.logkv('n_eps', self.logs['eps']) logger.logkv('n_samples', frames) logger.logkv('n_updates', self.logs['updates']) logger.logkv('global_step', self.agent.get_global_step()) logger.logkv('lr', self.agent.get_lr()) logger.logkv('aux_ep_rew_best', self.logs['aux_rew_best']) logger.logkv('aux_ep_rew_max', np.max(self.logs['aux_ep_rew'][0])) logger.logkv('aux_ep_rew_mean', self.logs['aux_ep_rews']) i = 0 for key in self.maps: pre = str(i) + '_' + key best_key = key + '_rew_best' ep_key = key + '_ep_rew' hasdata = len(self.logs[ep_key]) > 0 logger.logkv(pre + '_rew_best', self.logs[best_key] if hasdata else '-') logger.logkv(pre + '_rew_max', np.max(self.logs[ep_key]) if hasdata else '-') logger.logkv(pre + '_rew_mean', np.mean(self.logs[ep_key]) if hasdata else '-') logger.logkv(pre + '_rew_std', np.std(self.logs[ep_key]) if hasdata else '-') logger.logkv(pre + '_rew_min', np.min(self.logs[ep_key]) if hasdata else '-') i += 1 logger.dumpkvs() self.logs['dones'] = np.zeros(self.envs.num_envs, dtype=int) self.logs['ep_rew'][0] = self.logs['ep_rew'][1] self.logs['ep_rew'][1] = np.zeros(self.envs.num_envs) self.logs['aux_ep_rew'][0] = self.logs['aux_ep_rew'][1] self.logs['aux_ep_rew'][1] = np.zeros(self.envs.num_envs)
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): self.mvs = self.env.mvs[0] self.dir = self.env.dir[0] self.server = self.env.server[0] new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) nupdates = total_timesteps // self.batch_size if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() #obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for update in range(nupdates): self._start() self.env.set_attr('id', [self.backend_proc.pid]) obs = self.env.reset() rewards = [] for step in range(self.batch_size): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action( self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step( unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper rewards.append(reward) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) stop_solver(self.backend_proc) delete_id(self.server, self.backend_proc.pid) self.ep_info_buf.append({'r': safe_mean(rewards)}) # Display training infos fps = int(step * update / (time() - start_time)) logger.logkv("episodes", update + 1) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rew_mean', safe_mean( [ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total_timesteps", self.num_timesteps) logger.dumpkvs() # reset infos: infos_values = [] # train callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample( self.batch_size ) or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - update / nupdates current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(update, writer, current_lr)) # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs, 1) inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.eval_env is not None: rollout = self.runner._run() obs, returns, masks, actions, values, neglogpacs, states, eval_ep_infos, true_reward = rollout self.eval_ep_info_buf.extend(eval_ep_infos) logger.logkv( 'eval_ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.eval_ep_info_buf ])) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) if 'rc1' in self.ep_info_buf[0].keys(): logger.logkv( 'ep_reward_c1_mean', safe_mean([ ep_info['rc1'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_reward_c2_mean', safe_mean([ ep_info['rc2'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_reward_c3_mean', safe_mean([ ep_info['rc3'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def train(self): first_tstart = time.perf_counter() for _epoch in range(self._epoch, self.total_epoch): tstart = time.perf_counter() frac = 1. - _epoch * 1. / self.total_epoch clip_ratio_now = self.clip_ratio(frac) if (_epoch % self.log_freq == 0 or _epoch == self.total_epoch - 1) and self.is_mpi_root: logger.log('Stepping environment...') # collect data obs, returns, masks, actions, values, neglogpacs, states, epinfos = self.collect( ) # if eval_env is not None: # eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() # pylint: disable=E0632 if (_epoch % self.log_freq == 0 or _epoch == self.total_epoch - 1) and self.is_mpi_root: logger.log('done') self.epinfobuf.extend(epinfos) # if eval_env is not None: # eval_epinfobuf.extend(eval_epinfos) self.update(obs, returns, masks, actions, values, neglogpacs, clip_ratio_now, states) self.lr_scheduler.step() fps = int(self.nbatch / (time.perf_counter() - tstart)) if (_epoch % self.log_freq == 0 or _epoch == self.total_epoch - 1) and self.is_mpi_root: logger.logkv('epoch', _epoch) logger.logkv('lr', self.optimizer.param_groups[0]['lr']) logger.logkv('timesteps', (_epoch + 1) * self.nbatch) logger.logkv('fps', fps) logger.logkv( 'eprewmean', safemean([epinfo['r'] for epinfo in self.epinfobuf])) logger.logkv( 'eplenmean', safemean([epinfo['l'] for epinfo in self.epinfobuf])) logger.logkv('time_elapsed', time.perf_counter() - first_tstart) logger.dump_tabular() self._epoch = _epoch self.save_model()