Пример #1
0
    def train(self):

        o = self.train_env.reset()
        first_tstart = time.perf_counter()
        for _epoch in range(self._epoch, self.total_epoch):
            tstart = time.perf_counter()
            for _t in range(self.nsteps):

                if self._t > self.start_steps:
                    a = self.ac.act(np2tentor(o))
                    a = action4env(a)
                else:
                    a = np.concatenate([
                        self.train_env.action_space.sample().reshape(1, -1)
                        for _ in range(self.nenv)
                    ],
                                       axis=0)
                o2, r, d, infos = self.train_env.step(a)
                self.buffer.store(o, a, r, o2, d)
                o = o2

                for info in infos:
                    maybeepinfo = info.get('episode')
                    if maybeepinfo:
                        logger.logkv_mean('eprewtrain', maybeepinfo['r'])
                        logger.logkv_mean('eplentrain', maybeepinfo['l'])

                self._t += 1
                if self._t >= self.update_after and self._t % self.update_every == 0:
                    self.update()
                if self._t > self.n_timesteps:
                    break

            fps = int((_t + 1) / (time.perf_counter() - tstart))

            if (_epoch % self.log_freq == 0 or _epoch == self.total_epoch - 1):
                self.test_agent()
                logger.logkv('epoch', _epoch)
                logger.logkv('lr', self.optimizer.param_groups[0]['lr'])
                logger.logkv('timesteps', self._t)
                logger.logkv('fps', fps)
                logger.logkv('time_elapsed',
                             time.perf_counter() - first_tstart)
                logger.dump_tabular()
                self._epoch = _epoch
                # self.save_model()
            self.lr_scheduler.step()
Пример #2
0
    def log(self, rewards, aux_rewards, dones):
        dones = np.array(dones, dtype=int)
        for i, d in enumerate(self.logs['dones']):
            self.logs['ep_rew'][d, i] += rewards[
                i]  #the record is the reward rather than the aux_reward
            self.logs['aux_ep_rew'][d, i] += aux_rewards[i]
            if self.logs['dones'][i] + dones[i] == 2:
                self.logs['ep_rew'][0, i] = self.logs['ep_rew'][1, i]
                self.logs['aux_ep_rew'][0, i] = self.logs['aux_ep_rew'][1, i]
                self.logs['ep_rew'][1, i] = 0
                self.logs['aux_ep_rew'][1, i] = 0

        self.logs['eps'] += sum(dones)
        self.logs['dones'] = np.maximum(self.logs['dones'], dones)
        if sum(self.logs['dones']) < self.envs.num_envs:
            return

        left = right = 0
        for key, value in self.maps.items():
            right += value
            self.logs[key + '_ep_rew'] = self.logs['ep_rew'][0][left:right]
            left = right

        self.logs['ep_rews'] = np.mean(self.logs['ep_rew'][0])
        self.logs['aux_ep_rews'] = np.mean(self.logs['aux_ep_rew'][0])
        self.logs['rew_best'] = max(self.logs['rew_best'],
                                    self.logs['ep_rews'])
        self.logs['aux_rew_best'] = max(self.logs['aux_rew_best'],
                                        self.logs['aux_ep_rews'])

        for key in self.maps:
            hasdata = len(self.logs[key + '_ep_rew']) > 0
            self.logs[key + '_ep_rews'] = np.mean(
                self.logs[key + '_ep_rew']) if hasdata else 0
            self.logs[key + '_rew_best'] = max(
                self.logs[key + '_rew_best'],
                self.logs[key + '_ep_rews']) if hasdata else 0

        if self.logs[list(self.maps)[0] + '_ep_rews'] > self.agent.next_best:
            if not self.args.distill_restore:
                self.agent.save_best()
            self.agent.next_best = self.logs[
                list(self.maps)[0] + '_ep_rews'] + self.agent.best_interval
            print('best snapshot saved')

        elapsed_time = time.time() - self.logs['start_time']
        frames = self.envs.num_envs * self.n_steps * self.logs['updates']

        logger.logkv('fps', int(frames / elapsed_time))
        logger.logkv('elapsed_time', int(elapsed_time))
        logger.logkv('n_eps', self.logs['eps'])
        logger.logkv('n_samples', frames)
        logger.logkv('n_updates', self.logs['updates'])
        logger.logkv('global_step', self.agent.get_global_step())
        logger.logkv('lr', self.agent.get_lr())
        logger.logkv('aux_ep_rew_best', self.logs['aux_rew_best'])
        logger.logkv('aux_ep_rew_max', np.max(self.logs['aux_ep_rew'][0]))
        logger.logkv('aux_ep_rew_mean', self.logs['aux_ep_rews'])
        i = 0
        for key in self.maps:
            pre = str(i) + '_' + key
            best_key = key + '_rew_best'
            ep_key = key + '_ep_rew'
            hasdata = len(self.logs[ep_key]) > 0
            logger.logkv(pre + '_rew_best',
                         self.logs[best_key] if hasdata else '-')
            logger.logkv(pre + '_rew_max',
                         np.max(self.logs[ep_key]) if hasdata else '-')
            logger.logkv(pre + '_rew_mean',
                         np.mean(self.logs[ep_key]) if hasdata else '-')
            logger.logkv(pre + '_rew_std',
                         np.std(self.logs[ep_key]) if hasdata else '-')
            logger.logkv(pre + '_rew_min',
                         np.min(self.logs[ep_key]) if hasdata else '-')
            i += 1
        logger.dumpkvs()

        self.logs['dones'] = np.zeros(self.envs.num_envs, dtype=int)
        self.logs['ep_rew'][0] = self.logs['ep_rew'][1]
        self.logs['ep_rew'][1] = np.zeros(self.envs.num_envs)
        self.logs['aux_ep_rew'][0] = self.logs['aux_ep_rew'][1]
        self.logs['aux_ep_rew'][1] = np.zeros(self.envs.num_envs)
Пример #3
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        self.mvs = self.env.mvs[0]
        self.dir = self.env.dir[0]
        self.server = self.env.server[0]

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        nupdates = total_timesteps // self.batch_size

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name,
                new_tb_log) as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()

            #obs = self.env.reset()

            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            n_updates = 0
            infos_values = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            for update in range(nupdates):

                self._start()
                self.env.set_attr('id', [self.backend_proc.pid])
                obs = self.env.reset()
                rewards = []

                for step in range(self.batch_size):

                    # Before training starts, randomly sample actions
                    # from a uniform distribution for better exploration.
                    # Afterwards, use the learned policy
                    # if random_exploration is set to 0 (normal setting)

                    if self.num_timesteps < self.learning_starts or np.random.rand(
                    ) < self.random_exploration:

                        # actions sampled from action space are from range specific to the environment
                        # but algorithm operates on tanh-squashed actions therefore simple scaling is used

                        unscaled_action = self.env.action_space.sample()
                        action = scale_action(self.action_space,
                                              unscaled_action)

                    else:

                        action = self.policy_tf.step(
                            obs[None], deterministic=False).flatten()

                        # Add noise to the action (improve exploration,
                        # not needed in general)

                        if self.action_noise is not None:
                            action = np.clip(action + self.action_noise(), -1,
                                             1)

                        # inferred actions need to be transformed to environment action_space before stepping

                        unscaled_action = unscale_action(
                            self.action_space, action)

                    assert action.shape == self.env.action_space.shape

                    new_obs, reward, done, info = self.env.step(
                        unscaled_action)

                    self.num_timesteps += 1

                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.

                    callback.update_locals(locals())
                    if callback.on_step() is False:
                        break

                    # Store only the unnormalized version

                    if self._vec_normalize_env is not None:

                        new_obs_ = self._vec_normalize_env.get_original_obs(
                        ).squeeze()
                        reward_ = self._vec_normalize_env.get_original_reward(
                        ).squeeze()

                    else:

                        # Avoid changing the original ones

                        obs_, new_obs_, reward_ = obs, new_obs, reward

                    # Store transition in the replay buffer.

                    self.replay_buffer_add(obs_, action, reward_, new_obs_,
                                           done, info)
                    obs = new_obs

                    # Save the unnormalized observation

                    if self._vec_normalize_env is not None:
                        obs_ = new_obs_

                    # Retrieve reward and episode length if using Monitor wrapper

                    rewards.append(reward)

                    if writer is not None:
                        # Write reward per episode to tensorboard
                        ep_reward = np.array([reward_]).reshape((1, -1))
                        ep_done = np.array([done]).reshape((1, -1))
                        tf_util.total_episode_reward_logger(
                            self.episode_reward, ep_reward, ep_done, writer,
                            self.num_timesteps)

                    episode_rewards[-1] += reward_
                    if done:
                        if self.action_noise is not None:
                            self.action_noise.reset()
                        if not isinstance(self.env, VecEnv):
                            obs = self.env.reset()
                        episode_rewards.append(0.0)

                        maybe_is_success = info.get('is_success')
                        if maybe_is_success is not None:
                            episode_successes.append(float(maybe_is_success))

                    if len(episode_rewards[-101:-1]) == 0:
                        mean_reward = -np.inf
                    else:
                        mean_reward = round(
                            float(np.mean(episode_rewards[-101:-1])), 1)

                    num_episodes = len(episode_rewards)

                stop_solver(self.backend_proc)
                delete_id(self.server, self.backend_proc.pid)

                self.ep_info_buf.append({'r': safe_mean(rewards)})

                # Display training infos

                fps = int(step * update / (time() - start_time))
                logger.logkv("episodes", update + 1)
                if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                    logger.logkv(
                        'ep_rew_mean',
                        safe_mean(
                            [ep_info['r'] for ep_info in self.ep_info_buf]))
                logger.logkv("current_lr", current_lr)
                logger.logkv("fps", fps)
                logger.logkv('time_elapsed', int(time() - start_time))
                if len(infos_values) > 0:
                    for (name, val) in zip(self.infos_names, infos_values):
                        logger.logkv(name, val)
                logger.logkv("total_timesteps", self.num_timesteps)
                logger.dumpkvs()

                # reset infos:

                infos_values = []

                # train

                callback.on_rollout_end()

                mb_infos_vals = []

                # Update policy, critics and target networks

                for grad_step in range(self.gradient_steps):

                    # Break if the warmup phase is not over
                    # or if there are not enough samples in the replay buffer

                    if not self.replay_buffer.can_sample(
                            self.batch_size
                    ) or self.num_timesteps < self.learning_starts:
                        break
                    n_updates += 1

                    # Compute current learning_rate

                    frac = 1.0 - update / nupdates
                    current_lr = self.learning_rate(frac)

                    # Update policy and critics (q functions)

                    mb_infos_vals.append(
                        self._train_step(update, writer, current_lr))

                    # Update target network

                    self.sess.run(self.target_update_op)

                # Log losses and entropy, useful for monitor training

                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                callback.on_rollout_start()

            callback.on_training_end()

            return self
Пример #4
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):

        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name,
                new_tb_log) as writer:
            self._setup_learn()

            t_first_start = time.time()
            n_updates = total_timesteps // self.n_batch

            callback.on_training_start(locals(), globals())

            for update in range(1, n_updates + 1):

                assert self.n_batch % self.nminibatches == 0, (
                    "The number of minibatches (`nminibatches`) "
                    "is not a factor of the total number of samples "
                    "collected per rollout (`n_batch`), "
                    "some samples won't be used.")

                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # Unpack
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout
                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                mb_loss_vals = []

                if states is None:  # nonrecurrent version
                    update_fac = max(
                        self.n_batch // self.nminibatches // self.noptepochs,
                        1)
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (epoch_num * self.n_batch + start) //
                                batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = max(
                        self.n_batch // self.nminibatches // self.noptepochs //
                        self.n_steps, 1)
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (epoch_num * self.n_envs + start) //
                                envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.eval_env is not None:
                    rollout = self.runner._run()
                    obs, returns, masks, actions, values, neglogpacs, states, eval_ep_infos, true_reward = rollout
                    self.eval_ep_info_buf.extend(eval_ep_infos)
                    logger.logkv(
                        'eval_ep_reward_mean',
                        safe_mean([
                            ep_info['r'] for ep_info in self.eval_ep_info_buf
                        ]))

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        if 'rc1' in self.ep_info_buf[0].keys():
                            logger.logkv(
                                'ep_reward_c1_mean',
                                safe_mean([
                                    ep_info['rc1']
                                    for ep_info in self.ep_info_buf
                                ]))
                            logger.logkv(
                                'ep_reward_c2_mean',
                                safe_mean([
                                    ep_info['rc2']
                                    for ep_info in self.ep_info_buf
                                ]))
                            logger.logkv(
                                'ep_reward_c3_mean',
                                safe_mean([
                                    ep_info['rc3']
                                    for ep_info in self.ep_info_buf
                                ]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

            callback.on_training_end()
            return self
Пример #5
0
    def train(self):
        first_tstart = time.perf_counter()
        for _epoch in range(self._epoch, self.total_epoch):
            tstart = time.perf_counter()
            frac = 1. - _epoch * 1. / self.total_epoch
            clip_ratio_now = self.clip_ratio(frac)
            if (_epoch % self.log_freq == 0
                    or _epoch == self.total_epoch - 1) and self.is_mpi_root:
                logger.log('Stepping environment...')

            # collect data
            obs, returns, masks, actions, values, neglogpacs, states, epinfos = self.collect(
            )
            # if eval_env is not None:
            #     eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run()  # pylint: disable=E0632

            if (_epoch % self.log_freq == 0
                    or _epoch == self.total_epoch - 1) and self.is_mpi_root:
                logger.log('done')

            self.epinfobuf.extend(epinfos)
            # if eval_env is not None:
            #     eval_epinfobuf.extend(eval_epinfos)
            self.update(obs, returns, masks, actions, values, neglogpacs,
                        clip_ratio_now, states)
            self.lr_scheduler.step()
            fps = int(self.nbatch / (time.perf_counter() - tstart))
            if (_epoch % self.log_freq == 0
                    or _epoch == self.total_epoch - 1) and self.is_mpi_root:
                logger.logkv('epoch', _epoch)
                logger.logkv('lr', self.optimizer.param_groups[0]['lr'])
                logger.logkv('timesteps', (_epoch + 1) * self.nbatch)
                logger.logkv('fps', fps)
                logger.logkv(
                    'eprewmean',
                    safemean([epinfo['r'] for epinfo in self.epinfobuf]))
                logger.logkv(
                    'eplenmean',
                    safemean([epinfo['l'] for epinfo in self.epinfobuf]))
                logger.logkv('time_elapsed',
                             time.perf_counter() - first_tstart)
                logger.dump_tabular()
                self._epoch = _epoch
                self.save_model()