예제 #1
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="A2C"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            runner = A2CRunner(self.env,
                               self,
                               n_steps=self.n_steps,
                               gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs, ))

            t_start = time.time()
            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, true_reward = runner.run(
                )
                _, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values, update,
                    writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        update * (self.n_batch + 1))

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) == False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          update * self.n_batch)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    logger.dump_tabular()

        return self
예제 #2
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                    schedule=self.lr_schedule)

            runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs,))
            # Training stats (when using Monitor wrapper)
            ep_info_buf = deque(maxlen=100)

            t_start = time.time()
            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run()
                ep_info_buf.extend(ep_infos)

                _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values,
                                                                 self.num_timesteps // self.n_batch, writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, self.num_timesteps)

                self.num_timesteps += self.n_batch

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps", self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy", float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.dump_tabular()

        return self
예제 #3
0
  def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="SIL_A2C"):
    with SetVerbosity(self.verbose), \
         TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:  # type: tf.summary.FileWriter
      self._setup_learn(seed)
      self.save_directory = Path(writer.get_logdir())

      self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                              schedule=self.lr_schedule)

      runner = SuccessorFeatureA2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)
      self.episode_reward = np.zeros((self.n_envs,))

      t_start = time.time()
      for update in range(1, total_timesteps // self.n_batch + 1):
        # true_reward is the reward without discount
        obs, states, rewards, masks, actions, values, true_reward, raw_rewards, features, reward_bonuses = runner.run()
        _, value_loss, policy_entropy, sf_loss = self._train_step(obs, states, rewards, masks, actions, values, update,
                                                                  writer, features=features,
                                                                  rewards_bonuses=reward_bonuses)
        sil_loss, sil_adv, sil_samples, sil_nlogp = self._train_sil()
        n_seconds = time.time() - t_start
        fps = int((update * self.n_batch) / n_seconds)

        if writer is not None:
          self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                            raw_rewards.reshape((self.n_envs, self.n_steps)),
                                                            masks.reshape((self.n_envs, self.n_steps)),
                                                            writer, update * (self.n_batch + 1))
          summary = tf.Summary(value=[tf.Summary.Value(
            tag="episode_reward/best_reward", simple_value=self.sil.get_best_reward())])
          writer.add_summary(summary, update * (self.n_batch + 1))

        if callback is not None:
          callback(locals(), globals())

        if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
          explained_var = explained_variance(values, rewards)
          logger.record_tabular("nupdates", update)
          logger.record_tabular("total_timesteps", update * self.n_batch)
          logger.record_tabular("fps", fps)
          logger.record_tabular("policy_entropy", float(policy_entropy))
          logger.record_tabular("value_loss", float(value_loss))
          logger.record_tabular('sf_loss', float(sf_loss))
          logger.record_tabular("explained_variance", float(explained_var))
          logger.record_tabular("best_episode_reward", float(self.sil.get_best_reward()))
          if self.sil_update > 0:
            logger.record_tabular("sil_num_episodes", float(self.sil.num_episodes()))
            logger.record_tabular("sil_valid_samples", float(sil_samples))
            logger.record_tabular("sil_steps", float(self.sil.num_steps()))
          logger.dump_tabular()

    return self
예제 #4
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                    self.beta_schedule = LinearSchedule(
                        prioritized_replay_beta_iters,
                        initial_p=self.prioritized_replay_beta0,
                        final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            for step in range(total_timesteps):
                if callback is not None:
                    callback(locals(), globals())
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(step)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(step) +
                                self.exploration.value(step) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, _ = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer, step)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if step > self.learning_starts and step % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(step))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + step) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata,
                                                    'step%d' % step)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, step)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if step > self.learning_starts and step % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", step)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(step)))
                    logger.dump_tabular()

        return self
예제 #5
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="PPO2_SH"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam,
                            visualize=self.visualize,
                            snapshot_details=self.snapshot_details)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch
            for update in range(nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                n_batch_train = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update / (nupdates + 1))
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, n_batch_train):
                            timestep = (
                                (update * self.noptepochs * self.n_batch +
                                 epoch_num * self.n_batch + start) //
                                n_batch_train)
                            end = start + n_batch_train
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 writer=writer,
                                                 update=timestep))
                else:  # recurrent version
                    assert self.n_envs % self.nminibatches == 0
                    envinds = np.arange(self.n_envs)
                    flatinds = np.arange(self.n_envs * self.n_steps).reshape(
                        self.n_envs, self.n_steps)
                    envsperbatch = n_batch_train // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(envinds)
                        for start in range(0, self.n_envs, envsperbatch):
                            timestep = (
                                (update * self.noptepochs * self.n_envs +
                                 epoch_num * self.n_envs + start) //
                                envsperbatch)
                            end = start + envsperbatch
                            mb_env_inds = envinds[start:end]
                            mb_flat_inds = flatinds[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 update=timestep,
                                                 writer=writer,
                                                 states=mb_states))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        update * (self.n_batch + 1))

                all_env_episode_rewards = calculate_total_episode_reward(
                    self.episode_reward,
                    true_reward.reshape((self.n_envs, self.n_steps)),
                    masks.reshape((self.n_envs, self.n_steps)))
                average_episode_reward = safe_mean(all_env_episode_rewards)
                ep_info = {'r': average_episode_reward, 'l': np.nan}
                ep_info_buf.append(ep_info)

                if callback is not None:
                    callback(locals(), globals())

                if self.verbose >= 1 and (
                    (update + 1) % log_interval // 100 == 0 or update == 0):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps",
                                 (update + 1) * self.n_steps)
                    logger.logkv("nupdates", (update + 1))
                    logger.logkv("total_timesteps",
                                 (update + 1) * self.n_batch)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

            return self
예제 #6
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="TRPO",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            with self.sess.as_default():
                seg_gen = traj_segment_generator(
                    self.policy_pi,
                    self.env,
                    self.timesteps_per_batch,
                    reward_giver=self.reward_giver,
                    gail=self.using_gail)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                len_buffer = deque(
                    maxlen=40)  # rolling buffer for episode lengths
                reward_buffer = deque(
                    maxlen=40)  # rolling buffer for episode rewards
                self.episode_reward = np.zeros((self.n_envs, ))

                true_reward_buffer = None
                if self.using_gail:
                    true_reward_buffer = deque(maxlen=40)

                    # Initialize dataloader
                    batchsize = self.timesteps_per_batch // self.d_step
                    self.expert_dataset.init_dataloader(batchsize)

                    #  Stats not used for now
                    # TODO: replace with normal tb logging
                    #  g_loss_stats = Stats(loss_names)
                    #  d_loss_stats = Stats(reward_giver.loss_name)
                    #  ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"])

                while True:
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    def fisher_vector_product(vec):
                        return self.allmean(
                            self.compute_fvp(
                                vec, *fvpargs,
                                sess=self.sess)) + self.cg_damping * vec

                    # ------------------ Update G ------------------
                    logger.log("Optimizing Policy...")
                    # g_step = 1 when not using GAIL
                    mean_losses = None
                    vpredbefore = None
                    tdlamret = None
                    observation = None
                    action = None
                    seg = None
                    for k in range(self.g_step):
                        with self.timed("sampling"):
                            seg = seg_gen.__next__()
                        add_vtarg_and_adv(seg, self.gamma, self.lam)
                        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                        observation, action, atarg, tdlamret = seg["ob"], seg[
                            "ac"], seg["adv"], seg["tdlamret"]
                        vpredbefore = seg[
                            "vpred"]  # predicted value function before update
                        atarg = (atarg - atarg.mean()) / atarg.std(
                        )  # standardized advantage function estimate

                        # true_rew is the reward without discount
                        if writer is not None:
                            self.episode_reward = total_episode_reward_logger(
                                self.episode_reward, seg["true_rew"].reshape(
                                    (self.n_envs, -1)), seg["dones"].reshape(
                                        (self.n_envs, -1)), writer,
                                self.num_timesteps)

                        args = seg["ob"], seg["ob"], seg["ac"], atarg
                        fvpargs = [arr[::5] for arr in args]

                        self.assign_old_eq_new(sess=self.sess)

                        with self.timed("computegrad"):
                            steps = self.num_timesteps + (k + 1) * (
                                seg["total_timestep"] / self.g_step)
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata(
                            ) if self.full_tensorboard_log else None
                            # run loss backprop with summary, and save the metadata (memory, compute time, ...)
                            if writer is not None:
                                summary, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)
                                if self.full_tensorboard_log:
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)

                        lossbefore = self.allmean(np.array(lossbefore))
                        grad = self.allmean(grad)
                        if np.allclose(grad, 0):
                            logger.log("Got zero gradient. not updating")
                        else:
                            with self.timed("conjugate_gradient"):
                                stepdir = conjugate_gradient(
                                    fisher_vector_product,
                                    grad,
                                    cg_iters=self.cg_iters,
                                    verbose=self.rank == 0
                                    and self.verbose >= 1)
                            assert np.isfinite(stepdir).all()
                            shs = .5 * stepdir.dot(
                                fisher_vector_product(stepdir))
                            # abs(shs) to avoid taking square root of negative values
                            lagrange_multiplier = np.sqrt(
                                abs(shs) / self.max_kl)
                            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                            fullstep = stepdir / lagrange_multiplier
                            expectedimprove = grad.dot(fullstep)
                            surrbefore = lossbefore[0]
                            stepsize = 1.0
                            thbefore = self.get_flat()
                            thnew = None
                            for _ in range(10):
                                thnew = thbefore + fullstep * stepsize
                                self.set_from_flat(thnew)
                                mean_losses = surr, kl_loss, *_ = self.allmean(
                                    np.array(
                                        self.compute_losses(*args,
                                                            sess=self.sess)))
                                improve = surr - surrbefore
                                logger.log("Expected: %.3f Actual: %.3f" %
                                           (expectedimprove, improve))
                                if not np.isfinite(mean_losses).all():
                                    logger.log(
                                        "Got non-finite value of losses -- bad!"
                                    )
                                elif kl_loss > self.max_kl * 1.5:
                                    logger.log(
                                        "violated KL constraint. shrinking step."
                                    )
                                elif improve < 0:
                                    logger.log(
                                        "surrogate didn't improve. shrinking step."
                                    )
                                else:
                                    logger.log("Stepsize OK!")
                                    break
                                stepsize *= .5
                            else:
                                logger.log("couldn't compute a good step")
                                self.set_from_flat(thbefore)
                            if self.nworkers > 1 and iters_so_far % 20 == 0:
                                # list of tuples
                                paramsums = MPI.COMM_WORLD.allgather(
                                    (thnew.sum(), self.vfadam.getflat().sum()))
                                assert all(
                                    np.allclose(ps, paramsums[0])
                                    for ps in paramsums[1:])

                        with self.timed("vf"):
                            for _ in range(self.vf_iters):
                                # NOTE: for recurrent policies, use shuffle=False?
                                for (mbob, mbret) in dataset.iterbatches(
                                    (seg["ob"], seg["tdlamret"]),
                                        include_final_partial_batch=False,
                                        batch_size=128,
                                        shuffle=True):
                                    grad = self.allmean(
                                        self.compute_vflossandgrad(
                                            mbob, mbob, mbret, sess=self.sess))
                                    self.vfadam.update(grad, self.vf_stepsize)

                    for (loss_name, loss_val) in zip(self.loss_names,
                                                     mean_losses):
                        logger.record_tabular(loss_name, loss_val)

                    logger.record_tabular(
                        "explained_variance_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    if self.using_gail:
                        # ------------------ Update D ------------------
                        logger.log("Optimizing Discriminator...")
                        logger.log(fmt_row(13, self.reward_giver.loss_name))
                        assert len(observation) == self.timesteps_per_batch
                        batch_size = self.timesteps_per_batch // self.d_step

                        # NOTE: uses only the last g step for observation
                        d_losses = [
                        ]  # list of tuples, each of which gives the loss for a minibatch
                        # NOTE: for recurrent policies, use shuffle=False?
                        for ob_batch, ac_batch in dataset.iterbatches(
                            (observation, action),
                                include_final_partial_batch=False,
                                batch_size=batch_size,
                                shuffle=True):
                            ob_expert, ac_expert = self.expert_dataset.get_next_batch(
                            )
                            # update running mean/std for reward_giver
                            if self.reward_giver.normalize:
                                self.reward_giver.obs_rms.update(
                                    np.concatenate((ob_batch, ob_expert), 0))

                            # Reshape actions if needed when using discrete actions
                            if isinstance(self.action_space,
                                          gym.spaces.Discrete):
                                if len(ac_batch.shape) == 2:
                                    ac_batch = ac_batch[:, 0]
                                if len(ac_expert.shape) == 2:
                                    ac_expert = ac_expert[:, 0]
                            *newlosses, grad = self.reward_giver.lossandgrad(
                                ob_batch, ac_batch, ob_expert, ac_expert)
                            self.d_adam.update(self.allmean(grad),
                                               self.d_stepsize)
                            d_losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"],
                                    seg["ep_true_rets"])  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(
                            lr_local)  # list of tuples
                        lens, rews, true_rets = map(flatten_lists,
                                                    zip(*list_lr_pairs))
                        true_reward_buffer.extend(true_rets)
                    else:
                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"]
                                    )  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(
                            lr_local)  # list of tuples
                        lens, rews = map(flatten_lists, zip(*list_lr_pairs))
                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)

                    if len(len_buffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))
                        logger.record_tabular("EpRewMean",
                                              np.mean(reward_buffer))
                    if self.using_gail:
                        logger.record_tabular("EpTrueRewMean",
                                              np.mean(true_reward_buffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1

                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)

                    if self.verbose >= 1 and self.rank == 0:
                        logger.dump_tabular()

        return self
예제 #7
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              eval_every_n=5,
              reset_num_timesteps=True,
              record_video=False,
              log_dir=""):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch

            for update in range(1, nupdates + 1):
                if update % eval_every_n == 1:
                    print("[RAISIM_GYM] Visualizing in RaiSimOgre")
                    obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = \
                        runner.run(test_mode=True, record_video=record_video, video_name=log_dir+"/"+str(update-1)+".mp4")
                    print("Average rewards in this test episode ",
                          ep_infos[0]['r'])
                    # tensorboard_log(logger, ep_infos, self.sess)

                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 writer=writer,
                                                 update=timestep))
                    self.num_timesteps += (self.n_batch * self.noptepochs
                                           ) // batch_size * update_fac
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 update=timestep,
                                                 writer=writer,
                                                 states=mb_states))
                    self.num_timesteps += (self.n_envs * self.noptepochs
                                           ) // envs_per_batch * update_fac

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
예제 #8
0
파일: dqn.py 프로젝트: tu2id4n/my_pmm
    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN",
              reset_num_timesteps=True, replay_wrapper=None, save_interval=None, save_path=None):

        print('----------------------------------------------')
        print('|                 L E A R N                  |')
        print('----------------------------------------------')
        print("num timesteps = " + str(int(total_timesteps / 1000)) + 'k')
        print("save_interval = " + str(int(save_interval / 1000)) + 'k')
        print()
        k = 10
        save_interval_st = save_interval

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                                    initial_p=self.prioritized_replay_beta0,
                                                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)  # 升级

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps),
                                              initial_p=1.0,
                                              final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_win_rates = [0.0]
            episode_successes = []
            obs, obs_nf = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1,))
            self.win_rate = np.zeros((1,))
            # print(obs_nf)
            """
            探索使用prune
            """
            prev2s = [None, None]

            def input_formate(obs):
                return obs.transpose((1, 2, 0))

            for _ in tqdm(range(total_timesteps)):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                # tf.summary.scalar('update_eps', update_eps)
                with self.sess.as_default():
                    # 永不探索 原本为update_eps=update_eps
                    action = self.act(np.array(input_formate(obs))[None], update_eps=-1, **kwargs)[0]
                    filter_action = random.randint(0, 5)
                    if type(obs_nf) == tuple:
                        obs_nf = obs_nf[0]
                    filter_action = feature_utils.get_modify_act(obs_nf, filter_action, prev2s, nokick=True)
                    filter_action = feature_utils.get_act_abs(obs_nf, filter_action, rang=8)
                    # 统计100次filter_actions的概率
                    fil_acts = []
                    for _ in range(100):
                        rand_act = random.randint(0, 5)
                        fil_act = feature_utils.get_modify_act(obs_nf, rand_act, prev2s, nokick=True)
                        fil_act = feature_utils.get_act_abs(obs_nf, fil_act, rang=8)
                        fil_acts.append(fil_act)
                    # print('fil', fil_acts)
                    # print()
                    fil_acts = np.eye(65)[fil_acts]
                    # print('eye', fil_acts)
                    # print()
                    fil_acts = fil_acts.sum(axis=0)
                    # print('sum', fil_acts)
                    # print()

                    if random.random() < update_eps:
                        action = filter_action

                env_action = action
                reset = False
                new_obs, rew, done, info, new_obs_nf = self.env.step(env_action)  # .ntc
                self.replay_buffer.add(input_formate(obs), action, rew, input_formate(new_obs), float(done), fil_acts)

                '''
                    HER
                '''
                self.temp_buffer.append((obs, action, rew, new_obs, float(done), fil_acts))
                if len(self.temp_buffer) >= self.temp_size:
                    for t in range(self.temp_size):
                        s, a, r, s_n, d, fa = self.temp_buffer[t]
                        for k in range(self.k):
                            _s = copy.deepcopy(s)
                            _a = a
                            _r = copy.deepcopy(r)
                            _s_n = copy.deepcopy(s_n)
                            future = np.random.randint(t, self.temp_size)
                            s_f, _a_f, _, _, _, _ = self.temp_buffer[future]
                            g_map = s_f[-2]
                            _s[-1] = g_map
                            # print(_s_n[-2][goal])
                            if (_s_n[-2] == g_map).all() or (
                                    (_s[-2] == _s[-1]).all() and _a_f == a == 64):  # 判断_s是否通过a到达goal
                                # if (_s[-2]) or g == 64:  # 是否为原地不动
                                # print('HER')
                                _r = _r + 0.01
                            self.replay_buffer.add(input_formate(_s), a, _r, input_formate(_s_n), d, fa)
                    self.temp_buffer.clear()

                obs = new_obs
                obs_nf = new_obs_nf

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_win = np.array([info]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
                                                                      self.num_timesteps)
                    self.win_rate = total_rate_logger(self.win_rate, ep_win, ep_done, writer,
                                                      self.num_timesteps, name='win_rate')

                episode_rewards[-1] += rew
                episode_win_rates[-1] += info
                if done:
                    maybe_is_success = (rew > 0)  # info.get('is_success')  # .ntc
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs, obs_nf = self.env.reset()
                    episode_rewards.append(0.0)
                    episode_win_rates.append(0.0)
                    reset = True
                    prev2s = [None, None]

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # print('Sampling ... ...', self.num_timesteps)
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(self.batch_size,
                                                               beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones, filter_actions = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # print(rewards.shape)
                    # print(dones.shape)
                    # print(actions.shape)
                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        # print("fils", filter_actions)
                        # print("acts", actions)
                        # print('   Training ... ...')
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors, kl_errors = self._train_step(obses_t, actions, rewards, obses_tp1,
                                                                             obses_tp1,
                                                                             dones, weights, filter_actions,
                                                                             sess=self.sess, options=run_options,
                                                                             run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors, kl_errors = self._train_step(obses_t, actions, rewards, obses_tp1,
                                                                                 obses_tp1,
                                                                                 dones, weights, filter_actions,
                                                                                 sess=self.sess)
                            # print('er', pr[0])
                            # print('kl', pr[1])
                            # print('x', pr[2])
                            # print('y', pr[3])
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                if len(episode_win_rates[-101:-1]) == 0:
                    mean_100ep_win_rate = -np.inf
                else:
                    mean_100ep_win_rate = round(float(np.mean(episode_win_rates[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    logger.record_tabular("mean 100 win rate", mean_100ep_win_rate)
                    logger.record_tabular("% time spent exploring",
                                          int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                # save interval
                if self.num_timesteps >= save_interval_st:
                    save_interval_st += save_interval
                    s_path = save_path + '_' + str(int(self.num_timesteps / 1000)) + 'k.zip'
                    self.save(save_path=s_path)

                self.num_timesteps += 1

        return self
예제 #9
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="ACKTR",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.variable_scope(
                        "kfac_apply",
                        reuse=self.trained,
                        custom_getter=tf_util.outer_scope_getter(
                            "kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized) if not f
                    ]

                    self.train_op, self.q_runner = self.optim.apply_gradients(
                        list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized)
                        if not f and v not in old_uninitialized_vars
                    ]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(
                            tf.variables_initializer(new_uninitialized_vars))

            self.trained = True

            # Use GAE
            if self.gae_lambda is not None:
                runner = PPO2Runner(env=self.env,
                                    model=self,
                                    n_steps=self.n_steps,
                                    gamma=self.gamma,
                                    lam=self.gae_lambda)
            else:
                runner = A2CRunner(self.env,
                                   self,
                                   n_steps=self.n_steps,
                                   gamma=self.gamma)

            self.episode_reward = np.zeros((self.n_envs, ))

            t_start = time.time()
            coord = tf.train.Coordinator()
            if self.q_runner is not None:
                enqueue_threads = self.q_runner.create_threads(self.sess,
                                                               coord=coord,
                                                               start=True)
            else:
                enqueue_threads = []

            # Training stats (when using Monitor wrapper)
            ep_info_buf = deque(maxlen=100)

            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                if isinstance(runner, PPO2Runner):
                    # We are using GAE
                    obs, returns, masks, actions, values, _, states, ep_infos, true_reward = runner.run(
                    )
                else:
                    obs, states, returns, masks, actions, values, ep_infos, true_reward = runner.run(
                    )

                ep_info_buf.extend(ep_infos)
                policy_loss, value_loss, policy_entropy = self._train_step(
                    obs, states, returns, masks, actions, values,
                    self.num_timesteps // (self.n_batch + 1), writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.dump_tabular()

                self.num_timesteps += self.n_batch + 1

            coord.request_stop()
            coord.join(enqueue_threads)

        return self
예제 #10
0
    def learn(self,
              total_timesteps,
              env,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True,
              save_file="default"):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = OverideRunner(env=self.env,
                                   model=self,
                                   n_steps=self.n_steps,
                                   gamma=self.gamma,
                                   lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch

            print("No of updates: {}".format(nupdates))
            print("Total timesteps : {}".format(total_timesteps))
            print("Batch size: {}".format(self.n_batch))
            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                # frac = 1.0
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 writer=writer,
                                                 update=timestep))
                    self.num_timesteps += (self.n_batch * self.noptepochs
                                           ) // batch_size * update_fac
                    if (update * self.n_batch) % 8192 == 0:
                        self.save(save_file + str(update * self.n_batch))
                        # plot_policy_and_value_fns(self, update * self.n_batch, save_file.split('ppo2_me')[0] + 'policy_plots/')

                        # total_reward, success_episodes = self.test(env)
                        # env.logger.log_scalar('test/success_episodes', success_episodes, update * self.n_batch)
                        # env.logger.log_scalar('test/total_reward', total_reward, update * self.n_batch)
                        # total_rewards.append(total_reward)
                        # total_successes.append(success_episodes)
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 update=timestep,
                                                 writer=writer,
                                                 states=mb_states))
                    self.num_timesteps += (self.n_envs * self.noptepochs
                                           ) // envs_per_batch * update_fac

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
예제 #11
0
파일: ppo2_v2.py 프로젝트: tu2id4n/my_pmm
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True,
              save_interval=None,
              save_path=None,
              gamma=0.99,
              n_steps=128):
        print('----------------------------------------------')
        print('|                 L E A R N                  |')
        print('----------------------------------------------')

        print("num timesteps = " + str(int(total_timesteps / 1000000)) + 'm')
        # print("num_envs = ", self.num_envs)
        print("save_interval = " + str(int(save_interval / 1000)) + 'k')
        print()
        save_interval_st = save_interval
        self.gamma = gamma
        self.n_steps = n_steps
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()  # 去掉参数 seed ?

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            hindsight_buffer = HindSightBuffer(self.n_steps, self.gamma,
                                               self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))
            self.win_rate = np.zeros((self.n_envs, ))
            self.tie_rate = np.zeros((self.n_envs, ))
            self.loss_rate = np.zeros((self.n_envs, ))

            # ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch  # self.n_batch = self.n_envs(8) * self.n_steps(128)
            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0  # self.nminibatches == 4
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, true_reward, \
                win_rates, tie_rates, loss_rates, obs_nf = runner.run()
                self.num_timesteps += self.n_batch
                # ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, 2 * self.n_steps)),
                        masks.reshape((self.n_envs, 2 * self.n_steps)), writer,
                        self.num_timesteps)
                    self.win_rate = total_rate_logger(
                        self.win_rate,
                        win_rates.reshape((self.n_envs, self.n_steps)),
                        masks[:5120].reshape((self.n_envs, self.n_steps)),
                        writer,
                        self.num_timesteps,
                        name='win_rate')
                    self.tie_rate = total_rate_logger(
                        self.tie_rate,
                        tie_rates.reshape((self.n_envs, self.n_steps)),
                        masks[:5120].reshape((self.n_envs, self.n_steps)),
                        writer,
                        self.num_timesteps,
                        name='tie_rate')
                    self.loss_rate = total_rate_logger(
                        self.loss_rate,
                        loss_rates.reshape((self.n_envs, self.n_steps)),
                        masks[:5120].reshape((self.n_envs, self.n_steps)),
                        writer,
                        self.num_timesteps,
                        name='loss_rate')

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    # if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                    #     logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    #     logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # save interval
                if self.num_timesteps >= save_interval_st:
                    save_interval_st += save_interval
                    s_path = save_path + '_' + str(
                        int(self.num_timesteps / 10000)) + 'k.zip'
                    self.save(save_path=s_path)

            return self
예제 #12
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="SAC",
              print_freq=100):

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            start_time = time.time()
            episode_rewards = [0.0]
            is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
            # TeleopEnv
            if is_teleop_env:
                print("Waiting for teleop")
                obs = self.env.wait_for_teleop_reset()
                info = {"cte": 0.0}
            else:
                obs = self.env.reset()
                info = {"cte": 0.0}

            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            infos_values = []
            mb_infos_vals = []

            # ---------------------load the trained NN for safety signal

            tf_obs = tf.placeholder(tf.float32, shape=(1, 104))

            hidden1 = tf.layers.dense(tf_obs, 64, tf.nn.relu)
            hidden2 = tf.layers.dense(hidden1, 16, tf.nn.relu)
            output1 = tf.layers.dense(hidden2, 2)

            hidden3 = tf.layers.dense(tf_obs, 64, tf.nn.relu)
            hidden4 = tf.layers.dense(hidden3, 16, tf.nn.relu)
            output2 = tf.layers.dense(hidden4, 3)

            sess = tf.Session()
            saver = tf.train.Saver()
            saver.restore(sess,
                          "./saved_params/param03-level1-quad/safe_layer")

            # --------------------------------------------------------

            fr = open("dump_reward.txt", "w")
            fv = open("dump_violation.txt", "w")
            cum_reward = []
            num_vio = 0

            for step in range(total_timesteps):
                # Compute current learning_rate
                frac = 1.0 - step / total_timesteps
                current_lr = self.learning_rate(frac)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                # ---------- use trained NN to revise the action
                if action[1] < 0:
                    action[1] *= -1
                print("h1, action ", action)
                proposed_action = action.copy()
                action_take = action.copy()
                proposed_action = np.asarray(proposed_action).reshape((1, 2))
                #print ("h2, proposed_action", proposed_action)

                #print("obs shape", obs.shape)
                v1 = sess.run(output1, {tf_obs: obs.reshape((1, 104))})

                v2 = sess.run(output2, {tf_obs: obs.reshape((1, 104))})
                q = [v2[0][0], 0.5 * v2[0][1], 0.5 * v2[0][1], v2[0][2]]
                q = np.reshape(q, (2, 2))

                x = cvx.Variable(1, 2)
                obj = cvx.sum_squares(x - proposed_action)
                cons = [info["cte"] + v1 * x.T + x * q * x.T <= 4.8, x[1] > 0]
                prob = cvx.Problem(cvx.Minimize(obj), cons)

                try:
                    qcqp = QCQP(prob)
                    qcqp.suggest(SDR)
                    f_cd, v_cd = qcqp.improve(COORD_DESCENT)
                    print(
                        "Coordinate descent: objective %.3f, violation %.3f" %
                        (f_cd, v_cd))

                    if v_cd == 0:
                        new_action = x.value
                        new_action = np.asarray(new_action).reshape((1, 2))
                        print("h5, action ", new_action)
                        action_take[0] = new_action[0][0]
                        action_take[1] = new_action[0][1]
                        new_obs, reward, done, new_info = self.env.step(
                            action_take)
                        action = action_take
                    else:
                        new_obs, reward, done, new_info = self.env.step(action)

                except:
                    new_obs, reward, done, new_info = self.env.step(action)
#             -----------------------------------------

                ep_len += 1

                if (len(cum_reward) == 10):
                    cum_reward.pop(0)
                cum_reward.append(reward)
                curr = 0.0
                for i in range(len(cum_reward)):
                    idx = len(cum_reward) - i - 1
                    curr += cum_reward[idx] * (0.99**i)
                fr.write("%f \n" % (curr))
                fv.write("%d \n" % (num_vio))

                if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
                    print("{} steps".format(ep_len))

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs
                info = new_info

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer, step)

                if ep_len > self.train_freq:
                    print("Additional training")
                    self.env.reset()
                    mb_infos_vals = self.optimize(step, writer, current_lr)
                    done = True

                episode_rewards[-1] += reward
                if done:
                    num_vio += 1
                    if not (isinstance(self.env, VecEnv) or is_teleop_env):
                        obs = self.env.reset()

                    print("Episode finished. Reward: {:.2f} {} Steps".format(
                        episode_rewards[-1], ep_len))
                    episode_rewards.append(0.0)
                    ep_len = 0
                    mb_infos_vals = self.optimize(step, writer, current_lr)

                    # Refresh obs when using TeleopEnv
                    if is_teleop_env:
                        print("Waiting for teleop")
                        obs = self.env.wait_for_teleop_reset()

                # Log losses and entropy, useful for monitor training
                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", self.n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed',
                                 "{:.2f}".format(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            if is_teleop_env:
                self.env.is_training = False
            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)
        return self
예제 #13
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            tra_obs = []
            ep_count = 0
            selected_goal = None
            tra_count = 0
            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape
                new_obs, reward, done, info = self.env.step(rescaled_action)

                #################################################################
                # fit density model and update goal proposing model
                skew_explore_obs = obs.copy()
                if isinstance(self.env, HERGoalEnvWrapper):
                    skew_explore_obs_dict = self.env.convert_obs_to_dict(
                        skew_explore_obs)
                    skew_explore_obs = np.array(
                        [skew_explore_obs_dict['observation']])
                    tra_obs.append(skew_explore_obs[0])
                    if selected_goal is None:
                        selected_goal = np.array(
                            skew_explore_obs_dict['desired_goal'])
                else:
                    tra_obs.append(skew_explore_obs)

                self.skew_explore.update_history(skew_explore_obs, [done])
                if (step % self.goal_update_frequency == 0
                        and step != 0) or step == 2000:
                    logging.info('update buffer')
                    self.skew_explore.activate_buffer()
                #################################################################

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    self.plot_tra(tra_count, tra_obs, selected_goal)
                    tra_obs = []
                    selected_goal = None

                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()

                    ep_count += 1
                    episode_rewards.append(0.0)
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                    tra_count += 1
                    self.save(self.args.save_path + '/model')

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #14
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="CLAC",
              reset_num_timesteps=True,
              randomization=0):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            learning_results = pd.DataFrame()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            reward_data = pd.DataFrame()

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if self.num_timesteps < self.learning_starts:
                    if (isinstance(self.env.action_space, Discrete)):
                        action = []
                        for _ in range(self.env.action_space.n):
                            action.append(1 / self.env.action_space.n)
                        rescaled_action = self.env.action_space.sample()
                    else:
                        action = self.env.action_space.sample()
                        # No need to rescale when sampling random action
                        rescaled_action = action
                else:
                    if (isinstance(self.env.action_space, Discrete)):
                        actions = list(range(self.env.action_space.n))
                        action = self.policy_tf.step(
                            obs[None], deterministic=False).flatten()
                        rescaled_action = np.random.choice(actions,
                                                           1,
                                                           p=action)[0]
                    else:
                        action = self.policy_tf.step(
                            obs[None], deterministic=False).flatten()
                        # Rescale from [-1, 1] to the correct bounds
                        rescaled_action = action * np.abs(
                            self.action_space.low)

                if (not isinstance(self.env.action_space, Discrete)):
                    assert action.shape == self.env.action_space.shape

                # If coinrunner environment
                # rescaled_action = np.array(rescaled_action, ndmin=1)

                new_obs, reward, done, info = self.env.step(rescaled_action)

                act_mu, act_std = self.policy_tf.proba_step(obs[None])

                if (len(act_std) == 1):
                    act_std = act_std[0]

                #print("ACT MU FROM PROBA STEP", act_mu)
                #print("ACT STD FROM PROBA STEP", act_std)
                if self.num_timesteps > self.learning_starts:
                    # Only update marginal approximation after learning starts is completed
                    if (self.multivariate_mean is None):
                        self.multivariate_mean = act_mu
                    else:
                        previous_mean = self.multivariate_mean
                        self.multivariate_mean = (
                            (1 - self.learning_rate_phi) *
                            self.multivariate_mean) + (self.learning_rate_phi *
                                                       act_mu)
                    if (self.multivariate_cov is None):
                        self.multivariate_cov = np.diag(act_std)
                    else:
                        cov = (self.learning_rate_phi * np.diag(act_std) +
                               (1 - self.learning_rate_phi) *
                               self.multivariate_cov)
                        mom_1 = (self.learning_rate_phi *
                                 np.square(np.diag(act_mu))) + (
                                     (1 - self.learning_rate_phi) *
                                     np.square(np.diag(previous_mean)))
                        mom_2 = np.square((self.learning_rate_phi *
                                           np.diag(act_mu)) +
                                          (1 - self.learning_rate_phi) *
                                          np.diag(previous_mean))
                        self.multivariate_cov = cov + mom_1 - mom_2

                    # Update Beta parameter if coef_schedule is set
                    if (self.coef_schedule is not None
                            and self.mut_inf_coef > 1e-12):
                        # (1 - a) B + a(1/L()) # Loss based update schdule, for later

                        # Currently using linear schedule:
                        self.mut_inf_coef *= (1 - self.coef_schedule)
                    """if(self.num_timesteps % 1000 == 0):
                        print("updated mut_inf_coef: ", self.mut_inf_coef, " at time step ", self.num_timesteps)"""

                # Store transition in the replay buffer.
                #print("adding action to replay buffer: ", action)
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                # info = info[0]
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        for mb_info_val in mb_infos_vals:
                            for mb_info in mb_info_val:
                                if mb_info is not None:
                                    infos_values.append(np.mean(mb_info))
                        #infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()

                        if (randomization == 1):
                            try:
                                for env in self.env.unwrapped.envs:
                                    env.randomize()
                            except:
                                print(
                                    "Trying to randomize an environment that is not set up for randomization, check environment file"
                                )
                                assert (False)

                        if (randomization == 2):
                            try:
                                for env in self.env.unwrapped.envs:
                                    env.randomize_extreme()
                            except:
                                print(
                                    "Trying to extremely randomize an environment that is not set up for randomization, check environment file"
                                )
                                assert (False)

                    Model_String = "CLAC"
                    if not self.auto_mut_inf_coef:
                        Model_String = "CLAC " + str(self.mut_inf_coef)

                    env_name = self.env.unwrapped.envs[0].spec.id

                    mut_inf_coef = self.init_mut_inf_coef
                    if (type(self.mut_inf_coef) == tf.Tensor
                            or np.isnan(mut_inf_coef)):
                        mut_inf_coef = "auto"
                    Model_String = "CLAC" + str(mut_inf_coef)
                    d = {
                        'Episode Reward': episode_rewards[-1],
                        'Coefficient': mut_inf_coef,
                        'Timestep': self.num_timesteps,
                        'Episode Number': len(episode_rewards) - 1,
                        'Env': env_name,
                        'Randomization': randomization,
                        'Model': "CLAC"
                    }
                    learning_results = learning_results.append(
                        d, ignore_index=True)

                    self.tf_logged_reward = episode_rewards[-1]

                    episode_rewards.append(0.0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return (self, learning_results)
예제 #15
0
    def run(self,
            total_timesteps,
            callback=None,
            seed=None,
            log_interval=4,
            tb_log_name="CLAC",
            reset_num_timesteps=True,
            randomization=0):

        start_time = time.time()
        episode_rewards = [0.0]
        learning_results = pd.DataFrame()
        obs = self.env.reset()
        self.episode_reward = np.zeros((1, ))
        ep_info_buf = deque(maxlen=100)
        n_updates = 0
        infos_values = []

        reward_data = pd.DataFrame()

        for step in range(total_timesteps):
            if (isinstance(self.env.action_space, Discrete)):
                actions = list(range(self.env.action_space.n))
                action = self.policy_tf.step(obs[None],
                                             deterministic=False).flatten()
                rescaled_action = np.random.choice(actions, 1, p=action)[0]
            else:
                action = self.policy_tf.step(obs[None],
                                             deterministic=False).flatten()
                # Rescale from [-1, 1] to the correct bounds
                rescaled_action = action * np.abs(self.action_space.low)

            new_obs, reward, done, info = self.env.step(rescaled_action)

            act_mu, act_std = self.policy_tf.proba_step(obs[None])
            obs = new_obs

            # Retrieve reward and episode length if using Monitor wrapper
            # info = info[0]
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_info_buf.extend([maybe_ep_info])

            if writer is not None:
                # Write reward per episode to tensorboard
                ep_reward = np.array([reward]).reshape((1, -1))
                ep_done = np.array([done]).reshape((1, -1))
                self.episode_reward = total_episode_reward_logger(
                    self.episode_reward, ep_reward, ep_done, writer,
                    self.num_timesteps)

            episode_rewards[-1] += reward
            if done:
                if not isinstance(self.env, VecEnv):
                    obs = self.env.reset()

                    if (randomization == 1):
                        try:
                            for env in self.env.unwrapped.envs:
                                env.randomize()
                        except:
                            print(
                                "Trying to randomize an environment that is not set up for randomization, check environment file"
                            )
                            assert (False)

                    if (randomization == 2):
                        try:
                            for env in self.env.unwrapped.envs:
                                env.randomize_extreme()
                        except:
                            print(
                                "Trying to extremely randomize an environment that is not set up for randomization, check environment file"
                            )
                            assert (False)

                Model_String = "CLAC"
                if not self.auto_mut_inf_coef:
                    Model_String = "CLAC " + str(self.init_mut_inf_coef)

                env_name = self.env.unwrapped.envs[0].spec.id

                mut_inf_coef = self.init_mut_inf_coef
                if (type(self.mut_inf_coef) == tf.Tensor
                        or np.isnan(mut_inf_coef)):
                    mut_inf_coef = "auto"
                Model_String = "CLAC" + str(mut_inf_coef)
                d = {
                    'Episode Reward': episode_rewards[-1],
                    'Coefficient': mut_inf_coef,
                    'Timestep': self.num_timesteps,
                    'Episode Number': len(episode_rewards) - 1,
                    'Env': env_name,
                    'Randomization': randomization,
                    'Model': "CLAC"
                }
                learning_results = learning_results.append(d,
                                                           ignore_index=True)

                self.tf_logged_reward = episode_rewards[-1]

                episode_rewards.append(0.0)

        return (self, learning_results)
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="SAC",
              print_freq=100):

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            start_time = time.time()
            episode_rewards = [0.0]
            is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
            # TeleopEnv
            if is_teleop_env:
                print("Waiting for teleop")
                obs = self.env.wait_for_teleop_reset()
            else:
                obs = self.env.reset()

            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            infos_values = []
            mb_infos_vals = []

            for step in range(total_timesteps):
                # Compute current learning_rate
                frac = 1.0 - step / total_timesteps
                current_lr = self.learning_rate(frac)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)
                ep_len += 1

                if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
                    print("{} steps".format(ep_len))

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer, step)

                if ep_len > self.train_freq:
                    print("Additional training")
                    self.env.reset()
                    mb_infos_vals = self.optimize(step, writer, current_lr)
                    done = True

                episode_rewards[-1] += reward
                if done:
                    if not (isinstance(self.env, VecEnv) or is_teleop_env):
                        obs = self.env.reset()

                    print("Episode finished. Reward: {:.2f} {} Steps".format(
                        episode_rewards[-1], ep_len))
                    episode_rewards.append(0.0)
                    ep_len = 0
                    mb_infos_vals = self.optimize(step, writer, current_lr)

                    # Refresh obs when using TeleopEnv
                    if is_teleop_env:
                        print("Waiting for teleop")
                        obs = self.env.wait_for_teleop_reset()

                # Log losses and entropy, useful for monitor training
                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", self.n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed',
                                 "{:.2f}".format(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            if is_teleop_env:
                self.env.is_training = False
            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)
        return self
예제 #17
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              initial_p=1.0):

        self.actions_weights = []
        self.actions_container = []

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        cnt = 0
        ds_rewards = [[0, 0]]
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=initial_p,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()

            reset = True
            self.episode_reward = np.zeros((1, ))

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                ''' Hierarchical Step (Start) '''

                obs, new_obs, rew, action, done, reset = self.hierarchical_step(
                    obs, ds_rewards, cnt, kwargs, update_eps)
                ''' Hierarchical Step (End) '''

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        try:
                            new_priorities = np.array([
                                abs(x) for x in td_errors.tolist()
                            ]) + self.prioritized_replay_eps
                            self.replay_buffer.update_priorities(
                                batch_idxes, new_priorities)
                        except AssertionError:
                            print(td_errors)

                if self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1
        return self, ds_rewards
예제 #18
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=None,
              tb_log_name="DDPG",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            # a list for tensorboard logging, to prevent logging with the same step number, if it already occured
            self.tb_seen_steps = []

            # rank = MPI.COMM_WORLD.Get_rank()
            # we assume symmetric actions.
            assert np.all(
                np.abs(self.env.action_space.low) ==
                self.env.action_space.high)
            if self.verbose >= 2:
                logger.log('Using agent with the following configuration:')
                logger.log(str(self.__dict__.items()))

            with self.sess.as_default(), self.graph.as_default():
                # Prepare everything.
                self._reset()
                obs = self.env.reset()
                eval_obs = None
                if self.eval_env is not None:
                    eval_obs = self.eval_env.reset()

                episode_rewards_deque = deque(maxlen=100)
                eval_episode_rewards_deque = deque(maxlen=100)
                self.episode_reward = np.zeros((1, ))

                episode_successes = []
                episode_rewards_all = []
                episode_steps_all = []
                episode_reward = 0.
                episode_step = 0
                total_steps = 0
                step_since_eval = 0
                total_episode_num = 0

                start_time = time.time()

                while True:
                    # Perform rollouts.
                    qs_this_rollout_period = []
                    actions_this_rollout_period = []
                    while True:
                        if total_steps >= total_timesteps:
                            return self

                        # Predict next action.
                        if total_steps <= 10000:
                            action = self.env.action_space.sample()
                            q_value = 0
                        else:
                            action, q_value = self._policy(obs,
                                                           apply_noise=True,
                                                           compute_q=True)
                        assert action.shape == self.env.action_space.shape

                        rescaled_action = action * np.abs(
                            self.action_space.low)
                        new_obs, reward, done, info = self.env.step(
                            rescaled_action)

                        if writer is not None:
                            ep_rew = np.array([reward]).reshape((1, -1))
                            ep_done = np.array([done]).reshape((1, -1))
                            self.episode_reward = total_episode_reward_logger(
                                self.episode_reward, ep_rew, ep_done, writer,
                                self.num_timesteps)
                        total_steps += 1
                        self.num_timesteps += 1
                        episode_reward += reward
                        episode_step += 1
                        step_since_eval += 1

                        # Book-keeping.
                        actions_this_rollout_period.append(action)
                        qs_this_rollout_period.append(q_value)
                        self._store_transition(obs, action, reward, new_obs,
                                               done)
                        obs = new_obs

                        if done:
                            # Episode done.
                            episode_rewards_all.append(episode_reward)
                            episode_rewards_deque.append(episode_reward)
                            episode_steps_all.append(episode_step)
                            episode_reward = 0.
                            episode_step = 0
                            total_episode_num += 1

                            maybe_is_success = info.get('is_success')
                            if maybe_is_success is not None:
                                episode_successes.append(
                                    float(maybe_is_success))

                            self._reset()
                            if not isinstance(self.env, VecEnv):
                                obs = self.env.reset()
                            break

                    # Train.
                    actor_losses_this_train_period = []
                    critic_losses_this_train_period = []
                    last_episode_step = int(episode_steps_all[-1])
                    for t_train in range(last_episode_step):
                        # Not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size):
                            break

                        # weird equation to deal with the fact the nb_train_steps will be different
                        # to nb_rollout_steps

                        step = total_steps - last_episode_step + t_train

                        critic_loss, actor_loss = self._train_step(
                            step, writer, do_actor_update=t_train % 2 == 0)
                        critic_losses_this_train_period.append(critic_loss)
                        if actor_loss:
                            actor_losses_this_train_period.append(actor_loss)
                            self._update_target_net()

                    # Evaluate.
                    eval_episode_rewards = []
                    eval_qs = []
                    if self.eval_env is not None and step_since_eval >= self.eval_freq:
                        step_since_eval %= self.eval_freq
                        eval_episode_reward = 0.
                        eval_episode = 0
                        while eval_episode < 10:
                            eval_action, eval_q = self._policy(
                                eval_obs, apply_noise=False, compute_q=True)
                            eval_obs, eval_r, eval_done, _ = self.eval_env.step(
                                eval_action * np.abs(self.action_space.low))
                            eval_episode_reward += eval_r

                            eval_qs.append(eval_q)
                            if eval_done:
                                if not isinstance(self.env, VecEnv):
                                    eval_obs = self.eval_env.reset()
                                eval_episode_rewards.append(
                                    eval_episode_reward)
                                eval_episode_rewards_deque.append(
                                    eval_episode_reward)
                                eval_episode_reward = 0.
                                eval_episode += 1

                    if callback is not None:
                        # Only stop training if return value is False, not when it is None.
                        # This is for backwards compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            return self

                    # mpi_size = MPI.COMM_WORLD.Get_size()
                    # Log stats.
                    # XXX shouldn't call np.mean on variable length lists
                    duration = time.time() - start_time
                    stats = self._get_stats()
                    combined_stats = stats.copy()
                    combined_stats['rollout/return'] = episode_rewards_all[-1]
                    combined_stats['rollout/return_last_100'] = np.mean(
                        episode_rewards_deque)
                    combined_stats[
                        'rollout/episode_steps'] = episode_steps_all[-1]
                    combined_stats['debug/actions_mean'] = np.mean(
                        actions_this_rollout_period)
                    combined_stats['debug/actions_std'] = np.std(
                        actions_this_rollout_period)
                    combined_stats['debug/Q_mean'] = np.mean(
                        qs_this_rollout_period)
                    combined_stats['train/loss_actor'] = np.mean(
                        actor_losses_this_train_period)
                    combined_stats['train/loss_critic'] = np.mean(
                        critic_losses_this_train_period)
                    combined_stats['total/duration'] = duration
                    combined_stats['total/steps_per_second'] = float(
                        total_steps) / float(duration)
                    # Evaluation statistics.
                    if self.eval_env is not None and eval_episode_rewards:
                        combined_stats['eval/return'] = np.mean(
                            eval_episode_rewards)
                        combined_stats['eval/return_history'] = np.mean(
                            eval_episode_rewards_deque)
                        combined_stats['eval/Q'] = np.mean(eval_qs)
                        combined_stats['eval/episodes'] = len(
                            eval_episode_rewards)

                    def as_scalar(scalar):
                        """
                        check and return the input if it is a scalar, otherwise raise ValueError

                        :param scalar: (Any) the object to check
                        :return: (Number) the scalar if x is a scalar
                        """
                        if isinstance(scalar, np.ndarray):
                            assert scalar.size == 1
                            return scalar[0]
                        elif np.isscalar(scalar):
                            return scalar
                        else:
                            raise ValueError('expected scalar, got %s' %
                                             scalar)

                    # combined_stats_sums = MPI.COMM_WORLD.allreduce(
                    #     np.array([as_scalar(x) for x in combined_stats.values()]))
                    # combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}

                    # Total statistics.
                    combined_stats['total/episodes'] = total_episode_num
                    combined_stats['total/steps'] = total_steps

                    for key in sorted(combined_stats.keys()):
                        logger.record_tabular(key, combined_stats[key])
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.dump_tabular()
                    logger.info('')
                    logdir = logger.get_dir()
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True,
              vae=None):
        #  making the learning rate and clip range callable here.

        self.writer2 = tf.summary.FileWriter('/tmp/ppo/PPO_test',
                                             graph=self.graph)

        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        new_tb_log = self._init_num_timesteps(
            reset_num_timesteps=reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch

            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)

                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []

                if states is None:
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 writer=writer,
                                                 update=timestep))

                        self.num_timesteps += (self.n_batch * self.noptepochs
                                               ) // batch_size * update_fac
                    """Optimize the VAE"""
                    time_start = time.time()
                    vae.optimize()
                    print("Time to optimize the VAE: ",
                          time.time() - time_start)

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))
                if update % 2 == 0:
                    self.env.testing = True
                    ob = np.zeros((self.env.num_envs, ) +
                                  self.env.observation_space.shape)

                    ob[:] = self.env.reset()
                    total_reward_test = 0
                    for i in range(1000):
                        print("TESTING")

                        actions = self.step(ob)
                        ob[:], reward, _, _ = self.env.step(actions)
                        total_reward_test += total_reward_test + reward
                    summary2 = tf.Summary(value=[
                        tf.Summary.Value(tag="episode_reward",
                                         simple_value=total_reward_test)
                    ])
                    self.writer2.add_summary(summary2)
                else:
                    self.env.testing = False

                if writer is not None and not self.env.testing:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()
            return self
예제 #20
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="A2C"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            # Entropy tobe a large in the beginning
            self.ent_coef_schedule = LinearSchedule(
                schedule_timesteps=int(1e6), initial_p=0.1, final_p=0.01)

            runner = A2CRunner(self.env,
                               self,
                               n_steps=self.n_steps,
                               gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs, ))

            t_start = time.time()
            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, true_reward = runner.run(
                )
                _, value_loss, policy_entropy = self._train_step(
                    update * self.n_batch, obs, states, rewards, masks,
                    actions, values, update, writer)

                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        update * (self.n_batch + 1))

                if callback is not None:
                    callback(locals(), globals())

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          update * self.n_batch)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    logger.dump_tabular()

        return self
예제 #21
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="PPO1"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            with self.sess.as_default():
                self.adam.sync()

                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi, self.env,
                                                 self.timesteps_per_actorbatch)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()

                # rolling buffer for episode lengths
                lenbuffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                rewbuffer = deque(maxlen=100)

                self.episode_reward = np.zeros((self.n_envs, ))

                while True:
                    if callback:
                        callback(locals(), globals())
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(
                            1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    seg = seg_gen.__next__()
                    add_vtarg_and_adv(seg, self.gamma, self.lam)

                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg[
                        "ac"], seg["adv"], seg["tdlamret"]

                    # true_rew is the reward without discount
                    if writer is not None:
                        self.episode_reward = total_episode_reward_logger(
                            self.episode_reward, seg["true_rew"].reshape(
                                (self.n_envs, -1)), seg["dones"].reshape(
                                    (self.n_envs, -1)), writer,
                            timesteps_so_far)

                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]

                    # standardized advantage function estimate
                    atarg = (atarg - atarg.mean()) / atarg.std()
                    dataset = Dataset(
                        dict(ob=obs_ph,
                             ac=action_ph,
                             atarg=atarg,
                             vtarg=tdlamret),
                        shuffle=not issubclass(self.policy, LstmPolicy))
                    optim_batchsize = self.optim_batchsize or obs_ph.shape[0]

                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)
                    logger.log("Optimizing...")
                    logger.log(fmt_row(13, self.loss_names))

                    # Here we do a bunch of optimization epochs over the data
                    for k in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for i, batch in enumerate(
                                dataset.iterate_once(optim_batchsize)):
                            steps = (
                                timesteps_so_far + k * optim_batchsize +
                                int(i *
                                    (optim_batchsize / len(dataset.data_map))))
                            if writer is not None:
                                # run loss backprop with summary, but once every 10 runs save the metadata
                                # (memory, compute time, ...)
                                if (1 + k) % 10 == 0:
                                    run_options = tf.RunOptions(
                                        trace_level=tf.RunOptions.FULL_TRACE)
                                    run_metadata = tf.RunMetadata()
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess,
                                        options=run_options,
                                        run_metadata=run_metadata)
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                else:
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *newlosses = self.lossandgrad(
                                    batch["ob"],
                                    batch["ob"],
                                    batch["ac"],
                                    batch["atarg"],
                                    batch["vtarg"],
                                    cur_lrmult,
                                    sess=self.sess)

                            self.adam.update(grad,
                                             self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(losses, axis=0)))

                    logger.log("Evaluating losses...")
                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"],
                                                        batch["ob"],
                                                        batch["ac"],
                                                        batch["atarg"],
                                                        batch["vtarg"],
                                                        cur_lrmult,
                                                        sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)
                    logger.log(fmt_row(13, mean_losses))
                    for (loss_val, name) in zipsame(mean_losses,
                                                    self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])

                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)
                    logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                    logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    timesteps_so_far += MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                        logger.dump_tabular()

        return self
예제 #22
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            t_first_start = time.time()
            n_updates = total_timesteps // self.n_batch

            callback.on_training_start(locals(), globals())

            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0, (
                    "The number of minibatches (`nminibatches`) "
                    "is not a factor of the total number of samples "
                    "collected per rollout (`n_batch`), "
                    "some samples won't be used.")
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # Unpack
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                # Save the model every 5000 time_step
                if self.num_timesteps % 5000 == 0:
                    self.save(self.model_dir +
                              "/step_{}".format(self.num_timesteps))

            callback.on_training_end()
            return self
예제 #23
0
파일: idac.py 프로젝트: zhougroup/IDAC
    def learn(self, total_timesteps, env_eval, callback=None, seed=None, path=None, dis_path=None, score_path=None,
              dis_eval_interval=100, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None):

        self.eval_env = env_eval
        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            dis_eval_array = []  # (total_step % eval_intervel) x 2 x n_batch
            self.ep_length = 0

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                    or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                    noise = np.zeros(self.noise_dim)
                else:            
                    noise = self.policy_tf.gen_noise(obs[None]).flatten()
                    action = self.policy_tf.step(obs[None],noise[None] ,deterministic=False).flatten()

                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)
                self.ep_length += 1

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs, float(done), noise)

                episode_rewards[-1] += reward
                reset_flag = done or self.ep_length >= self.max_ep_length
                if reset_flag:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    obs = self.env.reset()
                    episode_rewards.append(0.0)
                    self.ep_length = 0

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                else:
                    obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                                                                      ep_done, writer, self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)

                        mb_infos_vals.append(self._train_step(step, writer, current_lr, dis_eval_array,
                                                              dis_eval_interval, dis_path))
                        # Update target network
                        if (step + grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                if self.num_timesteps % 2000 == 0:
                    eval_ob = self.eval_env.reset()
                    eval_epi_rewards = 0
                    eval_epis = 0
                    eval_performance = []
                    eval_ep_step = 0
                    while True:
                        eval_noise = self.policy_tf.gen_noise(eval_ob[None]).flatten()
                        eval_action = self.policy_tf.step(eval_ob[None], eval_noise[None], deterministic=True).flatten()
                        eval_rescaled_action = eval_action * np.abs(self.action_space.low)
                        eval_new_obs, eval_reward, eval_done, eval_info = self.eval_env.step(eval_rescaled_action)
                        eval_epi_rewards += eval_reward
                        eval_ob = eval_new_obs
                        eval_ep_step += 1
                        if eval_done or eval_ep_step >= self.max_ep_length:
                            eval_ob = self.eval_env.reset()
                            eval_performance.append(eval_epi_rewards)
                            eval_epi_rewards = 0
                            eval_epis += 1
                            eval_ep_step = 0
                            if eval_epis > 5:
                                break
                    with open(score_path, 'a') as f2:
                        f2.write("%i %f\n" % (self.num_timesteps, np.mean(eval_performance)))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and reset_flag and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    with open(path,'a') as f1:
                        f1.write("%f " % step)
                        f1.write("%f " % mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                        with open(path,'a') as f1:
                            f1.write("%f " % safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                            f1.write("%f " % safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    with open(path,'a') as f1:
                        f1.write("%f " % n_updates)                    
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)

                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []

            return self
예제 #24
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=self.exploration_initial_eps,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True

            ############################################################
            # MODIFICATION:
            # Track list of actions taken each episode. This is
            # intentionally not a set so that we can use np.isin.
            action_list = list()
            ############################################################

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    ####################################################
                    # MODIFICATION:
                    # Rename variable from original, since it's now
                    # going to come back as an array due to the
                    # modified build_act function being used to
                    # construct everything.
                    action_arr = self.act(np.array(obs)[None],
                                          update_eps=update_eps,
                                          **kwargs)[0]
                    ####################################################
                    # ORIGINAL:
                    # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]

                ########################################################
                # MODIFICATION:
                # Get the best action that has not yet been taken this
                # episode.
                action = \
                    action_arr[np.argmin(np.isin(action_arr, action_list))]
                # Add this action to the list.
                action_list.append(action)
                ########################################################

                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    total_episode_reward_logger(self.episode_reward, ep_rew,
                                                ep_done, writer,
                                                self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    ####################################################
                    # MODIFICATION:
                    # Clear the list.
                    action_list.clear()
                    ####################################################
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer,
                                          PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1

        return self
예제 #25
0
    def learn(self, total_timesteps, callback=None, seed=None,
              log_interval=4, tb_log_name="SAC", reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            obs = self.env.reset()
            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            obs = self.env.reset()
            for i in range(128):
                action = self.env.action_space.sample()
                new_obs, reward, done, info = self.env.step(action)
                # print(new_obs)
                # self.env.render()
                self.iiayn.update_history([obs])
                obs = new_obs

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if self.num_timesteps < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action #* np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)

                print(step, action)
                # self.env.render()

                self.iiayn.update_history([obs])

                if step % 2048 == 0:
                    self.iiayn.activate_buffer()

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs, float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                                                                      ep_done, writer, self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step + grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done or step%1024 == 0:
                    obs = self.env.reset()
                    # if not isinstance(self.env, VecEnv):
                    #     obs = self.env.reset()
                    episode_rewards.append(0.0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #26
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="ACER"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            episode_stats = EpisodeStats(self.n_steps, self.n_envs)

            runner = _Runner(env=self.env, model=self, n_steps=self.n_steps)
            self.episode_reward = np.zeros((self.n_envs, ))
            if self.replay_ratio > 0:
                buffer = Buffer(env=self.env,
                                n_steps=self.n_steps,
                                size=self.buffer_size)
            else:
                buffer = None

            t_start = time.time()

            # n_batch samples, 1 on_policy call and multiple off-policy calls
            for steps in range(0, total_timesteps, self.n_batch):
                enc_obs, obs, actions, rewards, mus, dones, masks = runner.run(
                )
                episode_stats.feed(rewards, dones)

                if buffer is not None:
                    buffer.put(enc_obs, actions, rewards, mus, dones, masks)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        rewards.reshape((self.n_envs, self.n_steps)),
                        dones.reshape((self.n_envs, self.n_steps)), writer,
                        steps)

                # reshape stuff correctly
                obs = obs.reshape(runner.batch_ob_shape)
                actions = actions.reshape([runner.n_batch])
                rewards = rewards.reshape([runner.n_batch])
                mus = mus.reshape([runner.n_batch, runner.n_act])
                dones = dones.reshape([runner.n_batch])
                masks = masks.reshape([runner.batch_ob_shape[0]])

                names_ops, values_ops = self._train_step(
                    obs, actions, rewards, dones, mus, self.initial_state,
                    masks, steps, writer)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) == False:
                        break

                if self.verbose >= 1 and (int(steps / runner.n_batch) %
                                          log_interval == 0):
                    logger.record_tabular("total_timesteps", steps)
                    logger.record_tabular("fps",
                                          int(steps / (time.time() - t_start)))
                    # IMP: In EpisodicLife env, during training, we get done=True at each loss of life,
                    # not just at the terminal state. Thus, this is mean until end of life, not end of episode.
                    # For true episode rewards, see the monitor files in the log folder.
                    logger.record_tabular("mean_episode_length",
                                          episode_stats.mean_length())
                    logger.record_tabular("mean_episode_reward",
                                          episode_stats.mean_reward())
                    for name, val in zip(names_ops, values_ops):
                        logger.record_tabular(name, float(val))
                    logger.dump_tabular()

                if self.replay_ratio > 0 and buffer.has_atleast(
                        self.replay_start):
                    samples_number = np.random.poisson(self.replay_ratio)
                    for _ in range(samples_number):
                        # get obs, actions, rewards, mus, dones from buffer.
                        obs, actions, rewards, mus, dones, masks = buffer.get()

                        # reshape stuff correctly
                        obs = obs.reshape(runner.batch_ob_shape)
                        actions = actions.reshape([runner.n_batch])
                        rewards = rewards.reshape([runner.n_batch])
                        mus = mus.reshape([runner.n_batch, runner.n_act])
                        dones = dones.reshape([runner.n_batch])
                        masks = masks.reshape([runner.batch_ob_shape[0]])

                        self._train_step(obs, actions, rewards, dones, mus,
                                         self.initial_state, masks, steps)

        return self
예제 #27
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG", \
             reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # a list for tensorboard logging, to prevent logging with the same step number, if it already occured
            self.tb_seen_steps = []

            rank = MPI.COMM_WORLD.Get_rank()
            # we assume symmetric actions.
            assert np.all(
                np.abs(self.env.action_space.low) ==
                self.env.action_space.high)
            if self.verbose >= 2:
                logger.log('Using agent with the following configuration:')
                logger.log(str(self.__dict__.items()))

            eval_episode_rewards_history = deque(maxlen=100)
            episode_rewards_history = deque(maxlen=100)
            self.episode_reward = np.zeros((1, ))
            episode_successes = []
            with self.sess.as_default(), self.graph.as_default():
                # Prepare everything.
                self._reset()
                obs = self.env.reset()
                eval_obs = None
                if self.eval_env is not None:
                    eval_obs = self.eval_env.reset()
                episode_reward = 0.
                episode_step = 0
                episodes = 0
                step = 0
                total_steps = 0

                start_time = time.time()

                epoch_episode_rewards = []
                epoch_episode_steps = []
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                eval_episode_rewards = []
                eval_qs = []
                epoch_actions = []
                epoch_qs = []
                epoch_episodes = 0
                epoch = 0
                while True:
                    for _ in range(log_interval):
                        # Perform rollouts.
                        for _ in range(self.nb_rollout_steps):
                            if total_steps >= total_timesteps:
                                return self

                            # Predict next action.
                            action, q_value = self._policy(obs,
                                                           apply_noise=True,
                                                           compute_q=True)
                            assert action.shape == self.env.action_space.shape

                            # Execute next action.
                            if rank == 0 and self.render:
                                self.env.render()

                            # Randomly sample actions from a uniform distribution
                            # with a probabilty self.random_exploration (used in HER + DDPG)
                            if np.random.rand() < self.random_exploration:
                                rescaled_action = action = self.action_space.sample(
                                )
                            else:
                                rescaled_action = action * np.abs(
                                    self.action_space.low)

                            rescaled_action = np.where(action)[0][0]
                            new_obs, reward, done, info = self.env.step(
                                rescaled_action)

                            if writer is not None:
                                ep_rew = np.array([reward]).reshape((1, -1))
                                ep_done = np.array([done]).reshape((1, -1))
                                self.episode_reward = total_episode_reward_logger(
                                    self.episode_reward, ep_rew, ep_done,
                                    writer, self.num_timesteps)
                            step += 1
                            total_steps += 1
                            self.num_timesteps += 1
                            if rank == 0 and self.render:
                                self.env.render()
                            episode_reward += reward
                            episode_step += 1

                            # Book-keeping.
                            epoch_actions.append(action)
                            epoch_qs.append(q_value)
                            self._store_transition(obs, action, reward,
                                                   new_obs, done)
                            obs = new_obs
                            if callback is not None:
                                # Only stop training if return value is False, not when it is None.
                                # This is for backwards compatibility with callbacks that have no return statement.
                                if callback(locals(), globals()) is False:
                                    return self

                            if done:
                                # Episode done.
                                epoch_episode_rewards.append(episode_reward)
                                episode_rewards_history.append(episode_reward)
                                epoch_episode_steps.append(episode_step)
                                episode_reward = 0.
                                episode_step = 0
                                epoch_episodes += 1
                                episodes += 1

                                maybe_is_success = info.get('is_success')
                                if maybe_is_success is not None:
                                    episode_successes.append(
                                        float(maybe_is_success))

                                self._reset()
                                if not isinstance(self.env, VecEnv):
                                    obs = self.env.reset()

                        # Train.
                        epoch_actor_losses = []
                        epoch_critic_losses = []
                        epoch_adaptive_distances = []
                        for t_train in range(self.nb_train_steps):
                            # Not enough samples in the replay buffer
                            if not self.replay_buffer.can_sample(
                                    self.batch_size):
                                break

                            # Adapt param noise, if necessary.
                            if len(self.replay_buffer) >= self.batch_size and \
                                    t_train % self.param_noise_adaption_interval == 0:
                                distance = self._adapt_param_noise()
                                epoch_adaptive_distances.append(distance)

                            # weird equation to deal with the fact the nb_train_steps will be different
                            # to nb_rollout_steps
                            step = (int(t_train * (self.nb_rollout_steps /
                                                   self.nb_train_steps)) +
                                    self.num_timesteps - self.nb_rollout_steps)

                            critic_loss, actor_loss = self._train_step(
                                step, writer, log=t_train == 0)
                            epoch_critic_losses.append(critic_loss)
                            epoch_actor_losses.append(actor_loss)
                            self._update_target_net()

                        # Evaluate.
                        eval_episode_rewards = []
                        eval_qs = []
                        if self.eval_env is not None:
                            eval_episode_reward = 0.
                            for _ in range(self.nb_eval_steps):
                                if total_steps >= total_timesteps:
                                    return self

                                eval_action, eval_q = self._policy(
                                    eval_obs,
                                    apply_noise=False,
                                    compute_q=True)
                                eval_obs, eval_r, eval_done, _ = self.eval_env.step(
                                    eval_action *
                                    np.abs(self.action_space.low))
                                if self.render_eval:
                                    self.eval_env.render()
                                eval_episode_reward += eval_r

                                eval_qs.append(eval_q)
                                if eval_done:
                                    if not isinstance(self.env, VecEnv):
                                        eval_obs = self.eval_env.reset()
                                    eval_episode_rewards.append(
                                        eval_episode_reward)
                                    eval_episode_rewards_history.append(
                                        eval_episode_reward)
                                    eval_episode_reward = 0.

                    mpi_size = MPI.COMM_WORLD.Get_size()
                    # Log stats.
                    # XXX shouldn't call np.mean on variable length lists
                    duration = time.time() - start_time
                    stats = self._get_stats()
                    combined_stats = stats.copy()
                    combined_stats['rollout/return'] = np.mean(
                        epoch_episode_rewards)
                    combined_stats['rollout/return_history'] = np.mean(
                        episode_rewards_history)
                    combined_stats['rollout/episode_steps'] = np.mean(
                        epoch_episode_steps)
                    combined_stats['rollout/actions_mean'] = np.mean(
                        epoch_actions)
                    combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
                    combined_stats['train/loss_actor'] = np.mean(
                        epoch_actor_losses)
                    combined_stats['train/loss_critic'] = np.mean(
                        epoch_critic_losses)
                    if len(epoch_adaptive_distances) != 0:
                        combined_stats['train/param_noise_distance'] = np.mean(
                            epoch_adaptive_distances)
                    combined_stats['total/duration'] = duration
                    combined_stats['total/steps_per_second'] = float(
                        step) / float(duration)
                    combined_stats['total/episodes'] = episodes
                    combined_stats['rollout/episodes'] = epoch_episodes
                    combined_stats['rollout/actions_std'] = np.std(
                        epoch_actions)
                    # Evaluation statistics.
                    if self.eval_env is not None:
                        combined_stats['eval/return'] = np.mean(
                            eval_episode_rewards)
                        combined_stats['eval/return_history'] = np.mean(
                            eval_episode_rewards_history)
                        combined_stats['eval/Q'] = np.mean(eval_qs)
                        combined_stats['eval/episodes'] = len(
                            eval_episode_rewards)

                    def as_scalar(scalar):
                        """
                        check and return the input if it is a scalar, otherwise raise ValueError

                        :param scalar: (Any) the object to check
                        :return: (Number) the scalar if x is a scalar
                        """
                        if isinstance(scalar, np.ndarray):
                            assert scalar.size == 1
                            return scalar[0]
                        elif np.isscalar(scalar):
                            return scalar
                        else:
                            raise ValueError('expected scalar, got %s' %
                                             scalar)

                    combined_stats_sums = MPI.COMM_WORLD.allreduce(
                        np.array(
                            [as_scalar(x) for x in combined_stats.values()]))
                    combined_stats = {
                        k: v / mpi_size
                        for (k, v) in zip(combined_stats.keys(),
                                          combined_stats_sums)
                    }

                    # Total statistics.
                    combined_stats['total/epochs'] = epoch + 1
                    combined_stats['total/steps'] = step

                    for key in sorted(combined_stats.keys()):
                        logger.record_tabular(key, combined_stats[key])
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.dump_tabular()
                    logger.info('')
                    logdir = logger.get_dir()
                    if rank == 0 and logdir:
                        if hasattr(self.env, 'get_state'):
                            with open(os.path.join(logdir, 'env_state.pkl'),
                                      'wb') as file_handler:
                                pickle.dump(self.env.get_state(), file_handler)
                        if self.eval_env and hasattr(self.eval_env,
                                                     'get_state'):
                            with open(
                                    os.path.join(logdir, 'eval_env_state.pkl'),
                                    'wb') as file_handler:
                                pickle.dump(self.eval_env.get_state(),
                                            file_handler)
예제 #28
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)
            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()
            # define the victim_model
            if self.use_explanation:
                if self.pretrained_mimic:
                    exp_test = GradientExp(self.mimic_model)
                else:
                    exp_test = None
            else:
                exp_test = None
            nupdates = total_timesteps // self.n_batch

            obs_list = []
            act_list = []

            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward, \
                obs_oppo, actions_oppo, o_next, o_opp_next, a_opp_next = runner.run()

                obs_opp_ph = obs_oppo
                action_oppo_ph = actions_oppo
                if update % 100 == 0 and self.save_victim_traj:
                    obs_list.append(obs_oppo)
                    act_list.append(actions_oppo)

                # todo calculate the attention paid on opponent
                attention = self.calculate_attention(obs_oppo=obs_opp_ph, action_oppo=action_oppo_ph, \
                                        exp_test=exp_test, black_box_att=self.black_box_att, exp_method=self.exp_method)

                is_stochastic = False
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []

                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            slices_hua = (arr[mbinds]
                                          for arr in (a_opp_next, o_opp_next,
                                                      attention))
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 *slices_hua,
                                                 is_stochastic=is_stochastic,
                                                 ratio=self.mix_ratio,
                                                 writer=writer,
                                                 update=timestep))
                    self.num_timesteps += (self.n_batch * self.noptepochs
                                           ) // batch_size * update_fac
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            slices_hua = (arr[mb_flat_inds]
                                          for arr in (a_opp_next, o_opp_next,
                                                      is_stochastic,
                                                      attention))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 *slices_hua,
                                                 update=timestep,
                                                 writer=writer,
                                                 states=mb_states))
                    self.num_timesteps += (self.n_envs * self.noptepochs
                                           ) // envs_per_batch * update_fac

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))

                    # print the attention weights
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                model_file_name = "{0}agent_{1}.pkl".format(
                    self.model_saved_loc, update * self.n_batch)
                if self.black_box_att:
                    if update % 1000 == 0:
                        print("Model saved at: {}".format(model_file_name))
                        self.save(model_file_name)
                else:
                    if update % 1000 == 0:
                        print("Model saved at: {}".format(model_file_name))
                        self.save(model_file_name)

            obs_numpy = np.vstack(obs_list)
            act_numpy = np.vstack(act_list)

            with open('../saved/trajectory.pkl', 'ab+') as f:
                pkl.dump([obs_numpy, act_numpy], f, protocol=2)

            return self
예제 #29
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand(
                ) < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # inferred actions need to be transformed to environment action_space before stepping
                    unscaled_action = unscale_action(self.action_space, action)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(unscaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv("episode reward", episode_rewards[-2])
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #30
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        #print("BEHOOOOOLD A CHANGE!!!!")
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch
            counter = 0
            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0, (
                    "The number of minibatches (`nminibatches`) "
                    "is not a factor of the total number of samples "
                    "collected per rollout (`n_batch`), "
                    "some samples won't be used.")
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = self.runner.run(
                )
                #*************************************************
                if counter < 1000:
                    self_adversary = 0  # no adversary
                else:
                    self_adversary = .0001
                    self_adversary = self_adversary * counter  #increment adversary
                    if self_adversary >= 1:  #don't become too nasty now, ya hear?
                        self_adversary = 1
                values = values - ((values * self_adversary) * 2)
                #*************************************************
                self.num_timesteps += self.n_batch
                self.ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self