예제 #1
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        # callback = self._init_callback(callback)

        # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
        #         as writer:
        self._setup_learn()

        # Create the replay buffer
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.prioritized_replay_alpha)
            if self.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = total_timesteps
            else:
                prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
            self.beta_schedule = None

        if replay_wrapper is not None:
            assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(self.exploration_fraction *
                                   total_timesteps),
            initial_p=self.exploration_initial_eps,
            final_p=self.exploration_final_eps)

        episode_rewards = [[0.0] * self.num_agents]  #MA-MOD
        episode_successes = []

        #callback.on_training_start(locals(), globals())
        #callback.on_rollout_start()

        reset = True
        obs = self.env.reset()

        for _ in range(total_timesteps):
            # Take action and update exploration to the newest value
            kwargs = {}
            if not self.param_noise:
                update_eps = self.exploration.value(self.num_timesteps)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = \
                    -np.log(1. - self.exploration.value(self.num_timesteps) +
                            self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            with self.sess.as_default():
                env_action = []  # MA-MOD
                for i in range(self.num_agents
                               ):  # MA-MOD. This is fine for one policy.
                    action = self.act[i](
                        np.array(obs[i])[None],
                        update_eps=update_eps,
                        **kwargs
                    )[0]  # TODO: Is this the correct way to get the correct agent obs?
                    env_action.append(action)
            reset = False
            new_obs, rew, done, info = self.env.step(
                env_action
            )  # NOUPDATE - env.step should take a vector of actions
            # print("Agent 1", type(new_obs[0]))
            # for row in new_obs[0]:
            #     print(' '.join(str(int(item)) for item in row))

            # print("Agent 2", type(new_obs[1]))
            # for row in new_obs[1]:
            #     print(' '.join(str(int(item)) for item in row))
            '''
            Obs: x_me, x_opp --- agent 1. In env: x_1, x_2
            Obs: x_me, x_opp -- agent 2. In env: x_2, x_1
            Env: (n_agents, state_dim)
            '''

            self.num_timesteps += 1

            # Store transition in the replay buffer.
            # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index]
            # Joey: Does this look right to you?

            for num_agent in range(self.num_agents):
                self.replay_buffer.add(obs[num_agent], env_action[num_agent],
                                       rew[num_agent], new_obs[num_agent],
                                       float(done[num_agent]))
            obs = new_obs

            # if writer is not None:
            #     ep_rew = np.array([rew]).reshape((1, -1))
            #     ep_done = np.array([done]).reshape((1, -1))
            #     tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
            #                                         self.num_timesteps)

            # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps
            #     append the newest reward to the end of each list for each agent
            if isinstance(done, list):
                done = np.array(done)
            if done.any():
                for num_agent in range(self.num_agents):  #MA-MOD
                    episode_rewards[-1][num_agent] += rew[num_agent]
                maybe_is_success = info.get('is_success')
                if maybe_is_success is not None:
                    episode_successes.append(float(maybe_is_success))
                if not isinstance(self.env, VecEnv):
                    obs = self.env.reset()
                episode_rewards.append([0.0] * self.num_agents)
                reset = True

            # Do not train if the warmup phase is not over
            # or if there are not enough samples in the replay buffer
            can_sample = self.replay_buffer.can_sample(self.batch_size)
            if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:

                # callback.on_rollout_end()

                for i in range(self.num_agents):  # MA-MOD
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                                "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    # if writer is not None:
                    #     # run loss backprop with summary, but once every 100 steps save the metadata
                    #     # (memory, compute time, ...)
                    #     if (1 + self.num_timesteps) % 100 == 0:
                    #         run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    #         run_metadata = tf.RunMetadata()
                    #         summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1,
                    #                                               dones, weights, sess=self.sess, options=run_options,
                    #                                               run_metadata=run_metadata)
                    #         writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i))
                    #     else:
                    #         summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1,
                    #                                               dones, weights, sess=self.sess)
                    #     writer.add_summary(summary, self.num_timesteps)
                    # else:
                    td_errors = self._train_step[i](obses_t,
                                                    actions,
                                                    rewards,
                                                    obses_tp1,
                                                    obses_tp1,
                                                    dones,
                                                    weights,
                                                    sess=self.sess)

                if self.prioritized_replay:  # NOUPDATE - not inside main agent for loop
                    new_priorities = np.abs(
                        td_errors) + self.prioritized_replay_eps  # NOUPDATE
                    assert isinstance(self.replay_buffer,
                                      PrioritizedReplayBuffer)
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)

                # callback.on_rollout_start()

            if can_sample and self.num_timesteps > self.learning_starts and \
                    self.num_timesteps % self.target_network_update_freq == 0:
                # Update target network periodically.
                for i in range(self.num_agents):
                    self.update_target[i](sess=self.sess)  # MA-MOD

            if len(episode_rewards[-101:-1]) == 0:  # MA-MOD
                mean_100ep_reward = -np.inf * np.ones((self.num_agents, ))
            else:
                mean_100ep_reward = np.mean(episode_rewards[-101:-1], axis=0)

            # below is what's logged in terminal.
            num_episodes = len(episode_rewards)  #MA-MOD
            if self.verbose >= 1 and done.any(
            ) and log_interval is not None and len(
                    episode_rewards) % log_interval == 0:  #MA-MOD
                logger.record_tabular("steps", self.num_timesteps)
                logger.record_tabular("episodes", num_episodes)
                if len(episode_successes) > 0:
                    logger.logkv("success rate",
                                 np.mean(episode_successes[-100:], axis=0))
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular(
                    "% time spent exploring",
                    int(100 * self.exploration.value(self.num_timesteps)))
                logger.dump_tabular()

        return self
예제 #2
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=self.exploration_initial_eps,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            reset = True
            obs = self.env.reset()
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()
            each_reward_episode = 0
            self.my_counter
            for _ in range(total_timesteps):
                # Take action and update exploration to the newest value
                kwargs = {}

                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                # with self.sess.as_default():
                #     action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
                # env_action = action

                if self.my_counter % 10 == 1:
                    if self.counter_success < 4:

                        self.Predicted_Q = IMC(self.Q, self.Nq,
                                               self.action_features,
                                               self.state_features, 1)

                if self.episode_num < 1000 and self.my_counter > 0:
                    a = self.StateHash(self.current_state)
                    env_action = np.argmax(self.Predicted_Q[:, a])
                    #print(env_action)

                if self.episode_num == 40:
                    if self.counter_success < 4:
                        print('init')
                        self.num_timesteps = 0
                        self.episode_num = 0
                        self.my_counter = 0
                        self.counter_success = 0
                        self.Q = np.zeros_like(self.Q)
                        self.Nq = np.ones_like(self.Nq)
                        self.Predicted_Q = np.zeros_like(self.Q)
                        self.temp_mem = []
                        self.episode_rewards = []
                        self.loss = []
                        each_reward_episode = 0

                if self.episode_num == 0:
                    if self.init_action < len(self.action_spac):
                        env_action = int(self.init_action)
                        self.init_action += 1
                    else:
                        self.init_action = 0
                        env_action = int(self.init_action)

                reset = False

                new_obs, rew, done, info = self.env.step(env_action)
                action = env_action
                #print(new_obs)
                if new_obs[0] > 0.48:
                    rew = 10
                    self.counter_success += 1
                    print('****** reward ********')
                    #each_reward_episode+=10

                self.new_state = new_obs

                if self.my_counter > 1:
                    self.UpdateQ(self.current_state, self.new_state,
                                 env_action, rew)

                each_reward_episode += rew

                self.current_state = self.new_state
                if new_obs[0] > 0.48 or done:
                    if self.episode_num % 50 == 0:
                        print(self.episode_num, self.my_counter, '--> ',
                              each_reward_episode, new_obs[0], done)
                    self.episode_rewards.append(each_reward_episode)
                    each_reward_episode = 0
                    self.episode_num += 1

                self.num_timesteps += 1
                self.my_counter += 1
                # Stop training if return value is False
                callback.update_locals(locals())
                if callback.on_step() is False:
                    break

                # Store only the unnormalized version
                if self._vec_normalize_env is not None:
                    new_obs_ = self._vec_normalize_env.get_original_obs(
                    ).squeeze()
                    reward_ = self._vec_normalize_env.get_original_reward(
                    ).squeeze()
                else:
                    # Avoid changing the original ones
                    obs_, new_obs_, reward_ = obs, new_obs, rew
                # Store transition in the replay buffer.
                self.replay_buffer_add(obs_, action, reward_, new_obs_, done,
                                       info)
                obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    obs_ = new_obs_

                if writer is not None:
                    ep_rew = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += reward_
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:

                    callback.on_rollout_end()
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps),
                            env=self._vec_normalize_env)
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size, env=self._vec_normalize_env)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer,
                                          PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                    callback.on_rollout_start()

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

        callback.on_training_end()
        return self
예제 #3
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="A2C",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            runner = A2CRunner(self.env,
                               self,
                               n_steps=self.n_steps,
                               gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs, ))
            # Training stats (when using Monitor wrapper)
            ep_info_buf = deque(maxlen=100)

            t_start = time.time()
            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                _, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values,
                    self.num_timesteps // (self.n_batch + 1), writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                self.num_timesteps += self.n_batch + 1

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.dump_tabular()

        return self
예제 #4
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="SAC",
              print_freq=100):

        with TensorboardWriter(self.graph, self.tensorboard_log,
                               tb_log_name) as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)

            start_time = time.time()
            episode_rewards = [0.0]
            is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
            # TeleopEnv
            if is_teleop_env:
                print("Waiting for teleop")
                obs = self.env.wait_for_teleop_reset()
            else:
                obs = self.env.reset()

            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            ep_len = 0
            self.n_updates = 0
            infos_values = []
            mb_infos_vals = []

            for step in range(total_timesteps):
                # Compute current learning_rate
                frac = 1.0 - step / total_timesteps
                current_lr = self.learning_rate(frac)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if step < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)
                ep_len += 1

                if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0:
                    print("{} steps".format(ep_len))

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer, step)

                if ep_len > self.train_freq:
                    print("Additional training")
                    self.env.reset()
                    mb_infos_vals = self.optimize(step, writer, current_lr)
                    done = True

                episode_rewards[-1] += reward
                if done:
                    if not (isinstance(self.env, VecEnv) or is_teleop_env):
                        obs = self.env.reset()

                    print("Episode finished. Reward: {:.2f} {} Steps".format(
                        episode_rewards[-1], ep_len))
                    episode_rewards.append(0.0)
                    ep_len = 0
                    mb_infos_vals = self.optimize(step, writer, current_lr)

                    # Refresh obs when using TeleopEnv
                    if is_teleop_env:
                        print("Waiting for teleop")
                        obs = self.env.wait_for_teleop_reset()

                # Log losses and entropy, useful for monitor training
                if len(mb_infos_vals) > 0:
                    infos_values = np.mean(mb_infos_vals, axis=0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", self.n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed',
                                 "{:.2f}".format(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            if is_teleop_env:
                self.env.is_training = False
            # Use last batch
            print("Final optimization before saving")
            self.env.reset()
            mb_infos_vals = self.optimize(step, writer, current_lr)
        return self
예제 #5
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_true_rewards = [0.0]

            episode_successor_features = [
                np.zeros(self.observation_space.shape[0])
            ]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            h_step = 0
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            n_updates = 0
            infos_values = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            for step in range(total_timesteps):
                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand(
                ) < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # inferred actions need to be transformed to environment action_space before stepping
                    unscaled_action = unscale_action(self.action_space, action)

                    assert action.shape == self.env.action_space.shape

                if self.using_mdal:
                    # reward = reward_giver.get_reward(observation, (step+1) * covariance_lambda)
                    # covariance_lambda = (step+1) / (step + 2) * covariance_lambda \
                    #                     + np.matmul(np.expand_dims(observation, axis=1), np.expand_dims(observation, axis=0))\
                    #                     / (step + 2)
                    reward = self.reward_giver.get_reward(obs)
                    new_obs, true_reward, done, info = self.env.step(
                        unscaled_action)
                else:
                    new_obs, reward, done, info = self.env.step(
                        unscaled_action)
                    true_reward = reward

                self.num_timesteps += 1

                # Only stop training if return value is False, not when it is None. This is for backwards
                # compatibility with callbacks that have no return statement.
                callback.update_locals(locals())
                if callback.on_step() is False:
                    break

                # Store only the unnormalized version
                if self._vec_normalize_env is not None:
                    new_obs_ = self._vec_normalize_env.get_original_obs(
                    ).squeeze()
                    reward_ = self._vec_normalize_env.get_original_reward(
                    ).squeeze()

                else:
                    # Avoid changing the original ones
                    obs_, new_obs_, reward_ = obs, new_obs, reward
                    true_reward_ = true_reward

                # Store transition in the replay buffer.
                self.replay_buffer_add(obs_, action, reward_, new_obs_, done,
                                       info)
                obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    obs_ = new_obs_

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    self.ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward_]).reshape((1, -1))
                    ep_true_reward = np.array([true_reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    # tf_util.total_episode_reward_logger(self.episode_reward, ep_reward,
                    #                                     ep_done, writer, self.num_timesteps)
                    tf_util.total_episode_reward_logger(
                        self.episode_reward, ep_true_reward, ep_done, writer,
                        self.num_timesteps)

                if self.num_timesteps % self.train_freq == 0:
                    callback.on_rollout_end()

                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                # Update Rewards
                if self.using_mdal and self.num_timesteps % self.update_reward_freq == 0 and self.num_timesteps > 1:
                    policy_successor_features = np.mean(
                        episode_successor_features[-11:-1], axis=0)
                    self.reward_giver.update_reward(policy_successor_features)

                    callback.on_rollout_start()

                episode_rewards[-1] += reward_
                episode_true_rewards[-1] += true_reward_

                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    episode_true_rewards.append(0.0)

                    episode_successor_features.append(
                        np.zeros(self.observation_space.shape[0]))
                    h_step = 0
                else:
                    episode_successor_features[-1] = np.add(
                        episode_successor_features[-1],
                        (1 - self.gamma) * (self.gamma**h_step) * obs_)
                    h_step += 1

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                    mean_true_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)
                    mean_true_reward = round(
                        float(np.mean(episode_true_rewards[-101:-1])), 1)

                # substract 1 as we appended a new term just now
                num_episodes = len(episode_rewards) - 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv("mean 100 episode true reward",
                                 mean_true_reward)
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            callback.on_training_end()
            return self
예제 #6
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="TD3",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        last_replay_update = 0

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []
            self.active_sampling = False
            initial_step = self.num_timesteps

            if self.buffer_is_prioritized and \
                    ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer")
                     or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer")) \
                    and self.num_timesteps >= self.prioritization_starts:
                self._set_prioritized_buffer()

            for step in range(initial_step, total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    action = self.policy_tf.step(obs[None]).flatten()
                    # Add noise to the action, as the policy
                    # is deterministic, this is required for exploration
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(
                    obs, action, reward, new_obs,
                    float(done if not self.time_aware else done
                          and info["termination"] != "steps"))
                obs = new_obs

                if ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "RankPrioritizedReplayBuffer")\
                        or self.replay_buffer.__name__ == "RankPrioritizedReplayBuffer") and \
                        self.num_timesteps % self.buffer_size == 0:
                    self.replay_buffer.rebalance()

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                                or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - self.num_timesteps / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        # Note: the policy is updated less frequently than the Q functions
                        # this is controlled by the `policy_delay` parameter
                        step_writer = writer if grad_step % self.write_freq == 0 else None
                        mb_infos_vals.append(
                            self._train_step(step, step_writer, current_lr,
                                             (step + grad_step) %
                                             self.policy_delay == 0))

                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if isinstance(self.replay_buffer, DiscrepancyReplayBuffer
                                  ) and n_updates - last_replay_update >= 5000:
                        self.replay_buffer.update_priorities()
                        last_replay_update = n_updates
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        if self.active_sampling:
                            sample_obs, sample_state = self.env.get_random_initial_states(
                                25)
                            obs_discrepancies = self.policy_tf.get_q_discrepancy(
                                sample_obs)
                            obs = self.env.reset(
                                **sample_state[np.argmax(obs_discrepancies)])
                        else:
                            obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1

                if self.buffer_is_prioritized and \
                        ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer")
                         or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer"))\
                        and self.num_timesteps >= self.prioritization_starts:
                    self._set_prioritized_buffer()

                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #7
0
    def learn(self, total_timesteps, callback=None,
              log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_discrim_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            discrim_ac_buf = deque(maxlen=20)
            discrim_train_ac_buf = deque(maxlen=20)
            discrim_loss_buf = deque(maxlen=200)
            n_updates = 0
            infos_values = []
            discrim_loss_val = 0
            discrim_ac_val = 0
            current_discrim_weight = None
            current_discrim_lr = None

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # inferred actions need to be transformed to environment action_space before stepping
                    unscaled_action = unscale_action(self.action_space, action)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(unscaled_action)
                skill = info['skill']
                agent_discrim_components, expert_discrim_components = [info['prev_agent_obs'].flat], [info['prev_expert_obs'].flat]
                if self.discrim_include_next_state:
                    agent_discrim_components.append(info['agent_obs'].flat)
                    expert_discrim_components.append(info['expert_obs'].flat)
                if self.discrim_include_skill:
                    agent_discrim_components.append(skill)
                    expert_discrim_components.append(skill)
                
                agent_discrim_obs = np.concatenate(agent_discrim_components, axis=0)
                expert_discrim_obs = np.concatenate(expert_discrim_components, axis=0)
                
                # Collect Entire trajectories of data for discrim test buffer vs regular discrim buffer
                if len(episode_rewards) % self.discrim_test_data_collection_freq == 0 and len(episode_rewards) >= self.discrim_test_data_collection_freq:
                # if step % self.discrim_test_data_collection_freq:
                    self.test_discrim_replay_buffer.add(agent_discrim_obs, None, None, np.array([0.95]), 0.0)
                    self.test_discrim_replay_buffer.add(expert_discrim_obs, None, None, np.array([0.05]), 0.0)
                else:
                    self.discrim_replay_buffer.add(agent_discrim_obs, None, None, np.array([0.95]), 0.0)
                    self.discrim_replay_buffer.add(expert_discrim_obs, None, None, np.array([0.05]), 0.0)

                # Store transition in the replay buffer.
                self.replay_buffer.add(np.concatenate((obs, agent_discrim_obs)), action, reward, new_obs, float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                                                                      ep_done, writer, self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        current_discrim_weight = max(self.discrim_weight * frac, 0.1) if self.discrim_decay else self.discrim_weight
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(self._train_step(step, writer, current_lr, current_discrim_weight))
                        # Update target network
                        if (step + grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                if step % self.discrim_train_freq == 0 and float(step) / total_timesteps < self.discrim_stop:
                    mb_discrim_loss = []
                    for grad_step in range(self.discrim_grad_steps):
                        if not self.discrim_replay_buffer.can_sample(self.discrim_batch_size):
                            break
                        current_discrim_lr = self.discrim_lr * max(0, 1.0 - float(step) / (self.discrim_stop * total_timesteps))
                        discrim_loss = self._discrim_train_step(step, current_discrim_lr)
                        mb_discrim_loss.append(discrim_loss)
                    
                    if len(mb_discrim_loss) > 0:
                        discrim_loss_val = np.mean(mb_discrim_loss)

                # assess the discrim accuracy
                if step % self.discrim_test_data_collection_freq and self.test_discrim_replay_buffer.can_sample(self.discrim_batch_size):
                    discrim_ac_buf.append(self._get_discrim_ac())
                    discrim_train_ac_buf.append(self._get_discrim_ac(test=False))
                
                episode_rewards[-1] += reward
                cur_discrim_reward, cur_discrim_loss = self._get_discrim_reward(agent_discrim_obs)
                episode_discrim_rewards[-1] += cur_discrim_reward[0][0]
                discrim_loss_buf.append(cur_discrim_loss)
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    episode_discrim_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("discrim train loss", discrim_loss_val)
                    if len(discrim_ac_buf) > 0:
                        logger.logkv("discrim test ac", safe_mean([ac for ac in discrim_ac_buf]))
                    if len(discrim_train_ac_buf) > 0:
                        logger.logkv("discrim train ac", safe_mean([ac for ac in discrim_train_ac_buf]))
                    if len(episode_discrim_rewards) > 50:
                        logger.logkv("mean 50 ep discrim rew", safe_mean(episode_discrim_rewards[-50:]))
                    if len(discrim_loss_buf) > 0:
                        logger.logkv("discirm pred loss", safe_mean([l for l in discrim_loss_buf]))
                    if current_discrim_weight:
                        logger.logkv("cur discrim weight", current_discrim_weight)
                    if current_discrim_lr:
                        logger.logkv("cur discrim lr", current_discrim_lr)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1

        return self
예제 #9
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch
            start_decay = n_updates
            pp_sr_buf = deque(maxlen=5)
            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                if self.curriculum:
                    if 'FetchStack' in self.env.get_attr('spec')[0].id:
                        # Stacking
                        pp_sr = pp_eval_model(self.eval_env, self)
                        pp_sr_buf.append(pp_sr)
                        print('Pick-and-place success rate', np.mean(pp_sr_buf))
                        if start_decay == n_updates and np.mean(pp_sr_buf) > 0.8:
                            start_decay = update
                        _ratio = np.clip(0.7 - 0.8 * (update - start_decay) / 380, 0.3, 0.7)  # from 0.7 to 0.3
                    elif 'FetchPushWallObstacle' in self.env.get_attr('spec')[0].id:
                        _ratio = max(1.0 - (update - 1.0) / n_updates, 0.0)
                    elif 'MasspointMaze-v3' in self.env.get_attr('spec')[0].id:
                        _ratio = 1.0 - (update - 1.0) / n_updates
                    else:
                        raise NotImplementedError
                    self.env.env_method('set_random_ratio', _ratio)
                    print('Set random_ratio to', self.env.get_attr('random_ratio')[0])
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
                self.num_timesteps += self.n_batch
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num *
                                                                            self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, writer=writer,
                                                                 update=timestep, cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num *
                                                                            self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, update=timestep,
                                                                 writer=writer, states=mb_states,
                                                                 cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                        logger.logkv('ep_success_rate', safe_mean([ep_info['is_success'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)
            if self.priority_buffer:
                self.replay_buffer.set_model(self)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)
            self.env_id = self.env.env.get_attr('spec')[0].id

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            store_time = 0.0
            step_time = 0.0
            train_time = 0.0
            episode_rewards = [[0.0] for _ in range(self.env.env.num_envs)]
            episode_successes = [[] for _ in range(self.env.env.num_envs)]
            if self.action_noise is not None:
                self.action_noise.reset()
            assert isinstance(self.env.env, VecEnv)
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []
            pp_sr_buf = deque(maxlen=5)
            stack_sr_buf = deque(maxlen=5)
            start_decay = total_timesteps
            if self.sequential and 'FetchStack' in self.env_id:
                current_max_nobject = 2
                self.env.env.env_method('set_task_array',
                                        [[(2, 0), (2, 1),
                                          (1, 0)]] * self.env.env.num_envs)
                print('Set task_array to ',
                      self.env.env.get_attr('task_array')[0])
                self.env.env.env_method('set_random_ratio',
                                        [0.7] * self.env.env.num_envs)
            obs = self.env.reset()
            print(obs.shape)
            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.curriculum and step % 3000 == 0:
                    if 'FetchStack' in self.env.env.get_attr('spec')[0].id:
                        # Stacking
                        pp_sr = eval_model(
                            self.eval_env,
                            self,
                            current_max_nobject if self.sequential else
                            self.env.env.get_attr('n_object')[0],
                            1.0,
                            init_on_table=(self.env.env.get_attr('spec')[0].id
                                           == 'FetchStack-v2'))
                        pp_sr_buf.append(pp_sr)
                        stack_sr = eval_model(
                            self.eval_env,
                            self,
                            current_max_nobject if self.sequential else
                            self.env.env.get_attr('n_object')[0],
                            0.0,
                            init_on_table=(self.env.env.get_attr('spec')[0].id
                                           == 'FetchStack-v2'))
                        stack_sr_buf.append(stack_sr)
                        print('Pick-and-place success rate',
                              np.mean(pp_sr_buf))
                        if self.sequential:
                            if self.env.env.get_attr('random_ratio')[
                                    0] > 0.5 and np.mean(pp_sr_buf) > 0.8:
                                _ratio = 0.3
                            elif self.env.env.get_attr('random_ratio')[0] < 0.5 \
                                    and current_max_nobject < self.env.env.get_attr('n_object')[0] \
                                    and np.mean(stack_sr_buf) > 1 / current_max_nobject:
                                _ratio = 0.7
                                current_max_nobject += 1
                                previous_task_array = self.env.env.get_attr(
                                    'task_array')[0]
                                self.env.env.env_method(
                                    'set_task_array', [
                                        previous_task_array +
                                        [(current_max_nobject, j)
                                         for j in range(current_max_nobject)]
                                    ] * self.env.env.num_envs)

                                print('Set task_array to',
                                      self.env.env.get_attr('task_array')[0])
                            else:
                                _ratio = self.env.env.get_attr(
                                    'random_ratio')[0]
                        else:
                            if start_decay == total_timesteps and np.mean(
                                    pp_sr_buf) > 0.8:
                                start_decay = step
                            _ratio = np.clip(0.7 - (step - start_decay) / 2e6,
                                             0.3, 0.7)  # from 0.7 to 0.3
                    elif 'FetchPushWallObstacle' in self.env_id:
                        _ratio = max(1.0 - step / total_timesteps, 0.0)
                    else:
                        raise NotImplementedError
                    self.env.env.env_method('set_random_ratio',
                                            [_ratio] * self.env.env.num_envs)
                    print('Set random_ratio to',
                          self.env.env.get_attr('random_ratio')[0])

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    rescaled_action = np.stack([
                        self.env.action_space.sample()
                        for _ in range(self.env.env.num_envs)
                    ],
                                               axis=0)
                    action = rescaled_action
                else:
                    action = self.policy_tf.step(obs, deterministic=False)
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == (
                    self.env.env.num_envs, ) + self.env.action_space.shape

                step_time0 = time.time()
                new_obs, reward, done, info = self.env.step(rescaled_action)
                step_time += time.time() - step_time0

                next_obs = new_obs.copy()
                for idx, _done in enumerate(done):
                    if _done:
                        next_obs[idx] = self.env.convert_dict_to_obs(
                            info[idx]['terminal_observation'])

                # Store transition in the replay buffer.
                store_time0 = time.time()
                self.replay_buffer.add(obs, action, reward, next_obs, done)
                store_time += time.time() - store_time0
                obs = new_obs
                for idx, _done in enumerate(done):
                    episode_rewards[idx][-1] += reward[idx]
                    if _done:
                        episode_rewards[idx].append(0.0)
                        maybe_is_success = info[idx].get('is_success')
                        if maybe_is_success is not None:
                            episode_successes[idx].append(
                                float(maybe_is_success))

                # Retrieve reward and episode length if using Monitor wrapper
                for _info in info:
                    maybe_ep_info = _info.get('episode')
                    if maybe_ep_info is not None:
                        ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.reshape(reward, (self.env.env.num_envs, -1))
                    ep_done = np.reshape(done, (self.env.env.num_envs, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                train_time0 = time.time()
                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                train_time += time.time() - train_time0
                if len(episode_rewards[0][-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(
                            np.mean(
                                np.concatenate([
                                    episode_rewards[i][-101:-1]
                                    for i in range(self.env.env.num_envs)
                                ]))), 1)

                num_episodes = sum([
                    len(episode_rewards[i])
                    for i in range(len(episode_rewards))
                ])
                self.num_timesteps += self.env.env.num_envs
                # Display training infos
                if self.verbose >= 1 and done[
                        0] and log_interval is not None and len(
                            episode_rewards[0]) % (log_interval //
                                                   self.env.env.num_envs) == 0:
                    fps = int(self.num_timesteps / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes[0]) > 0:
                        logger.logkv(
                            "success rate",
                            np.mean(
                                np.concatenate([
                                    episode_successes[i][-100:]
                                    for i in range(self.env.env.num_envs)
                                ])))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    if hasattr(self.eval_env.unwrapped, 'random_ratio'):
                        logger.logkv("random_ratio",
                                     self.env.env.get_attr('random_ratio')[0])
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #11
0
    def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="Dual",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate  = get_schedule_fn(self.learning_rate)
        self.cliprange      = get_schedule_fn(self.cliprange)
        cliprange_vf        = get_schedule_fn(self.cliprange_vf)

        new_tb_log   = self._init_num_timesteps(reset_num_timesteps)
        top_callback = SaveOnTopRewardCallback(check_freq=self.n_steps, logdir=self.tensorboard_log, models_num=self.models_num)
        callback.append(top_callback)
        callback     = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.models[0].graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer:

            for model in self.models:
                model._setup_learn(self)

            t_first_start = time.time()
            n_updates     = total_timesteps // (self.n_envs * self.n_steps)

            callback.on_training_start(locals(), globals())

            for update in range(1, n_updates + 1):
                assert (self.n_envs * self.n_steps) % self.nminibatches == 0, ("The number of minibatches (`nminibatches`) is not a factor of the total number of samples collected per rollout (`n_batch`), some samples won't be used.")

                batch_size       = (self.n_envs * self.n_steps) // self.nminibatches
                t_start          = time.time()
                frac             = 1.0 - (update - 1.0) / n_updates
                lr_now           = self.learning_rate(frac)
                cliprange_now    = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)

                callback.on_rollout_start()

                rollouts = self.runner.run(callback) #execute episode

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break


                # Unpack
                i = 0
                steps_used = rollouts[-1]
                for rollout in rollouts[0]:
                    obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward, success_stages = rollout
                    model = self.models[i]
                    # calc = len(true_reward)
                    # model.n_batch = calc

                    if model.n_batch == 0:
                        b = 0
                    else:
                        self.ep_info_buf.extend(ep_infos)   
                        mb_loss_vals = []
                        if states is None:  # nonrecurrent version
                            update_fac = max(model.n_batch // self.nminibatches // self.noptepochs, 1)
                            inds = np.arange(len(obs))#np.arange(model.n_batch)
                            for epoch_num in range(self.noptepochs):
                                np.random.shuffle(inds)
                                for start in range(0, model.n_batch, batch_size):
                                    timestep = self.num_timesteps // update_fac + ((epoch_num * model.n_batch + start) // batch_size)
                                    end      = start + batch_size
                                    mbinds   = inds[start:end]
                                    if len(obs) > 1:
                                        slices   = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                        mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, model=self.models[i], writer=writer, update=timestep, cliprange_vf=cliprange_vf_now))
                                    else:
                                        mb_loss_vals.append((0,0,0,0,0))
                            i+=1
                        else:
                            exit("does not support recurrent version")

                        loss_vals = np.mean(mb_loss_vals, axis=0)
                        t_now     = time.time()
                        fps       = int(model.n_batch / (t_now - t_start))

                        if writer is not None:
                            n_steps = model.n_batch
                            try:
                                total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, n_steps)), masks.reshape((self.n_envs, n_steps)), writer, self.num_timesteps)
                            except:
                                print("Failed to log episode reward of shape {}".format(true_reward.shape))
                            summary = tf.Summary(value=[tf.Summary.Value(tag='episode_reward/Successful stages',
                                                                         simple_value=success_stages)])
                            writer.add_summary(summary, self.num_timesteps)
                            #@TODO plot in one graph:
                            for i, val in enumerate(steps_used):
                                summary = tf.Summary(value=[tf.Summary.Value(tag='episode_reward/Used steps net {}'.format(i),
                                                                              simple_value=val)])
                                writer.add_summary(summary, self.num_timesteps)

                        if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                            explained_var = explained_variance(values, returns)
                            logger.logkv("serial_timesteps", update * self.n_steps)
                            logger.logkv("n_updates", update)
                            logger.logkv("total_timesteps", self.num_timesteps)
                            logger.logkv("fps", fps)
                            logger.logkv("Steps", steps_used)
                            logger.logkv("explained_variance", float(explained_var))
                            if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                                logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                                logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                            logger.logkv('time_elapsed', t_start - t_first_start)
                            for (loss_val, loss_name) in zip(loss_vals, model.loss_names):
                                logger.logkv(loss_name, loss_val)
                            logger.dumpkvs()

            callback.on_training_end()
            return self
예제 #12
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2"):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)
            # この時点でself.n_steps=256
            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)

            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()
            n_timesteps = 0
            # nupdates = total_timesteps // self.n_batch
            for timestep in range(1, total_timesteps + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - timestep / total_timesteps
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
                self.n_steps = len(true_reward)
                n_timesteps += len(obs)
                #print("len(obs): {}, np.shape(obs): {}, np.shape(rewards): {}".format(len(obs), np.shape(obs), np.shape(returns)))
                #print("np.shape(actions): {}, np.shape(rewards): {}".format(np.shape(actions), np.shape(values)))
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs): # noptepochs回
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):  # self.n_batch個のデータをbatch_size間隔で等分し_train_stepに渡す.
                            # timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) //
                            #             batch_size)
                            end = start + batch_size
                            #print("batch_size: {}".format(batch_size))
                            mbinds = inds[start:end]
                            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
                                                                 update=n_timesteps))
                else:  # recurrent version
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for stan_timestepsrt in range(0, self.n_envs, envs_per_batch):
                            # timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) //
                            #             envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=n_timesteps,
                                                                 writer=writer, states=mb_states))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                      true_reward.reshape((self.n_envs, self.n_steps)),
                                                                      masks.reshape((self.n_envs, self.n_steps)),
                                                                      writer, n_timesteps)

                if self.verbose >= 1 and (timestep % log_interval == 0 or timestep == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("total_timesteps", n_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                if n_timesteps > total_timesteps:
                    break

            return self
예제 #13
0
    def run(self, num_steps, data_dir, policy_record=None):
        local_steps = int(num_steps / self.comm.Get_size())

        steps = 0

        # TODO: Add alpha annealing over num_steps

        while True:

            # sync weights
            self.b_agent_attack.sync_weights()
            self.b_agent_evade.sync_weights()
            self.b_agent_transit.sync_weights()
            self.m_agent.sync_weights()

            # create placeholders to store experience that we gather
            training_state = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_action = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_reward = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_next_state = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_done = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_reward_sum_combined = 0  # keep track of combined reward over all episodes between training

            state = self.env.reset()
            reward_sum = {}
            done = False
            while not done:
                complete_action, distribution, beh_actions, label = self.m_agent.get_action(
                    state)

                next_state, reward, done, info = self.env.step(
                    complete_action, dst=distribution, label=label)

                # Aggregate reward throughout the episode
                if not reward_sum:
                    reward_sum = reward
                else:
                    reward_sum = {
                        k: reward_sum[k] + reward[k]
                        for (k, v) in reward.items()
                    }
                training_reward_sum_combined += reward["combined"]

                training_state["meta"].append(state)
                training_action["meta"].append(distribution)
                training_reward["meta"].append(reward["combined"])
                training_next_state["meta"].append(next_state)
                training_done["meta"].append(done)

                for idx, label in enumerate(['attack', 'evade', 'transit']):
                    training_state[label].append(state)
                    training_action[label].append(beh_actions[idx])
                    training_reward[label].append(reward[label])
                    training_next_state[label].append(next_state)
                    training_done[label].append(done)

                state = next_state

            # #now we have batches of data: compute the values and advantages
            # training_value = {"meta": None, "attack": None, "evade": None, "transit": None}
            # training_advantages = {"meta": None, "attack": None, "evade": None, "transit": None}

            # log tensorboard
            {logger.logkv(k, v) for (k, v) in reward_sum.items()}
            logger.dumpkvs()

            # vcompute advantages and values
            models = [
                self.b_agent_attack, self.b_agent_evade, self.b_agent_transit,
                self.m_agent
            ]
            for model in models:

                network = model.label

                states = training_state[network]
                actions = training_action[network]
                reward = training_reward[network]
                next_states = training_next_state[network]
                done = training_done[network]

                # Convert done bools to ints and invert
                done_int = np.invert(done).astype(np.int)

                # Generalized advantage estimation (gets advantages to train on and value estimates)
                target, advantages = GAE(states,
                                         actions,
                                         reward,
                                         next_states,
                                         done_int,
                                         model.sample_value,
                                         T=128,
                                         y=0.99,
                                         lam=0.95,
                                         use_Q=False)

                # train this model
                dataset = Dataset(dict(ob=np.asarray(states),
                                       ac=np.asarray(actions),
                                       atarg=np.asarray(advantages),
                                       vtarg=np.asarray(target)),
                                  shuffle=True)

                for k in range(4):
                    for i, batch in enumerate(dataset.iterate_once(
                            len(states))):
                        model.train(batch["ob"], batch["ac"], batch["vtarg"],
                                    batch["atarg"], 1.0)

            print('FINISHED TRAINING EPISODE')
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO2"):
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            runner_resolution = Runner_resolution(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs,))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch
            for update in range(nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                n_batch_train = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update / (nupdates + 1))
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount


                action_omega = random.randint(1,21)



                for i in range(action_omega):
                    if i ==0:


                        obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
                        ep_info_buf.extend(ep_infos)
                        mb_loss_vals = []
                        if states is None:  # nonrecurrent version
                            inds = np.arange(self.n_batch)
                            for epoch_num in range(self.noptepochs):
                                np.random.shuffle(inds)
                                for start in range(0, self.n_batch, n_batch_train):
                                    timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) //
                                                n_batch_train)
                                    end = start + n_batch_train
                                    mbinds = inds[start:end]
                                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                    mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
                                                                         update=timestep))
                        else:  # recurrent version
                            assert self.n_envs % self.nminibatches == 0
                            envinds = np.arange(self.n_envs)
                            flatinds = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                            envsperbatch = n_batch_train // self.n_steps
                            for epoch_num in range(self.noptepochs):
                                np.random.shuffle(envinds)
                                for start in range(0, self.n_envs, envsperbatch):
                                    timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) //
                                                envsperbatch)
                                    end = start + envsperbatch
                                    mb_env_inds = envinds[start:end]
                                    mb_flat_inds = flatinds[mb_env_inds].ravel()
                                    slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                    mb_states = states[mb_env_inds]
                                    mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep,
                                                                         writer=writer, states=mb_states))

                        loss_vals = np.mean(mb_loss_vals, axis=0)
                        t_now = time.time()
                        fps = int(self.n_batch / (t_now - t_start))

                        if writer is not None:
                            self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                              true_reward.reshape((self.n_envs, self.n_steps)),
                                                                              masks.reshape((self.n_envs, self.n_steps)),
                                                                              writer, update * (self.n_batch + 1))

                        if callback is not None:
                            callback(locals(), globals())

                        if self.verbose >= 1 and ((update + 1) % log_interval//100 == 0 or update == 0):
                            explained_var = explained_variance(values, returns)
                            logger.logkv("serial_timesteps", (update + 1) * self.n_steps)
                            logger.logkv("nupdates", (update + 1))
                            logger.logkv("total_timesteps", (update + 1) * self.n_batch)
                            logger.logkv("fps", fps)
                            logger.logkv("explained_variance", float(explained_var))
                            logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                            logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                            logger.logkv('time_elapsed', t_start - t_first_start)
                            for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                                logger.logkv(loss_name, loss_val)
                            logger.dumpkvs()


                    elif i>0:
                            obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner_resolution.run()
                            ep_info_buf.extend(ep_infos)
                            mb_loss_vals = []
                            print("omega",i)

                            if states is None:  # nonrecurrent version
                                inds = np.arange(self.n_batch)
                                for epoch_num in range(self.noptepochs):
                                    np.random.shuffle(inds)
                                    for start in range(0, self.n_batch, n_batch_train):
                                        timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) //
                                                    n_batch_train)
                                        end = start + n_batch_train
                                        mbinds = inds[start:end]
                                        slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                        mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
                                                                             update=timestep))
                            else:  # recurrent version


                                assert self.n_envs % self.nminibatches == 0
                                envinds = np.arange(self.n_envs)
                                flatinds = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
                                envsperbatch = n_batch_train // self.n_steps
                                for epoch_num in range(self.noptepochs):
                                    np.random.shuffle(envinds)

                                    for start in range(0, self.n_envs, envsperbatch):
                                        timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) //
                                                    envsperbatch)
                                        end = start + envsperbatch
                                        mb_env_inds = envinds[start:end]
                                        mb_flat_inds = flatinds[mb_env_inds].ravel()
                                        slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                                        mb_states = states[mb_env_inds]
                                        mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep,
                                                                             writer=writer, states=mb_states))

                            loss_vals = np.mean(mb_loss_vals, axis=0)
                            t_now = time.time()
                            fps = int(self.n_batch / (t_now - t_start))

                            if writer is not None:
                                self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                                  true_reward.reshape((self.n_envs, self.n_steps)),
                                                                                  masks.reshape((self.n_envs, self.n_steps)),
                                                                                  writer, update * (self.n_batch + 1))

                            if callback is not None:
                                callback(locals(), globals())

                            if self.verbose >= 1 and ((update + 1) % log_interval//100 == 0 or update == 0):
                                explained_var = explained_variance(values, returns)
                                logger.logkv("serial_timesteps", (update + 1) * self.n_steps)
                                logger.logkv("nupdates", (update + 1))
                                logger.logkv("total_timesteps", (update + 1) * self.n_batch)
                                logger.logkv("fps", fps)
                                logger.logkv("explained_variance", float(explained_var))
                                logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                                logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                                logger.logkv('time_elapsed', t_start - t_first_start)
                                for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
                                    logger.logkv(loss_name, loss_val)
                                logger.dumpkvs()

                return self
예제 #15
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name=None,
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        tb_log_name = self.model_name

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            n_updates = total_timesteps // self.n_batch
            n_checkpoints = self.checkpoint_inc // self.n_batch
            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                self.num_timesteps += self.n_batch
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                for ep_info in ep_infos:
                    self.plot_l.append(ep_info['l'])
                    self.plot_r.append(ep_info['r'])
                    self.plot_t.append(ep_info['t'])

                if update % n_checkpoints == 0:
                    checkpoint = 'model-' + str(self.num_timesteps)
                    with open(self.checkpoint_log, mode='a') as employee_file:
                        employee_writer = csv.writer(employee_file)
                        employee_writer.writerow([checkpoint])
                    graph_name_csv = self.graph_name + '-' + str(
                        self.num_timesteps) + '.csv'
                    graph_name_sb = self.graph_name + '-' + str(
                        self.num_timesteps) + '.pkl'
                    self.save(graph_name_sb)
                    with open(graph_name_csv, mode='w') as employee_file:
                        employee_writer = csv.writer(employee_file,
                                                     delimiter=',',
                                                     quotechar='"',
                                                     quoting=csv.QUOTE_MINIMAL)
                        employee_writer.writerow(['r', 'l', 't'])
                        for i in range(len(self.plot_l)):
                            employee_writer.writerow([
                                self.plot_r[i], self.plot_l[i], self.plot_t[i]
                            ])
                    pydrive_util.upload_file(self.drive, graph_name_sb)
                    pydrive_util.upload_file(self.drive, graph_name_csv)
            return self
예제 #16
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None,
              learning_curve=False,
              test_t=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            self.cumul_reward = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            # variables for test eval ##
            test_step = test_t * 3
            test_results = {'sum': []}
            test_ts = []

            for _ in range(total_timesteps):

                ## Test eval period ##
                if learning_curve and _ % test_step == 0 and _ > 0:
                    print("--> Simulating test period")
                    self.env.reset()
                    test_r = 0.0
                    for i in range(test_t):
                        feasible_actions = AllocationEnv.get_feasible_actions(
                            obs["board_config"])
                        action_mask = AllocationEnv.get_action_mask(
                            feasible_actions, self.env.action_space.n)
                        action, _states = self.predict(obs, mask=action_mask)
                        action = AllocationEnv.check_action(
                            obs['board_config'], action)
                        obs, rewards, dones, info = self.env.step(action)
                        test_r += rewards

                    test_results["sum"].append(test_r)
                    test_ts.append(_)
                    self.env.reset()

                    # plot test eval progress
                    plt.plot(test_ts, test_results["sum"])
                    # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k')
                    plt.xlabel("Iteration count")
                    plt.ylabel("Total (sum) test reward")
                    plt.savefig("figs/rl-learning-curve-{}.pdf".format(
                        cfg.vals['prj_name']))
                    plt.clf()
                    plt.close()

                    # write test eval progress
                    write_results = {}
                    for k, v in test_results.items():
                        write_results[k] = serialize_floats(v)

                    with open(
                            "output/rl-learning-curve-{}.json".format(
                                cfg.vals['prj_name']), 'w') as f:
                        json.dump(write_results, f)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True

                feasible_actions = AllocationEnv.get_feasible_actions(
                    obs["board_config"])
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.action_space.n)
                with self.sess.as_default():
                    action = self.act(State.get_vec_observation(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs,
                                      mask=action_mask)[0]
                reset = False
                # CHECK IF ACTIONS IS FEASIBLE
                action = AllocationEnv.check_action(obs['board_config'],
                                                    action)
                env_action = action
                new_obs, rew, done, info = self.env.step(env_action)
                print("action: {} - reward: {} - eps: {:.4}".format(
                    action, rew, update_eps))
                print(new_obs['day_vec'])
                print(new_obs['board_config'])
                # Store transition in the replay buffer.
                self.replay_buffer.add(State.get_vec_observation(obs), action,
                                       rew, State.get_vec_observation(new_obs),
                                       float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                self.cumul_reward.append(self.cumul_reward[-1] + rew)
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()
                print('timestamp: {}'.format(self.num_timesteps, end='\r\n'))
                self.num_timesteps += 1

        return self
예제 #17
0
def test_hard(tmpdir):
    logger.configure(tmpdir)

    # Part One: Test logging outside of the accumulating scope, and within scopes
    # with two different different logging keys (including a repeat).

    sb_logger.logkv("no_context", 1)

    with logger.accumulate_means("disc"):
        sb_logger.logkv("C", 2)
        sb_logger.logkv("D", 2)
        sb_logger.dumpkvs()
        sb_logger.logkv("C", 4)
        sb_logger.dumpkvs()

    with logger.accumulate_means("gen"):
        sb_logger.logkv("E", 2)
        sb_logger.dumpkvs()
        sb_logger.logkv("E", 0)
        sb_logger.dumpkvs()

    with logger.accumulate_means("disc"):
        sb_logger.logkv("C", 3)
        sb_logger.dumpkvs()

    sb_logger.dumpkvs()  # Writes 1 mean each from "gen" and "disc".

    expect_raw_gen = {"raw/gen/E": [2, 0]}
    expect_raw_disc = {
        "raw/disc/C": [2, 4, 3],
        "raw/disc/D": [2, '', ''],
    }
    expect_default = {
        "mean/gen/E": [1],
        "mean/disc/C": [3],
        "mean/disc/D": [2],
        "no_context": [1],
    }

    _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect_default)
    _compare_csv_lines(osp.join(tmpdir, "raw", "gen", "progress.csv"),
                       expect_raw_gen)
    _compare_csv_lines(osp.join(tmpdir, "raw", "disc", "progress.csv"),
                       expect_raw_disc)

    # Part Two:
    # Check that we append to the same logs after the first dump to "means/*".

    with logger.accumulate_means("disc"):
        sb_logger.logkv("D", 100)
        sb_logger.dumpkvs()

    sb_logger.logkv("no_context", 2)

    sb_logger.dumpkvs()  # Writes 1 mean from "disc". "gen" is blank.

    expect_raw_gen = {"raw/gen/E": [2, 0]}
    expect_raw_disc = {
        "raw/disc/C": [2, 4, 3, ''],
        "raw/disc/D": [2, '', '', 100],
    }
    expect_default = {
        "mean/gen/E": [1, ''],
        "mean/disc/C": [3, ''],
        "mean/disc/D": [2, 100],
        "no_context": [1, 2],
    }

    _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect_default)
    _compare_csv_lines(osp.join(tmpdir, "raw", "gen", "progress.csv"),
                       expect_raw_gen)
    _compare_csv_lines(osp.join(tmpdir, "raw", "disc", "progress.csv"),
                       expect_raw_disc)
예제 #18
0
    def learn(self, total_timesteps, callback=None, seed=None,
              log_interval=2000, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose):#, TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                #as writer:

            #self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)
            frac = 0
            t_k = 0

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1,))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []
            #add = 0

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                    or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    action = self.policy_tf.step(obs[None], deterministic=True).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                #print("action", action)
                new_obs, reward, done, info = self.env.step(rescaled_action)
                #print("new obs", new_obs)

                # Store transition in the replay buffer.
                #if add < 1:
                    #for _ in range(self.batch_size):
                self.replay_buffer.add(obs, action, reward, new_obs, float(done))
                    #add += 1
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                #if writer is not None:
                    # Write reward per episode to tensorboard
                    #ep_reward = np.array([reward]).reshape((1, -1))
                    #ep_done = np.array([done]).reshape((1, -1))
                    #self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
                    #                                                  ep_done, writer, self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(1):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                            or self.num_timesteps < self.learning_starts:
                            break
                        #print("training now........")
                        n_updates += 1
                        # Compute current learning_rate
                        t_k = self.klconst #* np.sqrt(step / total_timesteps)
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        #t = time.process_time()
                        mb_infos_vals.append(self._train_step(step, current_lr, t_k))
                        #print("time taken", time.process_time() - t)
                        #print("n updates", n_updates)
                        #if n_updates > 10:
                        #    break
                        # Update target network
                        if (step + grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                if step % self.gradient_steps == 0:
                    self.sess.run(self.assign_policy_op)

                episode_rewards[-1] += reward
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and step % log_interval == 0: #done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("ent_coef", 1-frac)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.logkv("frac", frac)
                    logger.logkv("t_k", t_k)
                    logger.logkv("steps", step)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #19
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy.
                if self.num_timesteps < self.learning_starts:
                    action = self.env.action_space.sample()
                    # No need to rescale when sampling random action
                    rescaled_action = action
                else:
                    action = self.policy_tf.step(
                        obs, deterministic=False).flatten()

                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    logger.logkv(
                        'ep_rewmean',
                        safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
                    logger.logkv(
                        'eplenmean',
                        safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self
예제 #20
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            t_first_start = time.time()
            n_updates = total_timesteps // self.n_batch

            callback.on_training_start(locals(), globals())

            for update in range(1, n_updates + 1):
                assert self.n_batch % self.nminibatches == 0, (
                    "The number of minibatches (`nminibatches`) "
                    "is not a factor of the total number of samples "
                    "collected per rollout (`n_batch`), "
                    "some samples won't be used.")
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # Unpack
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

            callback.on_training_end()
            return self
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="A2C",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            t_start = time.time()
            callback.on_training_start(locals(), globals())

            for update in range(1, total_timesteps // self.n_batch + 1):

                callback.on_rollout_start()
                # true_reward is the reward without discount
                rollout = self.runner.run(callback)
                # unpack
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                _, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values,
                    self.num_timesteps // self.n_batch, writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.dump_tabular()

        callback.on_training_end()
        return self
예제 #22
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=4,
              tb_log_name="TD3",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            n_updates = 0
            infos_values = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            for step in range(total_timesteps):
                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand(
                ) < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(obs[None]).flatten()
                    # Add noise to the action, as the policy
                    # is deterministic, this is required for exploration
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    unscaled_action = unscale_action(self.action_space, action)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(unscaled_action)

                self.num_timesteps += 1

                # Only stop training if return value is False, not when it is None. This is for backwards
                # compatibility with callbacks that have no return statement.
                if callback.on_step() is False:
                    break

                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, reward, new_obs,
                                       float(done))
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    self.ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    callback.on_rollout_end()

                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                                or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        # Note: the policy is updated less frequently than the Q functions
                        # this is controlled by the `policy_delay` parameter
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr,
                                             (step + grad_step) %
                                             self.policy_delay == 0))

                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                    callback.on_rollout_start()

                episode_rewards[-1] += reward
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []

            callback.on_training_end()
            return self
예제 #23
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="ACKTR",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(
                initial_value=self.learning_rate,
                n_values=total_timesteps,
                schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.variable_scope(
                        "kfac_apply",
                        reuse=self.trained,
                        custom_getter=tf_util.outer_scope_getter(
                            "kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized) if not f
                    ]

                    self.train_op, self.q_runner = self.optim.apply_gradients(
                        list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.global_variables()
                    is_uninitialized = self.sess.run(
                        [tf.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [
                        v for (v, f) in zip(tf_vars, is_uninitialized)
                        if not f and v not in old_uninitialized_vars
                    ]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(
                            tf.variables_initializer(new_uninitialized_vars))

            self.trained = True

            runner = A2CRunner(self.env,
                               self,
                               n_steps=self.n_steps,
                               gamma=self.gamma)
            self.episode_reward = np.zeros((self.n_envs, ))

            t_start = time.time()
            coord = tf.train.Coordinator()
            if self.q_runner is not None:
                enqueue_threads = self.q_runner.create_threads(self.sess,
                                                               coord=coord,
                                                               start=True)
            else:
                enqueue_threads = []

            # Training stats (when using Monitor wrapper)
            ep_info_buf = deque(maxlen=100)

            for update in range(1, total_timesteps // self.n_batch + 1):
                # true_reward is the reward without discount
                obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                policy_loss, value_loss, policy_entropy = self._train_step(
                    obs, states, rewards, masks, actions, values,
                    self.num_timesteps // (self.n_batch + 1), writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps",
                                          self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance",
                                          float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.dump_tabular()

                self.num_timesteps += self.n_batch + 1

            coord.request_stop()
            coord.join(enqueue_threads)

        return self
예제 #24
0
    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()
            self.n_batch = self.n_envs * self.n_steps

            self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                    schedule=self.lr_schedule)

            # FIFO queue of the q_runner thread is closed at the end of the learn function.
            # As a result, it needs to be redefinied at every call
            with self.graph.as_default():
                with tf.compat.v1.variable_scope("kfac_apply", reuse=self.trained,
                                       custom_getter=tf_util.outer_scope_getter("kfac_apply")):
                    # Some of the variables are not in a scope when they are create
                    # so we make a note of any previously uninitialized variables
                    tf_vars = tf.compat.v1.global_variables()
                    is_uninitialized = self.sess.run([tf.compat.v1.is_variable_initialized(var) for var in tf_vars])
                    old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f]

                    self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params)))

                    # then we check for new uninitialized variables and initialize them
                    tf_vars = tf.compat.v1.global_variables()
                    is_uninitialized = self.sess.run([tf.compat.v1.is_variable_initialized(var) for var in tf_vars])
                    new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized)
                                              if not f and v not in old_uninitialized_vars]

                    if len(new_uninitialized_vars) != 0:
                        self.sess.run(tf.compat.v1.variables_initializer(new_uninitialized_vars))

            self.trained = True

            t_start = time.time()
            coord = tf.train.Coordinator()
            if self.q_runner is not None:
                enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True)
            else:
                enqueue_threads = []

            callback.on_training_start(locals(), globals())

            for update in range(1, total_timesteps // self.n_batch + 1):

                callback.on_rollout_start()

                # pytype:disable=bad-unpacking
                # true_reward is the reward without discount
                if isinstance(self.runner, PPO2Runner):
                    # We are using GAE
                    rollout = self.runner.run(callback)
                    obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout
                else:
                    rollout = self.runner.run(callback)
                    obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout
                # pytype:enable=bad-unpacking
                callback.update_locals(locals())
                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                policy_loss, value_loss, policy_entropy = self._train_step(obs, states, returns, masks, actions, values,
                                                                           self.num_timesteps // (self.n_batch + 1),
                                                                           writer)
                n_seconds = time.time() - t_start
                fps = int((update * self.n_batch) / n_seconds)

                if writer is not None:
                    total_episode_reward_logger(self.episode_reward,
                                                true_reward.reshape((self.n_envs, self.n_steps)),
                                                masks.reshape((self.n_envs, self.n_steps)),
                                                writer, self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps", self.num_timesteps)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy", float(policy_entropy))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                        logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                        logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                    logger.dump_tabular()

            coord.request_stop()
            coord.join(enqueue_threads)

        callback.on_training_end()
        return self
예제 #25
0
파일: ppo2.py 프로젝트: NikitaRdn/raisimGym
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              eval_every_n=5,
              reset_num_timesteps=True,
              record_video=False,
              log_dir=""):

        # Define model saving variables
        # Current Iteration is basically the update
        # Initialise variable
        _savediter = 0
        _counter = (200 // eval_every_n)

        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch

            for update in range(1, nupdates + 1):
                # Do the following except keyboard interrupt the learning process.
                try:

                    if update % eval_every_n == 1:
                        print("[RAISIM_GYM] Visualizing in RaiSimOgre")
                        obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = \
                            runner.run(test_mode=True, record_video=record_video, video_name=log_dir+"/"+str(update-1)+".mp4")
                        # print("Average rewards in this test episode ", ep_infos[0]['r'])
                        # model_name = log_dir + "_Iteration_{}".format(update-1)
                        # self.save(model_name)
                        print("Saving video " + log_dir + "/" +
                              str(update - 1) + ".mp4")
                        # tensorboard_log(logger, ep_infos, self.sess)

                    assert self.n_batch % self.nminibatches == 0
                    batch_size = self.n_batch // self.nminibatches
                    t_start = time.time()
                    frac = 1.0 - (update - 1.0) / nupdates
                    lr_now = self.learning_rate(frac)
                    cliprangenow = self.cliprange(frac)
                    # true_reward is the reward without discount
                    obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                    )
                    ep_info_buf.extend(ep_infos)
                    mb_loss_vals = []
                    if states is None:  # nonrecurrent version
                        update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                        inds = np.arange(self.n_batch)
                        for epoch_num in range(self.noptepochs):
                            np.random.shuffle(inds)
                            for start in range(0, self.n_batch, batch_size):
                                timestep = self.num_timesteps // update_fac + (
                                    (self.noptepochs * self.n_batch +
                                     epoch_num * self.n_batch + start) //
                                    batch_size)
                                end = start + batch_size
                                mbinds = inds[start:end]
                                slices = (arr[mbinds]
                                          for arr in (obs, returns, masks,
                                                      actions, values,
                                                      neglogpacs))
                                mb_loss_vals.append(
                                    self._train_step(lr_now,
                                                     cliprangenow,
                                                     *slices,
                                                     writer=writer,
                                                     update=timestep))
                        self.num_timesteps += (self.n_batch * self.noptepochs
                                               ) // batch_size * update_fac
                    else:  # recurrent version
                        update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                        assert self.n_envs % self.nminibatches == 0
                        env_indices = np.arange(self.n_envs)
                        flat_indices = np.arange(self.n_envs *
                                                 self.n_steps).reshape(
                                                     self.n_envs, self.n_steps)
                        envs_per_batch = batch_size // self.n_steps
                        for epoch_num in range(self.noptepochs):
                            np.random.shuffle(env_indices)
                            for start in range(0, self.n_envs, envs_per_batch):
                                timestep = self.num_timesteps // update_fac + (
                                    (self.noptepochs * self.n_envs +
                                     epoch_num * self.n_envs + start) //
                                    envs_per_batch)
                                end = start + envs_per_batch
                                mb_env_inds = env_indices[start:end]
                                mb_flat_inds = flat_indices[mb_env_inds].ravel(
                                )
                                slices = (arr[mb_flat_inds]
                                          for arr in (obs, returns, masks,
                                                      actions, values,
                                                      neglogpacs))
                                mb_states = states[mb_env_inds]
                                mb_loss_vals.append(
                                    self._train_step(lr_now,
                                                     cliprangenow,
                                                     *slices,
                                                     update=timestep,
                                                     writer=writer,
                                                     states=mb_states))
                        self.num_timesteps += (self.n_envs * self.noptepochs
                                               ) // envs_per_batch * update_fac

                    loss_vals = np.mean(mb_loss_vals, axis=0)
                    t_now = time.time()
                    fps = int(self.n_batch / (t_now - t_start))

                    if writer is not None:
                        self.episode_reward = total_episode_reward_logger(
                            self.episode_reward,
                            true_reward.reshape((self.n_envs, self.n_steps)),
                            masks.reshape((self.n_envs, self.n_steps)), writer,
                            self.num_timesteps)

                    # Verbose just mean that it will show you the logger on the terminal screen.
                    if self.verbose >= 1 and (update % log_interval == 0
                                              or update == 1):
                        explained_var = explained_variance(values, returns)
                        logger.logkv("serial_timesteps", update * self.n_steps)
                        logger.logkv("nupdates", update)
                        logger.logkv("total_timesteps", self.num_timesteps)
                        logger.logkv("fps", fps)
                        logger.logkv("explained_variance",
                                     float(explained_var))
                        if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                            logger.logkv(
                                'ep_reward_mean',
                                safe_mean(
                                    [ep_info['r'] for ep_info in ep_info_buf]))
                            logger.logkv(
                                'ep_len_mean',
                                safe_mean(
                                    [ep_info['l'] for ep_info in ep_info_buf]))
                        logger.logkv('time_elapsed', t_start - t_first_start)
                        for (loss_val,
                             loss_name) in zip(loss_vals, self.loss_names):
                            logger.logkv(loss_name, loss_val)
                        logger.dumpkvs()

                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break

                except KeyboardInterrupt:
                    print(
                        "You have stopped the learning process by keyboard interrupt. Model Parameter is saved. \n"
                    )
                    # You can actually save files using the instance of self. save the model parameters.
                    self.save(log_dir + "_Iteration_{}".format(update))
                    sys.exit()

            return self
예제 #26
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            runner = Runner(env=self.env,
                            model=self,
                            n_steps=self.n_steps,
                            gamma=self.gamma,
                            lam=self.lam)
            self.episode_reward = np.zeros((self.n_envs, ))

            ep_info_buf = deque(maxlen=100)
            t_first_start = time.time()

            nupdates = total_timesteps // self.n_batch
            for update in range(1, nupdates + 1):
                assert self.n_batch % self.nminibatches == 0
                batch_size = self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / nupdates
                lr_now = self.learning_rate(frac)
                cliprangenow = self.cliprange(frac)
                # true_reward is the reward without discount
                obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
                )
                ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, batch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // batch_size)
                            end = start + batch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 writer=writer,
                                                 update=timestep))
                    self.num_timesteps += (self.n_batch * self.noptepochs
                                           ) // batch_size * update_fac
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = batch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(lr_now,
                                                 cliprangenow,
                                                 *slices,
                                                 update=timestep,
                                                 writer=writer,
                                                 states=mb_states))
                    self.num_timesteps += (self.n_envs * self.noptepochs
                                           ) // envs_per_batch * update_fac

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if writer is not None:
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward,
                        true_reward.reshape((self.n_envs, self.n_steps)),
                        masks.reshape((self.n_envs, self.n_steps)), writer,
                        self.num_timesteps)

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("nupdates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

            return self
예제 #27
0
    def learn(
        self,
        total_timesteps,
        rollout_params,
        callback=None,
        log_interval=100,
        tb_log_name="DQN",
        reset_num_timesteps=True,
        replay_wrapper=None,
        train_plots=None,
        train_plots_path=None,
        rollout_timesteps=None,
    ):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        self.rollout_values = [[] for i in range(len(rollout_params["perc"]))]

        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name,
                new_tb_log) as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0,
                )
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert (not self.prioritized_replay
                        ), "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=self.exploration_initial_eps,
                final_p=self.exploration_final_eps,
            )

            rewardPer1000T = []
            episode_rewards = [0.0]
            episode_successes = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            reset = True
            obs = self.env.reset()
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            for _ in range(total_timesteps):
                if rollout_timesteps is None or total_timesteps - _ < rollout_timesteps:
                    if _ % 1000 == 0 and _ != 0 or _ == total_timesteps - 1:
                        print("computing rollout values")
                        for k, l in enumerate(rollout_params["perc"]):
                            self.rollout_values[k].append(
                                testExpert(
                                    paths=[
                                        "Fish/Guppy/validationData/" + elem
                                        for elem in os.listdir(
                                            "Fish/Guppy/validationData")
                                    ],
                                    model=self,
                                    env=self.env,
                                    exp_turn_fraction=rollout_params[
                                        "exp_turn_fraction"],
                                    exp_speed=rollout_params["exp_min_dist"],
                                    perc=l,
                                    deterministic=True,
                                ))
                if _ % 1000 == 0 and _ != 0:
                    print("timestep", _, "finished")
                    obs = self.env.reset()
                if not train_plots is None:
                    if _ % train_plots == 0:
                        testModel_(self, train_plots_path, rollout_params, _)

                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.0
                else:
                    update_eps = 0.0
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1.0 - self.exploration.value(self.num_timesteps) +
                        self.exploration.value(self.num_timesteps) /
                        float(self.env.action_space.n))
                    kwargs["reset"] = reset
                    kwargs[
                        "update_param_noise_threshold"] = update_param_noise_threshold
                    kwargs["update_param_noise_scale"] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action

                # check if next state (after action) would be outside of fish tank (CLIPPING)
                env_state = self.env.get_state()
                turn_dist = self.env.action(env_action)
                global_turn = env_state[0][2] + turn_dist[0]
                coords = [
                    env_state[0][0] + turn_dist[1] * np.cos(global_turn),
                    env_state[0][1] + turn_dist[1] * np.sin(global_turn),
                ]
                changed = False
                if coords[0] < -0.49:
                    coords[0] = -0.47
                    changed = True
                elif coords[0] > 0.49:
                    coords[0] = 0.47
                    changed = True

                if coords[1] < -0.49:
                    coords[1] = -0.47
                    changed = True
                elif coords[1] > 0.49:
                    coords[1] = 0.47
                    changed = True

                if changed:
                    diff = coords - env_state[0, :2]
                    speed = np.linalg.norm(diff)
                    angles = np.arctan2(diff[1], diff[0])
                    turn = angles - env_state[0, 2]
                    turn = turn - 2 * np.pi if turn > np.pi else turn
                    turn = turn + 2 * np.pi if turn < -np.pi else turn

                    # convert to DQN output
                    dist_turn = np.abs(self.env.turn_rate_bins - turn)
                    dist_speed = np.abs(self.env.speed_bins - speed)

                    # convert to bins
                    turn = np.argmin(dist_turn, axis=0)
                    speed = np.argmin(dist_speed, axis=0)

                    env_action = action = turn * len(
                        self.env.speed_bins) + speed

                reset = False
                new_obs, rew, done, info = self.env.step(env_action)

                self.num_timesteps += 1

                # Stop training if return value is False
                if callback.on_step() is False:
                    break

                # Store only the unnormalized version
                if self._vec_normalize_env is not None:
                    new_obs_ = self._vec_normalize_env.get_original_obs(
                    ).squeeze()
                    reward_ = self._vec_normalize_env.get_original_reward(
                    ).squeeze()
                else:
                    # Avoid changing the original ones
                    obs_, new_obs_, reward_ = obs, new_obs, rew
                # Store transition in the replay buffer, but change reward to 0 (use it for plot later though)
                self.replay_buffer.add(obs_, action, 0, new_obs_, float(done))
                obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    obs_ = new_obs_

                if writer is not None:
                    ep_rew = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += reward_
                if done:
                    maybe_is_success = info.get("is_success")
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if (can_sample and self.num_timesteps > self.learning_starts
                        and self.num_timesteps % self.train_freq == 0):

                    callback.on_rollout_end()
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert (
                            self.beta_schedule is not None
                        ), "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps),
                            env=self._vec_normalize_env,
                        )
                        (
                            obses_t,
                            actions,
                            rewards,
                            obses_tp1,
                            dones,
                            weights,
                            batch_idxes,
                        ) = experience
                    else:
                        (
                            obses_t,
                            actions,
                            rewards,
                            obses_tp1,
                            dones,
                        ) = self.replay_buffer.sample(
                            self.batch_size, env=self._vec_normalize_env)
                        # also sample from expert buffer
                        (
                            obses_t_exp,
                            actions_exp,
                            rewards_exp,
                            obses_tp1_exp,
                            dones_exp,
                        ) = self.expert_buffer.sample(
                            self.batch_size, env=self._vec_normalize_env)
                        weights, batch_idxes = np.ones_like(rewards), None
                        weights_exp, batch_idxes_exp = np.ones_like(
                            rewards_exp), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                np.append(obses_t, obses_t_exp, axis=0),
                                np.append(actions,
                                          actions_exp.flatten(),
                                          axis=0),
                                np.append(rewards,
                                          rewards_exp.flatten(),
                                          axis=0),
                                np.append(obses_tp1, obses_tp1_exp, axis=0),
                                np.append(obses_tp1, obses_tp1_exp, axis=0),
                                np.append(dones.flatten(),
                                          dones_exp.flatten(),
                                          axis=0),
                                np.append(weights, weights_exp),
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata,
                            )
                            writer.add_run_metadata(
                                run_metadata, "step%d" % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                np.append(obses_t, obses_t_exp, axis=0),
                                np.append(actions,
                                          actions_exp.flatten(),
                                          axis=0),
                                np.append(rewards,
                                          rewards_exp.flatten(),
                                          axis=0),
                                np.append(obses_tp1, obses_tp1_exp, axis=0),
                                np.append(obses_tp1, obses_tp1_exp, axis=0),
                                np.append(dones.flatten(),
                                          dones_exp.flatten(),
                                          axis=0),
                                np.append(weights, weights_exp),
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata,
                            )
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(
                            np.append(obses_t, obses_t_exp, axis=0),
                            np.append(actions, actions_exp.flatten(), axis=0),
                            np.append(rewards, rewards_exp.flatten(), axis=0),
                            np.append(obses_tp1, obses_tp1_exp, axis=0),
                            np.append(obses_tp1, obses_tp1_exp, axis=0),
                            np.append(dones.flatten(),
                                      dones_exp.flatten(),
                                      axis=0),
                            np.append(weights, weights_exp),
                            sess=self.sess,
                        )

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer,
                                          PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                    callback.on_rollout_start()

                if (can_sample and self.num_timesteps > self.learning_starts
                        and self.num_timesteps %
                        self.target_network_update_freq == 0):
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if (self.verbose >= 1 and done and log_interval is not None
                        and len(episode_rewards) % log_interval == 0):
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)),
                    )
                    logger.dump_tabular()

        callback.on_training_end()
        return self