예제 #1
0
    def doTestDDPG(self):
        np.random.seed(0)
        env = gym.make("Pendulum-v0")
        env.seed(0)
        ddpg_g = tf.Graph()
        with ddpg_g.as_default():
            tf.set_random_seed(123)
            agent = agents[DDPG_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                DDPG_AGENT_CONFIG,
                DDPG_MODEL_CONFIG,
                distributed_spec={})
        reward_window = WindowStat("reward", 25)
        obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
        ), list()
        act_count = 0

        for i in range(200):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act(
                    [ob], False, use_perturbed_action=False)
                act_count += 1
                next_ob, reward, done, info = env.step(action[0])
                obs.append(ob)
                actions.append(action[0])
                rewards.append(0.1 * reward)
                next_obs.append(next_ob)
                dones.append(done)
                if agent.ready_to_send:
                    agent.send_experience(
                        obs=obs,
                        actions=actions,
                        rewards=rewards,
                        dones=dones,
                        next_obs=next_obs)
                if agent.ready_to_receive:
                    batch_data = agent.receive_experience()
                    res = agent.learn(batch_data)

                    if DDPG_AGENT_CONFIG.get("prioritized_replay", False):
                        agent.update_priorities(
                            indexes=batch_data["indexes"],
                            td_error=res["td_error"])

                ob = next_ob
                episode_reward += reward
                if act_count % 1024 == 0:
                    print("timestep:", act_count, reward_window)

            agent.add_episode(1)
            reward_window.push(episode_reward)

        return reward_window.stats()["reward_mean"]
예제 #2
0
    def doTestPPO(self):
        env = gym.make("CartPole-v0")
        env.seed(0)
        ppo_g = tf.Graph()
        with ppo_g.as_default():
            tf.set_random_seed(123)
            agent = agents[PPO_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                PPO_AGENT_CONFIG,
                PPO_MODEL_CONFIG,
                distributed_spec={})

        reward_window = WindowStat("reward", 25)
        obs, actions, rewards, next_obs, dones, value_preds, logits = list(
        ), list(), list(), list(), list(), list(), list()
        act_count = 0

        for i in range(300):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act([ob], False)
                next_ob, reward, done, info = env.step(action[0])
                act_count += 1

                obs.append(ob)
                actions.append(action[0])
                rewards.append(0.1 * reward)
                next_obs.append(next_ob)
                dones.append(done)

                logits.append(results["logits"][0])
                value_preds.append(results["value_preds"][0])
                if agent.ready_to_send:
                    agent.send_experience(
                        obs=obs,
                        actions=actions,
                        rewards=rewards,
                        dones=dones,
                        next_obs=next_obs,
                        value_preds=value_preds,
                        logits=logits)
                if agent.ready_to_receive:
                    batch_data = agent.receive_experience()
                    res = agent.learn(batch_data)

                ob = next_ob
                episode_reward += reward
                if act_count % 1024 == 0:
                    print("timestep:", act_count, reward_window)

            reward_window.push(episode_reward)

        return reward_window.stats()["reward_mean"]
예제 #3
0
파일: buffer.py 프로젝트: zhaoqiuye/EasyRL
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
          Max number of transitions to store in the buffer. When the buffer
          overflows the old memories are dropped.
        alpha: float
          how much prioritization is used
          (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._prio_change_stats = WindowStat("reprio", 1000)

        self._debug_cost = 0

    def add(self, obs, actions, rewards, dones, next_obs, weights, **kwargs):
        """See ReplayBuffer.store_effect"""

        super(PrioritizedReplayBuffer, self).add(
            obs=obs,
            actions=actions,
            rewards=rewards,
            dones=dones,
            next_obs=next_obs,
            **{})

        if weights is None:
            weights = self._max_priority
            constant_weight = weights**self._alpha
            for idx in self._cover_indices:
                self._it_sum[idx] = constant_weight
                self._it_min[idx] = constant_weight
        else:
            weights = np.power(weights, self._alpha)
            for n, idx in enumerate(self._cover_indices):
                self._it_sum[idx] = weights[n]
                self._it_min[idx] = weights[n]

    def _sample_proportional(self, batch_size):
        res = []
        sum_value = self._it_sum.sum(0, len(self))
        mass = np.random.random(size=batch_size) * sum_value
        for i in range(batch_size):
            # TODO(szymon): should we ensure no repeats?
            idx = self._it_sum.find_prefixsum_idx(mass[i])
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
          How many transitions to sample.
        beta: float
          To what degree to use importance weights
          (0 - no corrections, 1 - full correction)

        Returns
        -------
        obs_batch: np.array
          batch of observations
        act_batch: np.array
          batch of actions executed given obs_batch
        rew_batch: np.array
          rewards received as results of executing act_batch
        next_obs_batch: np.array
          next set of observations seen after executing act_batch
        done_mask: np.array
          done_mask[i] = 1 if executing act_batch[i] resulted in
          the end of an episode and 0 otherwise.
        weights: np.array
          Array of shape (batch_size,) and dtype np.float32
          denoting importance weight of each sampled transition
        idxes: np.array
          Array of shape (batch_size,) and dtype np.int32
          idexes in buffer of sampled experiences
        """
        assert beta > 0
        self._num_sampled += batch_size

        start = time.time()
        idxes = self._sample_proportional(batch_size)
        self._debug_cost += time.time() - start

        sum_value = self._it_sum.sum()

        weights = []
        p_min = self._it_min.min() / sum_value
        max_weight = (p_min * len(self))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / sum_value
            weight = (p_sample * len(self))**(-beta)
            weights.append(weight / max_weight)
        weights = np.asarray(weights)
        encoded_sample = self._encode_sample(idxes)
        encoded_sample["weights"] = weights
        encoded_sample["indexes"] = idxes
        return encoded_sample

    def update_priorities(self, indexes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        indexes: [int]
          List of idxes of sampled transitions
        priorities: [float]
          List of updated priorities corresponding to
          transitions at the sampled idxes denoted by
          variable `idxes`.
        """
        assert len(indexes) == len(priorities)
        pvs = np.power(priorities, self._alpha).astype(np.float64)
        for idx, priority, pv in zip(indexes, priorities, pvs):
            assert priority > 0
            assert 0 <= idx < len(self)
            delta = pv - self._it_sum[idx]
            self._prio_change_stats.push(delta)
            self._it_sum[idx] = pv
            self._it_min[idx] = pv

        self._max_priority = max(self._max_priority, np.max(priorities))

    def stats(self, debug=False):
        parent = ReplayBuffer.stats(self, debug)
        if debug:
            parent.update(self._prio_change_stats.stats())
        return parent
예제 #4
0
파일: buffer.py 프로젝트: zhaoqiuye/EasyRL
class ReplayBuffer(object):
    """Basic replay buffer.

    Support O(1) `add` and O(1) `sample` operations (w.r.t. each transition).
    The buffer is implemented as a fixed-length list where the index of insertion is reset to zero,
    once the list length is reached.
    """

    def __init__(self, size):
        """Create the replay buffer.

        Parameters
        ----------
        size: int
          Max number of transitions to store in the buffer. When the buffer
          overflows the old memories are dropped.
        """
        self._maxsize = size
        self._next_idx = 0
        self._hit_count = np.zeros(size)
        self._eviction_started = False
        self._num_added = 0
        self._num_sampled = 0
        self._evicted_hit_stats = WindowStat("evicted_hit", 1000)
        self._est_size_bytes = 0

        self._extra_fields = None

        self._first_add = True

    def __len__(self):
        return min(self._num_added, self._maxsize)

    def add(self,
            obs,
            actions,
            rewards,
            dones,
            next_obs=None,
            weights=None,
            **kwargs):

        batch_size = np.shape(rewards)[0]
        assert batch_size < self._maxsize, "size of data added in buffer is too big at once"

        truncated_size = min(batch_size, self._maxsize - self._next_idx)
        extra_size = max(0, batch_size - (self._maxsize - self._next_idx))

        if self._extra_fields is None:
            self._extra_fields = list(kwargs.keys())

        if self._first_add:
            self._obs = np.zeros(
                shape=((self._maxsize, ) + np.shape(obs)[1:]), dtype=obs.dtype)
            self._actions = np.zeros(
                shape=((self._maxsize, ) + np.shape(actions)[1:]),
                dtype=actions.dtype)
            self._rewards = np.zeros(shape=(self._maxsize, ), dtype=np.float32)

            if next_obs is not None:
                self._next_obs = np.zeros(
                    shape=((self._maxsize, ) + np.shape(next_obs)[1:]),
                    dtype=next_obs.dtype)

            if weights is not None:
                self._weights = np.zeros(
                    shape=((self._maxsize, )), dtype=np.float32)

            self._dones = np.zeros(shape=(self._maxsize, ), dtype=np.float32)

            self._extras = {
                name: np.zeros(
                    shape=((self._maxsize, ) + np.shape(kwargs[name])[1:]),
                    dtype=kwargs[name].dtype)
                for name in self._extra_fields
            }

            self._first_add = False

        self._num_added += batch_size

        #if self._num_added <= self._maxsize:
        #self._est_size_bytes += sum(sys.getsizeof(d) for d in data)

        self._obs[self._next_idx:self._next_idx +
                  truncated_size] = obs[:truncated_size]
        self._actions[self._next_idx:self._next_idx +
                      truncated_size] = actions[:truncated_size]
        self._rewards[self._next_idx:self._next_idx +
                      truncated_size] = rewards[:truncated_size]
        self._dones[self._next_idx:self._next_idx +
                    truncated_size] = dones[:truncated_size]

        if next_obs is not None:
            self._next_obs[self._next_idx:self._next_idx +
                           truncated_size] = next_obs[:truncated_size]
        if weights is not None:
            self._weights[self._next_idx:self._next_idx +
                          truncated_size] = weights[:truncated_size]

        for name in self._extras.keys():
            self._extras[name][self._next_idx:self._next_idx +
                               truncated_size] = kwargs[name][:truncated_size]

        if extra_size > 0:
            self._obs[:extra_size] = obs[truncated_size:]
            self._actions[:extra_size] = actions[truncated_size:]
            self._rewards[:extra_size] = rewards[truncated_size:]
            self._dones[:extra_size] = dones[truncated_size:]
            if next_obs is not None:
                self._next_obs[:extra_size] = next_obs[truncated_size:]
            if weights is not None:
                self._weights[:extra_size] = weights[truncated_size:]

            for name in self._extras.keys():
                self._extras[name][:extra_size] = kwargs[name][truncated_size:]

        if self._next_idx + batch_size >= self._maxsize:
            self._eviction_started = True
        self._cover_indices = [
            self._next_idx + i for i in range(truncated_size)
        ]
        if extra_size > 0:
            self._cover_indices += [i for i in range(extra_size)]
        self._next_idx = (self._next_idx + batch_size) % self._maxsize
        if self._eviction_started:
            for i in self._cover_indices:
                self._evicted_hit_stats.push(self._hit_count[i])
                self._hit_count[i] = 0

    def _encode_sample(self, idxes):
        idxes = np.asarray(idxes)

        obs = np.take(self._obs, indices=idxes, axis=0)
        actions = np.take(self._actions, indices=idxes, axis=0)
        rewards = np.take(self._rewards, indices=idxes, axis=0)
        next_obs = np.take(self._next_obs, indices=idxes, axis=0)
        dones = np.take(self._dones, indices=idxes, axis=0)

        batch_data = dict(
            obs=obs,
            actions=actions,
            rewards=rewards,
            dones=dones,
            next_obs=next_obs)

        return batch_data

    def sample(self, batch_size):
        """Sample a batch of experiences.

        Parameters
        ----------
        batch_size: int
            How many transitions to sample.

        Returns
        -------
        obs_batch: np.array
          batch of observations
        act_batch: np.array
          batch of actions executed given obs_batch
        rew_batch: np.array
          rewards received as results of executing act_batch
        next_obs_batch: np.array
          next set of observations seen after executing act_batch
        done_mask: np.array
          done_mask[i] = 1 if executing act_batch[i] resulted in
          the end of an episode and 0 otherwise.
        """
        idxes = np.random.randint(
            0, min(self._num_added, self._maxsize) - 1, size=(batch_size, ))
        self._num_sampled += batch_size
        return self._encode_sample(idxes)

    def stats(self, debug=False):
        data = {
            "added_count": self._num_added,
            "sampled_count": self._num_sampled,
            "est_size_bytes": self._est_size_bytes,
            "num_entries": len(self),
        }
        if debug:
            data.update(self._evicted_hit_stats.stats())
        return data
예제 #5
0
    def doTestCkpt(self):
        trial_timestamp = time.strftime("%Y%m%d-%H%M%S")
        np.random.seed(0)
        env = gym.make("CartPole-v0")
        env.seed(0)
        dqn_g = tf.Graph()
        with dqn_g.as_default():
            tf.set_random_seed(123)
            agent = agents[DQN_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                DQN_AGENT_CONFIG,
                DQN_MODEL_CONFIG,
                checkpoint_dir="ckpt_dir_{}".format(trial_timestamp),
                distributed_spec={})
        reward_window = WindowStat("reward", 50)
        obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
        ), list()
        act_count = 0

        for i in range(500):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act([ob],
                                            deterministic=False,
                                            use_perturbed_action=False)

                next_ob, reward, done, info = env.step(action[0])
                act_count += 1

                obs.append(ob)
                actions.append(action[0])
                rewards.append(reward)
                next_obs.append(next_ob)
                dones.append(done)
                if agent.ready_to_send:
                    agent.send_experience(obs=obs,
                                          actions=actions,
                                          rewards=rewards,
                                          next_obs=next_obs,
                                          dones=dones)
                if agent.ready_to_receive:
                    batch_data = agent.receive_experience()
                    res = agent.learn(batch_data)

                    if DQN_AGENT_CONFIG.get("prioritized_replay", False):
                        agent.update_priorities(indexes=batch_data["indexes"],
                                                td_error=res["td_error"])

                ob = next_ob
                episode_reward += reward
                if act_count % 1024 == 0:
                    print("timestep:", act_count, reward_window)

            agent.add_episode(1)
            reward_window.push(episode_reward)

        prev_perf = reward_window.stats()["reward_mean"]
        print("Performance before saving is {}".format(prev_perf))

        new_dqn_g = tf.Graph()
        with new_dqn_g.as_default():
            agent = agents[DQN_AGENT_CONFIG["type"]](
                env.observation_space,
                env.action_space,
                DQN_AGENT_CONFIG,
                DQN_MODEL_CONFIG,
                checkpoint_dir="ckpt_dir_{}".format(trial_timestamp),
                distributed_spec={})
        reward_window = WindowStat("reward", 10)
        ob = env.reset()
        for i in range(10):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act([ob],
                                            deterministic=True,
                                            use_perturbed_action=False)

                next_ob, reward, done, info = env.step(action[0])
                act_count += 1

                ob = next_ob
                episode_reward += reward

            agent.add_episode(1)
            reward_window.push(episode_reward)

        cur_perf = reward_window.stats()["reward_mean"]
        print("Performance after restore is {}".format(cur_perf))
        return prev_perf - cur_perf
예제 #6
0
    def doTestSavedModel(self):
        trial_timestamp = time.strftime("%Y%m%d-%H%M%S")
        model_dir = "model_dir_{}".format(trial_timestamp)
        os.system("mkdir {}".format(model_dir))

        np.random.seed(0)
        env = gym.make("CartPole-v0")
        env.seed(0)
        dqn_g = tf.Graph()
        with dqn_g.as_default():
            tf.set_random_seed(123)
            agent = agents[DQN_AGENT_CONFIG["type"]](env.observation_space,
                                                     env.action_space,
                                                     DQN_AGENT_CONFIG,
                                                     DQN_MODEL_CONFIG,
                                                     export_dir=model_dir,
                                                     distributed_spec={})
        reward_window = WindowStat("reward", 50)
        obs, actions, rewards, next_obs, dones = list(), list(), list(), list(
        ), list()
        act_count = 0

        for i in range(500):
            ob = env.reset()
            done = False
            episode_reward = .0

            while not done:
                action, results = agent.act([ob],
                                            deterministic=False,
                                            use_perturbed_action=False)

                next_ob, reward, done, info = env.step(action[0])
                act_count += 1

                obs.append(ob)
                actions.append(action[0])
                rewards.append(reward)
                next_obs.append(next_ob)
                dones.append(done)
                if agent.ready_to_send:
                    agent.send_experience(obs=obs,
                                          actions=actions,
                                          rewards=rewards,
                                          next_obs=next_obs,
                                          dones=dones)
                if agent.ready_to_receive:
                    batch_data = agent.receive_experience()
                    res = agent.learn(batch_data)

                    if DQN_AGENT_CONFIG.get("prioritized_replay", False):
                        agent.update_priorities(indexes=batch_data["indexes"],
                                                td_error=res["td_error"])

                ob = next_ob
                episode_reward += reward
                if act_count % 1024 == 0:
                    print("timestep:", act_count, reward_window)

            agent.add_episode(1)
            reward_window.push(episode_reward)

        prev_perf = reward_window.stats()["reward_mean"]
        print("Performance before saving is {}".format(prev_perf))

        with tf.Session() as sess:
            path = model_dir
            MetaGraphDef = tf.saved_model.loader.load(
                sess, tags=[sm.tag_constants.SERVING], export_dir=path)

            # get SignatureDef protobuf
            SignatureDef_d = MetaGraphDef.signature_def
            SignatureDef = SignatureDef_d["predict_results"]

            # get inputs/outputs TensorInfo protobuf
            ph_inputs = {}
            for name, ts_info in SignatureDef.inputs.items():
                ph_inputs[name] = sm.utils.get_tensor_from_tensor_info(
                    ts_info, sess.graph)

            outputs = {}
            for name, ts_info in SignatureDef.outputs.items():
                outputs[name] = sm.utils.get_tensor_from_tensor_info(
                    ts_info, sess.graph)

            for name, ph in ph_inputs.items():
                print(name, ph)

            for name, ts in outputs.items():
                print(name, ts)

            reward_window = WindowStat("reward", 10)
            for i in range(10):
                ob = env.reset()
                done = False
                episode_reward = .0

                while not done:
                    action = sess.run(outputs["output_actions"],
                                      feed_dict={
                                          ph_inputs["obs_ph"]:
                                          [np.asarray(ob)],
                                          ph_inputs["deterministic_ph"]: True
                                      })
                    next_ob, reward, done, info = env.step(action[0])
                    episode_reward += reward
                    ob = next_ob

                reward_window.push(episode_reward)

        cur_perf = reward_window.stats()["reward_mean"]
        print("Performance after restore is {}".format(cur_perf))
        return prev_perf - cur_perf