Пример #1
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch|MultiAgentBatch from evaluating the current policies.
        """

        batches = [self.sampler.get_data()]
        steps_so_far = batches[0].count

        # In truncate_episodes mode, never pull more than 1 batch per env.
        # This avoids over-running the target batch size.
        if self.batch_mode == "truncate_episodes":
            max_batches = self.num_envs
        else:
            max_batches = float("inf")

        while steps_so_far < self.batch_steps and len(batches) < max_batches:
            batch = self.sampler.get_data()
            steps_so_far += batch.count
            batches.append(batch)
        batches.extend(self.sampler.get_extra_batches())
        batch = batches[0].concat_samples(batches)

        if self.compress_observations:
            if isinstance(batch, MultiAgentBatch):
                for data in batch.policy_batches.values():
                    data["obs"] = [pack(o) for o in data["obs"]]
                    data["new_obs"] = [pack(o) for o in data["new_obs"]]
            else:
                batch["obs"] = [pack(o) for o in batch["obs"]]
                batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
Пример #2
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch|MultiAgentBatch from evaluating the current policies.
        """

        batches = [self.sampler.get_data()]
        steps_so_far = batches[0].count
        while steps_so_far < self.batch_steps:
            batch = self.sampler.get_data()
            steps_so_far += batch.count
            batches.append(batch)
        batch = batches[0].concat_samples(batches)

        if self.compress_observations:
            if isinstance(batch, MultiAgentBatch):
                for data in batch.policy_batches.values():
                    data["obs"] = [pack(o) for o in data["obs"]]
                    data["new_obs"] = [pack(o) for o in data["new_obs"]]
            else:
                batch["obs"] = [pack(o) for o in batch["obs"]]
                batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
Пример #3
0
 def compress(self, bulk=False, columns=frozenset(["obs", "new_obs"])):
     for key in columns:
         if key in self.data:
             if bulk:
                 self.data[key] = pack(self.data[key])
             else:
                 self.data[key] = np.array(
                     [pack(o) for o in self.data[key]])
Пример #4
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch|MultiAgentBatch from evaluating the current policies.
        """

        if log_once("sample_start"):
            logger.info("Generating sample batch of size {}".format(
                self.sample_batch_size))

        batches = [self.input_reader.next()]
        steps_so_far = batches[0].count

        # In truncate_episodes mode, never pull more than 1 batch per env.
        # This avoids over-running the target batch size.
        if self.batch_mode == "truncate_episodes":
            max_batches = self.num_envs
        else:
            max_batches = float("inf")

        while steps_so_far < self.sample_batch_size and len(
                batches) < max_batches:
            batch = self.input_reader.next()
            steps_so_far += batch.count
            batches.append(batch)
        batch = batches[0].concat_samples(batches)

        if self.callbacks.get("on_sample_end"):
            self.callbacks["on_sample_end"]({
                "evaluator": self,
                "samples": batch
            })

        # Always do writes prior to compression for consistency and to allow
        # for better compression inside the writer.
        self.output_writer.write(batch)

        # Do off-policy estimation if needed
        if self.reward_estimators:
            for sub_batch in batch.split_by_episode():
                for estimator in self.reward_estimators:
                    estimator.process(sub_batch)

        if log_once("sample_end"):
            logger.info("Completed sample batch:\n\n{}\n".format(
                summarize(batch)))

        if self.compress_observations:
            if isinstance(batch, MultiAgentBatch):
                for data in batch.policy_batches.values():
                    data["obs"] = [pack(o) for o in data["obs"]]
                    data["new_obs"] = [pack(o) for o in data["new_obs"]]
            else:
                batch["obs"] = [pack(o) for o in batch["obs"]]
                batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
Пример #5
0
 def _compress_in_place(path, value):
     if path[0] not in columns:
         return
     curr = self
     for i, p in enumerate(path):
         if i == len(path) - 1:
             if bulk:
                 curr[p] = pack(value)
             else:
                 curr[p] = np.array([pack(o) for o in value])
         curr = curr[p]
Пример #6
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch|MultiAgentBatch from evaluating the current policies.
        """

        batches = [self.input_reader.next()]
        steps_so_far = batches[0].count

        # In truncate_episodes mode, never pull more than 1 batch per env.
        # This avoids over-running the target batch size.
        if self.batch_mode == "truncate_episodes":
            max_batches = self.num_envs
        else:
            max_batches = float("inf")

        while steps_so_far < self.sample_batch_size and len(
                batches) < max_batches:
            batch = self.input_reader.next()
            steps_so_far += batch.count
            batches.append(batch)
        batch = batches[0].concat_samples(batches)

        if self.callbacks.get("on_sample_end"):
            self.callbacks["on_sample_end"]({
                "evaluator": self,
                "samples": batch
            })

        # Always do writes prior to compression for consistency and to allow
        # for better compression inside the writer.
        self.output_writer.write(batch)

        # Do off-policy estimation if needed
        if self.reward_estimators:
            for sub_batch in batch.split_by_episode():
                for estimator in self.reward_estimators:
                    estimator.process(sub_batch)

        if self.compress_observations:
            if isinstance(batch, MultiAgentBatch):
                for data in batch.policy_batches.values():
                    data["obs"] = [pack(o) for o in data["obs"]]
                    data["new_obs"] = [pack(o) for o in data["new_obs"]]
            else:
                batch["obs"] = [pack(o) for o in batch["obs"]]
                batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
Пример #7
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch from evaluating the current policies.
        """

        batch = self.policy_map["default"].postprocess_trajectory(
            self.sampler.get_data())

        if self.compress_observations:
            batch["obs"] = [pack(o) for o in batch["obs"]]
            batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
Пример #8
0
    def compress(self,
                 bulk: bool = False,
                 columns: Set[str] = frozenset(["obs", "new_obs"])) -> None:
        """Compresses the data buffers (by column) in place.

        Args:
            bulk (bool): Whether to compress across the batch dimension (0)
                as well. If False will compress n separate list items, where n
                is the batch size.
            columns (Set[str]): The columns to compress. Default: Only
                compress the obs and new_obs columns.
        """
        for key in columns:
            if key in self.keys():
                if bulk:
                    self[key] = pack(self[key])
                else:
                    self[key] = np.array([pack(o) for o in self[key]])
Пример #9
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch|MultiAgentBatch from evaluating the current policies.
        """

        batches = [self.input_reader.next()]
        steps_so_far = batches[0].count

        # In truncate_episodes mode, never pull more than 1 batch per env.
        # This avoids over-running the target batch size.
        if self.batch_mode == "truncate_episodes":
            max_batches = self.num_envs
        else:
            max_batches = float("inf")

        while steps_so_far < self.sample_batch_size and len(
                batches) < max_batches:
            batch = self.input_reader.next()
            steps_so_far += batch.count
            batches.append(batch)
        batch = batches[0].concat_samples(batches)

        if self.callbacks.get("on_sample_end"):
            self.callbacks["on_sample_end"]({
                "evaluator": self,
                "samples": batch
            })

        # Always do writes prior to compression for consistency and to allow
        # for better compression inside the writer.
        self.output_writer.write(batch)

        if self.compress_observations:
            if isinstance(batch, MultiAgentBatch):
                for data in batch.policy_batches.values():
                    data["obs"] = [pack(o) for o in data["obs"]]
                    data["new_obs"] = [pack(o) for o in data["new_obs"]]
            else:
                batch["obs"] = [pack(o) for o in batch["obs"]]
                batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
Пример #10
0
    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch from evaluating the current policies.
        """

        batches = [self.sampler.get_data()]
        steps_so_far = batches[0].count
        while steps_so_far < self.batch_steps:
            batch = self.sampler.get_data()
            steps_so_far += batch.count
            batches.append(batch)
        batch = SampleBatch.concat_samples(batches)

        if self.compress_observations:
            batch["obs"] = [pack(o) for o in batch["obs"]]
            batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch
Пример #11
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
                         actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": [pack(np.array(o)) for o in obs],
            "actions": actions,
            "rewards": rewards,
            "new_obs": [pack(np.array(o)) for o in new_obs],
            "dones": dones,
            "weights": np.ones_like(rewards)
        })
        assert (batch.count == self.config["sample_batch_size"])

        # Prioritize on the worker side
        if self.config["worker_side_prioritization"]:
            td_errors = self.ddpg_graph.compute_td_error(
                self.sess, obs, batch["actions"], batch["rewards"], new_obs,
                batch["dones"], batch["weights"])
            new_priorities = (
                np.abs(td_errors) + self.config["prioritized_replay_eps"])
            batch.data["weights"] = new_priorities

        return batch
Пример #12
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(
                self.config["n_step"], self.config["gamma"],
                obs, actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": [pack(np.array(o)) for o in obs], "actions": actions,
            "rewards": rewards,
            "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones,
            "weights": np.ones_like(rewards)})
        assert (batch.count == self.config["sample_batch_size"])

        # Prioritize on the worker side
        if self.config["worker_side_prioritization"]:
            td_errors = self.dqn_graph.compute_td_error(
                self.sess, obs, batch["actions"], batch["rewards"],
                new_obs, batch["dones"], batch["weights"])
            new_priorities = (
                np.abs(td_errors) + self.config["prioritized_replay_eps"])
            batch.data["weights"] = new_priorities

        return batch
Пример #13
0
def _to_jsonable(v, compress):
    if compress:
        return str(pack(v))
    elif isinstance(v, np.ndarray):
        return v.tolist()
    return v
Пример #14
0
def _to_jsonable(v, compress):
    if compress:
        return str(pack(v))
    elif isinstance(v, np.ndarray):
        return v.tolist()
    return v
Пример #15
0
def worker_rollout(ps, replay_buffer, opt, worker_index):
    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]
    np.random.seed()
    rand_buff1 = np.random.choice(opt.num_buffers, 1)[0]

    random_steps = 0

    while True:
        # ------ env set up ------

        env = TradingEnv()
        # env = Wrapper(env, opt.action_repeat, opt.reward_scale)
        # ------ env set up end ------

        o_queue = deque([], maxlen=opt.Ln + 1)
        a_r_d_queue = deque([], maxlen=opt.Ln)

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        ep_score, ep_target_bias = 0, 0

        if opt.model == "cnn":
            compressed_o = pack(o)
            o_queue.append((compressed_o,))
        else:
            o_queue.append((o,))

        t_queue = 1

        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        # for a_l_ratio control
        np.random.seed()
        rand_buff = np.random.choice(opt.num_buffers, 1)[0]
        last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())

        while True:

            # don't need to random sample action if load weights from local.
            if random_steps > opt.start_steps or opt.weights_file or opt.recover:
                a = agent.get_action(o, deterministic=False)
            else:
                a = env.action_space.sample()
                random_steps += 1
            # Step the env
            o2, r, d, info = env.step(a)

            ep_ret += r
            ep_score += info['score']
            ep_target_bias += info['target_bias']
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d

            o = o2

            a_r_d_queue.append((a, r, d,))
            if opt.model == "cnn":
                compressed_o2 = pack(o2)
                o_queue.append((compressed_o2,))
            else:
                o_queue.append((o2,))

            # scheme 1:
            # TODO  and t_queue % 2 == 0: %1 lead to q smaller
            # TODO
            if t_queue >= opt.Ln and t_queue % opt.save_freq == 0:
                replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote(o_queue, a_r_d_queue, worker_index)

            t_queue += 1

            # End of episode. Training (ep_len times).
            # if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
            if d or ep_len > opt.max_ep_len:
                sample_times, steps, _ = ray.get(replay_buffer[0].get_counts.remote())

                # print('rollout ep_len:', ep_len * opt.action_repeat, 'ep_score:', ep_score,
                #       'ep_target_bias:', ep_target_bias)

                if steps > opt.start_steps:
                    # update parameters every episode
                    weights = ray.get(ps.pull.remote(keys))
                    agent.set_weights(keys, weights)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

                t_queue = 1
                if opt.model == "cnn":
                    compressed_o = pack(o)
                    o_queue.append((compressed_o,))
                else:
                    o_queue.append((o,))

                # for a_l_ratio control
                learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())

                while (actor_steps - last_actor_steps) / (
                        learner_steps - last_learner_steps + 1) > opt.a_l_ratio and last_learner_steps > 0:
                    time.sleep(1)
                    learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())
Пример #16
0
def _to_jsonable(v, compress: bool) -> Any:
    if compress and compression_supported():
        return str(pack(v))
    elif isinstance(v, np.ndarray):
        return v.tolist()
    return v
Пример #17
0
def worker_rollout(ps, replay_buffer, opt, worker_index):

    agent = Actor(opt, job="worker")
    keys = agent.get_weights()[0]

    filling_steps = 0
    while True:
        # ------ env set up ------
        env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise,
                      opt.reward_scale, 3)
        # ------ env set up end ------

        ################################## deques

        o_queue = deque([], maxlen=opt.Ln + 1)
        a_r_d_queue = deque([], maxlen=opt.Ln)

        ################################## deques

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        ################################## deques reset
        t_queue = 1
        if opt.model == "cnn":
            compressed_o = pack(o)
            o_queue.append((compressed_o, ))
        else:
            o_queue.append((o, ))

        ################################## deques reset

        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)

        while True:

            # don't need to random sample action if load weights from local.
            if filling_steps > opt.start_steps or opt.weights_file:
                a = agent.get_action(o, deterministic=False)
            else:
                a = env.action_space.sample()
                filling_steps += 1
            # Step the env
            o2, r, d, _ = env.step(a)

            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d

            o = o2

            #################################### deques store

            a_r_d_queue.append((
                a,
                r,
                d,
            ))
            if opt.model == "cnn":
                compressed_o2 = pack(o2)
                o_queue.append((compressed_o2, ))
            else:
                o_queue.append((o2, ))

            # scheme 1:
            # TODO  and t_queue % 2 == 0: %1 lead to q smaller
            # TODO
            if t_queue >= opt.Ln and t_queue % opt.save_freq == 0:
                replay_buffer[np.random.choice(opt.num_buffers,
                                               1)[0]].store.remote(
                                                   o_queue, a_r_d_queue,
                                                   worker_index)

            t_queue += 1

            #################################### deques store

            # End of episode. Training (ep_len times).
            if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
                # TODO
                sample_times, steps, _ = ray.get(
                    replay_buffer[0].get_counts.remote())

                print('rollout_ep_len:', ep_len * opt.action_repeat,
                      'rollout_ep_ret:', ep_ret)

                if steps > opt.start_steps:
                    # update parameters every episode
                    weights = ray.get(ps.pull.remote(keys))
                    agent.set_weights(keys, weights)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

                ################################## deques reset
                t_queue = 1
                if opt.model == "cnn":
                    compressed_o = pack(o)
                    o_queue.append((compressed_o, ))
                else:
                    o_queue.append((o, ))