示例#1
0
    def collect_evaluation(self, itr):
        assert self.max_trajectories == len(self.envs)
        traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
        completed_traj_infos = list()
        observations = list()
        for env in self.envs:
            observations.append(env.reset())
        observation = buffer_from_example(observations[0], len(self.envs))
        for b, o in enumerate(observations):
            observation[b] = o
        action = buffer_from_example(self.envs[0].action_space.null_value(),
                                     len(self.envs))
        reward = np.zeros(len(self.envs), dtype="float32")
        obs_pyt, act_pyt, rew_pyt = torchify_buffer(
            (observation, action, reward))
        self.agent.reset()
        self.agent.eval_mode(itr)
        live_envs = list(range(len(self.envs)))
        for t in range(self.max_T):
            act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
            action = numpify_buffer(act_pyt)

            b = 0
            while b < len(
                    live_envs
            ):  # don't want to do a for loop since live envs changes over time
                env_id = live_envs[b]
                o, r, d, env_info = self.envs[env_id].step(action[b])
                traj_infos[env_id].step(observation[b], action[b], r, d,
                                        agent_info[b], env_info)
                if getattr(env_info, "traj_done", d):
                    completed_traj_infos.append(
                        traj_infos[env_id].terminate(o))

                    observation = delete_ind_from_array(observation, b)
                    reward = delete_ind_from_array(reward, b)
                    action = delete_ind_from_array(action, b)
                    obs_pyt, act_pyt, rew_pyt = torchify_buffer(
                        (observation, action, reward))

                    del live_envs[b]
                    b -= 1  # live_envs[b] is now the next env, so go back one.
                else:
                    observation[b] = o
                    reward[b] = r

                b += 1

                if (self.max_trajectories is not None and
                        len(completed_traj_infos) >= self.max_trajectories):
                    logger.log("Evaluation reached max num trajectories "
                               f"({self.max_trajectories}).")
                    return completed_traj_infos

        if t == self.max_T - 1:
            logger.log("Evaluation reached max num time steps "
                       f"({self.max_T}).")
        return completed_traj_infos
示例#2
0
 def sample_batch(self, batch_T):
     """Can dynamically input length of sequences to return, by ``batch_T``,
     else if ``None`` will use interanlly set value.  Returns batch with
     leading dimensions ``[batch_T, batch_B]``.
     """
     if self.t > batch_T:
         return torchify_buffer(self.samples[0:int(batch_T)])
         # return torchify_buffer(self.samples[self.t-int(batch_T):self.t])
     else:
         return torchify_buffer(self.samples[:self.t])
示例#3
0
    def _generate_stochastic_minibatches(self, replay_ratio):
        cum_sleep_length = 0
        with self.rw_lock:
            self._async_pull()
        if not self._buffer_full:
            print('buffer not yet filled')
            return

        for minibatch in range(self.T_target):
            indexes = np.random.choice(self.buffer_size, self.optim_batch_B)
            with self.rw_lock:  # Read lock.
                batch = self.samples[:, indexes]

            yield torchify_buffer(batch), torchify_buffer(
                self.samples_prev_rnn_state[indexes]), cum_sleep_length
示例#4
0
文件: n_step.py 项目: nirbhayjm/rlpyt
    def extract_batch(self, T_idxs, B_idxs, T):
        """Return full sequence of each field in `agent_inputs` (e.g. `observation`),
        including all timesteps for the main sequence and for the target sequence in
        one array; many timesteps will likely overlap, so the algorithm and make
        sub-sequences by slicing on device, for reduced memory usage.

        Enforces that input `T_idxs` align with RNN state interval.

        Uses helper function ``extract_sequences()`` to retrieve samples of
        length ``T`` starting at locations ``[T_idxs,B_idxs]``, so returned
        data batch has leading dimensions ``[T,len(B_idxs)]``."""
        s, rsi = self.samples, self.rnn_state_interval
        if rsi > 1:
            assert np.all(np.asarray(T_idxs) % rsi == 0)
            init_rnn_state = self.samples_prev_rnn_state[T_idxs // rsi, B_idxs]
        elif rsi == 1:
            init_rnn_state = self.samples.prev_rnn_state[T_idxs, B_idxs]
        else:  # rsi == 0
            init_rnn_state = None
        batch = SamplesFromReplay(
            all_observation=self.extract_observation(T_idxs, B_idxs,
                                                     T + self.n_step_return),
            all_action=buffer_func(
                s.action, extract_sequences, T_idxs - 1, B_idxs,
                T + self.n_step_return),  # Starts at prev_action.
            all_reward=extract_sequences(
                s.reward, T_idxs - 1, B_idxs,
                T + self.n_step_return),  # Only prev_reward (agent + target).
            return_=extract_sequences(self.samples_return_, T_idxs, B_idxs, T),
            done=extract_sequences(s.done, T_idxs, B_idxs, T),
            done_n=extract_sequences(self.samples_done_n, T_idxs, B_idxs, T),
            init_rnn_state=init_rnn_state,  # (Same state for agent and target.)
        )
        # NOTE: Algo might need to make zero prev_action/prev_reward depending on done.
        return torchify_buffer(batch)
示例#5
0
def build_samples_buffer(agent,
                         env,
                         batch_spec,
                         bootstrap_value=False,
                         agent_shared=True,
                         env_shared=True,
                         subprocess=True,
                         examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)"""
    if examples is None:
        if subprocess:
            mgr = mp.Manager()
            examples = mgr.dict()  # Examples pickled back to master.
            w = mp.Process(target=get_example_outputs,
                           args=(agent, env, examples, subprocess))
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)

    T, B = batch_spec
    all_action = buffer_from_example(examples["action"], (T + 1, B),
                                     agent_shared)
    action = all_action[1:]
    prev_action = all_action[:
                             -1]  # Writing to action will populate prev_action.
    agent_info = buffer_from_example(examples["agent_info"], (T, B),
                                     agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:
        bv = buffer_from_example(examples["agent_info"].value, (1, B),
                                 agent_shared)
        agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B),
                                      env_shared)
    all_reward = buffer_from_example(examples["reward"], (T + 1, B),
                                     env_shared)
    reward = all_reward[1:]
    prev_reward = all_reward[:
                             -1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        reward=reward,
        prev_reward=prev_reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np)
    return samples_pyt, samples_np, examples
示例#6
0
    def sample_batch(self, batch_B):
        while True:
            try:
                self._async_pull()  # Updates from writers.
                (T_idxs, B_idxs), priorities = self.priority_tree.sample(
                    batch_B, unique=self.unique)
                sampled_indices = True
                if self.rnn_state_interval > 1:
                    T_idxs = T_idxs * self.rnn_state_interval

                batch = self.extract_batch(T_idxs - 1, B_idxs,
                                           self.batch_T + 1)
                is_weights = (1. / (priorities + 1e-5))**self.beta
                is_weights /= max(is_weights)  # Normalize.
                is_weights = torchify_buffer(is_weights).float()

                batch = SamplesFromReplayPri(*batch, is_weights=is_weights)
                return self.sanitize_batch(batch)

            except Exception as e:
                print("FAILED TO LOAD BATCH")
                traceback.print_exc()
                if sampled_indices:
                    print("B_idxs:", B_idxs, flush=True)
                    print("T_idxs:", T_idxs, flush=True)
                    print("Batch_T:", self.batch_T, flush=True)
                    print("Buffer T:", self.T, flush=True)
示例#7
0
def get_example_outputs_single(agent, env, examples, subprocess=False):
    """For pre-batched environments"""
    if subprocess:  # i.e. in subprocess.
        import torch
        torch.set_num_threads(1)  # Some fix to prevent MKL hang.
    o = env.reset()
    a = env.action_space.sample()
    o, r, d, env_info = env.step(a)
    r = np.asarray(r, dtype="float32")  # Must match torch float dtype here.
    agent.reset()
    agent_inputs = torchify_buffer(AgentInputs(o, a, r))
    a, agent_info = agent.step(*agent_inputs)
    if "prev_rnn_state" in agent_info:
        # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H]
        prev_rnn_state = agent_info.prev_rnn_state[0]
        agent_info_0 = agent_info.__class__(*(i[0] for i in agent_info))
        agent_info_0 = agent_info_0._replace(prev_rnn_state=prev_rnn_state)
    else:
        agent_info_0 = agent_info.__class__(*(i[0] for i in agent_info))
    env_info_0 = env_info.__class__(*(i[0] for i in env_info))
    examples["observation"] = o[0]
    examples["reward"] = r[0]
    examples["done"] = d[0]
    examples["env_info"] = env_info_0
    examples["action"] = a[0]  # OK to put torch tensor here, could numpify.
    examples["agent_info"] = agent_info_0
示例#8
0
def get_example_outputs(agent,
                        EnvCls,
                        env_kwargs,
                        examples,
                        subprocess=False,
                        env=None):
    """Do this in a sub-process to avoid setup conflict in master/workers (e.g.
    MKL)."""
    if subprocess:  # i.e. in subprocess.
        import torch
        torch.set_num_threads(1)  # Some fix to prevent MKL hang.
    if env is None:
        env = EnvCls(**env_kwargs)
        if not hasattr(env, 'spaces'):
            env = MVPWrapper(env)
    o = env.reset()
    a = env.action_space.sample()
    o, r, d, env_info = env.step(a)
    r = np.asarray(r, dtype="float32")  # Must match torch float dtype here.
    agent.reset()
    agent_inputs = torchify_buffer(AgentInputs(o, a, r))
    a, agent_info = agent.step(*agent_inputs)
    if "prev_rnn_state" in agent_info:
        # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H]
        agent_info = agent_info._replace(
            prev_rnn_state=agent_info.prev_rnn_state[0])
    examples["observation"] = o
    examples["reward"] = r
    examples["done"] = d
    examples["env_info"] = env_info
    examples["action"] = a  # OK to put torch tensor here, could numpify.
    examples["agent_info"] = agent_info
示例#9
0
 def collect_evaluation(self, itr):
     traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     for b, o in enumerate(observations):
         observation[b] = o
     action = buffer_from_example(self.envs[0].action_space.null_value(),
                                  len(self.envs))
     reward = np.zeros(len(self.envs), dtype="float32")
     obs_pyt, act_pyt, rew_pyt = torchify_buffer(
         (observation, action, reward))
     self.agent.reset()
     self.agent.eval_mode(itr)
     for t in range(self.max_T):
         act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
         action = numpify_buffer(act_pyt)
         for b, env in enumerate(self.envs):
             o, r, d, env_info = env.step(action[b])
             traj_infos[b].step(observation[b], action[b], r, d,
                                agent_info[b], env_info)
             if getattr(env_info, "traj_done", d):
                 self.traj_infos_queue.put(traj_infos[b].terminate(o))
                 traj_infos[b] = self.TrajInfoCls()
                 o = env.reset()
             if d:
                 action[b] = 0  # Next prev_action.
                 r = 0
                 self.agent.reset_one(idx=b)
             observation[b] = o
             reward[b] = r
         if self.sync.stop_eval.value:
             break
     self.traj_infos_queue.put(None)  # End sentinel.
示例#10
0
 def extract_batch(self, T_idxs, B_idxs, T):
     """Return full sequence of each field which encompasses all subsequences
     to be used, so algorithm can make sub-sequences by slicing on device,
     for reduced memory usage."""
     s, rsi = self.samples, self.rnn_state_interval
     if rsi > 1:
         assert np.all(np.asarray(T_idxs) % rsi == 0)
         init_rnn_state = self.samples_prev_rnn_state[T_idxs // rsi, B_idxs]
     elif rsi == 1:
         init_rnn_state = self.samples.prev_rnn_state[T_idxs, B_idxs]
     else:  # rsi == 0
         init_rnn_state = None
     batch = SamplesFromReplay(
         all_observation=self.extract_observation(T_idxs, B_idxs,
                                                  T + self.n_step_return),
         all_action=buffer_func(
             s.action, extract_sequences, T_idxs - 1, B_idxs,
             T + self.n_step_return),  # Starts at prev_action.
         all_reward=extract_sequences(
             s.reward, T_idxs - 1, B_idxs,
             T + self.n_step_return),  # Only prev_reward (agent + target).
         return_=extract_sequences(self.samples_return_, T_idxs, B_idxs, T),
         done=extract_sequences(s.done, T_idxs, B_idxs, T),
         done_n=extract_sequences(self.samples_done_n, T_idxs, B_idxs, T),
         init_rnn_state=init_rnn_state,  # (Same state for agent and target.)
     )
     # NOTE: Algo might need to make zero prev_action/prev_reward depending on done.
     return torchify_buffer(batch)
def get_example_outputs(agent, env, examples, subprocess=False):
    """Do this in a sub-process to avoid setup conflict in master/workers (e.g.
    MKL)."""
    if subprocess:  # i.e. in subprocess.
        import torch
        torch.set_num_threads(1)  # Some fix to prevent MKL hang.
    o_reset = env.reset()
    a = env.action_space.sample()
    if a.shape == (
    ):  # 'a' gets stored, but if its array(3) you want step(3) for mario
        action = int(a)
    else:
        action = a

    o, r, d, env_info = env.step(action)
    r = np.asarray(r, dtype="float32")  # Must match torch float dtype here.
    agent.reset()
    agent_inputs = torchify_buffer(AgentInputs(o, a, r))
    a, agent_info = agent.step(*agent_inputs)

    if "prev_rnn_state" in agent_info:
        # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H]
        agent_info = agent_info._replace(
            prev_rnn_state=agent_info.prev_rnn_state[0])

    examples["observation"] = o_reset
    examples["reward"] = r
    examples["done"] = d
    examples["env_info"] = env_info
    examples["action"] = a  # OK to put torch tensor here, could numpify.
    examples["agent_info"] = agent_info
示例#12
0
    def _get_example_outputs(self):
        examples = dict()
        o = self.env.reset()
        a = np.stack(
            [self.env.action_space.sample() for _ in range(self.batch_spec.B)],
            axis=0)
        o, r, d, env_info = self.env.step(a)

        a = np.asarray(a[0])  # get first batch only
        o = o[0]  # get first batch only
        r = np.asarray(
            r[0], dtype="float32"
        )  # get first batch only, Must match torch float dtype here.
        self.agent.reset()
        agent_inputs = torchify_buffer(AgentInputs(o, a, r))
        a, agent_info = self.agent.step(*agent_inputs)
        if "prev_rnn_state" in agent_info:
            # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H]
            agent_info = agent_info._replace(
                prev_rnn_state=agent_info.prev_rnn_state[0])
        examples["observation"] = o
        examples["reward"] = r
        examples["done"] = d
        examples["env_info"] = env_info
        examples["action"] = a  # OK to put torch tensor here, could numpify.
        examples["agent_info"] = agent_info
        return examples
示例#13
0
    def collect_batch(self, agent_inputs, traj_infos, itr):
        # Numpy arrays can be written to from numpy arrays or torch tensors
        # (whereas torch tensors can only be written to from torch tensors).
        agent_buf, env_buf = self.samples_np.agent, self.samples_np.env
        completed_infos = list()
        observation, action, reward = agent_inputs
        b = np.where(self.done)[0]
        observation[b] = self.temp_observation[b]
        self.done[:] = False  # Did resets between batches.
        obs_pyt, act_pyt, rew_pyt = torchify_buffer(agent_inputs)
        agent_buf.prev_action[0] = action  # Leading prev_action.

        if env_buf.prev_reward[0].ndim > reward.ndim:
            reward = reward[:, None].repeat(env_buf.prev_reward[0].shape[-1],
                                            -1)
        env_buf.prev_reward[0] = reward

        self.agent.sample_mode(itr)
        for t in range(self.batch_T):
            env_buf.observation[t] = observation
            # Agent inputs and outputs are torch tensors.
            act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
            action = numpify_buffer(act_pyt)
            for b, env in enumerate(self.envs):
                if self.done[b]:
                    action[b] = 0  # Record blank.
                    reward[b] = 0
                    if agent_info:
                        agent_info[b] = 0
                    # Leave self.done[b] = True, record that.
                    continue
                # Environment inputs and outputs are numpy arrays.
                o, r, d, env_info = env.step(action[b])
                traj_infos[b].step(observation[b], action[b], r, d,
                                   agent_info[b], env_info)
                if getattr(env_info, "traj_done", d):
                    completed_infos.append(traj_infos[b].terminate(o))
                    traj_infos[b] = self.TrajInfoCls()
                    self.need_reset[b] = True
                if d:
                    self.temp_observation[b] = o
                    o = 0  # Record blank.
                observation[b] = o
                reward[b] = r
                self.done[b] = d
                if env_info:
                    env_buf.env_info[t, b] = env_info
            agent_buf.action[t] = action
            env_buf.reward[t] = reward
            env_buf.done[t] = self.done
            if agent_info:
                agent_buf.agent_info[t] = agent_info

        if "bootstrap_value" in agent_buf:
            # agent.value() should not advance rnn state.
            agent_buf.bootstrap_value[:] = self.agent.value(
                obs_pyt, act_pyt, rew_pyt)

        return AgentInputs(observation, action,
                           reward), traj_infos, completed_infos
示例#14
0
    def sample_batch(self, batch_B):
        while True:
            try:
                self._async_pull()  # Updates from writers.
                (T_idxs, B_idxs), priorities = self.priority_tree.sample(
                    batch_B, unique=self.unique)
                sampled_indices = True
                if self.rnn_state_interval > 1:
                    T_idxs = T_idxs * self.rnn_state_interval

                batch = self.extract_batch(T_idxs, B_idxs, self.batch_T)

            except Exception as _:
                print("FAILED TO LOAD BATCH")
                traceback.print_exc()
                if sampled_indices:
                    print("B_idxs:", B_idxs, flush=True)
                    print("T_idxs:", T_idxs, flush=True)
                    print("Batch_T:", self.batch_T, flush=True)
                    print("Buffer T:", self.T, flush=True)

            is_weights = (1. / (priorities + 1e-5)) ** self.beta
            is_weights /= max(is_weights)  # Normalize.
            is_weights = torchify_buffer(is_weights).float()

            elapsed_iters = self.t + self.T - T_idxs % self.T
            elapsed_samples = self.B*(elapsed_iters)
            values = torch.from_numpy(extract_sequences(self.samples.value, T_idxs, B_idxs, self.batch_T+self.n_step_return+1))
            batch = SamplesFromReplayPriExt(*batch,
                                            values=values,
                                            is_weights=is_weights,
                                            age=elapsed_samples)
            if self.batch_T > 1:
                batch = self.sanitize_batch(batch)
            return batch
示例#15
0
 def extract_batch(self, T_idxs, B_idxs):
     """From buffer locations `[T_idxs,B_idxs]`, extract data needed for
     training, including target values at `T_idxs + n_step_return`.  Returns
     namedarraytuple of torch tensors (see file for all fields).  Each tensor
     has leading batch dimension ``len(T_idxs)==len(B_idxs)``, but individual
     samples are drawn, so no leading time dimension."""
     s = self.samples
     target_T_idxs = (T_idxs + self.n_step_return) % self.T
     batch = SamplesFromReplay(
         agent_inputs=AgentInputs(
             observation=self.extract_observation(T_idxs, B_idxs),
             prev_action=s.action[T_idxs - 1, B_idxs],
             prev_reward=s.reward[T_idxs - 1, B_idxs],
         ),
         action=s.action[T_idxs, B_idxs],
         return_=self.samples_return_[T_idxs, B_idxs],
         done=self.samples.done[T_idxs, B_idxs],
         done_n=self.samples_done_n[T_idxs, B_idxs],
         target_inputs=AgentInputs(
             observation=self.extract_observation(target_T_idxs, B_idxs),
             prev_action=s.action[target_T_idxs - 1, B_idxs],
             prev_reward=s.reward[target_T_idxs - 1, B_idxs],
         ),
     )
     # target_... means what happend after self.n_step_return timestep
     # It serve as target for predicting the n-step return.
     t_news = np.where(s.done[T_idxs - 1, B_idxs])[0]
     batch.agent_inputs.prev_action[t_news] = 0
     batch.agent_inputs.prev_reward[t_news] = 0
     return torchify_buffer(batch)
示例#16
0
def get_example_outputs(agent, env, examples, subprocess=False):
    """Do this in a sub-process to avoid setup conflict in master/workers (e.g. MKL).
    在一个重置的environment中(从头开始),随机采取一个action,把得到的observation,reward等数据记录下来,保存到输入的examples里返回。
    注意:虽然输入的examples看上去名字是复数,但实际上,返回的并不是在environment中走多步的结果,而仅仅是走一步产生的结果。这个变量命名不好,
    我认为叫"example"更合理。

    :param agent: 一个agent类的对象。
    :param env: 一个environment类的对象。
    :param examples: 同时作为input和output。输入的有可能是一个空的dict,输出的是经过填充过的内容。
    :param subprocess: 是否是在子进程中执行。
    :return: 没有返回值,但需要返回的数据放在了输入的examples变量中返回。
    """
    if subprocess:  # i.e. in subprocess.
        import torch
        torch.set_num_threads(1)  # Some fix to prevent MKL hang.
    o = env.reset()  # 重置environment,从头开始
    a = env.action_space.sample()  # 随机选择action space内的一个index
    o, r, d, env_info = env.step(a)  # 根据选择的action(的index),在environment中步进(step)
    r = np.asarray(r, dtype="float32")  # Must match torch float dtype here. 把reward转成float32类型
    agent.reset()
    agent_inputs = torchify_buffer(AgentInputs(o, a, r))
    a, agent_info = agent.step(*agent_inputs)  # 星号把agent_inputs这一个tuple展开成step()函数所需的3个参数输入
    if "prev_rnn_state" in agent_info:
        # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H]
        agent_info = agent_info._replace(prev_rnn_state=agent_info.prev_rnn_state[0])
    examples["observation"] = o
    examples["reward"] = r  # 只含有一个数的NumPy array
    examples["done"] = d  # bool
    examples["env_info"] = env_info  # EnvInfo类型的对象
    examples["action"] = a  # OK to put torch tensor here, could numpify.
    examples["agent_info"] = agent_info
示例#17
0
 def collect_evaluation(self, itr):
     traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
     completed_traj_infos = list()
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     action = buffer_from_example(
         self.envs[0].action_space.sample(null=True), len(self.envs))
     reward = np.zeros(len(self.envs), dtype="float32")
     obs_pyt, act_pyt, rew_pyt = torchify_buffer(
         (observation, action, reward))
     self.agent.reset()
     for t in range(self.max_T):
         act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
         action = numpify_buffer(act_pyt)
         for b, env in enumerate(self.envs):
             o, r, d, env_info = env.step(action[b])
             traj_infos[b].step(observation[b], action[b], r, d,
                                agent_info[b], env_info)
             if getattr(env_info, "traj_done", d):
                 completed_traj_infos.append(traj_infos[b].terminate(o))
                 traj_infos[b] = self.TrajInfoCls()
                 o = env.reset()
             if d:
                 action[b] = 0  # Prev_action for next step.
                 r = 0
                 self.agent.reset_one(idx=b)
             observation[b] = o
             reward[b] = r
         if (self.max_trajectories is not None
                 and len(completed_traj_infos) >= self.max_trajectories):
             break
     return completed_traj_infos
示例#18
0
 def extract_batch(self, T_idxs, B_idxs):
     batch = super().extract_batch(T_idxs, B_idxs)
     batch = SamplesFromReplayTL(
         *batch,
         timeout=self.samples.timeout[T_idxs, B_idxs],
         timeout_n=self.samples_timeout_n[T_idxs, B_idxs],
     )
     return torchify_buffer(batch)
示例#19
0
文件: sampler.py 项目: zren96/rlpyt
def build_step_buffer(examples, B):
    step_bufs = {
        k: buffer_from_example(examples[k], B, share_memory=True)
        for k in ["observation", "action", "reward", "done", "agent_info"]
    }
    step_buffer_np = StepBuffer(**step_bufs)
    step_buffer_pyt = torchify_buffer(step_buffer_np)
    return step_buffer_pyt, step_buffer_np
示例#20
0
文件: utils.py 项目: wwxFromTju/rlpyt
def build_step_buffer(examples, B):
    bufs = tuple(
        buffer_from_example(examples[k], B, shared_memory=True)
        for k in ["observation", "action", "reward", "done", "agent_info"])
    need_reset = buffer_from_example(examples["done"], B, shared_memory=True)
    step_buffer_np = StepBuffer(*bufs, need_reset)
    step_buffer_pyt = torchify_buffer(step_buffer_np)
    return step_buffer_pyt, step_buffer_np
示例#21
0
 def extract_batch(self, T_idxs, B_idxs, T):
     s = self.samples
     batch = SamplesFromReplay(
         observation=self.extract_observation(T_idxs, B_idxs, T),
         action=buffer_func(s.action, extract_sequences, T_idxs, B_idxs, T),
         reward=extract_sequences(s.reward, T_idxs, B_idxs, T),
         done=extract_sequences(s.done, T_idxs, B_idxs, T),
     )
     return torchify_buffer(batch)
示例#22
0
 def sample_batch(self, batch_B):
     (T_idxs,
      B_idxs), priorities = self.priority_tree.sample(batch_B,
                                                      unique=self.unique)
     batch = self.extract_batch(T_idxs, B_idxs)
     is_weights = (1. / (priorities + EPS))**self.beta  # Unnormalized.
     is_weights /= max(is_weights)  # Normalize.
     is_weights = torchify_buffer(is_weights).float()
     return SamplesFromReplayPri(*batch, is_weights=is_weights)
示例#23
0
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False,
        agent_shared=True, env_shared=True, subprocess=True, examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)"""
    # import ipdb; ipdb.set_trace()
    if examples is None:
        if subprocess:
            mgr = mp.Manager()
            examples = mgr.dict()  # Examples pickled back to master.
            w = mp.Process(target=get_example_outputs,
                args=(agent, env, examples, subprocess))
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)

    T, B = batch_spec
    all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared)
    action = all_action[1:]
    prev_action = all_action[:-1]  # Writing to action will populate prev_action.
    # import ipdb; ipdb.set_trace()
    agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:        
        if agent.dual_model:
            bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            int_bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            agent_buffer = AgentSamplesBsvTwin(*agent_buffer, bootstrap_value=bv, int_bootstrap_value=int_bv)
        else:
            bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B), env_shared) # all zero arrays (except 0th index should equal o_reset)
    next_observation = buffer_from_example(examples["observation"], (T, B), env_shared) 
    all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) # all zero values
    reward = all_reward[1:]
    prev_reward = all_reward[:-1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        next_observation=next_observation,
        prev_reward=prev_reward,
        reward=reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np) # this links the two (changes to samples_np will reflect in samples_pyt)
    return samples_pyt, samples_np, examples
示例#24
0
    def _generate_deterministic_batches(self):
        # replay ratioo of 1 with deterministic batch selection
        cum_sleep_length = 0
        for i in range(self.T_target):
            while True:
                with self.rw_lock:  # get read lock
                    self._async_pull()
                if self.t >= self.optim_batch_B * (i + 1) or self._buffer_full:
                    break
                time.sleep(self.sleep_length)
                cum_sleep_length += self.sleep_length if i > 0 else 0

            # batch is available
            indexes = np.arange(i * self.optim_batch_B,
                                (i + 1) * self.optim_batch_B)
            with self.rw_lock:  # Read lock.
                batch = self.samples[:, indexes]
            yield torchify_buffer(batch), torchify_buffer(
                self.samples_prev_rnn_state[indexes]), cum_sleep_length
示例#25
0
 def sample_batch(self, batch_B):
     (tree_T_idxs, B_idxs), priorities = self.priority_tree.sample(
         batch_B, unique=self.unique)
     if self.rnn_state_interval > 1:
         T_idxs = tree_T_idxs * self.rnn_state_interval
     batch = self.extract_batch(T_idxs, B_idxs, self.batch_T)
     is_weights = (1. / priorities) ** self.beta
     is_weights /= max(is_weights)  # Normalize.
     is_weights = torchify_buffer(is_weights).float()
     return SamplesFromReplayPri(*batch, is_weights=is_weights)
示例#26
0
    def obtain_samples(self, itr, mode='sample'):
        agent_buf, env_buf = self.samples_np.agent, self.samples_np.env

        # Reset agent inputs
        observation, action, reward = self.agent_inputs
        obs_pyt, act_pyt, rew_pyt = torchify_buffer(self.agent_inputs)
        action[:], reward[:] = self.env.action_space.null_value(
        ), 0  # reset agent inputs

        # reset environment and agent
        observation[:] = self.env.reset()
        self.agent.reset()
        agent_buf.prev_action[0], env_buf.prev_reward[
            0] = action, reward  # Leading prev_action.

        # perform episode
        if mode == 'sample':
            self.agent.sample_mode(itr)
        elif mode == 'eval':
            self.agent.eval_mode(itr)
        traj_infos = [
            self.TrajInfoCls(**self.traj_info_kwargs)
            for _ in range(self.batch_spec.B)
        ]
        for t in range(self.batch_spec.T):
            env_buf.observation[t] = observation

            act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
            action = numpify_buffer(
                act_pyt
            )  # todo why doing this? they are sharing the same memory

            o, r, _, env_info = self.env.step(action)
            d = (t == self.batch_spec.T - 1)
            for b in range(self.batch_spec.B):
                traj_infos[b].step(observation[b], action[b], r[b], d,
                                   agent_info[b], env_info)
                if env_info:
                    env_buf.env_info[t, b] = env_info
            observation[:] = o
            reward[:] = r

            agent_buf.action[t] = action
            env_buf.reward[t] = reward
            if agent_info:
                agent_buf.agent_info[t] = agent_info

        if "bootstrap_value" in agent_buf:
            agent_buf.bootstrap_value[:] = self.agent.value(
                obs_pyt, act_pyt, rew_pyt)

        return self.samples_pyt, traj_infos
示例#27
0
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False,
                         agent_shared=True, env_shared=True, subprocess=True, examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)

    :param agent: 一个Agent类的对象。
    :param env: 一个environment类的对象。
    :param batch_spec: 一个BatchSpec类的对象。
    """
    if examples is None:
        if subprocess:  # 创建子进程
            mgr = mp.Manager()  # Manager模块用于资源共享
            examples = mgr.dict()  # Examples pickled back to master. 可以被子进程共享的全局变量
            w = mp.Process(target=get_example_outputs,
                           args=(agent, env, examples, subprocess))  # 创建worker进程,此进程执行的是target指定的函数,参数由args指定
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)  # examples会在get_example_outputs()函数中被更新,所以没有返回值

    T, B = batch_spec  # time step数,以及environment实例数
    all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared)
    action = all_action[1:]
    prev_action = all_action[:-1]  # Writing to action will populate prev_action.
    agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:
        bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
        agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B), env_shared)
    all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared)
    reward = all_reward[1:]
    prev_reward = all_reward[:-1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        reward=reward,
        prev_reward=prev_reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np)
    return samples_pyt, samples_np, examples
 def sample_batch(self, batch_B):
     """Returns batch with leading dimensions ``[self.batch_T, batch_B]``,
     with each sequence sampled randomly according to priority.
     (``self.batch_T`` should not be changed)."""
     (T_idxs, B_idxs), priorities = self.priority_tree.sample(
         batch_B, unique=self.unique)
     if self.rnn_state_interval > 1:
         T_idxs = T_idxs * self.rnn_state_interval
     batch = self.extract_batch(T_idxs, B_idxs, self.batch_T)
     is_weights = (1. / priorities) ** self.beta
     is_weights /= max(is_weights)  # Normalize.
     is_weights = torchify_buffer(is_weights).float()
     return SamplesFromReplayPri(*batch, is_weights=is_weights)
示例#29
0
    def policy(time_step):
        obs = dmc_wrapper.convertObservation(time_step.observation)
        reward = time_step.reward
        reward = np.asarray(reward) if reward is not None else reward

        obs_pyt, act_pyt, rew_pyt = torchify_buffer(
            (obs, get_prev_action(), reward))
        # obs_pyt, rew_pyt = torchify_buffer((obs, reward))

        act_pyt, agent_info = agent.step(obs_pyt.float(), act_pyt, rew_pyt)
        # prev_action = act_pyt

        return act_pyt
示例#30
0
 def sample_batch(self, batch_B):
     """Calls on the priority tree to generate random samples.  Returns
     samples data and normalized importance-sampling weights:
     ``is_weights=priorities ** -beta``
     """
     (T_idxs,
      B_idxs), priorities = self.priority_tree.sample(batch_B,
                                                      unique=self.unique)
     batch = self.extract_batch(T_idxs, B_idxs)
     is_weights = (1. / (priorities + EPS))**self.beta  # Unnormalized.
     is_weights /= max(is_weights)  # Normalize.
     is_weights = torchify_buffer(is_weights).float()
     return SamplesFromReplayPri(*batch, is_weights=is_weights)