Пример #1
0
def test_loss():
    batch_b = 2
    batch_t = 4
    stoch_state_dim = 3
    deter_state_dim = 4
    action_size = 3
    img_size = (3, 64, 64)  # TODO: figure out why atari games have 4 channels.

    dreamer = make_dreamer(action_size)

    # categorical action tensor
    action = torch.randint(action_size, (batch_t, batch_b))
    prev_action = torch.randn(batch_t, batch_b, action_size)
    observation = torch.randn(batch_t, batch_b, *img_size)
    env_reward = torch.randn(batch_t, batch_b, 1)
    prev_reward = torch.randn(batch_t, batch_b)
    done = torch.zeros(batch_t, batch_b, dtype=torch.bool)
    env_info = EnvInfo()
    prev_state = make_rssm_state(batch_t, batch_b, stoch_state_dim,
                                 deter_state_dim)
    agent_info = DreamerAgentInfo(prev_state=prev_state)
    agent_samples = AgentSamples(action=action,
                                 prev_action=prev_action,
                                 agent_info=agent_info)
    env_samples = EnvSamples(observation=observation,
                             reward=env_reward,
                             prev_reward=prev_reward,
                             done=done,
                             env_info=env_info)
    samples = Samples(agent=agent_samples, env=env_samples)
    loss = dreamer.loss(samples)

    # Check we have a single-element FloatTensor with a gradient
    assert isinstance(loss, torch.FloatTensor)
    assert loss.requires_grad
    assert loss.shape == ()

    # Check it still works if we pass in discrete actions
    num_actions = 6
    dreamer = make_dreamer(num_actions)
    action = torch.randint(0, num_actions, (batch_t, batch_b))
    prev_action = torch.randint(0, num_actions, (batch_t, batch_b))
    agent_samples = AgentSamples(action=action,
                                 prev_action=prev_action,
                                 agent_info=agent_info)
    env_samples = EnvSamples(observation=observation,
                             reward=env_reward,
                             prev_reward=prev_reward,
                             done=done,
                             env_info=env_info)
    samples = Samples(agent=agent_samples, env=env_samples)
    loss = dreamer.loss(samples)
    assert isinstance(loss, torch.FloatTensor)
    assert loss.requires_grad
    assert loss.shape == ()
Пример #2
0
def build_samples_buffer(agent,
                         env,
                         batch_spec,
                         bootstrap_value=False,
                         agent_shared=True,
                         env_shared=True,
                         subprocess=True,
                         examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)"""
    if examples is None:
        if subprocess:
            mgr = mp.Manager()
            examples = mgr.dict()  # Examples pickled back to master.
            w = mp.Process(target=get_example_outputs,
                           args=(agent, env, examples, subprocess))
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)

    T, B = batch_spec
    all_action = buffer_from_example(examples["action"], (T + 1, B),
                                     agent_shared)
    action = all_action[1:]
    prev_action = all_action[:
                             -1]  # Writing to action will populate prev_action.
    agent_info = buffer_from_example(examples["agent_info"], (T, B),
                                     agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:
        bv = buffer_from_example(examples["agent_info"].value, (1, B),
                                 agent_shared)
        agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B),
                                      env_shared)
    all_reward = buffer_from_example(examples["reward"], (T + 1, B),
                                     env_shared)
    reward = all_reward[1:]
    prev_reward = all_reward[:
                             -1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        reward=reward,
        prev_reward=prev_reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np)
    return samples_pyt, samples_np, examples
Пример #3
0
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False,
        agent_shared=True, env_shared=True, subprocess=True, examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)"""
    # import ipdb; ipdb.set_trace()
    if examples is None:
        if subprocess:
            mgr = mp.Manager()
            examples = mgr.dict()  # Examples pickled back to master.
            w = mp.Process(target=get_example_outputs,
                args=(agent, env, examples, subprocess))
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)

    T, B = batch_spec
    all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared)
    action = all_action[1:]
    prev_action = all_action[:-1]  # Writing to action will populate prev_action.
    # import ipdb; ipdb.set_trace()
    agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:        
        if agent.dual_model:
            bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            int_bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            agent_buffer = AgentSamplesBsvTwin(*agent_buffer, bootstrap_value=bv, int_bootstrap_value=int_bv)
        else:
            bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B), env_shared) # all zero arrays (except 0th index should equal o_reset)
    next_observation = buffer_from_example(examples["observation"], (T, B), env_shared) 
    all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) # all zero values
    reward = all_reward[1:]
    prev_reward = all_reward[:-1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        next_observation=next_observation,
        prev_reward=prev_reward,
        reward=reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np) # this links the two (changes to samples_np will reflect in samples_pyt)
    return samples_pyt, samples_np, examples
Пример #4
0
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False,
                         agent_shared=True, env_shared=True, subprocess=True, examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)

    :param agent: 一个Agent类的对象。
    :param env: 一个environment类的对象。
    :param batch_spec: 一个BatchSpec类的对象。
    """
    if examples is None:
        if subprocess:  # 创建子进程
            mgr = mp.Manager()  # Manager模块用于资源共享
            examples = mgr.dict()  # Examples pickled back to master. 可以被子进程共享的全局变量
            w = mp.Process(target=get_example_outputs,
                           args=(agent, env, examples, subprocess))  # 创建worker进程,此进程执行的是target指定的函数,参数由args指定
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)  # examples会在get_example_outputs()函数中被更新,所以没有返回值

    T, B = batch_spec  # time step数,以及environment实例数
    all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared)
    action = all_action[1:]
    prev_action = all_action[:-1]  # Writing to action will populate prev_action.
    agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:
        bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
        agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B), env_shared)
    all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared)
    reward = all_reward[1:]
    prev_reward = all_reward[:-1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        reward=reward,
        prev_reward=prev_reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np)
    return samples_pyt, samples_np, examples
Пример #5
0
def build_intrinsic_samples_buffer(agent,
                                   env,
                                   batch_spec,
                                   bootstrap_value=False,
                                   next_obs=False,
                                   agent_shared=True,
                                   env_shared=True,
                                   subprocess=True,
                                   examples=None):
    """
    Replaces ``build_samples_buffer`` to add additional buffer space for intrinsic bonus agents.
    If bootstrap_value=True, also adds space for int_bootstrap_value from intrinsic value head.
    If next_obs=True, also adds space for next observations (NOTE: This is memory intensive with
    raw pixel states, as it doubles the space to store images. Keep this as False unless the
    algorithm needs it).
    """
    if examples is None:
        if subprocess:
            mgr = mp.Manager()
            examples = mgr.dict()  # Examples pickled back to master.
            w = mp.Process(target=get_example_outputs,
                           args=(agent, env, examples, subprocess))
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)

    T, B = batch_spec
    all_action = buffer_from_example(examples["action"], (T + 1, B),
                                     agent_shared)
    action = all_action[1:]
    prev_action = all_action[:
                             -1]  # Writing to action will populate prev_action.
    agent_info = buffer_from_example(examples["agent_info"], (T, B),
                                     agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:  # Added buffer space for intrinsic bootstrap value
        bv = buffer_from_example(examples["agent_info"].ext_value, (1, B),
                                 agent_shared)
        int_bv = buffer_from_example(examples["agent_info"].int_value, (1, B),
                                     agent_shared)
        agent_buffer = IntAgentSamplesBsv(*agent_buffer,
                                          bootstrap_value=bv,
                                          int_bootstrap_value=int_bv)

    observation = buffer_from_example(examples["observation"], (T, B),
                                      env_shared)
    all_reward = buffer_from_example(examples["reward"], (T + 1, B),
                                     env_shared)
    reward = all_reward[1:]
    prev_reward = all_reward[:
                             -1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)

    if next_obs:  # Add buffer space for next obs, if specified
        next_observation = buffer_from_example(examples["observation"], (T, B),
                                               env_shared)
        env_buffer = EnvSamplesPlus(
            observation=observation,
            next_observation=next_observation,
            reward=reward,
            prev_reward=prev_reward,
            done=done,
            env_info=env_info,
        )
    else:
        env_buffer = EnvSamples(
            observation=observation,
            reward=reward,
            prev_reward=prev_reward,
            done=done,
            env_info=env_info,
        )

    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np)
    return samples_pyt, samples_np, examples