Пример #1
0
def test_new_time_step(sample_data):
    s = TimeStep(**sample_data)
    assert s.env_spec is sample_data['env_spec']
    assert s.observation is sample_data['observation']
    assert s.action is sample_data['action']
    assert s.reward is sample_data['reward']
    assert s.terminal is sample_data['terminal']
    assert s.env_info is sample_data['env_info']
    assert s.agent_info is sample_data['agent_info']
    del s

    obs_space = akro.Box(low=-1, high=10, shape=(4, 3, 2), dtype=np.float32)
    act_space = akro.Box(low=-1, high=10, shape=(4, 2), dtype=np.float32)
    env_spec = EnvSpec(obs_space, act_space)
    sample_data['env_spec'] = env_spec
    obs_space = akro.Box(low=-1000,
                         high=1000,
                         shape=(4, 3, 2),
                         dtype=np.float32)
    act_space = akro.Box(low=-1000, high=1000, shape=(4, 2), dtype=np.float32)
    sample_data['observation'] = obs_space.sample()
    sample_data['next_observation'] = obs_space.sample()
    sample_data['action'] = act_space.sample()
    s = TimeStep(**sample_data)

    assert s.observation is sample_data['observation']
    assert s.next_observation is sample_data['next_observation']
    assert s.action is sample_data['action']
Пример #2
0
    def get_env_spec(cls, env_spec, latent_dim, module):
        """Get environment specs of encoder with latent dimension.

        Args:
            env_spec (metarl.envs.EnvSpec): Environment specs.
            latent_dim (int): Latent dimension.
            module (str): Module to get environment specs for.

        Returns:
            metarl.envs.InOutSpec: Module environment specs with latent
                dimension.

        """
        obs_dim = int(np.prod(env_spec.observation_space.shape))
        action_dim = int(np.prod(env_spec.action_space.shape))
        if module == 'encoder':
            in_dim = obs_dim + action_dim + 1
            out_dim = latent_dim * 2
        elif module == 'vf':
            in_dim = obs_dim
            out_dim = latent_dim
        in_space = akro.Box(low=-1, high=1, shape=(in_dim, ), dtype=np.float32)
        out_space = akro.Box(low=-1,
                             high=1,
                             shape=(out_dim, ),
                             dtype=np.float32)
        if module == 'encoder':
            spec = InOutSpec(in_space, out_space)
        elif module == 'vf':
            spec = EnvSpec(in_space, out_space)

        return spec
Пример #3
0
def sample_data():
    # spaces
    obs_space = gym.spaces.Box(low=1,
                               high=10,
                               shape=(4, 3, 2),
                               dtype=np.float32)
    act_space = gym.spaces.MultiDiscrete([2, 5])
    env_spec = EnvSpec(obs_space, act_space)

    # generate data
    obs = obs_space.sample()
    next_obs = obs_space.sample()
    act = act_space.sample()
    rew = 10.0
    terms = False

    # env_infos
    env_infos = dict()
    env_infos['goal'] = np.array([[1, 1]])
    env_infos['TimeLimit.truncated'] = not terms

    # agent_infos
    agent_infos = dict()
    agent_infos['prev_action'] = act

    return {
        'env_spec': env_spec,
        'observation': obs,
        'next_observation': next_obs,
        'action': act,
        'reward': rew,
        'terminal': terms,
        'env_info': env_infos,
        'agent_info': agent_infos,
    }
Пример #4
0
def test_log_multitask_performance_task_id():
    lengths = np.array([10, 5, 1, 1])
    batch = TrajectoryBatch(
        EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])),
                akro.Box(np.array([-1., -1.]), np.array([0., 0.]))),
        observations=np.ones((sum(lengths), 3), dtype=np.float32),
        last_observations=np.ones((len(lengths), 3), dtype=np.float32),
        actions=np.zeros((sum(lengths), 2), dtype=np.float32),
        rewards=np.array([
            0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901,
            0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933,
            0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551,
            0.24203526, 0.43328910
        ]),
        terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1],
                           dtype=bool),
        env_infos={
            'success':
            np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     dtype=bool),
            'task_id':
            np.array([1] * 10 + [3] * 5 + [1] + [4])
        },
        agent_infos={},
        lengths=lengths)

    log_file = tempfile.NamedTemporaryFile()
    csv_output = dowel.CsvOutput(log_file.name)
    logger.add_output(csv_output)
    log_multitask_performance(7, batch, 0.8, {
        1: 'env1',
        3: 'env2',
        4: 'env3',
        5: 'env4'
    })
    logger.log(tabular)
    logger.dump_output_type(dowel.CsvOutput)
    with open(log_file.name, 'r') as file:
        rows = list(csv.DictReader(file))
    res = {k: float(r) for (k, r) in rows[0].items()}
    assert res['env1/Iteration'] == 7
    assert res['env2/Iteration'] == 7
    assert res['env3/Iteration'] == 7
    assert res['env4/Iteration'] == 7
    assert res['env1/NumTrajs'] == 2
    assert res['env2/NumTrajs'] == 1
    assert res['env3/NumTrajs'] == 1
    assert res['env4/NumTrajs'] == 0
    assert math.isclose(res['env1/SuccessRate'], 0.5)
    assert math.isclose(res['env2/SuccessRate'], 1.0)
    assert math.isclose(res['env3/SuccessRate'], 1.0)
    assert math.isnan(res['env4/SuccessRate'])
    assert math.isnan(res['env4/AverageReturn'])
Пример #5
0
 def __init__(self, env, task_index, n_total_tasks):
     assert 0 <= task_index < n_total_tasks
     super().__init__(env)
     self._task_index = task_index
     self._n_total_tasks = n_total_tasks
     env_lb = self.env.observation_space.low
     env_ub = self.env.observation_space.high
     one_hot_ub = np.ones(self._n_total_tasks)
     one_hot_lb = np.zeros(self._n_total_tasks)
     self.observation_space = akro.Box(np.concatenate([env_lb, one_hot_lb]),
                                       np.concatenate([env_ub, one_hot_ub]))
     self.__spec = EnvSpec(action_space=self.action_space,
                           observation_space=self.observation_space)
Пример #6
0
def test_act_env_spec_mismatch_time_step(sample_data):
    with pytest.raises(ValueError,
                       match='action must conform to action_space'):
        sample_data['action'] = sample_data['action'][:-1]
        s = TimeStep(**sample_data)
        del s

    obs_space = akro.Box(low=1, high=10, shape=(4, 3, 2), dtype=np.float32)
    act_space = akro.Discrete(5)
    env_spec = EnvSpec(obs_space, act_space)
    sample_data['env_spec'] = env_spec

    with pytest.raises(ValueError,
                       match='action should have the same dimensionality'):
        sample_data['action'] = sample_data['action'][:-1]
        s = TimeStep(**sample_data)
        del s
Пример #7
0
def test_obs_env_spec_mismatch_time_step(sample_data):
    with pytest.raises(ValueError,
                       match='observation must conform to observation_space'):
        sample_data['observation'] = sample_data['observation'][:, :, :1]
        s = TimeStep(**sample_data)
        del s

    obs_space = akro.Box(low=1, high=10, shape=(4, 5, 2), dtype=np.float32)
    act_space = gym.spaces.MultiDiscrete([2, 5])
    env_spec = EnvSpec(obs_space, act_space)
    sample_data['env_spec'] = env_spec

    with pytest.raises(
            ValueError,
            match='observation should have the same dimensionality'):
        sample_data['observation'] = sample_data['observation'][:, :, :1]
        s = TimeStep(**sample_data)
        del s
Пример #8
0
def test_log_performance():
    lengths = np.array([10, 5, 1, 1])
    batch = TrajectoryBatch(
        EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])),
                akro.Box(np.array([-1., -1.]), np.array([0., 0.]))),
        observations=np.ones((sum(lengths), 3), dtype=np.float32),
        last_observations=np.ones((len(lengths), 3), dtype=np.float32),
        actions=np.zeros((sum(lengths), 2), dtype=np.float32),
        rewards=np.array([
            0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901,
            0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933,
            0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551,
            0.24203526, 0.43328910
        ]),
        terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1],
                           dtype=bool),
        env_infos={
            'success':
            np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     dtype=bool)
        },
        agent_infos={},
        lengths=lengths)

    log_file = tempfile.NamedTemporaryFile()
    csv_output = dowel.CsvOutput(log_file.name)
    logger.add_output(csv_output)
    log_performance(7, batch, 0.8, prefix='test_log_performance')
    logger.log(tabular)
    logger.dump_output_type(dowel.CsvOutput)
    with open(log_file.name, 'r') as file:
        rows = list(csv.DictReader(file))
    res = {k: float(r) for (k, r) in rows[0].items()}
    assert res['test_log_performance/Iteration'] == 7
    assert res['test_log_performance/NumTrajs'] == 4
    assert math.isclose(res['test_log_performance/SuccessRate'], 0.75)
    assert math.isclose(res['test_log_performance/CompletionRate'], 0.5)
    assert math.isclose(res['test_log_performance/AverageDiscountedReturn'],
                        1.1131040640673113)
    assert math.isclose(res['test_log_performance/AverageReturn'],
                        2.1659965525)
    assert math.isclose(res['test_log_performance/StdReturn'],
                        2.354067152038576)
Пример #9
0
def traj_data():
    # spaces
    obs_space = gym.spaces.Box(low=1,
                               high=np.inf,
                               shape=(4, 3, 2),
                               dtype=np.float32)
    act_space = gym.spaces.MultiDiscrete([2, 5])
    env_spec = EnvSpec(obs_space, act_space)

    # generate data
    lens = np.array([10, 20, 7, 25, 25, 40, 10, 5])
    n_t = lens.sum()
    obs = np.stack([obs_space.low] * n_t)
    last_obs = np.stack([obs_space.low] * len(lens))
    act = np.stack([[1, 3]] * n_t)
    rew = np.arange(n_t)
    terms = np.zeros(n_t, dtype=np.bool)
    terms[np.cumsum(lens) - 1] = True  # set terminal bits

    # env_infos
    env_infos = dict()
    env_infos['goal'] = np.stack([[1, 1]] * n_t)
    env_infos['foo'] = np.arange(n_t)

    # agent_infos
    agent_infos = dict()
    agent_infos['prev_action'] = act
    agent_infos['hidden'] = np.arange(n_t)

    return {
        'env_spec': env_spec,
        'observations': obs,
        'last_observations': last_obs,
        'actions': act,
        'rewards': rew,
        'terminals': terms,
        'env_infos': env_infos,
        'agent_infos': agent_infos,
        'lengths': lens,
    }
Пример #10
0
    def augment_env_spec(cls, env_spec, latent_dim):
        """Augment environment by a size of latent dimension.

        Args:
            env_spec (metarl.envs.EnvSpec): Environment specs to be augmented.
            latent_dim (int): Latent dimension.

        Returns:
            metarl.envs.EnvSpec: Augmented environment specs.

        """
        obs_dim = int(np.prod(env_spec.observation_space.shape))
        action_dim = int(np.prod(env_spec.action_space.shape))
        aug_obs = akro.Box(low=-1,
                           high=1,
                           shape=(obs_dim + latent_dim, ),
                           dtype=np.float32)
        aug_act = akro.Box(low=-1,
                           high=1,
                           shape=(action_dim, ),
                           dtype=np.float32)
        return EnvSpec(aug_obs, aug_act)
Пример #11
0
def run_task(snapshot_config, v, *_):
    """Run task.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.

        _ (object): Ignored by this function.

    """
    v = SimpleNamespace(**v)

    task_names = sorted(v.tasks.keys())
    task_args = [v.tasks[t]['args'] for t in task_names]
    task_kwargs = [v.tasks[t]['kwargs'] for t in task_names]

    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        task_env_cls = PointEnv
        task_envs = [
            TfEnv(task_env_cls(*t_args, **t_kwargs))
            for t_args, t_kwargs in zip(task_args, task_kwargs)
        ]
        env = MultiEnvWrapper(
            task_envs,
            round_robin_strategy,
        )

        # Latent space and embedding specs
        # TODO(gh/10): this should probably be done in Embedding or Algo
        latent_lb = np.zeros(v.latent_length, )
        latent_ub = np.ones(v.latent_length, )
        latent_space = akro.Box(latent_lb, latent_ub)

        # trajectory space is (TRAJ_ENC_WINDOW, act_obs) where act_obs is a stacked
        # vector of flattened actions and observations
        act_lb, act_ub = env.action_space.bounds
        act_lb_flat = env.action_space.flatten(act_lb)
        act_ub_flat = env.action_space.flatten(act_ub)
        obs_lb, obs_ub = env.observation_space.bounds
        obs_lb_flat = env.observation_space.flatten(obs_lb)
        obs_ub_flat = env.observation_space.flatten(obs_ub)
        # act_obs_lb = np.concatenate([act_lb_flat, obs_lb_flat])
        # act_obs_ub = np.concatenate([act_ub_flat, obs_ub_flat])
        act_obs_lb = obs_lb_flat
        act_obs_ub = obs_ub_flat
        # act_obs_lb = act_lb_flat
        # act_obs_ub = act_ub_flat
        traj_lb = np.stack([act_obs_lb] * v.inference_window)
        traj_ub = np.stack([act_obs_ub] * v.inference_window)
        traj_space = akro.Box(traj_lb, traj_ub)

        task_embed_spec = EmbeddingSpec(env.task_space, latent_space)
        traj_embed_spec = EmbeddingSpec(traj_space, latent_space)
        task_obs_space = concat_spaces(env.task_space, env.observation_space)
        env_spec_embed = EnvSpec(task_obs_space, env.action_space)

        inference = GaussianMLPEmbedding(
            name='inference',
            embedding_spec=traj_embed_spec,
            hidden_sizes=(20,
                          10),  # was the same size as policy in Karol's paper
            std_share_network=True,
            init_std=2.0,
            output_nonlinearity=tf.nn.tanh,
            min_std=v.embedding_min_std,
        )

        # Embeddings
        task_embedding = GaussianMLPEmbedding(
            name='embedding',
            embedding_spec=task_embed_spec,
            hidden_sizes=(20, 20),
            std_share_network=True,
            init_std=v.embedding_init_std,
            max_std=v.embedding_max_std,
            output_nonlinearity=tf.nn.tanh,
            min_std=v.embedding_min_std,
        )

        # Multitask policy
        policy = GaussianMLPMultitaskPolicy(
            name='policy',
            env_spec=env.spec,
            task_space=env.task_space,
            embedding=task_embedding,
            hidden_sizes=(32, 16),
            std_share_network=True,
            max_std=v.policy_max_std,
            init_std=v.policy_init_std,
            min_std=v.policy_min_std,
        )

        baseline = MultiTaskLinearFeatureBaseline(env_spec=env.spec)

        algo = PPOTaskEmbedding(env_spec=env.spec,
                                policy=policy,
                                baseline=baseline,
                                inference=inference,
                                max_path_length=v.max_path_length,
                                discount=0.99,
                                lr_clip_range=0.2,
                                policy_ent_coeff=v.policy_ent_coeff,
                                embedding_ent_coeff=v.embedding_ent_coeff,
                                inference_ce_coeff=v.inference_ce_coeff,
                                entropy_method='max',
                                stop_entropy_gradient=True,
                                use_softplus_entropy=True,
                                optimizer_args=dict(
                                    batch_size=32,
                                    max_epochs=10,
                                ),
                                inference_optimizer_args=dict(
                                    batch_size=32,
                                    max_epochs=10,
                                ),
                                center_adv=True,
                                stop_ce_gradient=True)

        runner.setup(algo,
                     env,
                     sampler_cls=LocalSampler,
                     sampler_args=None,
                     worker_class=TaskEmbeddingWorker)
        runner.train(n_epochs=600, batch_size=v.batch_size, plot=False)
Пример #12
0
 def __init__(self, env):
     super().__init__(env)
     action_space = akro.from_gym(self.env.action_space)
     observation_space = self._create_rl2_obs_space()
     self._spec = EnvSpec(action_space=action_space,
                          observation_space=observation_space)
Пример #13
0
 def __init__(self, random=True):
     super().__init__(random)
     self.spec = EnvSpec(action_space=self.action_space,
                         observation_space=self.observation_space)
def test_methods():
    """Test PEARLWorker methods."""
    env_spec = MetaRLEnv(DummyBoxEnv())
    latent_dim = 5
    latent_space = akro.Box(low=-1,
                            high=1,
                            shape=(latent_dim, ),
                            dtype=np.float32)

    # add latent space to observation space to create a new space
    augmented_obs_space = akro.Tuple(
        (env_spec.observation_space, latent_space))
    augmented_env_spec = EnvSpec(augmented_obs_space, env_spec.action_space)

    obs_dim = int(np.prod(env_spec.observation_space.shape))
    action_dim = int(np.prod(env_spec.action_space.shape))
    reward_dim = 1
    encoder_input_dim = obs_dim + action_dim + reward_dim
    encoder_output_dim = latent_dim * 2
    encoder_hidden_sizes = (3, 2, encoder_output_dim)

    context_encoder = MLPEncoder(input_dim=encoder_input_dim,
                                 output_dim=encoder_output_dim,
                                 hidden_nonlinearity=None,
                                 hidden_sizes=encoder_hidden_sizes,
                                 hidden_w_init=nn.init.ones_,
                                 output_w_init=nn.init.ones_)

    policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec,
                                   hidden_sizes=(3, 5, 7),
                                   hidden_nonlinearity=F.relu,
                                   output_nonlinearity=None)

    context_policy = ContextConditionedPolicy(latent_dim=latent_dim,
                                              context_encoder=context_encoder,
                                              policy=policy,
                                              use_information_bottleneck=True,
                                              use_next_obs=False)

    max_path_length = 20
    worker1 = PEARLWorker(seed=1,
                          max_path_length=max_path_length,
                          worker_number=1)
    worker1.update_agent(context_policy)
    worker1.update_env(env_spec)
    rollouts = worker1.rollout()

    assert rollouts.observations.shape == (max_path_length, obs_dim)
    assert rollouts.actions.shape == (max_path_length, action_dim)
    assert rollouts.rewards.shape == (max_path_length, )

    worker2 = PEARLWorker(seed=1,
                          max_path_length=max_path_length,
                          worker_number=1,
                          deterministic=True,
                          accum_context=True)
    worker2.update_agent(context_policy)
    worker2.update_env(env_spec)
    rollouts = worker2.rollout()

    assert context_policy.context.shape == (1, max_path_length,
                                            encoder_input_dim)
    assert rollouts.observations.shape == (max_path_length, obs_dim)
    assert rollouts.actions.shape == (max_path_length, action_dim)
    assert rollouts.rewards.shape == (max_path_length, )