예제 #1
0
    def observation(self, obs):
        # Debug output: max-x/y positions to watch exploration progress.
        if self.step_count == 0:
            if self.x_positions:
                # max_diff = max(
                #   np.sqrt((np.array(self.x_positions) - self.init_x) ** 2 + (
                #            np.array(self.y_positions) - self.init_y) ** 2))
                # print("After reset: max delta-x/y={}".format(max_diff))
                self.x_positions = []
                self.y_positions = []
            self.init_x = self.agent_pos[0]
            self.init_y = self.agent_pos[1]

        # Are we carrying the key?
        if self.carrying is not None:
            print("Carrying KEY!!")

        self.x_positions.append(self.agent_pos[0])
        self.y_positions.append(self.agent_pos[1])

        # One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten.
        objects = one_hot(obs[:, :, 0], depth=11)
        colors = one_hot(obs[:, :, 1], depth=6)
        states = one_hot(obs[:, :, 2], depth=3)
        # Is the door we see open?
        for x in range(7):
            for y in range(7):
                if objects[x, y, 4] == 1.0 and states[x, y, 0] == 1.0:
                    print("Door OPEN!!")

        all_ = np.concatenate([objects, colors, states], -1)
        ret = np.reshape(all_, (-1, ))
        direction = one_hot(
            np.array(self.agent_dir), depth=4).astype(np.float32)
        return np.concatenate([ret, direction])
예제 #2
0
 def test_multi_agent_sample_round_robin(self):
     ev = RolloutWorker(
         env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
         },
         policy_mapping_fn=lambda agent_id, episode, **kwargs: "p0",
         rollout_fragment_length=50,
     )
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
     # since we round robin introduce agents into the env, some of the env
     # steps don't count as proper transitions
     self.assertEqual(batch.policy_batches["p0"].count, 42)
     check(
         batch.policy_batches["p0"]["obs"][:10],
         one_hot(np.array([0, 1, 2, 3, 4] * 2), 10),
     )
     check(
         batch.policy_batches["p0"]["new_obs"][:10],
         one_hot(np.array([1, 2, 3, 4, 5] * 2), 10),
     )
     self.assertEqual(
         batch.policy_batches["p0"]["rewards"].tolist()[:10],
         [100, 100, 100, 100, 0] * 2,
     )
     self.assertEqual(
         batch.policy_batches["p0"]["dones"].tolist()[:10],
         [False, False, False, False, True] * 2,
     )
     self.assertEqual(
         batch.policy_batches["p0"]["t"].tolist()[:10],
         [4, 9, 14, 19, 24, 5, 10, 15, 20, 25],
     )
예제 #3
0
    def test_multi_agent_complex_spaces(self):
        ModelCatalog.register_custom_model("dict_spy", DictSpyModel)
        ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel)
        register_env("nested_ma", lambda _: NestedMultiAgentEnv())
        act_space = spaces.Discrete(2)
        pg = PGTrainer(env="nested_ma",
                       config={
                           "num_workers": 0,
                           "rollout_fragment_length": 5,
                           "train_batch_size": 5,
                           "multiagent": {
                               "policies": {
                                   "tuple_policy":
                                   (PGTFPolicy, TUPLE_SPACE, act_space, {
                                       "model": {
                                           "custom_model": "tuple_spy"
                                       }
                                   }),
                                   "dict_policy":
                                   (PGTFPolicy, DICT_SPACE, act_space, {
                                       "model": {
                                           "custom_model": "dict_spy"
                                       }
                                   }),
                               },
                               "policy_mapping_fn": lambda a: {
                                   "tuple_agent": "tuple_policy",
                                   "dict_agent": "dict_policy"
                               }[a],
                           },
                           "framework": "tf",
                       })
        # Skip first passes as they came from the TorchPolicy loss
        # initialization.
        TupleSpyModel.capture_index = DictSpyModel.capture_index = 0
        pg.train()

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            check(seen[2][0], task_i)

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            check(seen[2][0], task_i)
예제 #4
0
    def test_py_torch_model(self):
        ModelCatalog.register_custom_model("composite", TorchSpyModel)
        register_env("nested", lambda _: NestedDictEnv())
        a2c = A2CTrainer(env="nested",
                         config={
                             "num_workers": 0,
                             "rollout_fragment_length": 5,
                             "train_batch_size": 5,
                             "model": {
                                 "custom_model": "composite",
                             },
                             "framework": "torch",
                         })

        # Skip first passes as they came from the TorchPolicy loss
        # initialization.
        TorchSpyModel.capture_index = 0
        a2c.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "torch_spy_in_{}".format(i)))

            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            # Only look at the last entry (-1) in `seen` as we reset (re-use)
            # the ray-kv indices before training.
            self.assertEqual(seen[0][-1].tolist(), pos_i)
            self.assertEqual(seen[1][-1].tolist(), cam_i)
            check(seen[2][-1], task_i)
예제 #5
0
    def do_test_nested_tuple(self, make_env):
        ModelCatalog.register_custom_model("composite2", TupleSpyModel)
        register_env("nested2", make_env)
        pg = PGTrainer(env="nested2",
                       config={
                           "num_workers": 0,
                           "rollout_fragment_length": 5,
                           "train_batch_size": 5,
                           "model": {
                               "custom_model": "composite2",
                           },
                           "framework": "tf",
                       })
        # Skip first passes as they came from the TorchPolicy loss
        # initialization.
        TupleSpyModel.capture_index = 0
        pg.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            check(seen[2][0], task_i)
예제 #6
0
    def do_test_nested_dict(self, make_env, test_lstm=False):
        ModelCatalog.register_custom_model("composite", DictSpyModel)
        register_env("nested", make_env)
        pg = PGTrainer(env="nested",
                       config={
                           "num_workers": 0,
                           "rollout_fragment_length": 5,
                           "train_batch_size": 5,
                           "model": {
                               "custom_model": "composite",
                               "use_lstm": test_lstm,
                           },
                           "framework": "tf",
                       })
        # Skip first passes as they came from the TorchPolicy loss
        # initialization.
        DictSpyModel.capture_index = 0
        pg.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            check(seen[2][0], task_i)
예제 #7
0
    def observation(self, obs):
        # Debug output: max-x/y positions to watch exploration progress.
        if self.step_count == 0:
            for _ in range(self.framestack):
                self.frame_buffer.append(np.zeros((self.single_frame_dim, )))
            if self.vector_index == 0:
                if self.x_positions:
                    max_diff = max(
                        np.sqrt((np.array(self.x_positions) - self.init_x)**2 +
                                (np.array(self.y_positions) - self.init_y)**2))
                    self.x_y_delta_buffer.append(max_diff)
                    print("100-average dist travelled={}".format(
                        np.mean(self.x_y_delta_buffer)))
                    self.x_positions = []
                    self.y_positions = []
                self.init_x = self.agent_pos[0]
                self.init_y = self.agent_pos[1]

        # Are we carrying the key?
        # if self.carrying is not None:
        #    print("Carrying KEY!!")

        self.x_positions.append(self.agent_pos[0])
        self.y_positions.append(self.agent_pos[1])

        # One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten.
        objects = one_hot(obs[:, :, 0], depth=11)
        colors = one_hot(obs[:, :, 1], depth=6)
        states = one_hot(obs[:, :, 2], depth=3)
        # Is the door we see open?
        # for x in range(7):
        #    for y in range(7):
        #        if objects[x, y, 4] == 1.0 and states[x, y, 0] == 1.0:
        #            print("Door OPEN!!")

        all_ = np.concatenate([objects, colors, states], -1)
        all_flat = np.reshape(all_, (-1, ))
        direction = one_hot(np.array(self.agent_dir),
                            depth=4).astype(np.float32)
        single_frame = np.concatenate([all_flat, direction])
        self.frame_buffer.append(single_frame)
        return np.concatenate(self.frame_buffer)
예제 #8
0
    def test_simple_q_loss_function(self):
        """Tests the Simple-Q loss function results on all frameworks."""
        config = dqn.simple_q.SimpleQConfig().rollouts(num_rollout_workers=0)
        # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)).
        config.training(model={
            "fcnet_hiddens": [10],
            "fcnet_activation": "linear",
        })

        for fw in framework_iterator(config):
            # Generate Trainer and get its default Policy object.
            trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            # Batch of size=2.
            input_ = SampleBatch({
                SampleBatch.CUR_OBS:
                np.random.random(size=(2, 4)),
                SampleBatch.ACTIONS:
                np.array([0, 1]),
                SampleBatch.REWARDS:
                np.array([0.4, -1.23]),
                SampleBatch.DONES:
                np.array([False, False]),
                SampleBatch.NEXT_OBS:
                np.random.random(size=(2, 4)),
                SampleBatch.EPS_ID:
                np.array([1234, 1234]),
                SampleBatch.AGENT_INDEX:
                np.array([0, 0]),
                SampleBatch.ACTION_LOGP:
                np.array([-0.1, -0.1]),
                SampleBatch.ACTION_DIST_INPUTS:
                np.array([[0.1, 0.2], [-0.1, -0.2]]),
                SampleBatch.ACTION_PROB:
                np.array([0.1, 0.2]),
                "q_values":
                np.array([[0.1, 0.2], [0.2, 0.1]]),
            })
            # Get model vars for computing expected model outs (q-vals).
            # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias
            vars = policy.get_weights()
            if isinstance(vars, dict):
                vars = list(vars.values())

            vars_t = policy.target_model.variables()
            if fw == "tf":
                vars_t = policy.get_session().run(vars_t)

            # Q(s,a) outputs.
            q_t = np.sum(
                one_hot(input_[SampleBatch.ACTIONS], 2) * fc(
                    fc(
                        input_[SampleBatch.CUR_OBS],
                        vars[0 if fw != "torch" else 2],
                        vars[1 if fw != "torch" else 3],
                        framework=fw,
                    ),
                    vars[2 if fw != "torch" else 0],
                    vars[3 if fw != "torch" else 1],
                    framework=fw,
                ),
                1,
            )
            # max[a'](Qtarget(s',a')) outputs.
            q_target_tp1 = np.max(
                fc(
                    fc(
                        input_[SampleBatch.NEXT_OBS],
                        vars_t[0 if fw != "torch" else 2],
                        vars_t[1 if fw != "torch" else 3],
                        framework=fw,
                    ),
                    vars_t[2 if fw != "torch" else 0],
                    vars_t[3 if fw != "torch" else 1],
                    framework=fw,
                ),
                1,
            )
            # TD-errors (Bellman equation).
            td_error = q_t - config.gamma * input_[
                SampleBatch.REWARDS] + q_target_tp1
            # Huber/Square loss on TD-error.
            expected_loss = huber_loss(td_error).mean()

            if fw == "torch":
                input_ = policy._lazy_tensor_dict(input_)
            # Get actual out and compare.
            if fw == "tf":
                out = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(input_,
                                                           shuffle=False),
                )
            else:
                out = (loss_torch if fw == "torch" else loss_tf)(policy,
                                                                 policy.model,
                                                                 None, input_)
            check(out, expected_loss, decimals=1)
def do_test_log_likelihood(run,
                           config,
                           prev_a=None,
                           continuous=False,
                           layer_key=("fc", (0, 4), ("_hidden_layers.0.",
                                                     "_logits.")),
                           logp_func=None):
    config = config.copy()
    # Run locally.
    config["num_workers"] = 0
    # Env setup.
    if continuous:
        env = "Pendulum-v0"
        obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]])
    else:
        env = "FrozenLake-v0"
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs_batch = np.array([0])
        preprocessed_obs_batch = one_hot(obs_batch, depth=16)

    prev_r = None if prev_a is None else np.array(0.0)

    # Test against all frameworks.
    for fw in framework_iterator(config):
        if run in [sac.SACTrainer] and fw == "tfe":
            continue

        trainer = run(config=config, env=env)

        policy = trainer.get_policy()
        vars = policy.get_weights()
        # Sample n actions, then roughly check their logp against their
        # counts.
        num_actions = 1000 if not continuous else 50
        actions = []
        for _ in range(num_actions):
            # Single action from single obs.
            actions.append(
                trainer.compute_action(obs_batch[0],
                                       prev_action=prev_a,
                                       prev_reward=prev_r,
                                       explore=True))

        # Test all taken actions for their log-likelihoods vs expected values.
        if continuous:
            for idx in range(num_actions):
                a = actions[idx]
                if fw != "torch":
                    if isinstance(vars, list):
                        expected_mean_logstd = fc(
                            fc(obs_batch, vars[layer_key[1][0]]),
                            vars[layer_key[1][1]])
                    else:
                        expected_mean_logstd = fc(
                            fc(
                                obs_batch,
                                vars["default_policy/{}_1/kernel".format(
                                    layer_key[0])]),
                            vars["default_policy/{}_out/kernel".format(
                                layer_key[0])])
                else:
                    expected_mean_logstd = fc(
                        fc(obs_batch,
                           vars["{}_model.0.weight".format(layer_key[2][0])],
                           framework=fw),
                        vars["{}_model.0.weight".format(layer_key[2][1])],
                        framework=fw)
                mean, log_std = np.split(expected_mean_logstd, 2, axis=-1)
                if logp_func is None:
                    expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std)))
                else:
                    expected_logp = logp_func(mean, log_std, a)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(logp, expected_logp[0], rtol=0.2)
        # Test all available actions for their logp values.
        else:
            for a in [0, 1, 2, 3]:
                count = actions.count(a)
                expected_prob = count / num_actions
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(np.exp(logp), expected_prob, atol=0.2)
예제 #10
0
    def test_simple_q_loss_function(self):
        """Tests the Simple-Q loss function results on all frameworks."""
        config = dqn.SIMPLE_Q_DEFAULT_CONFIG.copy()
        # Run locally.
        config["num_workers"] = 0
        # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)).
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        for fw in framework_iterator(config):
            # Generate Trainer and get its default Policy object.
            trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            # Batch of size=2.
            input_ = {
                SampleBatch.CUR_OBS: np.random.random(size=(2, 4)),
                SampleBatch.ACTIONS: np.array([0, 1]),
                SampleBatch.REWARDS: np.array([0.4, -1.23]),
                SampleBatch.DONES: np.array([False, False]),
                SampleBatch.NEXT_OBS: np.random.random(size=(2, 4))
            }
            # Get model vars for computing expected model outs (q-vals).
            # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias
            vars = policy.get_weights()
            if isinstance(vars, dict):
                vars = list(vars.values())
            vars_t = policy.target_q_func_vars
            if fw == "tf":
                vars_t = policy.get_session().run(vars_t)

            # Q(s,a) outputs.
            q_t = np.sum(
                one_hot(input_[SampleBatch.ACTIONS], 2) *
                fc(fc(input_[SampleBatch.CUR_OBS],
                      vars[0 if fw != "torch" else 2],
                      vars[1 if fw != "torch" else 3],
                      framework=fw),
                   vars[2 if fw != "torch" else 0],
                   vars[3 if fw != "torch" else 1],
                   framework=fw), 1)
            # max[a'](Qtarget(s',a')) outputs.
            q_target_tp1 = np.max(
                fc(fc(input_[SampleBatch.NEXT_OBS],
                      vars_t[0 if fw != "torch" else 2],
                      vars_t[1 if fw != "torch" else 3],
                      framework=fw),
                   vars_t[2 if fw != "torch" else 0],
                   vars_t[3 if fw != "torch" else 1],
                   framework=fw), 1)
            # TD-errors (Bellman equation).
            td_error = q_t - config["gamma"] * input_[SampleBatch.REWARDS] + \
                q_target_tp1
            # Huber/Square loss on TD-error.
            expected_loss = huber_loss(td_error).mean()

            if fw == "torch":
                input_ = policy._lazy_tensor_dict(input_)
            # Get actual out and compare.
            if fw == "tf":
                out = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(input_,
                                                           shuffle=False))
            else:
                out = (loss_torch if fw == "torch" else loss_tf)(policy,
                                                                 policy.model,
                                                                 None, input_)
            check(out, expected_loss, decimals=1)
예제 #11
0
def test_log_likelihood(run,
                        config,
                        prev_a=None,
                        continuous=False,
                        layer_key=("fc", (0, 4)),
                        logp_func=None):
    config = config.copy()
    # Run locally.
    config["num_workers"] = 0
    # Env setup.
    if continuous:
        env = "Pendulum-v0"
        obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]])
    else:
        env = "FrozenLake-v0"
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs_batch = np.array([0])
        preprocessed_obs_batch = one_hot(obs_batch, depth=16)

    # Use Soft-Q for DQNs.
    if run is dqn.DQNTrainer:
        config["exploration_config"] = {"type": "SoftQ", "temperature": 0.5}

    prev_r = None if prev_a is None else np.array(0.0)

    # Test against all frameworks.
    for fw in ["tf", "eager", "torch"]:
        if run in [dqn.DQNTrainer, sac.SACTrainer] and fw == "torch":
            continue
        print("Testing {} with framework={}".format(run, fw))
        config["eager"] = True if fw == "eager" else False
        config["use_pytorch"] = True if fw == "torch" else False

        trainer = run(config=config, env=env)
        policy = trainer.get_policy()
        vars = policy.get_weights()
        # Sample n actions, then roughly check their logp against their
        # counts.
        num_actions = 500
        actions = []
        for _ in range(num_actions):
            # Single action from single obs.
            actions.append(
                trainer.compute_action(obs_batch[0],
                                       prev_action=prev_a,
                                       prev_reward=prev_r,
                                       explore=True))

        # Test 50 actions for their log-likelihoods vs expected values.
        if continuous:
            for idx in range(50):
                a = actions[idx]
                if fw == "tf" or fw == "eager":
                    if isinstance(vars, list):
                        expected_mean_logstd = fc(
                            fc(obs_batch, vars[layer_key[1][0]]),
                            vars[layer_key[1][1]])
                    else:
                        expected_mean_logstd = fc(
                            fc(
                                obs_batch,
                                vars["default_policy/{}_1/kernel".format(
                                    layer_key[0])]),
                            vars["default_policy/{}_out/kernel".format(
                                layer_key[0])])
                else:
                    expected_mean_logstd = fc(
                        fc(obs_batch,
                           vars["_hidden_layers.0._model.0.weight"]),
                        vars["_logits._model.0.weight"])
                mean, log_std = np.split(expected_mean_logstd, 2, axis=-1)
                if logp_func is None:
                    expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std)))
                else:
                    expected_logp = logp_func(mean, log_std, a)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(logp, expected_logp[0], rtol=0.2)
        # Test all available actions for their logp values.
        else:
            for a in [0, 1, 2, 3]:
                count = actions.count(a)
                expected_logp = np.log(count / num_actions)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(logp, expected_logp, rtol=0.3)