예제 #1
0
    def __init__(self, config: Dict[str, any], result_dir: str, cache_stats: CacheInformation):
        super().__init__(config, result_dir, cache_stats)
        # evaluation specific variables
        self.observation_seen = 0
        self.episode_reward = 0
        self.checkpoint_steps = config['checkpoint_steps']

        self._incomplete_experiences = TTLCache(InMemoryStorage())
        self._incomplete_experiences.expired_entry_callback(self._observe_expired_incomplete_experience)

        self.experimental_reward = config.get('experimental_reward', False)
        agent_config = config['agent_config']
        self.converter = CachingStrategyRLConverter()
        # action space: should cache: true or false
        # state space: [capacity (1), query key(1), query result set(num_indexes)]
        fields_in_state = len(CachingAgentSystemState.__slots__)
        self.agent = Agent.from_spec(agent_config,
                                     state_space=FloatBox(shape=(fields_in_state,)),
                                     action_space=IntBox(2))

        self.logger = logging.getLogger(__name__)
        name = 'rl_caching_strategy'
        self.reward_logger = create_file_logger(name=f'{name}_reward_logger', result_dir=self.result_dir)
        self.loss_logger = create_file_logger(name=f'{name}_loss_logger', result_dir=self.result_dir)
        self.observation_logger = create_file_logger(name=f'{name}_observation_logger', result_dir=self.result_dir)
        self.entry_hits_logger = create_file_logger(name=f'{name}_entry_hits_logger', result_dir=self.result_dir)

        self.key_vocab = Vocabulary()
예제 #2
0
    def test_embedding_lookup_layer(self):
        # Input space for lookup indices (double indices for picking 2 rows per batch item).
        input_space = IntBox(shape=(2,), add_batch_rank=True)

        embedding = EmbeddingLookup(embed_dim=5, vocab_size=4, initializer_spec=np.array([
            [1.0, 2.0, 3.0, 4.0, 5.0],
            [6.0, 7.0, 8.0, 9.0, 10.0],
            [11.0, 12.0, 13.0, 14.0, 15.0],
            [16.0, 17.0, 18.0, 19.0, 20.0]
        ]))
        test = ComponentTest(component=embedding, input_spaces=dict(ids=input_space))

        # Pull a batch of 3 (2 vocabs each) from the embedding matrix.
        inputs = np.array(
            [[0, 1], [3, 2], [2, 1]]
        )

        expected = np.array([
            [
                [1.0, 2.0, 3.0, 4.0, 5.0],
                [6.0, 7.0, 8.0, 9.0, 10.0]
            ], [
                [16.0, 17.0, 18.0, 19.0, 20.0],
                [11.0, 12.0, 13.0, 14.0, 15.0]
            ], [
                [11.0, 12.0, 13.0, 14.0, 15.0],
                [6.0, 7.0, 8.0, 9.0, 10.0],
            ]
        ])
        test.test(("apply", inputs), expected_outputs=expected, decimals=5)
    def test_keras_style_one_container_input_space(self):
        # Define one container input Space.
        input_space = Tuple(IntBox(3), FloatBox(shape=(4,)), add_batch_rank=True)

        # One-hot flatten the int tensor.
        flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space[0])
        # Run the float tensor through two dense layers.
        dense_1_out = DenseLayer(units=3, scope="d1")(input_space[1])
        dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out)
        # Concat everything.
        cat_out = ConcatLayer()(flatten_layer_out, dense_2_out)

        # Use the `outputs` arg to allow your network to trace back the data flow until the input space.
        # `inputs` is not needed  here as we only have one single input (the Tuple).
        neural_net = NeuralNetwork(outputs=cat_out)

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_space))

        var_dict = neural_net.variable_registry
        w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"])
        b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"])
        w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"])
        b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"])

        # Batch of size=n.
        input_ = input_space.sample(4)

        expected = np.concatenate([  # concat everything
            one_hot(input_[0]),  # int flattening
            dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value)  # float -> 2 x dense
        ], axis=-1)
        out = test.test(("call", tuple([input_])), expected_outputs=expected)

        test.terminate()
    def test_double_dqn_on_2x2_grid_world_single_action_to_container(self):
        """
        Tests how dqn solves a mapping of a single integer to multiple actions (as opposed to using container
        actions).
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld_single_to_container.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        action_space = IntBox(0, 18)
        agent = DQNAgent.from_spec(agent_config,
                                   huber_loss=True,
                                   double_q=True,
                                   dueling_q=True,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=action_space,
                                   store_last_q_table=True)

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)
        print(results)
    def test_multi_input_stream_neural_network_with_dict(self):
        # Space must contain batch dimension (otherwise, NNlayer will complain).
        input_space = Dict(
            a=FloatBox(shape=(3,)),
            b=IntBox(4, shape=()),
            add_batch_rank=True
        )

        multi_input_nn = MultiInputStreamNeuralNetwork(
            input_network_specs=dict(
                a=[],
                b=[{"type": "reshape", "flatten": True, "flatten_categories": True}]
            ),
            post_network_spec=[{"type": "dense", "units": 2}],
        )

        test = ComponentTest(component=multi_input_nn, input_spaces=dict(inputs=input_space))

        # Batch of size=n.
        nn_inputs = input_space.sample(5)

        global_scope = "multi-input-stream-nn/post-concat-nn/dense-layer/dense/"
        # Calculate output manually.
        var_dict = test.read_variable_values()

        b_flat = one_hot(nn_inputs["b"], depth=4)
        concat_out = np.concatenate((nn_inputs["a"], b_flat), axis=-1)
        expected = dense_layer(concat_out, var_dict[global_scope+"kernel"], var_dict[global_scope+"bias"])

        test.test(("call", nn_inputs), expected_outputs=expected)

        test.terminate()
예제 #6
0
    def test_memory_compilation(self):
        # Builds a memory and returns build stats.
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)

        record_space = Dict(states=env.state_space,
                            actions=env.action_space,
                            rewards=float,
                            terminals=BoolBox(),
                            add_batch_rank=True)
        input_spaces = dict(
            # insert: records
            records=record_space,
            # get_records: num_records
            num_records=int,
            # update_records: indices, update
            indices=IntBox(add_batch_rank=True),
            update=FloatBox(add_batch_rank=True))

        input_spaces.pop("num_records")
        memory = MemPrioritizedReplay(capacity=20000, )
        test = ComponentTest(component=memory,
                             input_spaces=input_spaces,
                             auto_build=False)
        return test.build()
예제 #7
0
    def build_input_tokens(self):
        """
        Tokenizes vocabulary used for state representations for input to Q-network by 
        assigning integers to vocab words (query operators, query operands 
        i.e. attributes represented in workload)

        Exposed through self.system_spec and self.states_spec
        """

        self.system_spec["state_dim"] = self.input_sequence_size
        vocab = {}
        vocab_size = 0

        #
        # tokenize
        #

        # special tokens
        pad_token = 'pad'
        vocab[pad_token] = vocab_size
        vocab_size += 1

        ## state = ...
        ## ... query
        # operands
        for col in self.cols:
            vocab[col] = vocab_size
            vocab_size += 1

        # operators
        if self.include_default_operators:
            for op in self.query_ops:
                vocab[op] = vocab_size
                vocab_size += 1

        for op in self.query_selection_ops:
            vocab[op] = vocab_size
            vocab_size += 1

        ## ... + context TODO
        for col in self.cols:
            vocab[col + '_idx'] = vocab_size
            vocab_size += 1

        # delimits / demarcates compound indices
        idx_token = 'idx'
        vocab[idx_token] = vocab_size

        #
        # specific input schema, i.e. a vector of vocabulary tokens in Z^n, n=specified input size, to be embedded in embedding layer
        #
        self.states_spec = IntBox(low=0,
                                  high=vocab_size,
                                  shape=(self.input_sequence_size, ))
        self.system_spec['vocab'] = vocab
        self.system_spec['vocab_size'] = len(vocab)
        self.system_spec['index_token'] = idx_token
        self.system_spec['pad_token'] = pad_token
예제 #8
0
    def test_specifiable_server(self):
        action_space = IntBox(2)
        state_space = FloatBox()
        env_spec = dict(type="random_env",
                        state_space=state_space,
                        action_space=action_space,
                        deterministic=True)
        # Create the server, but don't start it yet. This will be done fully automatically by the tf-Session.
        specifiable_server = SpecifiableServer(
            Environment, env_spec, dict(step_flow=[state_space, float, bool]),
            "terminate")

        # ret are ops now in the graph.
        ret1 = specifiable_server.step_flow(action_space.sample())
        ret2 = specifiable_server.step_flow(action_space.sample())

        # Check all 3 outputs of the Env step (next state, reward, terminal).
        self.assertEqual(ret1[0].shape, ())
        self.assertEqual(ret1[0].dtype, convert_dtype("float32"))
        self.assertEqual(ret1[1].shape, ())
        self.assertEqual(ret1[1].dtype, convert_dtype("float32"))
        self.assertEqual(ret1[2].shape, ())
        self.assertEqual(ret1[2].dtype, convert_dtype("bool"))
        self.assertEqual(ret2[0].shape, ())
        self.assertEqual(ret2[0].dtype, convert_dtype("float32"))
        self.assertEqual(ret2[1].shape, ())
        self.assertEqual(ret2[1].dtype, convert_dtype("float32"))
        self.assertEqual(ret2[2].shape, ())
        self.assertEqual(ret2[2].dtype, convert_dtype("bool"))

        # Start the session and run the op, then check its actual values.
        with tf.train.SingularMonitoredSession(
                hooks=[SpecifiableServerHook()]) as sess:
            out1 = sess.run(ret1)
            out2 = sess.run(ret2)

        # next state
        self.assertAlmostEqual(out1[0], 0.7713, places=4)
        self.assertAlmostEqual(out2[0], 0.7488, places=4)
        # reward
        self.assertAlmostEqual(out1[1], 0.0208, places=4)
        self.assertAlmostEqual(out2[1], 0.4985, places=4)
        # terminal
        self.assertTrue(out1[2] is np.bool_(False))
        self.assertTrue(out2[2] is np.bool_(False))
예제 #9
0
    def test_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4,), add_batch_rank=True)

        # action_space (5 possible actions).
        action_space = IntBox(5, add_batch_rank=True)

        policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space)
        test = ComponentTest(
            component=policy,
            input_spaces=dict(nn_input=state_space),
            action_space=action_space
        )
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=2).
        states = np.array([[-0.08, 0.4, -0.05, -0.55], [13.0, -14.0, 10.0, -16.0]])
        # Raw NN-output.
        expected_nn_output = np.matmul(states, policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=6)

        # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories
        expected_action_layer_output = np.matmul(
            expected_nn_output, policy_params["policy/action-adapter/action-layer/dense/kernel"]
        )
        expected_action_layer_output = np.reshape(expected_action_layer_output, newshape=(2, 5))
        test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states), expected_outputs=dict(action=expected_actions, last_internal_states=None))

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probabilities_output = softmax(expected_action_layer_output, axis=-1)
        test.test(("get_logits_parameters_log_probs", states, [0, 1, 2]), expected_outputs=dict(
            logits=expected_action_layer_output,
            parameters=expected_probabilities_output,
            log_probs=np.log(expected_probabilities_output)
        ), decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2,))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2,))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2,))
예제 #10
0
    def test_slice_without_squeeze(self):
        slicer = Slice(squeeze=False)
        input_space = FloatBox(shape=(1, 4, 5), add_batch_rank=True)
        test = ComponentTest(component=slicer,
                             input_spaces=dict(inputs=input_space,
                                               start_index=IntBox(),
                                               end_index=IntBox()))

        # Time-steps=3, Batch=5
        inputs = input_space.sample(size=4)
        expected = np.asarray(
            [inputs[1]])  # Add the not-squeezed rank back to expected.
        test.test(("slice", [inputs, 1, 2]), expected_outputs=expected)

        expected = inputs[0:2]
        test.test(("slice", [inputs, 0, 2]), expected_outputs=expected)

        expected = np.asarray([inputs[0]])
        test.test(("slice", [inputs, 0, 1]), expected_outputs=expected)
예제 #11
0
    def test_slice_with_squeeze(self):
        slicer = Slice(squeeze=True)
        input_space = FloatBox(shape=(2, 2, 3), add_batch_rank=True, add_time_rank=True, time_major=True)
        test = ComponentTest(component=slicer, input_spaces=dict(
            preprocessing_inputs=input_space,
            start_index=IntBox(),
            end_index=IntBox()
        ))

        # Time-steps=3, Batch=5
        inputs = input_space.sample(size=(3, 5))
        expected = inputs[1]
        test.test(("slice", [inputs, 1, 2]), expected_outputs=expected)

        expected = inputs[0:2]
        test.test(("slice", [inputs, 0, 2]), expected_outputs=expected)

        expected = inputs[0]
        test.test(("slice", [inputs, 0, 1]), expected_outputs=expected)
    def test_multi_input_stream_neural_network_with_tuple(self):
        # Space must contain batch dimension (otherwise, NNLayer will complain).
        input_space = Tuple(
            IntBox(3, shape=()),
            FloatBox(shape=(8,)),
            IntBox(4, shape=()),
            add_batch_rank=True
        )

        multi_input_nn = MultiInputStreamNeuralNetwork(
            input_network_specs=(
                [{"type": "reshape", "flatten": True, "flatten_categories": True}],  # intbox -> flatten
                [{"type": "dense", "units": 2}],  # floatbox -> dense
                [{"type": "reshape", "flatten": True, "flatten_categories": True}]  # inbox -> flatten
            ),
            post_network_spec=[{"type": "dense", "units": 3}],
        )

        test = ComponentTest(component=multi_input_nn, input_spaces=dict(inputs=input_space))

        # Batch of size=n.
        nn_inputs = input_space.sample(3)

        global_scope_pre = "multi-input-stream-nn/input-stream-nn-"
        global_scope_post = "multi-input-stream-nn/post-concat-nn/dense-layer/dense/"
        # Calculate output manually.
        var_dict = test.read_variable_values()

        flat_0 = one_hot(nn_inputs[0], depth=3)
        dense_1 = dense_layer(
            nn_inputs[1], var_dict[global_scope_pre+"1/dense-layer/dense/kernel"],
            var_dict[global_scope_pre+"1/dense-layer/dense/bias"]
        )
        flat_2 = one_hot(nn_inputs[2], depth=4)
        concat_out = np.concatenate((flat_0, dense_1, flat_2), axis=-1)
        expected = dense_layer(concat_out, var_dict[global_scope_post+"kernel"], var_dict[global_scope_post+"bias"])

        test.test(("call", tuple([nn_inputs])), expected_outputs=expected)

        test.terminate()
예제 #13
0
    def test_container_actions(self):
        # Test container actions with embedding.

        vocab_size = 100
        embed_dim = 128
        # ID/state space.
        state_space = IntBox(vocab_size, shape=(10, ))
        # Container action space.
        actions_space = {}
        num_outputs = 3
        for i in range(3):
            actions_space['action_{}'.format(i)] = IntBox(low=0,
                                                          high=num_outputs)
        actions_space = Dict(actions_space)

        agent_config = config_from_path("configs/dqfd_container.json")
        agent_config["network_spec"] = [
            dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size),
            dict(type="reshape", flatten=True),
            dict(type="dense",
                 units=embed_dim,
                 activation="relu",
                 scope="dense_1")
        ]
        agent = DQFDAgent.from_spec(agent_config,
                                    state_space=state_space,
                                    action_space=actions_space)
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)

        agent.observe_demos(
            preprocessed_states=agent.preprocessed_state_space.with_batch_rank(
            ).sample(1),
            actions=actions_space.with_batch_rank().sample(1),
            rewards=rewards.sample(1),
            next_states=agent.preprocessed_state_space.with_batch_rank().
            sample(1),
            terminals=terminals.sample(1),
        )
예제 #14
0
    def get_preprocessed_space(self, space):
        # TODO map of allowed conversions in utils?
        if isinstance(space, IntBox):
            if self.to_dtype == "float" or self.to_dtype == "float32" or self.to_dtype == "np.float"\
                    or self.to_dtype == "tf.float32" or self.to_dtype == "torch.float32":
                return FloatBox(shape=space.shape, low=space.low, high=space.high,
                                add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank)
            elif self.to_dtype == "bool":
                if space.low == 0 and space.high == 1:
                    return BoolBox(shape=space.shape, add_batch_rank=space.has_batch_rank,
                                   add_time_rank=space.has_time_rank)
                else:
                    raise RLGraphError("ERROR: Conversion from IntBox to BoolBox not allowed if low is not 0 and "
                                       "high is not 1.")
        elif isinstance(space, BoolBox):
            if self.to_dtype == "float" or self.to_dtype == "float32" or self.to_dtype == "np.float" \
                 or self.to_dtype == "tf.float32" or self.to_dtype == "torch.float32":
                return FloatBox(shape=space.shape, low=0.0, high=1.0,
                                add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank)
            elif self.to_dtype == "int" or self.to_dtype == "int32" or self.to_dtype  == "np.int32" or \
                    self.to_dtype == "tf.int32" or self.to_dtype == "torch.int32":
                return IntBox(shape=space.shape, low=0, high=1,
                              add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank)
        elif isinstance(space, FloatBox):
            if self.to_dtype == "int" or self.to_dtype == "int32" or self.to_dtype  == "np.int32" or \
                 self.to_dtype == "tf.int32" or self.to_dtype == "torch.int32":
                return IntBox(shape=space.shape, low=space.low, high=space.high,
                              add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank)

        # Wrong conversion.
        else:
            raise RLGraphError("ERROR: Space conversion from: {} to type {} not supported".format(
                space, self.to_dtype
            ))

        # No conversion.
        return space
    def test_update_throughput(self):
        env = Environment.from_spec(self.env_spec)
        # TODO comment in for multi gpu
        # config_from_path("configs/multi_gpu_ray_apex_for_pong.json"),
        config = config_from_path("configs/ray_apex_for_pong.json")

        # Adjust to usable GPUs for test system.
        num_gpus = [1]
        for gpu_count in num_gpus:
            config["execution_spec"]["gpu_spec"]["num_gpus"] = gpu_count
            config["execution_spec"]["gpu_spec"]["per_process_gpu_memory_fraction"] = 1.0 / gpu_count

            agent = Agent.from_spec(
                # TODO replace with config from above
                config_from_path("configs/ray_apex_for_pong.json"),
                state_space=env.state_space,
                # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
                action_space=env.action_space
            )

            batch_space = Dict(
                states=agent.preprocessed_state_space,
                actions=env.action_space,
                rewards=FloatBox(),
                next_states=agent.preprocessed_state_space,
                terminals=IntBox(low=0, high=1),
                importance_weights=FloatBox(),
                add_batch_rank=True
            )

            batch_size = 512 * gpu_count
            num_samples = 50
            samples = [batch_space.sample(batch_size) for _ in range(num_samples)]

            times = []
            throughputs = []
            for sample in samples:
                start = time.perf_counter()
                agent.update(sample)
                runtime = time.perf_counter() - start
                times.append(runtime)
                throughputs.append(batch_size / runtime)

            print("Throughput: {} samples / s ({}) for {} GPUs".format(np.mean(throughputs),
                                                                       np.std(throughputs), gpu_count))
예제 #16
0
    def test_random_env(self):
        """
        Tests deterministic functionality of RandomEnv.
        """
        env = RandomEnv(state_space=FloatBox(shape=(2, 2)), action_space=IntBox(2), deterministic=True)

        # Simple test runs with fixed actions.
        s = env.reset()
        recursive_assert_almost_equal(s, np.array([[0.77132064, 0.02075195], [0.63364823, 0.74880388]]))
        s, r, t, _ = env.step(env.action_space.sample())
        recursive_assert_almost_equal(s, np.array([[0.1980629, 0.7605307], [0.1691108, 0.0883398]]))
        s, r, t, _ = env.step(env.action_space.sample())
        recursive_assert_almost_equal(r, np.array(0.7217553))
        s, r, t, _ = env.step(env.action_space.sample())
        self.assertEqual(t, False)
        s, r, t, _ = env.step(env.action_space.sample())
        recursive_assert_almost_equal(s, np.array([[0.4418332, 0.434014], [0.617767 , 0.5131382]]))
        s, r, t, _ = env.step(env.action_space.sample())
예제 #17
0
    def build_output_tokens(self):
        """
        Tokenizes vocabulary used for action representations for output of Q-network

        Exposed through self.system_spec and self.actions_spec

        Recall action representation maps index field (a candidate index field) to a decision
        e.g. 
            suppose allow indices on up to 3 cols, allow indices to be ASC or DESC
            then the action is specified in [0,6], where 0 corresponds to noop, 
            {1,2} correspond to an ASC or DESC index on 1st query attribute, 
            {3,4} correspond to an ASC or DESC index on 2nd query attribute, etc.

            {0:1, 1:0:, 2:0} is an action specifying an index (ascending index) on 1st query attributes,
            and noops for the 2 remaining allowed columns for the compound index 

        n.b. actions_spec comes from action branching architectures https://arxiv.org/abs/1711.08946
        TODO dig deeper into that

        """

        noop_idx = 0
        idxs = []
        self.actions_spec = {}

        # not sure whether ASC / DESC can be specified
        # see LIFT paper for this representation in particular
        n_outputs = 1 + self.max_fields_per_index  # 1 + 2 * self.max_fields_per_index
        for i in range(self.max_fields_per_index):
            idxs.append('index_column{}'.format(i))

            self.actions_spec['index_column{}'.format(i)] = IntBox(
                low=0, high=n_outputs)

        # ?
        self.actions_spec = Dict(self.actions_spec, add_batch_rank=True)

        self.system_spec['idxs'] = idxs
        self.system_spec['n_outputs'] = n_outputs
        self.system_spec['noop_idx'] = noop_idx
        self.system_spec['max_fields_per_index'] = self.max_fields_per_index
예제 #18
0
 def _prepare_loss_function_test(loss_function):
     test = ComponentTest(
         component=loss_function,
         input_spaces=dict(
             alpha=float,
             log_probs_next_sampled=FloatBox(shape=(1, ),
                                             add_batch_rank=True),
             q_values_next_sampled=Tuple(FloatBox(shape=(1, )),
                                         FloatBox(shape=(1, )),
                                         add_batch_rank=True),
             q_values=Tuple(FloatBox(shape=(1, )),
                            FloatBox(shape=(1, )),
                            add_batch_rank=True),
             log_probs_sampled=FloatBox(shape=(1, ), add_batch_rank=True),
             q_values_sampled=Tuple(FloatBox(shape=(1, )),
                                    FloatBox(shape=(1, )),
                                    add_batch_rank=True),
             rewards=FloatBox(add_batch_rank=True),
             terminals=BoolBox(add_batch_rank=True),
             loss_per_item=FloatBox(add_batch_rank=True)),
         action_space=IntBox(2, shape=(), add_batch_rank=True))
     return test
예제 #19
0
    def __init__(self, config: Dict[str, any], result_dir: str,
                 cache_stats: CacheInformation):
        super().__init__(config, result_dir, cache_stats)
        # evaluation specific variables
        self.observation_seen = 0
        self.episode_reward = 0
        self.checkpoint_steps = config['checkpoint_steps']

        self._incomplete_experiences = TTLCache(InMemoryStorage())
        self._incomplete_experiences.expired_entry_callback(
            self._observe_expired_incomplete_experience)
        self.view_of_the_cache = {}  # type: Dict[str, Dict[str, any]]
        self._end_episode_observation = {
            ObservationType.Invalidate, ObservationType.Miss,
            ObservationType.Expiration
        }

        # TODO refactor into common RL interface for all strategies
        # Agent configuration (can be shared with others)
        agent_config = config['agent_config']
        fields_in_state = len(EvictionAgentSystemState.__slots__)
        self.converter = EvictionStrategyRLConverter(self.result_dir)

        # State: fields to observe in question
        # Action: to evict or not that key
        self.agent = Agent.from_spec(
            agent_config,
            state_space=FloatBox(shape=(fields_in_state, )),
            action_space=IntBox(low=0, high=2))

        self.logger = logging.getLogger(__name__)
        name = 'rl_eviction_strategy'
        self.reward_logger = create_file_logger(name=f'{name}_reward_logger',
                                                result_dir=self.result_dir)
        self.loss_logger = create_file_logger(name=f'{name}_loss_logger',
                                              result_dir=self.result_dir)
        self.observation_logger = create_file_logger(
            name=f'{name}_observation_logger', result_dir=self.result_dir)
        self.key_vocab = Vocabulary()
예제 #20
0
    def test_prioritized_replay(self):
        """
        Tests individual and chunked insert and sampling performance of prioritized replay memory.
        """
        record_space = Dict(states=self.env.state_space,
                            actions=self.env.action_space,
                            reward=float,
                            terminals=BoolBox(),
                            add_batch_rank=True)
        input_spaces = dict(insert_records=record_space,
                            get_records=int,
                            update_records=[
                                IntBox(shape=(), add_batch_rank=True),
                                FloatBox(shape=(), add_batch_rank=True)
                            ])

        memory = PrioritizedReplay(capacity=self.capacity,
                                   next_states=True,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory,
                             input_spaces=input_spaces,
                             enable_profiler=self.enable_profiler)

        records = [record_space.sample(size=1) for _ in range(self.inserts)]
        start = time.monotonic()
        for record in records:
            test.test(("insert_records", record), expected_outputs=None)
        end = time.monotonic() - start

        tp = len(records) / end
        print('#### Testing Prioritized Replay memory ####')
        print('Testing insert performance:')
        print(
            'Inserted {} separate records, throughput: {} records/s, total time: {} s'
            .format(len(records), tp, end))

        record_chunks = [
            record_space.sample(size=self.chunk_size)
            for _ in range(self.inserts)
        ]
        start = time.monotonic()
        for chunk in record_chunks:
            test.test(("insert_records", chunk), expected_outputs=None)
        end = time.monotonic() - start

        tp = len(record_chunks) * self.chunk_size / end
        print(
            'Inserted {} record chunks of size {}, throughput: {} records/s, total time: {} s'
            .format(len(record_chunks), self.chunk_size, tp, end))

        print('Testing sample performance:')
        start = time.monotonic()
        for _ in range(self.samples):
            test.test(("get_records", self.sample_batch_size),
                      expected_outputs=None)
        end = time.monotonic() - start
        tp = self.samples / end

        print(
            'Sampled {} batches of size {}, throughput: {} sample-ops/s, total time: {} s'
            .format(self.samples, self.sample_batch_size, tp, end))
    def test_custom_margin_demos_with_container_actions(self):
        # Tests if using different margins per sample works.
        # Same state, but different
        vocab_size = 100
        embed_dim = 8
        # ID/state space.
        state_space = IntBox(vocab_size, shape=(10,))
        # Container action space.
        actions_space = {}
        num_outputs = 3
        for i in range(3):
            actions_space['action_{}'.format(i)] = IntBox(
                low=0,
                high=num_outputs
            )
        actions_space = Dict(actions_space)

        agent_config = config_from_path("configs/dqfd_container.json")
        agent_config["network_spec"] = [
            dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size),
            dict(type="reshape", flatten=True),
            dict(type="dense", units=embed_dim, activation="relu", scope="dense_1")
        ]
        agent = DQFDAgent.from_spec(
            agent_config,
            state_space=state_space,
            action_space=actions_space
        )
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)

        # Create a set of demos.
        demo_states = agent.preprocessed_state_space.with_batch_rank().sample(2)
        # Same state.
        demo_states[1] = demo_states[0]
        demo_actions = actions_space.with_batch_rank().sample(2)

        for name, action in actions_space.items():
            demo_actions[name][0] = 0
            demo_actions[name][1] = 1

        demo_rewards = rewards.sample(2, fill_value=.0)
        # One action has positive reward, one negative
        demo_rewards[0] = 0
        demo_rewards[1] = 0

        # One action is encouraged, one is discouraged.
        margins = np.asarray([0.5, -0.5])

        demo_next_states = agent.preprocessed_state_space.with_batch_rank().sample(2)
        demo_terminals = terminals.sample(2, fill_value=False)

        # When using margins, need to use external batch.
        batch = dict(
            states=demo_states,
            actions=demo_actions,
            rewards=demo_rewards,
            next_states=demo_next_states,
            importance_weights=np.ones_like(demo_rewards),
            terminals=demo_terminals,
        )
        # Fit demos with custom margins.
        for _ in range(10000):
            agent.update(batch=batch, update_from_demos=False, apply_demo_loss_to_batch=True, expert_margins=margins)

        # Evaluate demos for the state -> should have action with positive reward.
        agent_actions = agent.get_action(np.array([demo_states[0]]), apply_preprocessing=False, use_exploration=False)
        print("learned action = ", agent_actions)
예제 #22
0
    def __init__(self, file_name=None, worker_id=0, base_port=5005, seed=0, docker_training=False, no_graphics=False,
                 timeout_wait=30, train_mode=True, **kwargs):
        """
        Args:
            file_name (Optional[str]): Name of Unity environment binary.
            base_port (int): Port number to connect to Unity environment. `worker_id` increments on top of this.
            worker_id (int): Number to add to `base_port`. Used for asynchronous agent scenarios.
            docker_training (bool): Informs this class, whether the process is being run within a container.
                Default: False.
            no_graphics (bool): Whether to run the Unity simulator in no-graphics mode. Default: False.
            timeout_wait (int): Time (in seconds) to wait for connection from environment.
            train_mode (bool): Whether to run in training mode, speeding up the simulation. Default: True.
        """
        # First create the UnityMLAgentsEnvironment to get state and action spaces, then create RLgraph Environment
        # instance.
        self.mlagents_env = UnityEnvironment(
            file_name, worker_id, base_port, seed, docker_training, no_graphics
        )
        all_brain_info = self.mlagents_env.reset()
        # Get all possible information from AllBrainInfo.
        # TODO: Which scene do we pick?
        self.scene_key = next(iter(all_brain_info))
        first_brain_info = all_brain_info[self.scene_key]
        num_environments = len(first_brain_info.agents)

        state_space = {}
        if len(first_brain_info.vector_observations[0]) > 0:
            state_space["vector"] = get_space_from_op(first_brain_info.vector_observations[0])
            # TODO: This is a hack.
            if state_space["vector"].dtype == np.float64:
                state_space["vector"].dtype = np.float32
        if len(first_brain_info.visual_observations) > 0:
            state_space["visual"] = get_space_from_op(first_brain_info.visual_observations[0])
        if first_brain_info.text_observations[0]:
            state_space["text"] = get_space_from_op(first_brain_info.text_observations[0])

        if len(state_space) == 1:
            self.state_key = next(iter(state_space))
            state_space = state_space[self.state_key]
        else:
            self.state_key = None
            state_space = Dict(state_space)
        brain_params = next(iter(self.mlagents_env.brains.values()))
        if brain_params.vector_action_space_type == "discrete":
            highs = brain_params.vector_action_space_size
            # MultiDiscrete (Tuple(IntBox)).
            if any(h != highs[0] for h in highs):
                action_space = Tuple([IntBox(h) for h in highs])
            # Normal IntBox:
            else:
                action_space = IntBox(
                    low=np.zeros_like(highs, dtype=np.int32),
                    high=np.array(highs, dtype=np.int32),
                    shape=(len(highs),)
                )
        else:
            action_space = get_space_from_op(first_brain_info.action_masks[0])
        if action_space.dtype == np.float64:
            action_space.dtype = np.float32

        super(MLAgentsEnv, self).__init__(
            num_environments=num_environments, state_space=state_space, action_space=action_space, **kwargs
        )

        # Caches the last observation we made (after stepping or resetting).
        self.last_state = None
    def test_keras_style_complex_multi_stream_nn(self):
        # 3 inputs.
        input_spaces = [
            Dict({
                "img": FloatBox(shape=(6, 6, 3)),
                "int": IntBox(3)
            }, add_batch_rank=True, add_time_rank=True),
            FloatBox(shape=(2,), add_batch_rank=True),
            Tuple(IntBox(2), TextBox(), add_batch_rank=True, add_time_rank=True)
        ]

        # Same NN as in test above, only using some of the sub-Spaces from the input spaces.
        # Tests whether this NN can add automatically the correct splitters.
        folded_text = ReShape(fold_time_rank=True)(input_spaces[2][1])
        # String layer will create batched AND time-ranked (individual words) hash outputs (int64).
        string_bucket_out, lengths = StringToHashBucket(num_hash_buckets=5)(folded_text)
        # Batched and time-ranked embedding output (floats) with embed dim=n.
        embedding_out = EmbeddingLookup(embed_dim=10, vocab_size=5)(string_bucket_out)
        # Pass embeddings through a text LSTM and use last output (reduce time-rank).
        string_lstm_out, _ = LSTMLayer(units=2, return_sequences=False, scope="lstm-layer-txt")(
            embedding_out, sequence_length=lengths
        )
        # Unfold to get original time-rank back.
        string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(string_lstm_out, input_spaces[2][1])

        # Parallel image stream via 1 CNN layer plus dense.
        folded_img = ReShape(fold_time_rank=True, scope="img-fold")(input_spaces[0]["img"])
        cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img)
        unfolded_cnn_out = ReShape(unfold_time_rank=True, scope="img-unfold")(cnn_out, input_spaces[0]["img"])
        unfolded_cnn_out_flattened = ReShape(flatten=True, scope="img-flat")(unfolded_cnn_out)
        dense_out = DenseLayer(units=2, scope="dense-0")(unfolded_cnn_out_flattened)

        # Concat everything.
        concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out)

        # LSTM output has batch+time.
        main_lstm_out, internal_states = LSTMLayer(units=2, scope="lstm-layer-main")(concat_out)

        dense1_after_lstm_out = DenseLayer(units=3, scope="dense-1")(main_lstm_out)
        dense2_after_lstm_out = DenseLayer(units=2, scope="dense-2")(dense1_after_lstm_out)
        dense3_after_lstm_out = DenseLayer(units=1, scope="dense-3")(dense2_after_lstm_out)

        # A NN with 3 outputs.
        neural_net = NeuralNetwork(inputs=input_spaces, outputs=[dense3_after_lstm_out, main_lstm_out, internal_states])

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_spaces))

        # Batch of size=n.
        sample_shape = (4, 2)
        input_ = [input_spaces[0].sample(sample_shape), input_spaces[1].sample(sample_shape[0]),
                  input_spaces[2].sample(sample_shape)]

        out = test.test(("call", tuple(input_)), expected_outputs=None)
        # Main output (Dense out after LSTM).
        self.assertTrue(out[0].shape == sample_shape + (1,))  # 1=1 unit in dense layer
        self.assertTrue(out[0].dtype == np.float32)
        # main-LSTM out.
        self.assertTrue(out[1].shape == sample_shape + (2,))  # 2=2 LSTM units
        self.assertTrue(out[1].dtype == np.float32)
        # main-LSTM internal-states.
        self.assertTrue(out[2][0].shape == sample_shape[:1] + (2,))  # 2=2 LSTM units
        self.assertTrue(out[2][0].dtype == np.float32)
        self.assertTrue(out[2][1].shape == sample_shape[:1] + (2,))  # 2=2 LSTM units
        self.assertTrue(out[2][1].dtype == np.float32)

        test.terminate()
예제 #24
0
def run_dqn(exp, steps=25000, combinatorial=False):
    
    #
    # can't account for all configurations, but be sure agent is of a reasonably small size 
    # 

    vocab_size = 6
    state_size = 6

    #
    # queries, rewards for actions per query
    # 
    dqn_queries, _, actions = data(exp)
    repr_builder = RepresentationBuilder()
    get_query, get_reward = repr_builder.build_dqn(dqn_queries, actions, K=state_size, prob=0.67)

    
    #
    # agent
    #
    import json # config is a bit big to copy
    with open('/Users/jeremywelborn/rlautoindex/conf/dqn.json', 'r') as f:
        config = json.load(f)
    agent_config = config['agent']    

    # any further adjustments?
    agent_config['memory_spec']['type']='replay' 
    agent_config['exploration_spec']['epsilon_spec']['decay_spec']['num_timesteps'] = int(steps * .75)

    agent_config['network_spec'][0]['embed_dim'] = 64 # reduce capacity
    agent_config['network_spec'][2]['units'] = 64
    agent_config['network_spec'][0]['vocab_size'] = vocab_size


    # replicate representations defined in Schema
    state_spec = IntBox(low=0, high=vocab_size, shape=(state_size,))

    if not combinatorial:
        n_outputs = 1+3
        action_spec = {}
        for i in range(3):
            action_spec['candidate_index_column{}'.format(i)] = IntBox(low=0, high=n_outputs)
        action_spec = Dict(action_spec, add_batch_rank=True)
    else:
        perm_idx_2_perm = []
        for r in range(3+1): 
            perm_idx_2_perm.extend(itertools.permutations(range(3),r=r))
        perm_idx_2_perm = list(map(list, perm_idx_2_perm)) # [[], [1], [2], [3], [1, 2], [1, 3], [2, 1], [2, 3], [3, 1], [3, 2], [1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 1, 2], [3, 2, 1]]

        # action is a scalar corresponding to a particular permutation of query attributes
        action_spec = IntBox(low=0, high=len(perm_idx_2_perm))


    task_graph = TaskGraph()
    task = Task(agent_config, state_space=state_spec, action_space=action_spec)
    task_graph.add_task(task)
    task_graph.get_task("").unwrap().timesteps = 0
    controller = SystemController(None, None) # have to have for updates...
    controller.task_graph = task_graph
    controller.set_update_schedule(agent_config["update_spec"])

    print("params: {}".format(task.agent.graph_builder.num_trainable_parameters)) # TODO yikes

    #
    # train agent
    #
    step = 0; steps = steps 
    record = []
    running_avg_reward = deque(maxlen=1000)
    start = time.time()
    while step < steps:
        step += 1

        if step != 0 and step % 1000 == 0:
            print('running avg reward after {}/{} steps is {}'.format(step, steps, np.mean(running_avg_reward)))
            record.append((step, np.mean(running_avg_reward), time.time() - start))

        query_idx, query = get_query()
        
        agent_action = task_graph.act_task("", query, apply_preprocessing=True)
        
        # replicate representation conversions defined in Converter
        # hack - same as how query_cols are stored with query in actual training loop
        attr_tokens = [foo_token, bar_token, baz_token]
        n_attrs = len([attr_token for attr_token in query[:3] if attr_token in attr_tokens]) # count tokens that are column tokens
        
        if not combinatorial:
            action = []
            for key in ['candidate_index_column{}'.format(i) for i in range(3)]:
                action_val = agent_action[key][0]
                if action_val != 0: # if is not noop
                    if n_attrs > action_val - 1: # if is a valid action
                        col = query[:n_attrs][action_val - 1]
                        if col not in action:
                            action.append(col)

        else:
            action = []
            perm_idx = agent_action 
            perm = perm_idx_2_perm[perm_idx]
            
            if len(perm) == n_attrs: # ignore case like query==[foo], permutation of query==[1,2]
                for query_attr_idx in perm:
                    if n_attrs > query_attr_idx: # ignore case like query==[foo], permutation of query==[1] b/c there is only 0th attribute, not 1st attribute
                        col = query[:n_attrs][query_attr_idx]
                        # if col not in action: # no repeats in this representation
                        action.append(col)

        reward = get_reward(query_idx, action)        
        running_avg_reward.append(reward)

        # TODO what to do with s_t+1???
        task_graph.observe_task("", query, agent_action, [], reward, query, False)
        controller.update_if_necessary()

    return record
예제 #25
0
class TestPythonPrioritizedReplay(unittest.TestCase):
    """
    Tests sampling and insertion behaviour of the mem_prioritized_replay module.
    """
    record_space = Dict(states=dict(state1=float, state2=float),
                        actions=dict(action1=float),
                        reward=float,
                        terminals=BoolBox(),
                        add_batch_rank=True)
    apex_space = Dict(states=FloatBox(shape=(4, )),
                      actions=FloatBox(shape=(2, )),
                      reward=float,
                      terminals=BoolBox(),
                      weights=FloatBox(),
                      add_batch_rank=True)

    memory_variables = ["size", "index", "max-priority"]

    capacity = 10
    alpha = 1.0
    beta = 1.0

    max_priority = 1.0

    input_spaces = dict(
        # insert: records
        records=record_space,
        # get_records: num_records
        num_records=int,
        # update_records: indices, update
        indices=IntBox(add_batch_rank=True),
        update=FloatBox(add_batch_rank=True))

    # TODO These methods are all graph fns now -> unify backend tests.
    def test_insert(self):
        """
        Simply tests insert op without checking internal logic.
        """
        memory = MemPrioritizedReplay(capacity=self.capacity,
                                      next_states=True,
                                      alpha=self.alpha,
                                      beta=self.beta)
        memory.create_variables(self.input_spaces)

        observation = memory.record_space_flat.sample(size=1)
        memory.insert_records(observation)

        # Test chunked insert
        observation = memory.record_space_flat.sample(size=5)
        memory.insert_records(observation)

        # Also test Apex version
        memory = ApexMemory(capacity=self.capacity,
                            alpha=self.alpha,
                            beta=self.beta)
        observation = self.apex_space.sample(size=5)
        for i in range_(5):
            memory.insert_records(
                (observation['states'][i], observation['actions'][i],
                 observation['reward'][i], observation['terminals'][i],
                 observation['states'][i], observation["weights"][i]))

    def test_update_records(self):
        """
        Tests update records logic.
        """
        memory = MemPrioritizedReplay(capacity=self.capacity, next_states=True)
        memory.create_variables(self.input_spaces)

        # Insert a few Elements.
        observation = memory.record_space_flat.sample(size=2)
        memory.insert_records(observation)

        # Fetch elements and their indices.
        num_records = 2
        batch = memory.get_records(num_records)
        indices = batch[1]
        self.assertEqual(num_records, len(indices))

        # Does not return anything.
        memory.update_records(indices, np.asarray([0.1, 0.2]))

        # Test apex memory.
        memory = ApexMemory(capacity=self.capacity,
                            alpha=self.alpha,
                            beta=self.beta)
        observation = self.apex_space.sample(size=5)
        for i in range_(5):
            memory.insert_records(
                (ray_compress(observation["states"][i]),
                 observation["actions"][i], observation["reward"][i],
                 observation["terminals"][i], observation["weights"][i]))

        # Fetch elements and their indices.
        num_records = 5
        batch = memory.get_records(num_records)
        indices = batch[1]
        self.assertEqual(num_records, len(indices))

        # Does not return anything
        memory.update_records(indices, np.random.uniform(size=10))

    def test_segment_tree_insert_values(self):
        """
        Tests if segment tree inserts into correct positions.
        """
        memory = MemPrioritizedReplay(capacity=self.capacity,
                                      next_states=True,
                                      alpha=self.alpha,
                                      beta=self.beta)
        memory.create_variables(self.input_spaces)

        priority_capacity = 1
        while priority_capacity < self.capacity:
            priority_capacity *= 2

        sum_segment_values = memory.merged_segment_tree.sum_segment_tree.values
        min_segment_values = memory.merged_segment_tree.min_segment_tree.values

        self.assertEqual(sum(sum_segment_values), 0)
        self.assertEqual(sum(min_segment_values), float('inf'))
        self.assertEqual(len(sum_segment_values), 2 * priority_capacity)
        self.assertEqual(len(min_segment_values), 2 * priority_capacity)

        # Insert 1 Element.
        observation = memory.record_space_flat.sample(size=1)
        memory.insert_records(observation)

        # Check insert positions
        # Initial insert is at priority capacity
        print(sum_segment_values)
        print(min_segment_values)
        start = priority_capacity

        while start >= 1:
            self.assertEqual(sum_segment_values[start], 1.0)
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

        # Insert another Element.
        observation = memory.record_space_flat.sample(size=1)
        memory.insert_records(observation)

        # Index shifted 1
        start = priority_capacity + 1
        self.assertEqual(sum_segment_values[start], 1.0)
        self.assertEqual(min_segment_values[start], 1.0)
        start = int(start / 2)
        while start >= 1:
            # 1 + 1 is 2 on the segment.
            self.assertEqual(sum_segment_values[start], 2.0)
            # min is still 1.
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

    def test_tree_insert(self):
        """
        Tests inserting into the segment tree and querying segments.
        """
        memory = ApexMemory(capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)
        assert np.isclose(tree.get_sum(), 4.0)
        assert np.isclose(tree.get_sum(0, 2), 0.0)
        assert np.isclose(tree.get_sum(0, 3), 1.0)
        assert np.isclose(tree.get_sum(2, 3), 1.0)
        assert np.isclose(tree.get_sum(2, -1), 1.0)
        assert np.isclose(tree.get_sum(2, 4), 4.0)

    def test_prefixsum_idx(self):
        """
        Tests fetching the index corresponding to a prefix sum.
        """
        memory = ApexMemory(capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)

        self.assertEqual(tree.index_of_prefixsum(0.0), 2)
        self.assertEqual(tree.index_of_prefixsum(0.5), 2)
        self.assertEqual(tree.index_of_prefixsum(0.99), 2)
        self.assertEqual(tree.index_of_prefixsum(1.01), 3)
        self.assertEqual(tree.index_of_prefixsum(3.0), 3)
        self.assertEqual(tree.index_of_prefixsum(4.0), 3)

        memory = ApexMemory(capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(0, 0.5)
        tree.insert(1, 1.0)
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)
        self.assertEqual(tree.index_of_prefixsum(0.0), 0)
        self.assertEqual(tree.index_of_prefixsum(0.55), 1)
        self.assertEqual(tree.index_of_prefixsum(0.99), 1)
        self.assertEqual(tree.index_of_prefixsum(1.51), 2)
        self.assertEqual(tree.index_of_prefixsum(3.0), 3)
        self.assertEqual(tree.index_of_prefixsum(5.50), 3)
예제 #26
0
class TestEnvironmentStepper(unittest.TestCase):
    """
    Tests for the EnvironmentStepper Component using a simple RandomEnv.
    """
    deterministic_env_state_space = FloatBox(shape=(1, ))
    deterministic_env_action_space = IntBox(2)
    deterministic_action_probs_space = FloatBox(shape=(2, ),
                                                add_batch_rank=True)

    internal_states_space = Tuple(FloatBox(shape=(256, )),
                                  FloatBox(shape=(256, )),
                                  add_batch_rank=True)
    internal_states_space_test_lstm = Tuple(FloatBox(shape=(3, )),
                                            FloatBox(shape=(3, )),
                                            add_batch_rank=True)

    action_probs_space = FloatBox(shape=(4, ), add_batch_rank=True)

    time_steps = 500

    def test_environment_stepper_on_deterministic_env(self):
        preprocessor_spec = None
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=5),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            num_steps=3)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        # Reset the stepper.
        test.test("reset")

        # Step 3 times through the Env and collect results.
        expected = (
            None,
            (
                np.array([True, False, False, False]),  # t_
                np.array([[0.0], [1.0], [2.0], [3.0]]),  # s' (raw)
            ))
        test.test("step", expected_outputs=expected)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            None,
            (
                np.array([False, False, True, False]),  # t_
                np.array([[3.0], [4.0], [0.0], [1.0]]),  # s' (raw)
            ))
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()

    def test_environment_stepper_on_deterministic_env_with_returning_action_probs(
            self):
        preprocessor_spec = [dict(type="divide", divisor=2)]
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=6),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=3)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(
            environment_stepper.actor_component.policy.variables)
        weights_hid = weights[
            "environment-stepper/actor-component/policy/test-network/hidden-layer/dense/kernel"]
        biases_hid = weights[
            "environment-stepper/actor-component/policy/test-network/hidden-layer/dense/bias"]
        weights_action = weights[
            "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/kernel"]
        biases_action = weights[
            "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/bias"]

        # Reset the stepper.
        test.test("reset")

        # Step 3 times through the Env and collect results.
        expected = (
            None,
            (
                # t_
                np.array([True, False, False, False]),
                # s' (raw)
                np.array([[0.0], [1.0], [2.0], [3.0]]),
                # action probs
                np.array([
                    [0.0, 0.0],  # <- init (no input gets sent through NN).
                    softmax(
                        dense_layer(
                            dense_layer(np.array([0.0]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action)),
                    softmax(
                        dense_layer(
                            dense_layer(np.array([0.5]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action)),
                    softmax(
                        dense_layer(
                            dense_layer(np.array([1.0]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action))
                ])))
        test.test("step", expected_outputs=expected, decimals=3)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            None,
            (
                np.array([False, False, False, True]),
                np.array([[3.0], [4.0], [5.0], [0.0]]),  # s' (raw)
                np.array([
                    [0.0, 0.0],  # <- init (no input gets sent through NN).
                    softmax(
                        dense_layer(
                            dense_layer(np.array([1.5]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action)),
                    softmax(
                        dense_layer(
                            dense_layer(np.array([2.0]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action)),
                    softmax(
                        dense_layer(
                            dense_layer(np.array([2.5]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action))
                ])))
        test.test("step", expected_outputs=expected, decimals=3)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()

    def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(
            self):
        internal_states_space = Tuple(FloatBox(shape=(3, )),
                                      FloatBox(shape=(3, )))
        preprocessor_spec = [dict(type="multiply", factor=0.1)]
        network_spec = config_from_path("configs/test_lstm_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=3),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            internal_states_space=internal_states_space,
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=4,
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(
            environment_stepper.actor_component.policy.variables)
        weights_lstm = weights[
            "environment-stepper/actor-component/policy/test-lstm-network/"
            "lstm-layer/lstm-cell/kernel"]
        biases_lstm = weights[
            "environment-stepper/actor-component/policy/test-lstm-network/lstm-layer/lstm-cell/bias"]
        weights_action = weights[
            "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/kernel"]
        biases_action = weights[
            "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/bias"]

        # Reset the stepper.
        test.test("reset")

        # Step 3 times through the Env and collect results.
        lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm)
        lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm,
                            lstm_1[1])
        lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm,
                            lstm_2[1])
        lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm,
                            lstm_3[1])
        expected = (
            None,
            (
                np.array([True, False, False, True, False]),
                np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]),  # s' (raw)
                np.array([
                    [0.0, 0.0],
                    softmax(
                        dense_layer(np.squeeze(lstm_1[0]), weights_action,
                                    biases_action)),
                    softmax(
                        dense_layer(np.squeeze(lstm_2[0]), weights_action,
                                    biases_action)),
                    softmax(
                        dense_layer(np.squeeze(lstm_3[0]), weights_action,
                                    biases_action)),
                    softmax(
                        dense_layer(np.squeeze(lstm_4[0]), weights_action,
                                    biases_action)),
                ]),  # action probs
                # internal states
                (np.squeeze(
                    np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0],
                              lstm_3[1][0], lstm_4[1][0]])),
                 np.squeeze(
                     np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1],
                               lstm_3[1][1], lstm_4[1][1]])))))
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()

    def test_environment_stepper_on_pong(self):
        environment_spec = dict(type="openai_gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_adapter_spec=agent_config["action_adapter_spec"],
                 action_space=action_space), agent_config["exploration_spec"])
        environment_stepper = EnvironmentStepper(
            environment_spec=environment_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float",
            add_reward=True,
            num_steps=self.time_steps)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )

        # Step 30 times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        # Reset the stepper.
        test.test("reset")
        time_start = time.monotonic()
        out = test.test("step")
        time_end = time.monotonic()
        print("Done running {} steps in env-stepper env in {}sec.".format(
            environment_stepper.num_steps, time_end - time_start))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[1][0].dtype == np.float32)  # preprocessed states
        #self.assertTrue(out[1][0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[1][0].max() <= 1.0)
        #self.assertTrue(out[1][1].dtype == np.int32)  # actions
        #self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        #self.assertTrue(out[1][3].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        self.assertTrue(out[1][2].min() >= -1.0)  # -1.0 to 1.0
        self.assertTrue(out[1][2].max() <= 1.0)

        # Check whether episode returns match single rewards (including resetting after each terminal signal).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[2][i]
        #    self.assertAlmostEqual(episode_returns, out[1][3][i])
        #    # Terminal: Reset accumulated episode-return before next step.
        #    if out[1][4][i] is np.bool_(True):
        #        episode_returns = 0.0

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()

    def test_compare_with_non_env_stepper(self):
        environment_spec = dict(type="openai_gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space.with_batch_rank()
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_adapter_spec=agent_config["action_adapter_spec"],
                 action_space=action_space), agent_config["exploration_spec"])
        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(states=state_space),
            action_space=action_space,
        )
        s = dummy_env.reset()
        time_start = time.monotonic()
        for i in range(self.time_steps):
            out = test.test(
                ("get_preprocessed_state_and_action", np.array([s])))
            #preprocessed_s = out["preprocessed_state"]
            a = out["action"]
            # Act in env.
            s, r, t, _ = dummy_env.step(a[0])  # remove batch
            if t is True:
                s = dummy_env.reset()
        time_end = time.monotonic()
        print("Done running {} steps in bare-metal env in {}sec.".format(
            self.time_steps, time_end - time_start))
        test.terminate()

    def test_environment_stepper_on_deepmind_lab(self):
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("DeepmindLab not installed: Skipping this test case.")
            return

        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only divide and flatten the image).
            [{
                "type": "divide",
                "divisor": 255
            }, {
                "type": "reshape",
                "flatten": True
            }],
            # Policy spec.
            dict(network_spec="../configs/test_lstm_nn.json",
                 action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space_test_lstm,
            num_steps=1000,
            # Add both prev-action and -reward into the state sent through the network.
            #add_previous_action_to_state=True,
            #add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True))

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        steps = 10
        out = None
        for _ in range(steps):
            out = test.test("step")
        time_total = time.monotonic() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)"
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[0].dtype == np.float32)
        #self.assertTrue(out[0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[0].max() <= 1.0)
        #self.assertTrue(out[1].dtype == np.int32)  # actions
        #self.assertTrue(out[2].dtype == np.float32)  # rewards
        #self.assertTrue(out[0].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        # action probs (test whether sum to one).
        #self.assertTrue(out[1][6].dtype == np.float32)
        #self.assertTrue(out[1][6].min() >= 0.0)
        #self.assertTrue(out[1][6].max() <= 1.0)
        #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False),
        #                              np.ones(shape=(environment_stepper.num_steps,)), decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3))
        self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3))

        # Check whether episode returns match single rewards (including terminal signals).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[0][i]
        #    self.assertAlmostEqual(episode_returns, out[3][i])
        #    # Terminal: Reset for next step.
        #    if out[4][i] is np.bool_(True):
        #        episode_returns = 0.0

        test.terminate()
예제 #27
0
class TestPrioritizedReplay(unittest.TestCase):
    """
    Tests sampling and insertion behaviour of the prioritized_replay module.
    """
    record_space = Dict(states=dict(state1=float, state2=float),
                        actions=dict(action1=float),
                        reward=float,
                        terminals=BoolBox(),
                        add_batch_rank=True)
    memory_variables = ["size", "index", "max-priority"]

    capacity = 10
    alpha = 1.0
    beta = 1.0

    max_priority = 1.0

    input_spaces = dict(
        # insert: records
        records=record_space,
        # get_records: num_records
        num_records=int,
        # update_records: indices, update
        indices=IntBox(add_batch_rank=True),
        update=FloatBox(add_batch_rank=True))

    def test_insert(self):
        """
        Simply tests insert op without checking internal logic.
        """
        memory = PrioritizedReplay(capacity=self.capacity,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)

        observation = self.record_space.sample(size=1)
        test.test(("insert_records", observation), expected_outputs=None)

    def test_capacity(self):
        """
        Tests if insert correctly manages capacity.
        """
        memory = PrioritizedReplay(capacity=self.capacity,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)

        # Internal state variables.
        memory_variables = memory.get_variables(self.memory_variables,
                                                global_scope=False)
        buffer_size = memory_variables['size']
        buffer_index = memory_variables['index']
        max_priority = memory_variables['max-priority']

        size_value, index_value, max_priority_value = test.read_variable_values(
            buffer_size, buffer_index, max_priority)

        # Assert indices 0 before insert.
        self.assertEqual(size_value, 0)
        self.assertEqual(index_value, 0)
        self.assertEqual(max_priority_value, 1.0)

        # Insert one more element than capacity
        observation = self.record_space.sample(size=self.capacity + 1)
        test.test(("insert_records", observation), expected_outputs=None)

        size_value, index_value = test.read_variable_values(
            buffer_size, buffer_index)
        # Size should be equivalent to capacity when full.
        self.assertEqual(size_value, self.capacity)

        # Index should be one over capacity due to modulo.
        self.assertEqual(index_value, 1)

    def test_batch_retrieve(self):
        """
        Tests if retrieval correctly manages capacity.
        """
        memory = PrioritizedReplay(capacity=self.capacity,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)

        # Insert 2 Elements.
        observation = non_terminal_records(self.record_space, 2)
        test.test(("insert_records", observation), expected_outputs=None)

        # Assert we can now fetch 2 elements.
        num_records = 2
        batch = test.test(("get_records", num_records), expected_outputs=None)
        records = batch[0]
        print('Result batch = {}'.format(records))
        self.assertEqual(2, len(records['terminals']))

        # We allow repeat indices in sampling.
        num_records = 5
        batch = test.test(("get_records", num_records), expected_outputs=None)
        records = batch[0]
        self.assertEqual(5, len(records['terminals']))

        # Now insert over capacity, note all elements here are non-terminal.
        observation = non_terminal_records(self.record_space, self.capacity)
        test.test(("insert_records", observation), expected_outputs=None)

        # Assert we can fetch exactly capacity elements.
        num_records = self.capacity
        batch = test.test(("get_records", num_records), expected_outputs=None)
        records = batch[0]
        self.assertEqual(self.capacity, len(records['terminals']))

    def test_update_records(self):
        """
        Tests update records logic.
        """
        memory = PrioritizedReplay(capacity=self.capacity)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)

        # Insert a few Elements.
        observation = non_terminal_records(self.record_space, 2)
        test.test(("insert_records", observation), expected_outputs=None)

        # Fetch elements and their indices.
        num_records = 2
        batch = test.test(("get_records", num_records), expected_outputs=None)
        indices = batch[1]
        self.assertEqual(num_records, len(indices))
        # 0.3, 0.5, 1.0])
        input_params = [indices, np.asarray([0.1, 0.2])]
        # Does not return anything
        test.test(("update_records", input_params), expected_outputs=None)

    def test_segment_tree_insert_values(self):
        """
        Tests if segment tree inserts into correct positions.
        """
        memory = PrioritizedReplay(capacity=self.capacity,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)
        priority_capacity = 1
        while priority_capacity < self.capacity:
            priority_capacity *= 2

        memory_variables = memory.get_variables(
            ["sum-segment-tree", "min-segment-tree"], global_scope=False)
        sum_segment_tree = memory_variables['sum-segment-tree']
        min_segment_tree = memory_variables['min-segment-tree']
        sum_segment_values, min_segment_values = test.read_variable_values(
            sum_segment_tree, min_segment_tree)

        self.assertEqual(sum(sum_segment_values), 0)
        self.assertEqual(sum(min_segment_values), float('inf'))
        self.assertEqual(len(sum_segment_values), 2 * priority_capacity)
        self.assertEqual(len(min_segment_values), 2 * priority_capacity)
        # Insert 1 Element.
        observation = non_terminal_records(self.record_space, 1)
        test.test(("insert_records", observation), expected_outputs=None)

        # Fetch segment tree.
        sum_segment_values, min_segment_values = test.read_variable_values(
            sum_segment_tree, min_segment_tree)

        # Check insert positions
        # Initial insert is at priority capacity
        print(sum_segment_values)
        print(min_segment_values)
        start = priority_capacity

        while start >= 1:
            self.assertEqual(sum_segment_values[start], 1.0)
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

        # Insert another Element.
        observation = non_terminal_records(self.record_space, 1)
        test.test(("insert_records", observation), expected_outputs=None)

        # Fetch segment tree.
        sum_segment_values, min_segment_values = test.read_variable_values(
            sum_segment_tree, min_segment_tree)
        print(sum_segment_values)
        print(min_segment_values)

        # Index shifted 1
        start = priority_capacity + 1
        self.assertEqual(sum_segment_values[start], 1.0)
        self.assertEqual(min_segment_values[start], 1.0)
        start = int(start / 2)
        while start >= 1:
            # 1 + 1 is 2 on the segment.
            self.assertEqual(sum_segment_values[start], 2.0)
            # min is still 1.
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)
예제 #28
0
    def __init__(self,
                 world="4x4",
                 save_mode=False,
                 action_type="udlr",
                 reward_function="sparse",
                 state_representation="discrete"):
        """
        Args:
            world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows
                of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state).

            save_mode (bool): Whether to replace holes (H) with walls (W). Default: False.

            action_type (str): Which action space to use. Chose between "udlr" (up, down, left, right), which is a
                discrete action space and "ftj" (forward + turn + jump), which is a container multi-discrete
                action space. "ftjb" is the same as "ftj", except that sub-action "jump" is a boolean.

            reward_function (str): One of
                sparse: hole=-5, fire=-3, goal=1, all other steps=-0.1
                rich: hole=-100, fire=-10, goal=50, all other steps=-0.1

            state_representation (str):
                - "discrete": An int representing the field on the grid, 0 meaning the upper left field, 1 the one
                    below, etc..
                - "xy": The x and y grid position tuple.
                - "xy+orientation": The x and y grid position tuple plus the orientation (if any) as tuple of 2 values
                    of the actor.
                - "camera": A 3-channel image where each field in the grid-world is one pixel and the 3 channels are
                    used to indicate different items in the scene (walls, holes, the actor, etc..).
        """
        # Build our map.
        if isinstance(world, str):
            self.description = world
            world = self.MAPS[world]
        else:
            self.description = "custom-map"

        world = np.array(list(map(list, world)))
        # Apply safety switch.
        world[world == 'H'] = ("H" if not save_mode else "F")

        # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column).
        self.world = world
        self.n_row, self.n_col = self.world.shape
        (start_y, ), (start_x, ) = np.nonzero(self.world == "S")

        # Init pygame (if installed) for visualizations.
        if pygame is not None:
            self.pygame_field_size = 30
            pygame.init()
            self.pygame_agent = pygame.image.load(
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "images/agent.png"))
            # Create basic grid Surface for reusage.
            self.pygame_basic_surface = self.grid_to_surface()
            self.pygame_display_set = False

        # Figure out our state space.
        assert state_representation in [
            "discrete", "xy", "xy+orientation", "camera"
        ]
        self.state_representation = state_representation
        # Discrete states (single int from 0 to n).
        if self.state_representation == "discrete":
            state_space = IntBox(self.n_row * self.n_col)
        # x/y position (2 ints).
        elif self.state_representation == "xy":
            state_space = IntBox(low=(0, 0),
                                 high=(self.n_col, self.n_row),
                                 shape=(2, ))
        # x/y position + orientation (3 ints).
        elif self.state_representation == "xy+orientation":
            state_space = IntBox(low=(0, 0, 0, 0),
                                 high=(self.n_col, self.n_row, 1, 1))
        # Camera outputting a 2D color image of the world.
        else:
            state_space = IntBox(0, 255, shape=(self.n_row, self.n_col, 3))

        self.default_start_pos = self.get_discrete_pos(start_x, start_y)
        self.discrete_pos = self.default_start_pos

        assert reward_function in ["sparse",
                                   "rich"]  # TODO: "potential"-based reward
        self.reward_function = reward_function

        # Store the goal position for proximity calculations (for "potential" reward function).
        (self.goal_y, ), (self.goal_x, ) = np.nonzero(self.world == "G")

        # Specify the actual action spaces.
        self.action_type = action_type
        action_space = IntBox(4) if self.action_type == "udlr" else Dict(
            dict(forward=IntBox(3),
                 turn=IntBox(3),
                 jump=(IntBox(2) if self.action_type == "ftj" else BoolBox())))

        # Call the super's constructor.
        super(GridWorld, self).__init__(state_space=state_space,
                                        action_space=action_space)

        # Reset ourselves.
        self.state = None
        self.orientation = None  # int: 0, 90, 180, 270
        self.camera_pixels = None  # only used, if state_representation=='cam'
        self.reward = None
        self.is_terminal = None
        self.reset(randomize=False)