示例#1
0
    def test_string_to_hash_bucket_layer_with_different_ctor_params(self):
        # Input space: Batch of strings.
        input_space = TextBox(add_batch_rank=True)

        # Construct a strong hash bucket with different delimiter, larger number of buckets, string algo and
        # int16 dtype.
        string_to_hash_bucket = StringToHashBucket(delimiter="-", num_hash_buckets=20, hash_function="strong",
                                                   dtype="int16")
        test = ComponentTest(component=string_to_hash_bucket, input_spaces=dict(text_inputs=input_space))

        # Send a batch of 5 strings through the hash-bucket generator.
        inputs = np.array([
            "text-A",
            "test-B",
            "text-C--D-and-E",
            "bla bla-D"
        ])

        # NOTE that some different words occupy the same hash bucket (e.g. 'C' and 'and' OR 'text' and [empty]).
        # This can be avoided by 1) picking a larger `num_hash_buckets` or 2) using the "strong" hash function.
        expected_hash_bucket = np.array([
            [2, 6, 18, 18, 18],    # text    A .  .  .
            [12, 7, 18, 18, 18],   # test    B .  .  .
            [2, 6, 13, 19, 15],    # text    C D and E
            [13, 13, 18, 18, 18],  # bla bla D .  .  .  <- Note that "bla bla" and "D" still have the same bucket (13)
        ])
        expected_lengths = np.array([2, 2, 5, 2])
        test.test(("apply", inputs), expected_outputs=(expected_hash_bucket, expected_lengths))
示例#2
0
    def test_string_to_hash_bucket_layer(self):
        # Input space: Batch of strings.
        input_space = TextBox(add_batch_rank=True)

        # Use a fast-hash function with 10 possible buckets to put a word into.
        string_to_hash_bucket = StringToHashBucket(num_hash_buckets=10,
                                                   hash_function="fast")
        test = ComponentTest(component=string_to_hash_bucket,
                             input_spaces=dict(text_inputs=input_space))

        # Send a batch of 3 strings through the hash-bucket generator.
        inputs = np.array(["text A", "test B", "text C  D and E"])

        # NOTE that some different words occupy the same hash bucket (e.g. 'C' and 'and' (7) OR 'text' and [empty] (3)).
        # This can be avoided by 1) picking a larger `num_hash_buckets` or 2) using the "strong" hash function.
        expected_hash_bucket = np.array([
            [3, 4, 3, 3, 3],  # text A .  .  .
            [6, 8, 3, 3, 3],  # test B .  .  .
            [3, 7, 5, 7, 2],  # text C D and E
        ])
        expected_lengths = np.array([2, 2, 5])
        test.test(("call", inputs),
                  expected_outputs=(expected_hash_bucket, expected_lengths))
    def test_functional_api_multi_stream_nn(self):
        # Input Space of the network.
        input_space = Dict(
            {
                "img": FloatBox(shape=(6, 6, 3)),  # some RGB img
                "txt": TextBox()  # some text
            },
            add_batch_rank=True,
            add_time_rank=True)

        img, txt = ContainerSplitter("img", "txt")(input_space)
        # Complex NN assembly via our Keras-style functional API.
        # Fold text input into single batch rank.
        folded_text = ReShape(fold_time_rank=True)(txt)
        # String layer will create batched AND time-ranked (individual words) hash outputs (int64).
        string_bucket_out, lengths = StringToHashBucket(
            num_hash_buckets=5)(folded_text)
        # Batched and time-ranked embedding output (floats) with embed dim=n.
        embedding_out = EmbeddingLookup(embed_dim=10,
                                        vocab_size=5)(string_bucket_out)
        # Pass embeddings through a text LSTM and use last output (reduce time-rank).
        string_lstm_out, _ = LSTMLayer(units=2,
                                       return_sequences=False,
                                       scope="lstm-layer-txt")(
                                           embedding_out,
                                           sequence_length=lengths)
        # Unfold to get original time-rank back.
        string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(
            string_lstm_out, txt)

        # Parallel image stream via 1 CNN layer plus dense.
        folded_img = ReShape(fold_time_rank=True, scope="img-fold")(img)
        cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img)
        unfolded_cnn_out = ReShape(unfold_time_rank=True,
                                   scope="img-unfold")(cnn_out, img)
        unfolded_cnn_out_flattened = ReShape(
            flatten=True, scope="img-flat")(unfolded_cnn_out)
        dense_out = DenseLayer(units=2,
                               scope="dense-0")(unfolded_cnn_out_flattened)

        # Concat everything.
        concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out)

        # LSTM output has batch+time.
        main_lstm_out, internal_states = LSTMLayer(
            units=2, scope="lstm-layer-main")(concat_out)

        dense1_after_lstm_out = DenseLayer(units=3,
                                           scope="dense-1")(main_lstm_out)
        dense2_after_lstm_out = DenseLayer(
            units=2, scope="dense-2")(dense1_after_lstm_out)
        dense3_after_lstm_out = DenseLayer(
            units=1, scope="dense-3")(dense2_after_lstm_out)

        # A NN with 2 outputs.
        neural_net = NeuralNetwork(
            outputs=[dense3_after_lstm_out, main_lstm_out, internal_states])

        test = ComponentTest(component=neural_net,
                             input_spaces=dict(inputs=input_space))

        # Batch of size=n.
        sample_shape = (4, 2)
        input_ = input_space.sample(sample_shape)

        out = test.test(("call", input_), expected_outputs=None)
        # Main output (Dense out after LSTM).
        self.assertTrue(out[0].shape == sample_shape +
                        (1, ))  # 1=1 unit in dense layer
        self.assertTrue(out[0].dtype == np.float32)
        # main-LSTM out.
        self.assertTrue(out[1].shape == sample_shape + (2, ))  # 2=2 LSTM units
        self.assertTrue(out[1].dtype == np.float32)
        # main-LSTM internal-states.
        self.assertTrue(out[2][0].shape == sample_shape[:1] +
                        (2, ))  # 2=2 LSTM units
        self.assertTrue(out[2][0].dtype == np.float32)
        self.assertTrue(out[2][1].shape == sample_shape[:1] +
                        (2, ))  # 2=2 LSTM units
        self.assertTrue(out[2][1].dtype == np.float32)

        test.terminate()
    def test_keras_style_complex_multi_stream_nn(self):
        # 3 inputs.
        input_spaces = [
            Dict({
                "img": FloatBox(shape=(6, 6, 3)),
                "int": IntBox(3)
            }, add_batch_rank=True, add_time_rank=True),
            FloatBox(shape=(2,), add_batch_rank=True),
            Tuple(IntBox(2), TextBox(), add_batch_rank=True, add_time_rank=True)
        ]

        # Same NN as in test above, only using some of the sub-Spaces from the input spaces.
        # Tests whether this NN can add automatically the correct splitters.
        folded_text = ReShape(fold_time_rank=True)(input_spaces[2][1])
        # String layer will create batched AND time-ranked (individual words) hash outputs (int64).
        string_bucket_out, lengths = StringToHashBucket(num_hash_buckets=5)(folded_text)
        # Batched and time-ranked embedding output (floats) with embed dim=n.
        embedding_out = EmbeddingLookup(embed_dim=10, vocab_size=5)(string_bucket_out)
        # Pass embeddings through a text LSTM and use last output (reduce time-rank).
        string_lstm_out, _ = LSTMLayer(units=2, return_sequences=False, scope="lstm-layer-txt")(
            embedding_out, sequence_length=lengths
        )
        # Unfold to get original time-rank back.
        string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(string_lstm_out, input_spaces[2][1])

        # Parallel image stream via 1 CNN layer plus dense.
        folded_img = ReShape(fold_time_rank=True, scope="img-fold")(input_spaces[0]["img"])
        cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img)
        unfolded_cnn_out = ReShape(unfold_time_rank=True, scope="img-unfold")(cnn_out, input_spaces[0]["img"])
        unfolded_cnn_out_flattened = ReShape(flatten=True, scope="img-flat")(unfolded_cnn_out)
        dense_out = DenseLayer(units=2, scope="dense-0")(unfolded_cnn_out_flattened)

        # Concat everything.
        concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out)

        # LSTM output has batch+time.
        main_lstm_out, internal_states = LSTMLayer(units=2, scope="lstm-layer-main")(concat_out)

        dense1_after_lstm_out = DenseLayer(units=3, scope="dense-1")(main_lstm_out)
        dense2_after_lstm_out = DenseLayer(units=2, scope="dense-2")(dense1_after_lstm_out)
        dense3_after_lstm_out = DenseLayer(units=1, scope="dense-3")(dense2_after_lstm_out)

        # A NN with 3 outputs.
        neural_net = NeuralNetwork(inputs=input_spaces, outputs=[dense3_after_lstm_out, main_lstm_out, internal_states])

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_spaces))

        # Batch of size=n.
        sample_shape = (4, 2)
        input_ = [input_spaces[0].sample(sample_shape), input_spaces[1].sample(sample_shape[0]),
                  input_spaces[2].sample(sample_shape)]

        out = test.test(("call", tuple(input_)), expected_outputs=None)
        # Main output (Dense out after LSTM).
        self.assertTrue(out[0].shape == sample_shape + (1,))  # 1=1 unit in dense layer
        self.assertTrue(out[0].dtype == np.float32)
        # main-LSTM out.
        self.assertTrue(out[1].shape == sample_shape + (2,))  # 2=2 LSTM units
        self.assertTrue(out[1].dtype == np.float32)
        # main-LSTM internal-states.
        self.assertTrue(out[2][0].shape == sample_shape[:1] + (2,))  # 2=2 LSTM units
        self.assertTrue(out[2][0].dtype == np.float32)
        self.assertTrue(out[2][1].shape == sample_shape[:1] + (2,))  # 2=2 LSTM units
        self.assertTrue(out[2][1].dtype == np.float32)

        test.terminate()