Exemplo n.º 1
0
    def test_kafka_global_configuration(self):
        """Tests for KafkaDataset global configuration properties."""
        tf.compat.v1.disable_eager_execution()

        import tensorflow_io.kafka as kafka_io  # pylint: disable=wrong-import-position

        topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[])
        cfg_list = ["debug=generic", "enable.auto.commit=false"]

        repeat_dataset = kafka_io.KafkaDataset(
            topics, group="test", eof=True,
            config_global=cfg_list).repeat(num_epochs)

        iterator = tf.compat.v1.data.Iterator.from_structure(
            repeat_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op,
                     feed_dict={
                         topics: ["test:0:0:4"],
                         num_epochs: 1
                     })
            for i in range(5):
                self.assertEqual(("D" + str(i)).encode(), sess.run(get_next))
            with self.assertRaises(tf.errors.OutOfRangeError):
                sess.run(get_next)
Exemplo n.º 2
0
    def test_kafka_wrong_topic_configuration_failed(self):
        """Tests for KafkaDataset wrong topic configuration properties."""
        tf.compat.v1.disable_eager_execution()

        import tensorflow_io.kafka as kafka_io  # pylint: disable=wrong-import-position

        topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[])

        # Add wrong configuration
        wrong_cfg = ["auto.offset.reset=arliest"]
        repeat_dataset = kafka_io.KafkaDataset(
            topics, group="test", eof=True,
            config_topic=wrong_cfg).repeat(num_epochs)

        iterator = tf.compat.v1.data.Iterator.from_structure(
            repeat_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op,
                     feed_dict={
                         topics: ["test:0:0:4"],
                         num_epochs: 1
                     })
            with self.assertRaises(tf.errors.InternalError):
                sess.run(get_next)
def kafka_dataset(servers, topic, offset, schema, eof=True):
    print("Create: ", "{}:0:{}".format(topic, offset))
    dataset = kafka_io.KafkaDataset(["{}:0:{}".format(topic, offset, offset)], servers=servers,
                                    group="cardata-autoencoder", eof=eof, config_global=kafka_config)

    # remove kafka framing
    dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1))

    # deserialize avro
    dataset = dataset.map(
        lambda e: kafka_io.decode_avro(
            e, schema=schema, dtype=[
                tf.float64,
                tf.float64,
                tf.float64,
                tf.float64,
                tf.float64,
                tf.float64,
                tf.float64,
                tf.float64,
                tf.float64,
                tf.int32,
                tf.int32,
                tf.int32,
                tf.int32,
                tf.float64,
                tf.float64,
                tf.float64,
                tf.float64,
                tf.int32,
                tf.string]))
    return dataset
Exemplo n.º 4
0
    def test_kafka_topic_configuration(self):
        """Tests for KafkaDataset topic configuration properties."""
        tf.compat.v1.disable_eager_execution()

        import tensorflow_io.kafka as kafka_io  # pylint: disable=wrong-import-position

        topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[])
        cfg_list = ["auto.offset.reset=earliest"]

        repeat_dataset = kafka_io.KafkaDataset(
            topics, group="test", eof=True,
            config_topic=cfg_list).repeat(num_epochs)

        iterator = tf.compat.v1.data.Iterator.from_structure(
            repeat_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            # Use a wrong offset 100 here to make sure
            # configuration 'auto.offset.reset=earliest' works.
            sess.run(init_op,
                     feed_dict={
                         topics: ["test:0:100:-1"],
                         num_epochs: 1
                     })
            for i in range(5):
                self.assertEqual(("D" + str(i)).encode(), sess.run(get_next))
Exemplo n.º 5
0
    def test_kafka_dataset_save_and_restore(self):
        """Tests for KafkaDataset save and restore."""
        g = tf.Graph()
        with g.as_default():
            topics = tf.compat.v1.placeholder(dtypes.string, shape=[None])
            num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[])

            repeat_dataset = kafka_io.KafkaDataset(
                topics, group="test", eof=True
            ).repeat(num_epochs)
            iterator = repeat_dataset.make_initializable_iterator()
            get_next = iterator.get_next()

            it = tf.data.experimental.make_saveable_from_iterator(iterator)
            g.add_to_collection(tf.compat.v1.GraphKeys.SAVEABLE_OBJECTS, it)
            saver = tf.compat.v1.train.Saver()

            model_file = "/tmp/test-kafka-model"
            with self.cached_session() as sess:
                sess.run(
                    iterator.initializer,
                    feed_dict={topics: ["test:0:0:4"], num_epochs: 1},
                )
                for i in range(3):
                    self.assertEqual(("D" + str(i)).encode(), sess.run(get_next))
                # Save current offset which is 2
                saver.save(sess, model_file, global_step=3)

            checkpoint_file = "/tmp/test-kafka-model-3"
            with self.cached_session() as sess:
                saver.restore(sess, checkpoint_file)
                # Restore current offset to 2
                for i in [2, 3]:
                    self.assertEqual(("D" + str(i)).encode(), sess.run(get_next))
Exemplo n.º 6
0
    def test_kafka_dataset_with_key(self):
        """Tests for KafkaDataset."""
        topics = tf.compat.v1.placeholder(dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[])
        batch_size = tf.compat.v1.placeholder(dtypes.int64, shape=[])

        repeat_dataset = kafka_io.KafkaDataset(
            topics, group="test", eof=True,
            message_key=True).repeat(num_epochs)
        batch_dataset = repeat_dataset.batch(batch_size)

        iterator = data.Iterator.from_structure(batch_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            # Basic test: read from topic 0.
            sess.run(init_op,
                     feed_dict={
                         topics: ["key-test:0:0:4"],
                         num_epochs: 1
                     })
            for i in range(5):
                self.assertEqual(
                    (("D" + str(i)).encode(), ("K" + str(i % 2)).encode()),
                    sess.run(get_next),
                )
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemplo n.º 7
0
    def test_kafka_dataset_with_offset(self):
        """Tests for KafkaDataset when reading non-keyed messages
        from a single-partitioned topic"""
        tf.compat.v1.disable_eager_execution()

        import tensorflow_io.kafka as kafka_io  # pylint: disable=wrong-import-position

        topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[])
        batch_size = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[])

        repeat_dataset = kafka_io.KafkaDataset(
            topics, group="test", eof=True,
            message_offset=True).repeat(num_epochs)
        batch_dataset = repeat_dataset.batch(batch_size)

        iterator = tf.compat.v1.data.Iterator.from_structure(
            batch_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            # Basic offset test: read a limited number of messages from the topic.
            sess.run(init_op,
                     feed_dict={
                         topics: ["offset-test:0:0:4"],
                         num_epochs: 1
                     })
            for i in range(5):
                self.assertEqual(
                    (("D" + str(i)).encode(), ("0:" + str(i)).encode()),
                    sess.run(get_next),
                )
            with self.assertRaises(tf.errors.OutOfRangeError):
                sess.run(get_next)
Exemplo n.º 8
0
def run():
    dataset = kafka_io.KafkaDataset([ TRAINING_TOPIC_NAME + ':0' ],
                                    servers=KAFKA_BOOTSTRAP,
                                    group="dalelane-tensorflow-train",
                                    config_global=[
                                        "api.version.request=true",
                                        "sasl.mechanisms=PLAIN",
                                        "security.protocol=sasl_ssl",
                                        "sasl.username=token",
                                        "sasl.password="******"ssl.ca.location=" + CERT
                                    ])

    dataset = dataset.map(deserialize).batch(1)

    # neural net definition taken from
    #  https://www.tensorflow.org/tutorials/keras/classification#set_up_the_layers

    model = keras.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Dense(10, activation="softmax")
    ])

    model.compile(optimizer="adam",
                loss="sparse_categorical_crossentropy",
                metrics=["accuracy"])

    model.fit(dataset, epochs=4, steps_per_epoch=1000)

    return model
Exemplo n.º 9
0
    def check(self, images, predictions):
        import tensorflow_io.kafka as kafka_io

        f = kafka_io.KafkaDataset(topics=[self._topic], group="test", eof=True)
        lines = list(f)
        assert np.all(lines == predictions)

        assert len(lines) == len(images)
Exemplo n.º 10
0
def test_kafka_output_sequence():
    """Test case based on fashion mnist tutorial"""
    fashion_mnist = tf.keras.datasets.fashion_mnist
    ((train_images, train_labels), (test_images,
                                    _)) = fashion_mnist.load_data()

    class_names = [
        'T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal',
        'Shirt', 'Sneaker', 'Bag', 'Ankle boot'
    ]

    train_images = train_images / 255.0
    test_images = test_images / 255.0

    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation=tf.nn.relu),
        tf.keras.layers.Dense(10, activation=tf.nn.softmax)
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    model.fit(train_images, train_labels, epochs=5)

    class OutputCallback(tf.keras.callbacks.Callback):
        """KafkaOutputCallback"""
        def __init__(self, batch_size, topic, servers):
            self._sequence = kafka_io.KafkaOutputSequence(topic=topic,
                                                          servers=servers)
            self._batch_size = batch_size

        def on_predict_batch_end(self, batch, logs=None):
            index = batch * self._batch_size
            for outputs in logs['outputs']:
                for output in outputs:
                    self._sequence.setitem(index,
                                           class_names[np.argmax(output)])
                    index += 1

        def flush(self):
            self._sequence.flush()

    channel = "e{}e".format(time.time())
    topic = "test_" + channel

    # By default batch size is 32
    output = OutputCallback(32, topic, "localhost")
    predictions = model.predict(test_images, callbacks=[output])
    output.flush()

    predictions = [class_names[v] for v in np.argmax(predictions, axis=1)]

    # Reading from `test_e(time)e` we should get the same result
    dataset = kafka_io.KafkaDataset(topics=[topic], group="test", eof=True)
    for entry, prediction in zip(dataset, predictions):
        assert entry.numpy() == prediction.encode()
Exemplo n.º 11
0
    def test_write_kafka(self):
        """test_write_kafka"""
        tf.compat.v1.disable_eager_execution()

        import tensorflow_io.kafka as kafka_io  # pylint: disable=wrong-import-position

        channel = "e{}e".format(time.time())

        # Start with reading test topic, replace `D` with `e(time)e`,
        # and write to test_e(time)e` topic.
        dataset = kafka_io.KafkaDataset(topics=["test:0:0:4"],
                                        group="test",
                                        eof=True)
        dataset = dataset.map(lambda x: kafka_io.write_kafka(
            tf.strings.regex_replace(x, "D", channel), topic="test_" + channel)
                              )
        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            # Basic test: read from topic 0.
            sess.run(init_op)
            for i in range(5):
                self.assertEqual((channel + str(i)).encode(),
                                 sess.run(get_next))
            with self.assertRaises(tf.errors.OutOfRangeError):
                sess.run(get_next)

        # Reading from `test_e(time)e` we should get the same result
        dataset = kafka_io.KafkaDataset(topics=["test_" + channel],
                                        group="test",
                                        eof=True)
        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)
            for i in range(5):
                self.assertEqual((channel + str(i)).encode(),
                                 sess.run(get_next))
            with self.assertRaises(tf.errors.OutOfRangeError):
                sess.run(get_next)
Exemplo n.º 12
0
def test_avro_kafka_dataset():
    """test_avro_kafka_dataset"""
    schema = ('{"type":"record","name":"myrecord","fields":'
              '[{"name":"f1","type":"string"},{"name":"f2","type":"long"}]}"')
    dataset = kafka_io.KafkaDataset(["avro-test:0"],
                                    group="avro-test",
                                    eof=True)
    # remove kafka framing
    dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1))
    # deserialize avro
    dataset = dataset.map(lambda e: kafka_io.decode_avro(
        e, schema=schema, dtype=[tf.string, tf.int64]))
    entries = [(f1.numpy(), f2.numpy()) for (f1, f2) in dataset]
    np.all(entries == [('value1', 1), ('value2', 2), ('value3', 3)])
Exemplo n.º 13
0
def get_train_data(boostrap_servers, kafka_topic, group, batch, decoder):
    """Obtains the data and labels for training from Kafka

    Args:
        boostrap_servers (str): list of boostrap servers for the connection with Kafka
        kafka_topic (str): Kafka topic   out_type_x, out_type_y, reshape_x, reshape_y) (raw): input data
        batch (int): batch size for training
        decoder(class): decoder to decode the data
        
    Returns:
        train_kafka: training data and labels from Kafka
    """

    logging.info("Starts receiving training data from Kafka servers [%s] with topics [%s]", boostrap_servers,  kafka_topic)
    train_data = kafka_io.KafkaDataset([kafka_topic], servers=boostrap_servers, group=group, eof=True, message_key=True).map(lambda x, y: decoder.decode(x, y)).batch(batch)
    
    return train_data
Exemplo n.º 14
0
def test_avro_kafka_dataset():
    """test_avro_kafka_dataset"""
    schema = ('{"type":"record","name":"myrecord","fields":['
              '{"name":"f1","type":"string"},'
              '{"name":"f2","type":"long"},'
              '{"name":"f3","type":["null","string"],"default":null}'
              "]}")
    dataset = kafka_io.KafkaDataset(["avro-test:0"],
                                    group="avro-test",
                                    eof=True)
    # remove kafka framing
    dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1))
    # deserialize avro
    dataset = dataset.map(lambda e: tfio.experimental.serialization.
                          decode_avro(e, schema=schema))
    entries = [(e["f1"], e["f2"], e["f3"]) for e in dataset]
    np.all(entries == [("value1", 1, ""), ("value2", 2, ""), ("value3", 3,
                                                              "")])
Exemplo n.º 15
0
def test_avro_kafka_dataset_with_resource():
  """test_avro_kafka_dataset_with_resource"""
  schema = ('{"type":"record","name":"myrecord","fields":['
            '{"name":"f1","type":"string"},'
            '{"name":"f2","type":"long"},'
            '{"name":"f3","type":["null","string"],"default":null}'
            ']}"')
  schema_resource = kafka_io.decode_avro_init(schema)
  dataset = kafka_io.KafkaDataset(
      ["avro-test:0"], group="avro-test", eof=True)
  # remove kafka framing
  dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1))
  # deserialize avro
  dataset = dataset.map(
      lambda e: kafka_io.decode_avro(
          e, schema=schema_resource, dtype=[tf.string, tf.int64, tf.string]))
  entries = [(f1.numpy(), f2.numpy(), f3.numpy()) for (f1, f2, f3) in dataset]
  np.all(entries == [('value1', 1), ('value2', 2), ('value3', 3)])
Exemplo n.º 16
0
    def test_kafka_wrong_global_configuration_failed(self):
        """Tests for KafkaDataset worng global configuration properties."""
        topics = tf.compat.v1.placeholder(dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[])

        # Add wrong configuration
        wrong_cfg = ["debug=al"]
        repeat_dataset = kafka_io.KafkaDataset(
            topics, group="test", eof=True, config_global=wrong_cfg
        ).repeat(num_epochs)

        iterator = data.Iterator.from_structure(repeat_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op, feed_dict={topics: ["test:0:0:4"], num_epochs: 1})
            with self.assertRaises(errors.InternalError):
                sess.run(get_next)
Exemplo n.º 17
0
    def test_kafka_dataset(self):
        """Tests for KafkaDataset."""
        topics = tensorflow.compat.v1.placeholder(dtypes.string, shape=[None])
        num_epochs = tensorflow.compat.v1.placeholder(dtypes.int64, shape=[])
        batch_size = tensorflow.compat.v1.placeholder(dtypes.int64, shape=[])

        repeat_dataset = kafka_io.KafkaDataset(topics, group="test",
                                               eof=True).repeat(num_epochs)
        batch_dataset = repeat_dataset.batch(batch_size)

        iterator = data.Iterator.from_structure(batch_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        init_batch_op = iterator.make_initializer(batch_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            # Basic test: read from topic 0.
            sess.run(init_op,
                     feed_dict={
                         topics: ["test:0:0:4"],
                         num_epochs: 1
                     })
            for i in range(5):
                self.assertEqual(("D" + str(i)).encode(), sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Basic test: read from topic 1.
            sess.run(init_op,
                     feed_dict={
                         topics: ["test:0:5:-1"],
                         num_epochs: 1
                     })
            for i in range(5):
                self.assertEqual(("D" + str(i + 5)).encode(),
                                 sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Basic test: read from both topics.
            sess.run(init_op,
                     feed_dict={
                         topics: ["test:0:0:4", "test:0:5:-1"],
                         num_epochs: 1
                     })
            for j in range(2):
                for i in range(5):
                    self.assertEqual(("D" + str(i + j * 5)).encode(),
                                     sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Test repeated iteration through both files.
            sess.run(init_op,
                     feed_dict={
                         topics: ["test:0:0:4", "test:0:5:-1"],
                         num_epochs: 10
                     })
            for _ in range(10):
                for j in range(2):
                    for i in range(5):
                        self.assertEqual(("D" + str(i + j * 5)).encode(),
                                         sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Test batched and repeated iteration through both files.
            sess.run(init_batch_op,
                     feed_dict={
                         topics: ["test:0:0:4", "test:0:5:-1"],
                         num_epochs: 10,
                         batch_size: 5
                     })
            for _ in range(10):
                self.assertAllEqual([("D" + str(i)).encode()
                                     for i in range(5)], sess.run(get_next))
                self.assertAllEqual([("D" + str(i + 5)).encode()
                                     for i in range(5)], sess.run(get_next))
Exemplo n.º 18
0
    def test_kafka_dataset(self):
        """Tests for KafkaDataset when reading non-keyed messages
        from a single-partitioned topic"""
        topics = tf.compat.v1.placeholder(dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[])
        batch_size = tf.compat.v1.placeholder(dtypes.int64, shape=[])

        repeat_dataset = kafka_io.KafkaDataset(topics, group="test", eof=True).repeat(
            num_epochs
        )
        batch_dataset = repeat_dataset.batch(batch_size)

        iterator = data.Iterator.from_structure(batch_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        init_batch_op = iterator.make_initializer(batch_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            # Basic test: read a limited number of messages from the topic.
            sess.run(init_op, feed_dict={topics: ["test:0:0:4"], num_epochs: 1})
            for i in range(5):
                self.assertEqual(("D" + str(i)).encode(), sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Basic test: read all the messages from the topic from offset 5.
            sess.run(init_op, feed_dict={topics: ["test:0:5:-1"], num_epochs: 1})
            for i in range(5):
                self.assertEqual(("D" + str(i + 5)).encode(), sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Basic test: read from different subscriptions of the same topic.
            sess.run(
                init_op,
                feed_dict={topics: ["test:0:0:4", "test:0:5:-1"], num_epochs: 1},
            )
            for j in range(2):
                for i in range(5):
                    self.assertEqual(
                        ("D" + str(i + j * 5)).encode(), sess.run(get_next)
                    )
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Test repeated iteration through both subscriptions.
            sess.run(
                init_op,
                feed_dict={topics: ["test:0:0:4", "test:0:5:-1"], num_epochs: 10},
            )
            for _ in range(10):
                for j in range(2):
                    for i in range(5):
                        self.assertEqual(
                            ("D" + str(i + j * 5)).encode(), sess.run(get_next)
                        )
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Test batched and repeated iteration through both subscriptions.
            sess.run(
                init_batch_op,
                feed_dict={
                    topics: ["test:0:0:4", "test:0:5:-1"],
                    num_epochs: 10,
                    batch_size: 5,
                },
            )
            for _ in range(10):
                self.assertAllEqual(
                    [("D" + str(i)).encode() for i in range(5)], sess.run(get_next)
                )
                self.assertAllEqual(
                    [("D" + str(i + 5)).encode() for i in range(5)], sess.run(get_next)
                )
Exemplo n.º 19
0
    def test_kafka_dataset_with_partitioned_key(self):
        """Tests for KafkaDataset when reading keyed-messages
        from a multi-partitioned topic"""
        topics = tf.compat.v1.placeholder(dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[])
        batch_size = tf.compat.v1.placeholder(dtypes.int64, shape=[])

        repeat_dataset = kafka_io.KafkaDataset(
            topics, group="test", eof=True, message_key=True
        ).repeat(num_epochs)
        batch_dataset = repeat_dataset.batch(batch_size)

        iterator = data.Iterator.from_structure(batch_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        init_batch_op = iterator.make_initializer(batch_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            # Basic test: read first 5 messages from the first partition of the topic.
            # NOTE: The key-partition mapping occurs based on the order in which the data
            # is being stored in kafka. Please check kafka_test.sh for the sample data.

            sess.run(
                init_op,
                feed_dict={topics: ["key-partition-test:0:0:5"], num_epochs: 1},
            )
            for i in range(5):
                self.assertEqual(
                    (("D" + str(i * 2)).encode(), (b"K0")), sess.run(get_next),
                )
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Basic test: read first 5 messages from the second partition of the topic.
            sess.run(
                init_op,
                feed_dict={topics: ["key-partition-test:1:0:5"], num_epochs: 1},
            )
            for i in range(5):
                self.assertEqual(
                    (("D" + str(i * 2 + 1)).encode(), (b"K1")), sess.run(get_next),
                )
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Basic test: read from different subscriptions to the same topic.
            sess.run(
                init_op,
                feed_dict={
                    topics: ["key-partition-test:0:0:5", "key-partition-test:1:0:5"],
                    num_epochs: 1,
                },
            )
            for j in range(2):
                for i in range(5):
                    self.assertEqual(
                        (("D" + str(i * 2 + j)).encode(), ("K" + str(j)).encode()),
                        sess.run(get_next),
                    )
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Test repeated iteration through both subscriptions.
            sess.run(
                init_op,
                feed_dict={
                    topics: ["key-partition-test:0:0:5", "key-partition-test:1:0:5"],
                    num_epochs: 10,
                },
            )
            for _ in range(10):
                for j in range(2):
                    for i in range(5):
                        self.assertEqual(
                            (("D" + str(i * 2 + j)).encode(), ("K" + str(j)).encode()),
                            sess.run(get_next),
                        )
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

            # Test batched and repeated iteration through both subscriptions.
            sess.run(
                init_batch_op,
                feed_dict={
                    topics: ["key-partition-test:0:0:5", "key-partition-test:1:0:5"],
                    num_epochs: 10,
                    batch_size: 5,
                },
            )
            for _ in range(10):
                for j in range(2):
                    self.assertAllEqual(
                        [
                            [("D" + str(i * 2 + j)).encode() for i in range(5)],
                            [("K" + str(j)).encode() for i in range(5)],
                        ],
                        sess.run(get_next),
                    )
    # Decode image to (28, 28)
    x = tf.io.decode_raw(x, out_type=tf.uint8)
    x = tf.reshape(x, [28, 28])
    # Convert to float32 for tf.keras
    x = tf.image.convert_image_dtype(x, tf.float32)
    return x


def func_y(y):
    # Decode image to (,)
    y = tf.io.decode_raw(y, out_type=tf.uint8)
    y = tf.reshape(y, [])
    return y


train_images = kafka_io.KafkaDataset(['xx:0'], group='xx',
                                     eof=True).map(func_x)
train_labels = kafka_io.KafkaDataset(['yy:0'], group='yy',
                                     eof=True).map(func_y)
train_kafka = tf.data.Dataset.zip((train_images, train_labels)).batch(1)
print(train_kafka)

# 3. Keras model
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
# default: steps_per_epoch=12000
Exemplo n.º 21
0
  def check(self, images, predictions):
    f = kafka_io.KafkaDataset(topics=[self._topic], group="test", eof=True)
    lines = [line for line in f]
    assert np.all(lines == predictions)

    assert len(lines) == len(images)
import tensorflow_io.kafka as kafka_io
# local dependencies
import train_model
# config values
from config import EVENT_STREAMS_API_KEY, CERT
from config import KAFKA_BOOTSTRAP, TEST_STREAM_TOPIC_NAME

print("-->>> Train a machine learning model using training data from Kafka")
model = train_model.run()

print("-->>> Prepare a streaming dataset based on a Kafka topic")
dataset = kafka_io.KafkaDataset(
    [TEST_STREAM_TOPIC_NAME + ":0"],
    servers=KAFKA_BOOTSTRAP,
    group="dalelane-tensorflow-test",
    eof=False,
    config_global=[
        "api.version.request=true", "sasl.mechanisms=PLAIN",
        "security.protocol=sasl_ssl", "sasl.username=token",
        "sasl.password="******"ssl.ca.location=" + CERT
    ])
dataset = dataset.map(train_model.deserialize).batch(1)

print("-->>> Start classifying events received on the topic %s" %
      TEST_STREAM_TOPIC_NAME)
for image, label in dataset:
    prediction = model.predict(image)

    if prediction.argmax() == label[0]:
        print("---->>>> ✓ Event classified correctly")
    else:
        print("---->>>> ✘ Event INCORRECTLY classified")
encoder = tf.keras.layers.Dense(encoding_dim, activation="tanh", activity_regularizer=tf.keras.regularizers.l1(learning_rate))(input_layer)
encoder = tf.keras.layers.Dense(hidden_dim, activation="relu")(encoder)
decoder = tf.keras.layers.Dense(hidden_dim, activation='tanh')(encoder)
decoder = tf.keras.layers.Dense(input_dim, activation='relu')(decoder)
autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder)


autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='adam')

# NOTE: KafkaDataset processing
def process_csv(entry):
  # "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
  return tf.io.decode_csv(entry, [[0.0], *[[0.0] for i in range(28)], [0.0], [""]])

creditcard_dataset = kafka_io.KafkaDataset(['creditcard:0'], group='creditcard', eof=True).batch(batch_size).map(process_csv)

def process_x_y(*entry):
  return (tf.stack(entry[0:30], 1), tf.strings.to_number(entry[30], out_type=tf.int32))

train_dataset = creditcard_dataset.map(process_x_y)
print(train_dataset)

# NOTE: model.fit()
# NOTE: "Time" and "Amount" are not transformed with the following yet
# df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1))
# df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))
# Runtime transformation of the above, may require all data available which may defeat the purpose of "streaming" data
autoencoder.fit(train_dataset, epochs=nb_epoch)
import tensorflow as tf
import tensorflow_io.kafka as kafka_io

with open('cardata-v1.avsc') as f:
    schema = f.read()

dataset = kafka_io.KafkaDataset(["cardata-v1:0"], group="cardata-v1", eof=True)

# remove kafka framing
dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1))

# deserialize avro
dataset = dataset.map(lambda e: kafka_io.decode_avro(
    e,
    schema=schema,
    dtype=[
        tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32,
        tf.float32, tf.float32, tf.float32, tf.int32, tf.int32, tf.int32, tf.
        int32, tf.float32, tf.float32, tf.float32, tf.float32, tf.int32
    ]))


def normalize_fn(coolant_temp, intake_air_temp, intake_air_flow_speed,
                 battery_percentage, battery_voltage, current_draw, speed,
                 engine_vibration_amplitude, throttle_pos, tire_pressure_1_1,
                 tire_pressure_1_2, tire_pressure_2_1, tire_pressure_2_2,
                 accelerometer_1_1_value, accelerometer_1_2_value,
                 accelerometer_2_1_value, accelerometer_2_2_value,
                 control_unit_firmware):

    tire_pressure_1_1 = tf.cast(tire_pressure_1_1, tf.float32)
Exemplo n.º 25
0
    def test_kafka_dataset_with_key(self):
        """Tests for KafkaDataset when reading keyed-messages from a single-partitioned topic"""
        tf.compat.v1.disable_eager_execution()

        import tensorflow_io.kafka as kafka_io  # pylint: disable=wrong-import-position

        topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None])
        num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[])
        batch_size = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[])

        repeat_dataset = kafka_io.KafkaDataset(
            topics, group="test", eof=True,
            message_key=True).repeat(num_epochs)
        batch_dataset = repeat_dataset.batch(batch_size)

        iterator = tf.compat.v1.data.Iterator.from_structure(
            batch_dataset.output_types)
        init_op = iterator.make_initializer(repeat_dataset)
        init_batch_op = iterator.make_initializer(batch_dataset)
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            # Basic test: read a limited number of keyed messages from the topic.
            sess.run(init_op,
                     feed_dict={
                         topics: ["key-test:0:0:4"],
                         num_epochs: 1
                     })
            for i in range(5):
                self.assertEqual(
                    (("D" + str(i)).encode(), ("K" + str(i % 2)).encode()),
                    sess.run(get_next),
                )
            with self.assertRaises(tf.errors.OutOfRangeError):
                sess.run(get_next)

            # Basic test: read all the keyed messages from the topic from offset 5.
            sess.run(init_op,
                     feed_dict={
                         topics: ["key-test:0:5:-1"],
                         num_epochs: 1
                     })
            for i in range(5):
                self.assertEqual(
                    (("D" + str(i + 5)).encode(), ("K" + str(
                        (i + 5) % 2)).encode()),
                    sess.run(get_next),
                )
            with self.assertRaises(tf.errors.OutOfRangeError):
                sess.run(get_next)

            # Basic test: read from different subscriptions of the same topic.
            sess.run(
                init_op,
                feed_dict={
                    topics: ["key-test:0:0:4", "key-test:0:5:-1"],
                    num_epochs: 1,
                },
            )
            for j in range(2):
                for i in range(5):
                    self.assertEqual(
                        (
                            ("D" + str(i + j * 5)).encode(),
                            ("K" + str((i + j * 5) % 2)).encode(),
                        ),
                        sess.run(get_next),
                    )
            with self.assertRaises(tf.errors.OutOfRangeError):
                sess.run(get_next)

            # Test repeated iteration through both subscriptions.
            sess.run(
                init_op,
                feed_dict={
                    topics: ["key-test:0:0:4", "key-test:0:5:-1"],
                    num_epochs: 10,
                },
            )
            for _ in range(10):
                for j in range(2):
                    for i in range(5):
                        self.assertEqual(
                            (
                                ("D" + str(i + j * 5)).encode(),
                                ("K" + str((i + j * 5) % 2)).encode(),
                            ),
                            sess.run(get_next),
                        )
            with self.assertRaises(tf.errors.OutOfRangeError):
                sess.run(get_next)

            # Test batched and repeated iteration through both subscriptions.
            sess.run(
                init_batch_op,
                feed_dict={
                    topics: ["key-test:0:0:4", "key-test:0:5:-1"],
                    num_epochs: 10,
                    batch_size: 5,
                },
            )
            for _ in range(10):
                self.assertAllEqual(
                    [
                        [("D" + str(i)).encode() for i in range(5)],
                        [("K" + str(i % 2)).encode() for i in range(5)],
                    ],
                    sess.run(get_next),
                )
                self.assertAllEqual(
                    [
                        [("D" + str(i + 5)).encode() for i in range(5)],
                        [("K" + str((i + 5) % 2)).encode() for i in range(5)],
                    ],
                    sess.run(get_next),
                )