示例#1
1
def word_count(input_path, output_path):
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)
    # write all the data to one file
    env.set_parallelism(1)

    # define the source
    if input_path is not None:
        ds = env.from_source(
            source=FileSource.for_record_stream_format(StreamFormat.text_line_format(),
                                                       input_path)
                             .process_static_file_set().build(),
            watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
            source_name="file_source"
        )
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        ds = env.from_collection(word_count_data)

    def split(line):
        yield from line.split()

    # compute word count
    ds = ds.flat_map(split) \
           .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
           .key_by(lambda i: i[0]) \
           .reduce(lambda i, j: (i[0], i[1] + j[1]))

    # define the sink
    if output_path is not None:
        ds.sink_to(
            sink=FileSink.for_row_format(
                base_path=output_path,
                encoder=Encoder.simple_string_encoder())
            .with_output_file_config(
                OutputFileConfig.builder()
                .with_part_prefix("prefix")
                .with_part_suffix(".ext")
                .build())
            .with_rolling_policy(RollingPolicy.default_rolling_policy())
            .build()
        )
    else:
        print("Printing result to stdout. Use --output to specify output path.")
        ds.print()

    # submit for execution
    env.execute()
示例#2
0
    def test_cassandra_sink(self):
        type_info = Types.ROW([Types.STRING(), Types.INT()])
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)],
                                      type_info=type_info)
        cassandra_sink_builder = CassandraSink.add_sink(ds)

        cassandra_sink = cassandra_sink_builder\
            .set_host('localhost', 9876) \
            .set_query('query') \
            .enable_ignore_null_fields() \
            .set_mapper_options(MapperOptions()
                                .ttl(1)
                                .timestamp(100)
                                .tracing(True)
                                .if_not_exists(False)
                                .consistency_level(ConsistencyLevel.ANY)
                                .save_null_fields(True)) \
            .set_max_concurrent_requests(1000) \
            .build()

        cassandra_sink.name('cassandra_sink').set_parallelism(3)

        plan = eval(self.env.get_execution_plan())
        self.assertEqual("Sink: cassandra_sink", plan['nodes'][1]['type'])
        self.assertEqual(3, plan['nodes'][1]['parallelism'])
示例#3
0
    def setUp(self):
        super(OneHotEncoderTest, self).setUp()
        self.train_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (0.0,),
                (1.0,),
                (2.0,),
                (0.0,),
            ],
                type_info=Types.ROW_NAMED(
                    ['input'],
                    [Types.DOUBLE()])))

        self.predict_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (0.0,),
                (1.0,),
                (2.0,),
            ],
                type_info=Types.ROW_NAMED(
                    ['input'],
                    [Types.DOUBLE()])))
        self.expected_data = {
            0.0: Vectors.sparse(2, [0], [1.0]),
            1.0: Vectors.sparse(2, [1], [1.0]),
            2.0: Vectors.sparse(2, [], [])
        }

        self.estimator = OneHotEncoder().set_input_cols('input').set_output_cols('output')
示例#4
0
        def field_assertion(field_info, csv_value, value, field_delimiter):
            row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()])
            expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n"
            j_row.setField(1, value)

            csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\
                .set_escape_character('*').set_quote_character('\'')\
                .set_array_element_delimiter(':').set_field_delimiter(';').build()
            csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\
                .set_escape_character('*').set_quote_character('\'')\
                .set_array_element_delimiter(':').set_field_delimiter(';').build()
            csv_row_serialization_schema._j_serialization_schema.open(
                jvm.org.apache.flink.connector.testutils.formats.
                DummyInitializationContext())
            csv_row_deserialization_schema._j_deserialization_schema.open(
                jvm.org.apache.flink.connector.testutils.formats.
                DummyInitializationContext())

            serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize(
                j_row)
            self.assertEqual(expected_csv,
                             str(serialized_bytes, encoding='utf-8'))

            j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\
                .deserialize(expected_csv.encode("utf-8"))
            self.assertTrue(j_row.equals(j_deserialized_row))
示例#5
0
def write_to_es7(env):
    ELASTICSEARCH_SQL_CONNECTOR_PATH = \
        'file:///path/to/flink-sql-connector-elasticsearch7-1.16.0.jar'
    env.add_jars(ELASTICSEARCH_SQL_CONNECTOR_PATH)

    ds = env.from_collection([{
        'name': 'ada',
        'id': '1'
    }, {
        'name': 'luna',
        'id': '2'
    }],
                             type_info=Types.MAP(Types.STRING(),
                                                 Types.STRING()))

    es7_sink = Elasticsearch7SinkBuilder() \
        .set_emitter(ElasticsearchEmitter.static_index('foo', 'id')) \
        .set_hosts(['localhost:9200']) \
        .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \
        .set_bulk_flush_max_actions(1) \
        .set_bulk_flush_max_size_mb(2) \
        .set_bulk_flush_interval(1000) \
        .set_bulk_flush_backoff_strategy(FlushBackoffType.CONSTANT, 3, 3000) \
        .set_connection_username('foo') \
        .set_connection_password('bar') \
        .set_connection_path_prefix('foo-bar') \
        .set_connection_request_timeout(30000) \
        .set_connection_timeout(31000) \
        .set_socket_timeout(32000) \
        .build()

    ds.sink_to(es7_sink).name('es7 sink')

    env.execute()
示例#6
0
def write_to_kafka(env):
    ds = env.from_collection([(1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'),
                              (5, 'hi'), (6, 'hello'), (6, 'hello')],
                             type_info=Types.ROW([Types.INT(),
                                                  Types.STRING()]))

    serialization_schema = AvroRowSerializationSchema(avro_schema_string="""
            {
                "type": "record",
                "name": "TestRecord",
                "fields": [
                    {"name": "id", "type": "int"},
                    {"name": "name", "type": "string"}
                ]
            }""")

    kafka_producer = FlinkKafkaProducer(
        topic='test_avro_topic',
        serialization_schema=serialization_schema,
        producer_config={
            'bootstrap.servers': 'localhost:9092',
            'group.id': 'test_group'
        })

    # note that the output type of ds must be RowTypeInfo
    ds.add_sink(kafka_producer)
    env.execute()
示例#7
0
    def test_es_sink_dynamic(self):
        ds = self.env.from_collection([{
            'name': 'ada',
            'id': '1'
        }, {
            'name': 'luna',
            'id': '2'
        }],
                                      type_info=Types.MAP(
                                          Types.STRING(), Types.STRING()))

        es_dynamic_index_sink = Elasticsearch7SinkBuilder() \
            .set_emitter(ElasticsearchEmitter.dynamic_index('name', 'id')) \
            .set_hosts(['localhost:9200']) \
            .build()

        j_emitter = get_field_value(es_dynamic_index_sink.get_java_function(),
                                    'emitter')
        self.assertTrue(
            is_instance_of(
                j_emitter,
                'org.apache.flink.connector.elasticsearch.sink.MapElasticsearchEmitter'
            ))

        ds.sink_to(es_dynamic_index_sink).name('es dynamic index sink')
示例#8
0
    def setUp(self):
        super(MinMaxScalerTest, self).setUp()
        self.train_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([0.0, 3.0]), ),
                (Vectors.dense([2.1, 0.0]), ),
                (Vectors.dense([4.1, 5.1]), ),
                (Vectors.dense([6.1, 8.1]), ),
                (Vectors.dense([200., 400.]), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))

        self.predict_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([150.0, 90.0]), ),
                (Vectors.dense([50.0, 40.0]), ),
                (Vectors.dense([100.0, 50.0]), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))
        self.expected_data = [
            Vectors.dense(0.25, 0.1),
            Vectors.dense(0.5, 0.125),
            Vectors.dense(0.75, 0.225)
        ]
示例#9
0
def pandas_udaf():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts, name, price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .column('w_start', DataTypes.TIMESTAMP_LTZ())
                               .column('w_end', DataTypes.TIMESTAMP_LTZ())
                               .build())
                       .build())

    @udaf(result_type=DataTypes.FLOAT(), func_type="pandas")
    def mean_udaf(v):
        return v.mean()

    # define the tumble window operation
    table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \
                 .group_by(table.name, col('w')) \
                 .select(table.name, mean_udaf(table.price), col("w").start, col("w").end)

    # submit for execution
    table.execute_insert('sink') \
         .wait()
示例#10
0
def tumble_window_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts", "name", "price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .build())
                       .build())

    # define the over window operation
    table = table.over_window(
        Over.partition_by(col("name"))
            .order_by(col("ts"))
            .preceding(row_interval(2))
            .following(CURRENT_ROW)
            .alias('w')) \
        .select(table.name, table.price.max.over(col('w')))

    # submit for execution
    table.execute_insert('sink') \
         .wait()
示例#11
0
    def test_csv_row_serialization_schema(self):
        jvm = get_gateway().jvm
        JRow = jvm.org.apache.flink.types.Row

        j_row = JRow(3)
        j_row.setField(0, "BEGIN")
        j_row.setField(2, "END")

        def field_assertion(field_info, csv_value, value, field_delimiter):
            row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()])
            expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n"
            j_row.setField(1, value)

            csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\
                .set_escape_character('*').set_quote_character('\'')\
                .set_array_element_delimiter(':').set_field_delimiter(';').build()
            csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\
                .set_escape_character('*').set_quote_character('\'')\
                .set_array_element_delimiter(':').set_field_delimiter(';').build()
            csv_row_serialization_schema._j_serialization_schema.open(
                jvm.org.apache.flink.connector.testutils.formats.
                DummyInitializationContext())
            csv_row_deserialization_schema._j_deserialization_schema.open(
                jvm.org.apache.flink.connector.testutils.formats.
                DummyInitializationContext())

            serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize(
                j_row)
            self.assertEqual(expected_csv,
                             str(serialized_bytes, encoding='utf-8'))

            j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\
                .deserialize(expected_csv.encode("utf-8"))
            self.assertTrue(j_row.equals(j_deserialized_row))

        field_assertion(Types.STRING(), "'123''4**'", "123'4*", ";")
        field_assertion(Types.STRING(), "'a;b''c'", "a;b'c", ";")
        field_assertion(Types.INT(), "12", 12, ";")

        test_j_row = JRow(2)
        test_j_row.setField(0, "1")
        test_j_row.setField(1, "hello")

        field_assertion(Types.ROW([Types.STRING(),
                                   Types.STRING()]), "'1:hello'", test_j_row,
                        ";")
        test_j_row.setField(1, "hello world")
        field_assertion(Types.ROW([Types.STRING(),
                                   Types.STRING()]), "'1:hello world'",
                        test_j_row, ";")
        field_assertion(Types.STRING(), "null", "null", ";")
示例#12
0
def read_from_kafka(env):
    deserialization_schema = JsonRowDeserializationSchema.Builder() \
        .type_info(Types.ROW([Types.INT(), Types.STRING()])) \
        .build()

    kafka_consumer = FlinkKafkaConsumer(
        topics='test_csv_topic',
        deserialization_schema=deserialization_schema,
        properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group_1'}
    )
    kafka_consumer.set_start_from_earliest()

    env.add_source(kafka_consumer).print()
    env.execute()
示例#13
0
    def test_fewer_distinct_points_than_cluster(self):
        input = self.t_env.from_data_stream(
            self.env.from_collection(
                [
                    (Vectors.dense([0.0, 0.1]), ),
                    (Vectors.dense([0.0, 0.1]), ),
                    (Vectors.dense([0.0, 0.1]), ),
                ],
                type_info=Types.ROW_NAMED(['features'],
                                          [DenseVectorTypeInfo()])))

        kmeans = KMeans().set_k(2)
        model = kmeans.fit(input)
        output = model.transform(input)[0]
        results = [
            result for result in self.t_env.to_data_stream(
                output).execute_and_collect()
        ]
        field_names = output.get_schema().get_field_names()
        actual_groups = group_features_by_prediction(
            results, field_names.index(kmeans.features_col),
            field_names.index(kmeans.prediction_col))

        expected_groups = [{DenseVector([0.0, 0.1])}]

        self.assertEqual(actual_groups, expected_groups)
示例#14
0
 def _build_csv_job(self, schema):
     source = FileSource.for_record_stream_format(
         CsvReaderFormat.for_schema(schema), self.csv_file_name).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(),
                               'csv-source')
     ds.map(PassThroughMapFunction(), output_type=Types.PICKLED_BYTE_ARRAY()) \
         .add_sink(self.test_sink)
示例#15
0
    def setUp(self):
        super(StandardScalerTest, self).setUp()
        self.dense_input = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense(-2.5, 9.0, 1.0), ),
                (Vectors.dense(1.4, -5.0, 1.0), ),
                (Vectors.dense(2.0, -1.0, -2.0), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))

        self.expected_res_with_mean = [
            Vectors.dense(-2.8, 8.0, 1.0),
            Vectors.dense(1.1, -6.0, 1.0),
            Vectors.dense(1.7, -2.0, -2.0)
        ]

        self.expected_res_with_std = [
            Vectors.dense(-1.0231819, 1.2480754, 0.5773502),
            Vectors.dense(0.5729819, -0.6933752, 0.5773503),
            Vectors.dense(0.8185455, -0.1386750, -1.1547005)
        ]

        self.expected_res_with_mean_and_std = [
            Vectors.dense(-1.1459637, 1.1094004, 0.5773503),
            Vectors.dense(0.45020003, -0.8320503, 0.5773503),
            Vectors.dense(0.69576368, -0.2773501, -1.1547005)
        ]

        self.expected_mean = [0.3, 1.0, 0.0]
        self.expected_std = [2.4433583, 7.2111026, 1.7320508]
示例#16
0
def write_to_kafka(env):
    type_info = Types.ROW([Types.INT(), Types.STRING()])
    ds = env.from_collection([
        (1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')],
        type_info=type_info)

    serialization_schema = CsvRowSerializationSchema.Builder(type_info).build()
    kafka_producer = FlinkKafkaProducer(
        topic='test_csv_topic',
        serialization_schema=serialization_schema,
        producer_config={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'}
    )

    # note that the output type of ds must be RowTypeInfo
    ds.add_sink(kafka_producer)
    env.execute()
示例#17
0
def basic_operations():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)

    # define the source
    ds = env.from_collection(collection=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                             type_info=Types.ROW_NAMED(
                                 ["id", "info"],
                                 [Types.INT(), Types.STRING()]))

    # map
    def update_tel(data):
        # parse the json
        json_data = json.loads(data.info)
        json_data['tel'] += 1
        return data.id, json.dumps(json_data)

    show(ds.map(update_tel), env)
    # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}')
    # (2, '{"name": "hello", "tel": 136, "addr": {"country": "China", "city": "Shanghai"}}')
    # (3, '{"name": "world", "tel": 125, "addr": {"country": "USA", "city": "NewYork"}}')
    # (4, '{"name": "PyFlink", "tel": 33, "addr": {"country": "China", "city": "Hangzhou"}}')

    # filter
    show(ds.filter(lambda data: data.id == 1).map(update_tel), env)
    # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}')

    # key by
    show(
        ds.map(lambda data: (json.loads(data.info)['addr']['country'],
                             json.loads(data.info)['tel'])).key_by(
                                 lambda data: data[0]).sum(1), env)
示例#18
0
    def test_jdbc_sink(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))
        jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\
            .with_driver_name('com.mysql.jdbc.Driver')\
            .with_user_name('root')\
            .with_password('password')\
            .with_url('jdbc:mysql://server-name:server-port/database-name').build()

        jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\
            .with_batch_size(100).with_max_retries(5).build()
        jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(),
                                  jdbc_connection_options,
                                  jdbc_execution_options)

        ds.add_sink(jdbc_sink).name('jdbc sink')
        plan = eval(self.env.get_execution_plan())
        self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type'])
        j_output_format = get_field_value(jdbc_sink.get_java_function(),
                                          'outputFormat')

        connection_options = JdbcConnectionOptions(
            get_field_value(
                get_field_value(j_output_format, 'connectionProvider'),
                'jdbcOptions'))
        self.assertEqual(jdbc_connection_options.get_db_url(),
                         connection_options.get_db_url())
        self.assertEqual(jdbc_connection_options.get_driver_name(),
                         connection_options.get_driver_name())
        self.assertEqual(jdbc_connection_options.get_password(),
                         connection_options.get_password())
        self.assertEqual(jdbc_connection_options.get_user_name(),
                         connection_options.get_user_name())

        exec_options = JdbcExecutionOptions(
            get_field_value(j_output_format, 'executionOptions'))
        self.assertEqual(jdbc_execution_options.get_batch_interval_ms(),
                         exec_options.get_batch_interval_ms())
        self.assertEqual(jdbc_execution_options.get_batch_size(),
                         exec_options.get_batch_size())
        self.assertEqual(jdbc_execution_options.get_max_retries(),
                         exec_options.get_max_retries())
示例#19
0
    def test_pulsar_source(self):
        TEST_OPTION_NAME = 'pulsar.source.enableAutoAcknowledgeMessage'
        pulsar_source = PulsarSource.builder() \
            .set_service_url('pulsar://localhost:6650') \
            .set_admin_url('http://localhost:8080') \
            .set_topics('ada') \
            .set_start_cursor(StartCursor.earliest()) \
            .set_unbounded_stop_cursor(StopCursor.never()) \
            .set_bounded_stop_cursor(StopCursor.at_publish_time(22)) \
            .set_subscription_name('ff') \
            .set_subscription_type(SubscriptionType.Exclusive) \
            .set_deserialization_schema(
                PulsarDeserializationSchema.flink_type_info(Types.STRING())) \
            .set_deserialization_schema(
                PulsarDeserializationSchema.flink_schema(SimpleStringSchema())) \
            .set_config(TEST_OPTION_NAME, True) \
            .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \
            .build()

        ds = self.env.from_source(
            source=pulsar_source,
            watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
            source_name="pulsar source")
        ds.print()
        plan = eval(self.env.get_execution_plan())
        self.assertEqual('Source: pulsar source', plan['nodes'][0]['type'])

        configuration = get_field_value(pulsar_source.get_java_function(),
                                        "sourceConfiguration")
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.client.serviceUrl').string_type().
                no_default_value()._j_config_option),
            'pulsar://localhost:6650')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.admin.adminUrl').string_type().
                no_default_value()._j_config_option), 'http://localhost:8080')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.consumer.subscriptionName').
                string_type().no_default_value()._j_config_option), 'ff')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.consumer.subscriptionType').
                string_type().no_default_value()._j_config_option),
            SubscriptionType.Exclusive.name)
        test_option = ConfigOptions.key(
            TEST_OPTION_NAME).boolean_type().no_default_value()
        self.assertEqual(
            configuration.getBoolean(test_option._j_config_option), True)
        self.assertEqual(
            configuration.getLong(
                ConfigOptions.key('pulsar.source.autoCommitCursorInterval').
                long_type().no_default_value()._j_config_option), 1000)
示例#20
0
 def setUp(self):
     super(LogisticRegressionTest, self).setUp()
     self.binomial_data_table = self.t_env.from_data_stream(
         self.env.from_collection(
             [
                 (Vectors.dense([1, 2, 3, 4]), 0., 1.),
                 (Vectors.dense([2, 2, 3, 4]), 0., 2.),
                 (Vectors.dense([3, 2, 3, 4]), 0., 3.),
                 (Vectors.dense([4, 2, 3, 4]), 0., 4.),
                 (Vectors.dense([5, 2, 3, 4]), 0., 5.),
                 (Vectors.dense([11, 2, 3, 4]), 1., 1.),
                 (Vectors.dense([12, 2, 3, 4]), 1., 2.),
                 (Vectors.dense([13, 2, 3, 4]), 1., 3.),
                 (Vectors.dense([14, 2, 3, 4]), 1., 4.),
                 (Vectors.dense([15, 2, 3, 4]), 1., 5.),
             ],
             type_info=Types.ROW_NAMED(
                 ['features', 'label', 'weight'],
                 [DenseVectorTypeInfo(),
                  Types.DOUBLE(),
                  Types.DOUBLE()])))
示例#21
0
    def setUp(self):
        super(StringIndexerTest, self).setUp()
        self.train_table = self.t_env.from_data_stream(
            self.env.from_collection([
                ('a', 1.0),
                ('b', 1.0),
                ('b', 2.0),
                ('c', 0.0),
                ('d', 2.0),
                ('a', 2.0),
                ('b', 2.0),
                ('b', -1.0),
                ('a', -1.0),
                ('c', -1.0),
            ],
                type_info=Types.ROW_NAMED(
                    ['input_col1', 'input_col2'],
                    [Types.STRING(), Types.DOUBLE()])))

        self.predict_table = self.t_env.from_data_stream(
            self.env.from_collection([
                ('a', 2.0),
                ('b', 1.0),
                ('e', 2.0),
            ],
                type_info=Types.ROW_NAMED(
                    ['input_col1', 'input_col2'],
                    [Types.STRING(), Types.DOUBLE()])))

        self.expected_alphabetic_asc_predict_data = [
            Row('a', 2.0, 0, 3),
            Row('b', 1.0, 1, 2),
            Row('e', 2.0, 4, 3)
        ]
示例#22
0
    def setUp(self):
        super(VectorAssemblerTest, self).setUp()
        self.input_data_table = self.t_env.from_data_stream(
            self.env.from_collection([
                (0, Vectors.dense(2.1, 3.1), 1.0, Vectors.sparse(
                    5, [3], [1.0])),
                (1, Vectors.dense(2.1, 3.1), 1.0,
                 Vectors.sparse(5, [1, 2, 3, 4], [1.0, 2.0, 3.0, 4.0])),
                (2, None, None, None),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['id', 'vec', 'num', 'sparse_vec'], [
                                             Types.INT(),
                                             DenseVectorTypeInfo(),
                                             Types.DOUBLE(),
                                             SparseVectorTypeInfo()
                                         ])))

        self.expected_output_data_1 = Vectors.sparse(8, [0, 1, 2, 6],
                                                     [2.1, 3.1, 1.0, 1.0])
        self.expected_output_data_2 = Vectors.dense(2.1, 3.1, 1.0, 0.0, 1.0,
                                                    2.0, 3.0, 4.0)
示例#23
0
    def setUp(self):
        super(NaiveBayesTest, self).setUp()
        self.env.set_parallelism(1)
        self.train_data = self.t_env.from_data_stream(
            self.env.from_collection(
                [
                    (Vectors.dense([0, 0.]), 11.),
                    (Vectors.dense([1, 0]), 10.),
                    (Vectors.dense([1, 1.]), 10.),
                ],
                type_info=Types.ROW_NAMED(
                    ['features', 'label'],
                    [DenseVectorTypeInfo(),
                     Types.DOUBLE()])))

        self.predict_data = self.t_env.from_data_stream(
            self.env.from_collection(
                [
                    (Vectors.dense([0, 1.]), ),
                    (Vectors.dense([0, 0.]), ),
                    (Vectors.dense([1, 0]), ),
                    (Vectors.dense([1, 1.]), ),
                ],
                type_info=Types.ROW_NAMED(['features'],
                                          [DenseVectorTypeInfo()])))

        self.expected_output = {
            Vectors.dense([0, 1.]): 11.,
            Vectors.dense([0, 0.]): 11.,
            Vectors.dense([1, 0.]): 10.,
            Vectors.dense([1, 1.]): 10.,
        }

        self.estimator = NaiveBayes() \
            .set_smoothing(1.0) \
            .set_features_col('features') \
            .set_label_col('label') \
            .set_prediction_col('prediction') \
            .set_model_type('multinomial')  # type: NaiveBayes
示例#24
0
    def setUp(self):
        super(KNNTest, self).setUp()
        self.train_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([2.0, 3.0]), 1.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([200.1, 300.1]), 2.0),
                (Vectors.dense([200.2, 300.2]), 2.0),
                (Vectors.dense([200.3, 300.3]), 2.0),
                (Vectors.dense([200.4, 300.4]), 2.0),
                (Vectors.dense([200.4, 300.4]), 2.0),
                (Vectors.dense([200.6, 300.6]), 2.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([2.3, 3.2]), 1.0),
                (Vectors.dense([2.3, 3.2]), 1.0),
                (Vectors.dense([2.8, 3.2]), 3.0),
                (Vectors.dense([300., 3.2]), 4.0),
                (Vectors.dense([2.2, 3.2]), 1.0),
                (Vectors.dense([2.4, 3.2]), 5.0),
                (Vectors.dense([2.5, 3.2]), 5.0),
                (Vectors.dense([2.5, 3.2]), 5.0),
                (Vectors.dense([2.1, 3.1]), 1.0)
            ],
                type_info=Types.ROW_NAMED(
                    ['features', 'label'],
                    [DenseVectorTypeInfo(), Types.DOUBLE()])))

        self.predict_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([4.0, 4.1]), 5.0),
                (Vectors.dense([300, 42]), 2.0),
            ],
                type_info=Types.ROW_NAMED(
                    ['features', 'label'],
                    [DenseVectorTypeInfo(), Types.DOUBLE()])))
示例#25
0
    def test_rabbitmq_connectors(self):
        connection_config = RMQConnectionConfig.Builder() \
            .set_host('localhost') \
            .set_port(5672) \
            .set_virtual_host('/') \
            .set_user_name('guest') \
            .set_password('guest') \
            .build()
        type_info = Types.ROW([Types.INT(), Types.STRING()])
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        rmq_source = RMQSource(
            connection_config, 'source_queue', True, deserialization_schema)
        self.assertEqual(
            get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue')
        self.assertTrue(get_field_value(rmq_source.get_java_function(), 'usesCorrelationId'))

        serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \
            .build()
        rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema)
        self.assertEqual(
            get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
示例#26
0
def write_to_es6_dynamic_index(env):
    ELASTICSEARCH_SQL_CONNECTOR_PATH = \
        'file:///path/to/flink-sql-connector-elasticsearch6-1.16.0.jar'
    env.add_jars(ELASTICSEARCH_SQL_CONNECTOR_PATH)

    ds = env.from_collection([{
        'name': 'ada',
        'id': '1'
    }, {
        'name': 'luna',
        'id': '2'
    }],
                             type_info=Types.MAP(Types.STRING(),
                                                 Types.STRING()))

    es_sink = Elasticsearch6SinkBuilder() \
        .set_emitter(ElasticsearchEmitter.dynamic_index('name', 'id', 'bar')) \
        .set_hosts(['localhost:9200']) \
        .build()

    ds.sink_to(es_sink).name('es6 dynamic index sink')

    env.execute()
示例#27
0
    def test_max_value_equas_min_value_but_predict_value_not_equals(self):
        train_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([40.0, 80.0]), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))

        predict_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([30.0, 50.0]), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))

        min_max_scalar = MinMaxScaler() \
            .set_min(0.0) \
            .set_max(10.0)

        model = min_max_scalar.fit(train_data)
        result = model.transform(predict_data)[0]
        self.verify_output_result(result, min_max_scalar.get_output_col(),
                                  result.get_schema().get_field_names(),
                                  [Vectors.dense(5.0, 5.0)])
示例#28
0
def mixing_use_of_datastream_and_table():
    # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source
    t_env.create_temporary_table(
        'source',
        TableDescriptor.for_connector('datagen').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data',
                DataTypes.STRING()).build()).option("number-of-rows",
                                                    "10").build())

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('a',
                                        DataTypes.BIGINT()).build()).build())

    @udf(result_type=DataTypes.BIGINT())
    def length(data):
        return len(data)

    # perform table api operations
    table = t_env.from_path("source")
    table = table.select(col('id'), length(col('data')))

    # convert table to datastream and perform datastream api operations
    ds = t_env.to_data_stream(table)
    ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG())

    # convert datastream to table and perform table api operations as you want
    table = t_env.from_data_stream(
        ds,
        Schema.new_builder().column("f0", DataTypes.BIGINT()).build())

    # execute
    table.execute_insert('sink') \
         .wait()
示例#29
0
 def test_source_deprecated_method(self):
     test_option = ConfigOptions.key('pulsar.source.enableAutoAcknowledgeMessage') \
         .boolean_type().no_default_value()
     pulsar_source = PulsarSource.builder() \
         .set_service_url('pulsar://localhost:6650') \
         .set_admin_url('http://localhost:8080') \
         .set_topic_pattern('ada.*') \
         .set_deserialization_schema(
             PulsarDeserializationSchema.flink_type_info(Types.STRING())) \
         .set_unbounded_stop_cursor(StopCursor.at_publish_time(4444)) \
         .set_subscription_name('ff') \
         .set_config(test_option, True) \
         .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \
         .build()
     configuration = get_field_value(pulsar_source.get_java_function(),
                                     "sourceConfiguration")
     self.assertEqual(
         configuration.getBoolean(test_option._j_config_option), True)
     self.assertEqual(
         configuration.getLong(
             ConfigOptions.key('pulsar.source.autoCommitCursorInterval').
             long_type().no_default_value()._j_config_option), 1000)
示例#30
0
 def setUp(self):
     super(KMeansTest, self).setUp()
     self.data_table = self.t_env.from_data_stream(
         self.env.from_collection(
             [
                 (Vectors.dense([0.0, 0.0]), ),
                 (Vectors.dense([0.0, 0.3]), ),
                 (Vectors.dense([0.3, 3.0]), ),
                 (Vectors.dense([9.0, 0.0]), ),
                 (Vectors.dense([9.0, 0.6]), ),
                 (Vectors.dense([9.6, 0.0]), ),
             ],
             type_info=Types.ROW_NAMED(['features'],
                                       [DenseVectorTypeInfo()])))
     self.expected_groups = [{
         DenseVector([0.0, 0.3]),
         DenseVector([0.3, 3.0]),
         DenseVector([0.0, 0.0])
     },
                             {
                                 DenseVector([9.6, 0.0]),
                                 DenseVector([9.0, 0.0]),
                                 DenseVector([9.0, 0.6])
                             }]