Exemplo n.º 1
0
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None):

        sr_conf = {
            key.replace("schema.registry.", ""): value
            for key, value in config.items()
            if key.startswith("schema.registry")
        }

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '')
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')

        ap_conf = {
            key: value
            for key, value in config.items()
            if not key.startswith("schema.registry")
        }

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroProducer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
Exemplo n.º 2
0
 def setUp(self):
     # need to set up the serializer
     # Make RecordSchema and PrimitiveSchema hashable
     schema.RecordSchema.__hash__ = self.hash_func
     schema.PrimitiveSchema.__hash__ = self.hash_func
     self.client = MockSchemaRegistryClient()
     self.ms = MessageSerializer(self.client)
Exemplo n.º 3
0
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None,
                 subject_name_strategy=SubjectNameStrategy.RecordNameStrategy
                 ):

        sr_conf = {key.replace("schema.registry.", ""): value
                   for key, value in config.items() if key.startswith("schema.registry")}

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            # Fallback to plural 'mechanisms' for backward compatibility
            sr_conf['sasl.mechanism'] = config.get('sasl.mechanism', config.get('sasl.mechanisms', ''))
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')
            sr_conf['auto.register.schemas'] = config.get('auto.register.schemas', True)

        ap_conf = {key: value
                   for key, value in config.items() if not key.startswith("schema.registry")}

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError("Cannot pass schema_registry along with schema.registry.url config")

        super(AvroProducer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry, subject_name_strategy=subject_name_strategy)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
Exemplo n.º 4
0
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None):

        schema_registry_url = config.pop("schema.registry.url", None)
        schema_registry_ca_location = config.pop(
            "schema.registry.ssl.ca.location", None)
        schema_registry_certificate_location = config.pop(
            "schema.registry.ssl.certificate.location", None)
        schema_registry_key_location = config.pop(
            "schema.registry.ssl.key.location", None)

        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")

            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url,
                ca_location=schema_registry_ca_location,
                cert_location=schema_registry_certificate_location,
                key_location=schema_registry_key_location)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroProducer, self).__init__(config)
        self._serializer = MessageSerializer(schema_registry)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
def produce(config, topic, input_messages):
    """
        produce initiate sending a message to Kafka, call the produce method passing in the input_messages key/value
        and and callback
    Parameters
    ----------
        topic: str
            topic where the input message publish too
        input_messages: dict
            a key/value input messages
        config: dict
            the config values that needed by the produce

     """
    if topic is None:
        logger.debug('Required topic field must be set')
        raise ValueError()

    if len(input_messages) <= 0:
        logger.debug('Required data field must not be empty.')
        raise ValueError()

    bootstrap_servers, schema_registry = producer_config(config)

    producer = Producer(bootstrap_servers)
    admin_client = AdminClient(bootstrap_servers)
    topics = admin_client.list_topics().topics
    #Just to show what's available
    print(topics)

    if not topics:
        print('Not Topics')
        raise RuntimeError()

    sr = CachedSchemaRegistryClient(schema_registry)
    ser = MessageSerializer(sr)
    # get schema
    id, schema, version = sr.get_latest_schema(topic + "-value")
    if schema:
        print('In If Schema')
        for key, value in input_messages.items():
            if validate_uuid4(key):
                print('In validate in For loop')
                serializedMessage = ser.encode_record_with_schema(
                    topic, schema, value)
                producer.produce(topic=topic,
                                 key=key,
                                 value=serializedMessage,
                                 callback=acked)
                # producer.flush() # bad idea, it limits throughput to the broker round trip time
                producer.poll(1)
            else:
                print('In Else of For Loop')
                logger.error('Invalid UUID String: ', key)

    else:
        print('Schema not found for topic name: ', topic)
        print('In Else Schema')
    sys.exit(1)
Exemplo n.º 6
0
    def __init__(self, config):

        if ('schema.registry.url' not in config.keys()):
            raise ValueError("Missing parameter: schema.registry.url")
        schem_registry_url = config["schema.registry.url"]
        del config["schema.registry.url"]

        super(AvroConsumer, self).__init__(config)
        self._serializer = MessageSerializer(CachedSchemaRegistryClient(url=schem_registry_url))
Exemplo n.º 7
0
    def read_from_offset(self, offset=0, lang='json', schema=None):

        '''

        Kafka read message

        Read json and avro messages from consumer

        '''
        log.debug("[KafkaDriver][read_from_offset] lang: " + str(lang))
        log.debug("[KafkaDriver][read_from_offset] offset: " + str(offset))

        def outputJSON(obj):

            '''

            Default JSON serializer.

            '''

            if isinstance(obj, datetime.datetime):
                return int(obj.strftime("%s%f")[:-3])
            return obj


        ret = None
        log.debug("[KafkaDriver][read_from_offset] read start: " + str(self.server))
        consumer = KafkaConsumer(bootstrap_servers=self.server + ':9092',
                                 auto_offset_reset='earliest',
                                 consumer_timeout_ms=1000)

        partition = TopicPartition(self.topic, 0)
        consumer.assign([partition])
        consumer.seek_to_end(partition)
        start = int(offset)
        consumer.seek(partition, offset)

        for msg in consumer:
            if (lang == 'avro'):
                #message = AvroDecoder.decode(schema, msg.value)
                schema_registry = CachedSchemaRegistryClient(url='http://' + self.schema_registry + ':8081')
                self._serializer = MessageSerializer(schema_registry)
                message = self._serializer.decode_message(msg.value)
                message = json.dumps(message, indent=4, sort_keys=True, default=outputJSON)
                #log.debug("[KafkaDriver][read_from_offset] avro message: " + str(message))
                ret = message
            else:
                message = msg.value
                #log.debug("[KafkaDriver][read_from_offset] other message: " + str(message))
                ret = msg.value
            log.debug("[KafkaDriver][read_from_offset] msg: " + str(message) + " msg.offset: " + str(msg.offset))
        consumer.close()
        log.debug("[KafkaDriver][read_from_offset] read end")
        return ret
Exemplo n.º 8
0
    def __init__(self, config, default_key_schema=None,
                 default_value_schema=None):
        if ('schema.registry.url' not in config.keys()):
            raise ValueError("Missing parameter: schema.registry.url")
        schem_registry_url = config["schema.registry.url"]
        del config["schema.registry.url"]

        super(AvroProducer, self).__init__(config)
        self._serializer = MessageSerializer(CachedSchemaRegistryClient(url=schem_registry_url))
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
 def __init__(self,
              producer,
              schema_registry_url,
              default_key_schema=None,
              default_value_schema=None
              ):  # real signature unknown; restored from __doc__
     self._producer = producer
     self._serializer = MessageSerializer(
         CachedSchemaRegistryClient(url=schema_registry_url))
     self.key_schema = default_key_schema
     self.value_schema = default_value_schema
Exemplo n.º 10
0
 def __init__(self, schema_registry_url):
     """Private implementation class for Avro IO using the registry"""
     log.info(
         f"Using registry with schema_url/id {schema_registry_url}/{config.SCHEMA_ID}"
     )
     try:
         self.client = CachedSchemaRegistryClient(url=schema_registry_url)
         self.schema = self.client.get_by_id(config.SCHEMA_ID)
         self.serializer = MessageSerializer(self.client)
     except:
         raise ValueError("Client id or schema id not found")
Exemplo n.º 11
0
def consume(config, topic, handler):
    """
    Starts a consumer and calls the given handler for each consumed message.
    Assumes that keys are serialized as strings and values are serialized
    as Avro objects with their schemas stored in a Confluent Schema Registry.
    """
    c_conf = {}
    for key, value in config.items():
        if not key.startswith("schema.registry"):
            if not value is None:
                c_conf[key] = value.strip()

    if "auto.offset.reset" in c_conf:
        print("offset provided")
    else:
        c_conf['auto.offset.reset'] = 'earliest'

    if "group.id" in c_conf:
        print("group id provided")
    else:
        c_conf['group.id'] = 'sme_test'

    c = Consumer(c_conf)
    c.subscribe([topic])

    sr_conf = {
        key.replace("schema.registry.", ""): value.strip()
        for key, value in config.items() if key.startswith("schema.registry")
    }

    sr = CachedSchemaRegistryClient(sr_conf)
    ser = MessageSerializer(sr)

    while True:
        try:
            msg = c.poll(10)
            if msg is None:
                print('No Messages')
                continue
            if msg.error():
                log.error("Consumer error: {}".format(msg.error()))
                continue
            key = msg.key().decode('utf-8')
            value = ser.decode_message(msg.value(), is_key=False)
        except Exception as e:
            log.error("Message consumption failed: {}".format(e))
            break
        try:
            handler(key, value)
        except Exception as e:
            log.error("Message handler failed: {}".format(e))
            break
    c.close()
Exemplo n.º 12
0
    def __init__(self, config, schema_registry=None):
        schema_registry_url = config.pop("schema.registry.url", None)
        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")
            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroConsumer, self).__init__(config)
        self._serializer = MessageSerializer(schema_registry)
Exemplo n.º 13
0
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None):
        schema_registry_url = config.pop("schema.registry.url", None)
        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")
            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroProducer, self).__init__(config)
        self._key_serializer = config.pop("key.serializer",
                                          MessageSerializer(schema_registry))
        self._value_serializer = config.pop("key.serializer",
                                            MessageSerializer(schema_registry))
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
Exemplo n.º 14
0
    def __init__(self, config, schema_registry=None, reader_key_schema=None, reader_value_schema=None):

        sr_conf = {key.replace("schema.registry.", ""): value
                   for key, value in config.items() if key.startswith("schema.registry")}

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            # Fallback to plural 'mechanisms' for backward compatibility
            sr_conf['sasl.mechanism'] = config.get('sasl.mechanism', config.get('sasl.mechanisms', ''))
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')

        ap_conf = {key: value
                   for key, value in config.items() if not key.startswith("schema.registry")}

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError("Cannot pass schema_registry along with schema.registry.url config")

        super(AvroConsumer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry, reader_key_schema, reader_value_schema)
Exemplo n.º 15
0
def test_select(started_cluster):
    # type: (ClickHouseCluster) -> None

    schema_registry_client = CachedSchemaRegistryClient(
        'http://localhost:{}'.format(started_cluster.schema_registry_port))
    serializer = MessageSerializer(schema_registry_client)

    schema = avro.schema.make_avsc_object({
        'name':
        'test_record',
        'type':
        'record',
        'fields': [{
            'name': 'value',
            'type': 'long'
        }]
    })

    buf = io.BytesIO()
    for x in range(0, 3):
        message = serializer.encode_record_with_schema('test_subject', schema,
                                                       {'value': x})
        buf.write(message)
    data = buf.getvalue()

    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    schema_registry_url = "http://{}:{}".format(
        started_cluster.schema_registry_host, 8081)

    run_query(instance,
              "create table avro_data(value Int64) engine = Memory()")
    settings = {'format_avro_schema_registry_url': schema_registry_url}
    run_query(instance, "insert into avro_data format AvroConfluent", data,
              settings)
    stdout = run_query(instance, "select * from avro_data")
    assert list(map(str.split, stdout.splitlines())) == [
        ["0"],
        ["1"],
        ["2"],
    ]
Exemplo n.º 16
0
 def setUp(self):
     # need to set up the serializer
     self.client = MockSchemaRegistryClient()
     self.ms = MessageSerializer(self.client)
def test_kafka_destination_expression_partitioner_avro(sdc_builder,
                                                       sdc_executor, cluster,
                                                       confluent):
    """This test ensures that the correct serializer is set when producing AVRO records and using
    EXPRESSION partition strategy. We do so by setting the confluent serializer in the stage config, and also
    setting it to the kafka consumer used in the test. The consumer won't be able to deserialize the records
    if they're not serialized in AVRO.
    """
    topic = get_random_string(string.ascii_letters, 10)
    logger.debug('Kafka topic name: %s', topic)

    data = {'myLongField1': 'My Long Message'}

    # Build the Kafka destination pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    source = builder.add_stage('Dev Raw Data Source')
    source.set_attributes(stop_after_first_batch=True,
                          data_format='JSON',
                          raw_data=json.dumps(data))

    destination = builder.add_stage(
        name='com_streamsets_pipeline_stage_destination_kafka_KafkaDTarget',
        library=cluster.kafka.standalone_stage_lib)
    # Set configuration to use AVRO with a registered schema in confluent, and expression partition strategy
    destination.set_attributes(topic=topic,
                               data_format='AVRO',
                               message_key_format='AVRO',
                               avro_schema_location='REGISTRY',
                               lookup_schema_by='SUBJECT',
                               schema_subject=f'{topic}-value',
                               include_schema=False,
                               partition_strategy='EXPRESSION',
                               partition_expression='${0}',
                               kafka_message_key='',
                               key_serializer='CONFLUENT',
                               value_serializer='CONFLUENT')

    source >> destination
    pipeline = builder\
        .build(title='Kafka Destination pipeline with Expression Partitioner')\
        .configure_for_environment(cluster, confluent)

    sdc_executor.add_pipeline(pipeline)

    # Create the avro schema and register it to confluent
    field = avro.schema.Field(type=avro.schema.PrimitiveSchema(
        avro.schema.STRING),
                              name='myLongField1',
                              index=0,
                              has_default=False)
    schema = avro.schema.RecordSchema(name=f'value_{topic}',
                                      namespace=None,
                                      fields=[field],
                                      names=avro.schema.Names())
    confluent.schema_registry.register(f'{topic}-value', schema)

    # Set the confluent serializer to the kafka consumer
    serializer = MessageSerializer(confluent.schema_registry)
    consumer = cluster.kafka.consumer(
        consumer_timeout_ms=1000,
        auto_offset_reset='earliest',
        key_deserializer=partial(serializer.decode_message, is_key=True),
        value_deserializer=partial(serializer.decode_message, is_key=False))
    consumer.subscribe([topic])

    sdc_executor.start_pipeline(pipeline).wait_for_finished()

    msgs_received = [message for message in consumer]

    assert 1 == len(msgs_received)
    assert [message.value for message in msgs_received] == [data]
from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient
import logging

input_topics = ["avro-topic"]

config = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'phoenix-local-consumer-2',
    'auto.offset.reset': 'earliest'
}

sr_config = {'url': 'http://localhost:8081', 'auto.register.schemas': False}

sr_client = CachedSchemaRegistryClient(sr_config)

serializer = MessageSerializer(sr_client)

logging.getLogger().setLevel(logging.INFO)

consumer = Consumer(config)

consumer.subscribe(input_topics)
logging.info("Subscribed to the topic : {}".format(input_topics))


def handle_msg(msg):
    value = serializer.decode_message(msg.value())
    key = msg.key().decode("utf-8")
    logging.info(
        "The record was read from the kafka topic {}, partition {}, offset {}".
        format(msg.topic(), msg.partition(), msg.offset()))
Exemplo n.º 19
0
print("Start: avro-python-producer")

props = {
    'client.id': "basic-python-producer",
    'bootstrap.servers': "localhost:9092",
}

topic = "avro-python-producer-topic"

producer = Producer(props)

# connect to the schema_registry
schema_registry = CachedSchemaRegistryClient("http://localhost:8081")

# define avro serde - to be used to encode msg value against the avro schema
avro_serde = MessageSerializer(schema_registry)

# convert json to avro schema
schema = avro.schema.Parse(
    json.dumps({
        "namespace":
        "test.value.avro",
        "type":
        "record",
        "name":
        "avroValue",
        "fields": [{
            "name": "name",
            "type": "string"
        }, {
            "name": "type",
Exemplo n.º 20
0
 def __init__(self, schema_registry_url):
     schema_registry = CachedSchemaRegistryClient(
         {'url': schema_registry_url})
     self._serializer = MessageSerializer(schema_registry, None, None)
Exemplo n.º 21
0
 def __init__(self, topic, server='kafka1', schema_registry = 'kafka-schema-registry'):
     self.server = server
     self.topic = topic
     self.schema_registry = schema_registry
     schema_registryObj = CachedSchemaRegistryClient(url='http://' + self.schema_registry + ':8081')
     self.serializer = MessageSerializer(schema_registryObj)
Exemplo n.º 22
0
from pyspark.sql import SQLContext, SparkSession

from pyspark.streaming import StreamingContext
from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient
from confluent_kafka.avro.serializer.message_serializer import MessageSerializer

from pyspark.streaming.kafka import KafkaUtils

import json

var_schema_url = 'http://localhost:8081'
var_kafka_parms_src = {"metadata.broker.list": 'localhost:9092'}

schema_registry_client = CachedSchemaRegistryClient(var_schema_url)
serializer = MessageSerializer(schema_registry_client)

spark = SparkSession.builder \
  .appName('Advertiser_stream') \
  .master('local[*]') \
  .getOrCreate()


def handler(message):
    records = message.collect()
    for record in records:
        var_val_key = record[0]
        var_val_value = record[1]
        print(type(var_val_key))
        print(type(var_val_value))

    Triggered by poll() or flush().
    """
    if err is not None:
        print('Message delivery failed: {}'.format(err))
    else:
        print('Message delivered to {} [{}]'.format(msg.topic(),
                                                    msg.partition()))


# Create a topic if it doesn't exist yet
admin = CustomAdmin(BROKER)
if not admin.topic_exists(TOPIC_NAME):
    admin.create_topics([TOPIC_NAME])

# Define wrapper function for serializing in avro format
serialize_avro = MessageSerializer(
    CachedSchemaRegistryClient(SCHEMA_REGISTRY_URL)).encode_record_with_schema

# Define value schema
value_schema = avro.loads("""
    {
        "namespace": "septa.bus.location",
        "name": "value",
        "type": "record",
        "fields": [
            {"name": "lat", "type": "float", "doc": "latitude"},
            {"name": "lng", "type": "float", "doc": "longitude"}
        ]
    }
""")

# Initialize producer