예제 #1
0
# MAGIC %md
# MAGIC ### Set up the client for the Schema Registry

# COMMAND ----------

from confluent_kafka.schema_registry import SchemaRegistryClient

schema_registry_conf = {
    'url':
    schemaRegistryUrl,
    'basic.auth.user.info':
    '{}:{}'.format(confluentRegistryApiKey, confluentRegistrySecret)
}

schema_registry_client = SchemaRegistryClient(schema_registry_conf)

# COMMAND ----------

import pyspark.sql.functions as fn
from pyspark.sql.avro.functions import from_avro

keyRestResponseSchema = schema_registry_client.get_latest_version(
    confluentTopicName + "-key").schema
confluentKeySchema = keyRestResponseSchema.schema_str
valueRestResponseSchema = schema_registry_client.get_latest_version(
    confluentTopicName + "-value").schema
confluentValueSchema = valueRestResponseSchema.schema_str

# Set the option for how to fail - either stop on the first failure it finds (FAILFAST) or just set corrupt data to null (PERMISSIVE)
#fromAvroOptions = {"mode":"FAILFAST"}
def test_config_url_invalid():
    conf = {'url': 'htt://SchemaRegistry:65534'}
    with pytest.raises(ValueError) as e:
        SchemaRegistryClient(conf)
    assert e.match('Invalid url htt://SchemaRegistry:65534')
예제 #3
0
def _get_schema(schema_registry_client: SchemaRegistryClient,
                topic: str) -> str:
    """Return a schema string from an AVRO server."""
    return schema_registry_client.get_latest_version(topic).schema.schema_str
예제 #4
0
 def get_schema_registry_client(self, conf):
     return SchemaRegistryClient(conf)
예제 #5
0
#!/usr/bin/env python3

import json
import os
import pkgutil

from confluent_kafka.schema_registry import SchemaRegistryClient, Schema, SchemaReference

sr_conf = {'url':  os.environ.get('SCHEMA_REGISTRY', 'http://localhost:8081')}
client = SchemaRegistryClient(sr_conf)


def register(file, subject, references=[]):

    schema_bytes = pkgutil.get_data("jlab_jaws", file)

    json_dict = json.loads(schema_bytes)

    json_str = json.dumps(json_dict)

    unregistered_schema = Schema(json_str, 'AVRO', references)

    id = client.register_schema(subject, unregistered_schema)

    print('Successfully registered {} with id: {}'.format(subject, id))

    registered_schema = client.get_latest_version(subject)

    return registered_schema

예제 #6
0
def main():

    sr_conf = {'url': SCHEMA_REGISTRY_URL}
    schema_registry_client = SchemaRegistryClient(sr_conf)
    schema_str = """
    {
        "namespace": "io.confluent.ksql.avro_schemas",
        "name": "User",
        "type": "record",
        "fields":[
        {"name":"DATESTAMP","type":"string"},
        {"name":"TIMESTAMP","type":"string"},
        {"name":"MILLISEC","type":"string"},
        {"name":"LOGLEVEL","type":"string"},
        {"name":"REQUESTID","type":"string"},
        {"name":"RECORDFORMATVERSION","type":"string"},
        {"name":"SOURCEIP","type":"string"},
        {"name":"DNSDOMAIN","type":"string"},
        {"name":"MESSAGETYPE","type":"string"},
        {"name":"OPERATION","type":"string"},
        {"name":"AUTHUSER","type":"string"},
        {"name":"AUTHDOMAIN","type":"string"},
        {"name":"HTTPCODE","type":"string"},
        {"name":"SOURCEBYTES","type":"string"},
        {"name":"RESPONSEBYTES","type":"string"},
        {"name":"ELAPSEDTIME","type":"string"},
        {"name":"DOMAIN","type":"string"},
        {"name":"BUCKET","type":"string"},
        {"name":"OBJECT","type":"string"}
        ]
    }
    """

    avro_deserializer = AvroDeserializer(schema_str, schema_registry_client)
    string_deserializer = StringDeserializer('utf_8')

    consumer_conf = {
        'bootstrap.servers': bootstrap_servers,
        'key.deserializer': string_deserializer,
        'value.deserializer': avro_deserializer,
        'group.id': group,
        'auto.offset.reset': "earliest"
    }

    consumer = DeserializingConsumer(consumer_conf)
    consumer.subscribe([topic])

    while True:
        try:
            # SIGINT can't be handled when polling, limit timeout to 1 second.
            msg = consumer.poll(1.0)
            if msg is None:
                continue

            record = msg.value()
            if record is not None:
                if record['OPERATION'] == "POST" and record[
                        'DOMAIN'] != "%28none%29":
                    urllistraw = "http://" + record['DOMAIN'] + "/" + record[
                        'BUCKET'] + "/" + record['OBJECT']
                    urllist = urllistraw[:-1]
                    print(urllist)
                    r = requests.head(urllist)
                    print(r.headers)
                else:
                    continue
        except KeyboardInterrupt:
            break

    consumer.close()
예제 #7
0
parser.add_argument('athena_table_name',
                    nargs='?',
                    help='The name of the table to create')
parser.add_argument(
    's3_location',
    nargs='?',
    help='S3 location of your database. Example: s3://bucket/folder/')
parser.add_argument('--partition',
                    nargs='+',
                    type=str,
                    help='partitions, can be specified multiple times.',
                    default=[])

args = parser.parse_args()

schema_registry = SchemaRegistryClient({"url": args.registry_url})
avro_schema_literal = schema_registry.get_latest_version(
    f"{args.avro_subject}").schema.schema_str

athena_schema, partition_schema = create_athena_schema_from_avro(
    avro_schema_literal, args.partition)

if partition_schema:
    partition_statement = f'\nPARTITIONED BY ({partition_schema})'
else:
    partition_statement = ''

print(f'''
CREATE DATABASE IF NOT EXISTS {args.athena_database};
''')
class KafkaAvroCDCConsumer:
    def __init__(self,
                 consumer_name,
                 topic_name="kafka-avro-producer",
                 groupID='KafkaAvroConsumer',
                 autocommit=True):

        # Consumer name for logging purposes
        self.logging_prefix = '[' + consumer_name + '][KafkaAvroCDCConsumer]'

        # Schema Registry configuration
        self.schema_registry_conf = EventBackboneConfig.getSchemaRegistryConf()
        # Schema Registry Client
        self.schema_registry_client = SchemaRegistryClient(
            self.schema_registry_conf)

        # Get Schema for the key
        self.schema_id_key = self.schema_registry_client.get_latest_version(
            EventBackboneConfig.getKeySubject()).schema_id
        # print('The Schema ID for the key is: {}'.format(self.schema_id_key))
        self.schema_key = self.schema_registry_client.get_schema(
            self.schema_id_key).schema_str
        print(self.logging_prefix +
              ' - Key Subject: {}'.format(EventBackboneConfig.getKeySubject()))
        print(self.logging_prefix + ' - Key Schema:')
        print(self.logging_prefix + ' - -----------')
        print(self.logging_prefix + ' - ' + self.schema_key + "\n")

        # Get Schema for the value
        self.schema_id_value = self.schema_registry_client.get_latest_version(
            EventBackboneConfig.getValueSubject()).schema_id
        # print('The Schema ID for the value is: {}'.format(self.schema_id_value))
        self.schema_value = self.schema_registry_client.get_schema(
            self.schema_id_value).schema_str
        print(self.logging_prefix + ' - Value Subject: {}'.format(
            EventBackboneConfig.getValueSubject()))
        print(self.logging_prefix + ' - Value Schema:')
        print(self.logging_prefix + ' - -------------\n')
        print(self.logging_prefix + ' - ' + self.schema_value + '\n')

        # Key Deserializer
        self.key_deserializer = AvroDeserializer(self.schema_key,
                                                 self.schema_registry_client)

        # Value Deserializer
        # Presenting the schema to the Avro Deserializer is needed at the moment. In the future it might change
        # https://github.com/confluentinc/confluent-kafka-python/issues/834
        self.value_deserializer = AvroDeserializer(self.schema_value,
                                                   self.schema_registry_client)

        # Get the consumer configuration
        self.consumer_conf = EventBackboneConfig.getConsumerConfiguration(
            groupID, autocommit, self.key_deserializer,
            self.value_deserializer)

        # Create the consumer
        self.consumer = DeserializingConsumer(self.consumer_conf)

        # Print consumer configuration
        EventBackboneConfig.printConsumerConfiguration(
            self.logging_prefix, self.consumer_conf,
            self.schema_registry_conf['url'])

        # Subscribe to the topic
        self.consumer.subscribe([topic_name])

    def traceResponse(self, msg):
        print(
            self.logging_prefix +
            ' - New event received\n\tTopic: {}\n\tPartition: {}\n\tOffset: {}\n\tkey: {}\n\tvalue: {}'
            .format(msg.topic(), msg.partition(), msg.offset(), msg.key(),
                    msg.value()))

    # Polls for next event
    def pollNextEvent(self):
        # Poll for messages
        msg = self.consumer.poll(timeout=POLL_TIMEOUT)
        anEvent = {}
        # Validate the returned message
        if msg is None:
            print(self.logging_prefix +
                  ' - [INFO] - No new messages on the topic')
            return None
        elif msg.error():
            if ("PARTITION_EOF" in msg.error()):
                print(self.logging_prefix + ' - [INFO] - End of partition')
            else:
                print(self.logging_prefix +
                      ' - [ERROR] - Consumer error: {}'.format(msg.error()))
            return None
        else:
            # Print the message
            self.traceResponse(msg)
        return msg.value()

    # Polls for the next event but returns the raw event
    def pollNextRawEvent(self):
        records = self.consumer.poll(timeout=POLL_TIMEOUT)
        if records is None:
            return None
        if records.error():
            # Stop reading if we find end of partition in the error message
            if ("PARTITION_EOF" in records.error()):
                return None
            else:
                print(
                    self.logging_prefix +
                    ' - [ERROR] - Consumer error: {}'.format(records.error()))
                return None
        else:
            self.traceResponse(records)
        return records

    def commitEvent(self, event):
        self.consumer.commit(event)

    def close(self):
        self.consumer.close()
def test_config_ssl_certificate_no_key():
    conf = {'url': TEST_URL,
            'ssl.certificate.location': '/ssl/certificates/client'}
    test_client = SchemaRegistryClient(conf)
    assert test_client._rest_client.session.cert == '/ssl/certificates/client'
def test_config_ssl_key_no_certificate():
    conf = {'url': TEST_URL,
            'ssl.key.location': '/ssl/keys/client'}
    with pytest.raises(ValueError, match="ssl.certificate.location required"
                                         " when configuring ssl.key.location"):
        SchemaRegistryClient(conf)
def test_config_url_trailing_slash():
    conf = {'url': 'http://SchemaRegistry:65534/'}
    test_client = SchemaRegistryClient(conf)
    assert test_client._rest_client.base_url == TEST_URL
def test_config_url_None():
    conf = {}
    with pytest.raises(ValueError, match="Missing required configuration"
                                         " property url"):
        SchemaRegistryClient(conf)
def test_config_url_invalid_type():
    conf = {'url': dict()}
    with pytest.raises(TypeError, match="url must be an instance of str,"
                                        " not <(.*)>$"):
        SchemaRegistryClient(conf)
예제 #14
0
# For a complete example; see https://github.com/confluentinc/confluent-kafka-python/blob/master/examples/protobuf_producer.py

from uuid import uuid4

# Protobuf generated class; resides at ./meal_pb2.py
# Create it by running
# protoc -I=. --python_out=. ./meal.proto

import meal_pb2
from confluent_kafka import SerializingProducer
from confluent_kafka.serialization import StringSerializer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.protobuf import ProtobufSerializer

topic = 'MEAL_DELIVERY'
schema_registry_client = SchemaRegistryClient(
    {'url': 'http://schema-registry:8081'})
protobuf_serializer = ProtobufSerializer(meal_pb2.Meal, schema_registry_client)

producer_conf = {
    'bootstrap.servers': 'kafka:29092',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer': protobuf_serializer
}

producer = SerializingProducer(producer_conf)

producer.poll(0.0)

mybeer = meal_pb2.Meal.DrinkItems(drink_name="beer")
mywine = meal_pb2.Meal.DrinkItems(drink_name="wine")
예제 #15
0
def main(args):
    topic = args.topic
    outputtopic = args.outputtopic

    schema_enriched_event_str = EnrichedEventSchema
    schema_dict = ast.literal_eval(schema_enriched_event_str)
    schema_metrics = MetricSchema

    sr_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(sr_conf)
    string_deserializer = StringDeserializer('utf_8')

    avro_serializer = AvroSerializer(schema_metrics, schema_registry_client)
    producer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': avro_serializer
    }

    producer = SerializingProducer(producer_conf)

    avro_deserializer = AvroDeserializer(schema_enriched_event_str,
                                         schema_registry_client)

    consumer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.deserializer': string_deserializer,
        'value.deserializer': avro_deserializer,
        'group.id': args.group + str(random.Random()),
        'auto.offset.reset': "earliest"
    }

    consumer = DeserializingConsumer(consumer_conf)
    consumer.subscribe([topic])

    cluster = Cluster([args.host])
    session = cluster.connect("datascience")

    cluster.register_user_type('datascience', 'datafield', Datafield)

    client_influxdb = InfluxDBClient('35.181.155.182', 8086, "dbsaleh2")
    # client_influxdb = InfluxDBClient(url="http://35.181.155.182:8086 , "mydb")

    while True:
        try:
            # SIGINT can't be handled when polling, limit timeout to 1 second.
            start = time.time()
            msg = consumer.poll(1.0)
            if msg is None:
                continue

            evt = msg.value()

            query = f"""
            insert into eventenrich (
                        "eventId" ,
                        "dateTimeRef",
                        "nomenclatureEv",
                        "canal",
                        "media",
                        "schemaVersion",
                        "headerVersion",
                        "serveur",
                        "adresseIP",
                        "idTelematique",
                        "idPersonne",
                        "dateNaissance",
                        "paysResidence",
                        "paysNaissance",
                        "revenusAnnuel",
                        "csp",
                        "eventBC",
                        "eventContent"
                        )
                        VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s, %s,%s, %s)
                    """

            #eventBc = evt["EventBusinessContext"][0].replace("com.bnpparibas.dsibddf.event.","")
            eventBc = evt["eventBC"].replace("com.bnpparibas.dsibddf.event.",
                                             "")
            eventContent = evt["EventBusinessContext"][1]

            transformed_event = transform_enriched_event_to_cassandra_model(
                evt, eventBc, schema_dict, eventContent)

            insert_enriched_event_to_cassandra(transformed_event, session,
                                               query)

            elapsed_time = (time.time() - start)

        except Exception as e:
            print(f"Exception => {e}")
            continue

        query = 'SELECT * FROM metrics'
        result = client_influxdb.query(query, database="dbsaleh2")
        print(result)

        data = []

        print(elapsed_time)
        metrics = [{
            "measurement": "metrics",
            "fields": {
                "metricName": "hystorize",
                "timeforhystorize": elapsed_time
            }
        }]
        data.append(metrics)

        # client_influxdb.write_points("hystorize",elapsed_time, database="dbsaleh2")
        client_influxdb.write_points(metrics, database="dbsaleh2")
        producer.produce(topic=outputtopic,
                         value={
                             'metricName': "hystorize",
                             'time': elapsed_time
                         },
                         on_delivery=delivery_report)
        producer.flush()

    consumer.close()
예제 #16
0
def get_schema_registry_client(url="http://localhost:8081"):
    return SchemaRegistryClient({"url": url})
예제 #17
0
def send_record(args):
    if not any([args.record_value, args.record_file]):
        raise AttributeError(
            "--record-value or --record-file are not provided.")

    if args.schema_file is None:
        raise AttributeError("--schema-file is not provided.")

    if args.security_protocol and args.security_protocol.lower() not in [
            'plaintext', 'ssl'
    ]:
        raise AttributeError(
            "--security-protocol must be either plaintext or ssl.")

    schema_registry_client = SchemaRegistryClient(
        {'url': args.schema_registry})

    with open(args.schema_file, 'r') as file:
        schema = file.read()

    string_serializer = StringSerializer('utf-8')
    avro_serializer = AvroSerializer(schema, schema_registry_client)

    producer_config = {
        "bootstrap.servers": args.bootstrap_servers,
        'key.serializer': string_serializer,
        'value.serializer': avro_serializer,
    }

    security_protocol = args.security_protocol.lower()

    if security_protocol == "ssl" and all(
        [args.ssl_ca_location, args.ssl_cert_location, args.ssl_key_location]):
        producer_config.update({
            'security.protocol':
            security_protocol,
            'ssl.ca.location':
            args.ssl_ca_location,
            'ssl.key.location':
            args.ssl_key_location,
            'ssl.certificate.location':
            args.ssl_cert_location
        })
    else:
        raise AttributeError(
            "--security-protocol is ssl, please supply certificates.")

    producer = SerializingProducer(producer_config)

    key = args.record_key if args.record_key else str(uuid.uuid4())

    if args.record_file:
        with open(args.record_file, 'r') as f:
            data = f.readlines()
        for line in data:
            try:
                producer.produce(topic=args.topic,
                                 key=key,
                                 value=json.loads(line))
            except Exception as e:
                print(
                    f"Exception while producing record value - {line} to topic - {args.topic}: {e}"
                )
            else:
                print(
                    f"Successfully producing record value - {line} to topic - {args.topic}"
                )
    else:
        value = args.record_value

        try:
            producer.produce(topic=args.topic, key=key, value=value)
        except Exception as e:
            print(
                f"Exception while producing record value - {value} to topic - {args.topic}: {e}"
            )
        else:
            print(
                f"Successfully producing record value - {value} to topic - {args.topic}"
            )

    producer.flush()
    def __init__(self,
                 consumer_name,
                 topic_name="kafka-avro-producer",
                 groupID='KafkaAvroConsumer',
                 autocommit=True):

        # Consumer name for logging purposes
        self.logging_prefix = '[' + consumer_name + '][KafkaAvroCDCConsumer]'

        # Schema Registry configuration
        self.schema_registry_conf = EventBackboneConfig.getSchemaRegistryConf()
        # Schema Registry Client
        self.schema_registry_client = SchemaRegistryClient(
            self.schema_registry_conf)

        # Get Schema for the key
        self.schema_id_key = self.schema_registry_client.get_latest_version(
            EventBackboneConfig.getKeySubject()).schema_id
        # print('The Schema ID for the key is: {}'.format(self.schema_id_key))
        self.schema_key = self.schema_registry_client.get_schema(
            self.schema_id_key).schema_str
        print(self.logging_prefix +
              ' - Key Subject: {}'.format(EventBackboneConfig.getKeySubject()))
        print(self.logging_prefix + ' - Key Schema:')
        print(self.logging_prefix + ' - -----------')
        print(self.logging_prefix + ' - ' + self.schema_key + "\n")

        # Get Schema for the value
        self.schema_id_value = self.schema_registry_client.get_latest_version(
            EventBackboneConfig.getValueSubject()).schema_id
        # print('The Schema ID for the value is: {}'.format(self.schema_id_value))
        self.schema_value = self.schema_registry_client.get_schema(
            self.schema_id_value).schema_str
        print(self.logging_prefix + ' - Value Subject: {}'.format(
            EventBackboneConfig.getValueSubject()))
        print(self.logging_prefix + ' - Value Schema:')
        print(self.logging_prefix + ' - -------------\n')
        print(self.logging_prefix + ' - ' + self.schema_value + '\n')

        # Key Deserializer
        self.key_deserializer = AvroDeserializer(self.schema_key,
                                                 self.schema_registry_client)

        # Value Deserializer
        # Presenting the schema to the Avro Deserializer is needed at the moment. In the future it might change
        # https://github.com/confluentinc/confluent-kafka-python/issues/834
        self.value_deserializer = AvroDeserializer(self.schema_value,
                                                   self.schema_registry_client)

        # Get the consumer configuration
        self.consumer_conf = EventBackboneConfig.getConsumerConfiguration(
            groupID, autocommit, self.key_deserializer,
            self.value_deserializer)

        # Create the consumer
        self.consumer = DeserializingConsumer(self.consumer_conf)

        # Print consumer configuration
        EventBackboneConfig.printConsumerConfiguration(
            self.logging_prefix, self.consumer_conf,
            self.schema_registry_conf['url'])

        # Subscribe to the topic
        self.consumer.subscribe([topic_name])
예제 #19
0
class KafkaPC:
    def __init__(self, config_path, config_section):
        super(KafkaPC, self).__init__()

        self.in_topic = None
        self.out_topic = None
        self.in_schema = None
        self.out_schema = None

        self.read_config(config_path, config_section)
        self.connect_schema_registry()
        self.read_topics()
        self.create_serializer()
        self.create_deserializer()
        self.create_consumer()
        self.create_producer()

    def connect_schema_registry(self):

        if self.config.get("KAFKA_SCHEMA_REGISTRY_URL") is not None:
            sr_conf = {"url": self.config["KAFKA_SCHEMA_REGISTRY_URL"]}
            self.schema_registry = SchemaRegistryClient(sr_conf)
        else:
            raise ValueError("Need KAFKA_SCHEMA_REGISTRY_URL")

    def get_schema_from_registry(self, topic, suffix="-value"):
        response = None
        try:
            schema = self.schema_registry.get_latest_version(topic + suffix)
            response = schema.schema
        except Exception as e:
            print(f"Exception: {repr(e)}")
        return response

    def read_topics(self):

        if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"):
            self.in_topic = self.config["IN_TOPIC"]

            self.in_schema = {}
            for topic in self.in_topic:
                # try to get schema from registry
                schema = self.get_schema_from_registry(topic)
                # if no schema is found a simple string deserializer will be used, see line 87
                if schema is None:
                    self.in_schema[topic] = None
                else:
                    self.in_schema[topic] = schema

        if self.config.get("OUT_TOPIC"):
            self.out_topic = list(self.config["OUT_TOPIC"].keys())
            self.out_schema = {}
            for topic, schema in self.config["OUT_TOPIC"].items():
                self.out_schema[topic] = self.read_avro_schema(schema)

    def create_serializer(self):
        self.serializer = {}
        if self.out_topic is not None:
            for topic in self.out_topic:
                schema_str = self.out_schema[topic].schema_str
                self.serializer[topic] = AvroSerializer(
                    schema_str, self.schema_registry)

    def create_deserializer(self):
        self.deserializer = {}
        if self.in_topic is not None:
            for topic in self.in_topic:
                if self.in_schema[topic] is None:
                    self.deserializer[topic] = StringDeserializer("utf_8")
                else:
                    schema_str = self.in_schema[topic].schema_str
                    self.deserializer[topic] = AvroDeserializer(
                        schema_str, self.schema_registry)

    def create_consumer(self):

        if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"):

            consumer_conf = {
                "bootstrap.servers": self.config["KAFKA_BROKER_URL"],
                "group.id": self.config["IN_GROUP"],
                "auto.offset.reset": "earliest",
            }

            self.consumer = Consumer(consumer_conf)
            self.consumer.subscribe(self.in_topic)

    def create_producer(self):
        if self.config.get("OUT_TOPIC"):
            producer_conf = {
                "bootstrap.servers": self.config["KAFKA_BROKER_URL"]
            }
            self.producer = Producer(producer_conf)

    def read_config(self, config_path, config_section):
        self.config = {}
        if config_path is not None and config_section is not None:
            config_section = config_section.replace(" ", "").split(",")
        else:
            raise ValueError(
                "Configuration requires config_path and config_section")
        try:
            with open(config_path, "r") as ymlfile:
                config = yaml.load(ymlfile, Loader=yaml.FullLoader)
                for section in config_section:
                    for key, value in config[section].items():
                        self.config[key] = value

        except Exception as e:
            print(f"Failed to read the config: {repr(e)}")
            sys.exit()

    def read_avro_schema(self, schema):

        with open(schema, "r") as f:
            schema_str = f.read()
        avro_schema_str = Schema(schema_str, "AVRO")

        return avro_schema_str

    def decode_msg(self, msg):

        try:
            topic = msg.topic()
            value = self.deserializer[topic](msg.value(), None)
            return value
        except Exception as e:
            print(f"Error decoding avro data: {repr(e)}")
            sys.exit()

    def send_msg(self, message, partition=0, topic=None):

        # if no topic is provided, the first topic in the list is used as default
        if topic is None:
            out_topic = self.out_topic[0]
        else:
            out_topic = topic

        # encode the data with the specified Avro out_schema
        ctx = SerializationContext(out_topic, MessageField.VALUE)
        ser_message = self.serializer[out_topic](message, ctx)

        try:
            self.producer.produce(topic=out_topic,
                                  value=ser_message,
                                  partition=partition)
        except Exception as e:
            print(f"Error sending data to Kafka: {repr(e)}")
예제 #20
0
파일: score.py 프로젝트: MED-SALAH/lclfpy
def main(args):
    topic = args.topic
    outputtopic = args.outputtopic

    schema_enriched_event_str = EnrichedEventSchema
    schema_metrics = MetricSchema

    sr_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(sr_conf)
    string_deserializer = StringDeserializer('utf_8')

    avro_serializer = AvroSerializer(schema_metrics, schema_registry_client)
    producer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': avro_serializer
    }

    producer = SerializingProducer(producer_conf)

    avro_deserializer = AvroDeserializer(schema_enriched_event_str,
                                         schema_registry_client)

    consumer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.deserializer': string_deserializer,
        'value.deserializer': avro_deserializer,
        'group.id': args.group + str(random.Random()),
        'auto.offset.reset': "earliest"
    }

    consumer = DeserializingConsumer(consumer_conf)
    consumer.subscribe([topic])

    cluster = Cluster([args.host])
    session = cluster.connect("datascience")
    session.row_factory = dict_factory

    client_influxdb = InfluxDBClient('35.181.155.182', 8086, "dbsaleh2")

    while True:
        try:
            # SIGINT can't be handled when polling, limit timeout to 1 second.
            start = time.time()
            msg = consumer.poll(1.0)
            if msg is None:
                continue

            evt = msg.value()

            idPersonne = evt["idPersonne"]

            rows = session.execute(GET_ENRICHED_EVENT_QUERY, (idPersonne, ))
            if rows:
                # print(idPersonne, f"rows={rows.all().__len__()}")
                # stat_process(idPersonne, rows)
                # som = rec_process(rows,0,0)
                # print("some", som)

                # row["csp"] = get_value_column_enriched_data(row, "csp")
                # row["paysNaissance"] = get_value_column_enriched_data(row, "paysNaissance")
                #
                #
                # #get_value_column_event_content
                # row['appVersion'] = get_value_column_event_content(row, "appVersion")
                # row['montant'] = get_value_column_event_content(row, "montant")
                # row['androidID'] = get_value_column_event_content(row, "androidID")

                # del rows[0]['eventContent']

                elapsed_time = time.time() - start

                #producer.produce(topic=outputtopic, key=str(uuid4()), value={'metricName':"hystorize",'time':elapsed_time}, on_delivery=delivery_report)
                #producer.flush()

        except Exception:
            print('Exception')
            continue

        metrics = [{
            "measurement": "metrics",
            "fields": {
                "metricName": "score",
                "timeforscore": elapsed_time
            }
        }]
        print(elapsed_time)

        client_influxdb.write_points(metrics, database="dbsaleh2")
        producer.produce(topic=outputtopic,
                         value={
                             'metricName': "score",
                             'time': elapsed_time
                         },
                         on_delivery=delivery_report)
        producer.flush()

    consumer.close()
예제 #21
0
파일: serializer.py 프로젝트: AbsaOSS/py2k
 def __init__(self, record: KafkaRecord, schema_registry_config: dict):
     self._record = record
     self._key_fields = record.key_fields
     self._key_included = record.include_key
     self._schema_registry_client = SchemaRegistryClient(
         schema_registry_config)
예제 #22
0
파일: test_io.py 프로젝트: MED-SALAH/lclfpy
def main(args):
    topic = args.topic
    outputtopic = args.outputtopic

    schema_str = EventSchema
    schema_enriched_event_str = EnrichedEventSchema

    sr_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(sr_conf)

    avro_deserializer = AvroDeserializer(schema_str, schema_registry_client)
    string_deserializer = StringDeserializer('utf_8')

    avro_serializer = AvroSerializer(schema_enriched_event_str,
                                     schema_registry_client)

    consumer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.deserializer': string_deserializer,
        'value.deserializer': avro_deserializer,
        'group.id': args.group + str(random.Random()),
        'auto.offset.reset': "earliest"
    }

    consumer = DeserializingConsumer(consumer_conf)
    consumer.subscribe([topic])

    cluster = Cluster([args.host])
    session = cluster.connect("datascience")
    session.row_factory = dict_factory

    producer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': avro_serializer
    }

    producer = SerializingProducer(producer_conf)

    loop = asyncio.get_event_loop()

    while True:
        try:
            # SIGINT can't be handled when polling, limit timeout to 1 second.
            start = time.time()
            msg = consumer.poll(1.0)
            if msg is None:
                continue

            evt = msg.value()
            print("msg=>", evt)

            def enrich(evt):
                print("evt", evt)
                if evt is not None:
                    print("récupérer dans kafka")
                    row = session.execute(
                        GET_ENRICHED_DATA_QUERY,
                        (evt["EventHeader"]["acteurDeclencheur"]["idPersonne"],
                         )).one()

                    if row:
                        evt['EnrichedData'] = row
                        # evt['EventBusinessContext'] = evt["EventBusinessContext"][1]
                        EnrichedEvent = {
                            "eventId":
                            evt["EventHeader"]["eventId"],
                            "dateTimeRef":
                            evt["EventHeader"]["dateTimeRef"],
                            "nomenclatureEv":
                            evt["EventHeader"]["nomenclatureEv"],
                            "canal":
                            evt["EventHeader"]["canal"],
                            "media":
                            evt["EventHeader"]["media"],
                            "schemaVersion":
                            evt["EventHeader"]["schemaVersion"],
                            "headerVersion":
                            evt["EventHeader"]["headerVersion"],
                            "serveur":
                            evt["EventHeader"]["serveur"],
                            "adresseIP":
                            evt["EventHeader"]["acteurDeclencheur"]
                            ["adresseIP"],
                            "idTelematique":
                            evt["EventHeader"]["acteurDeclencheur"]
                            ["idTelematique"],
                            "idPersonne":
                            evt["EventHeader"]["acteurDeclencheur"]
                            ["idPersonne"],
                            "dateNaissance":
                            row["dateNaissance"],
                            "paysResidence":
                            row["paysResidence"],
                            "paysNaissance":
                            row["paysNaissance"],
                            "revenusAnnuel":
                            row["revenusAnnuel"],
                            "csp":
                            row["csp"],
                            "EventBusinessContext":
                            evt["EventBusinessContext"]
                        }

                        producer.produce(topic=outputtopic,
                                         key=str(uuid4()),
                                         value=EnrichedEvent,
                                         on_delivery=delivery_report)
                        producer.flush()

            async_enrich = async_wrap(enrich)
            loop.run_until_complete(async_enrich(evt))

        except Exception:
            print('Exception')
            continue

    consumer.close()
예제 #23
0
# For a complete example; see https://github.com/confluentinc/confluent-kafka-python/blob/master/examples/protobuf_producer.py

from uuid import uuid4

# Protobuf generated class; resides at ./meal_pb2.py
# Create it by running
# protoc -I=. --python_out=. ./meal.proto

import meal_pb2
from confluent_kafka import SerializingProducer
from confluent_kafka.serialization import StringSerializer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.protobuf import ProtobufSerializer

topic = 'DEMO_MEAL_PROTO'
schema_registry_client = SchemaRegistryClient({'url': 'http://localhost:8081'})
protobuf_serializer = ProtobufSerializer(meal_pb2.Meal, schema_registry_client)

producer_conf = {
    'bootstrap.servers': 'localhost:9092',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer': protobuf_serializer
}

producer = SerializingProducer(producer_conf)

producer.poll(0.0)

mybeer = meal_pb2.Meal.DrinkItems(drink_name="beer")
mywine = meal_pb2.Meal.DrinkItems(drink_name="wine")
예제 #24
0
from enum import Enum

from confluent_kafka.admin import AdminClient
from confluent_kafka.schema_registry import SchemaRegistryClient


class Topics(Enum):
    TOPIC_1 = "topic1"
    TOPIC_2 = "topic2"


SCHEMA_REGISTRY_URL = "http://127.0.0.1:8081"
SCHEMA_REGISTRY_CLIENT = SchemaRegistryClient({'url': SCHEMA_REGISTRY_URL})
BOOTSTRAP_SERVERS = "127.0.0.1:9092"
NUM_PARTITIONS = 3
ADMIN_CLIENT = AdminClient({'bootstrap.servers': BOOTSTRAP_SERVERS})
예제 #25
0
def main(args):
    topic = args.topic

    schema_str = """
    {
      "$schema": "http://json-schema.org/draft-07/schema#",
      "title": "User",
      "description": "A Confluent Kafka Python User",
      "type": "object",
      "properties": {
        "name": {
          "description": "User's name",
          "type": "string"
        },
        "favorite_number": {
          "description": "User's favorite number",
          "type": "number",
          "exclusiveMinimum": 0
        },
        "favorite_color": {
          "description": "User's favorite color",
          "type": "string"
        }
      },
      "required": [ "name", "favorite_number", "favorite_color" ]
    }
    """
    schema_registry_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(schema_registry_conf)

    json_serializer = JSONSerializer(schema_str, schema_registry_client,
                                     user_to_dict)

    producer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': json_serializer
    }

    producer = SerializingProducer(producer_conf)

    print("Producing user records to topic {}. ^C to exit.".format(topic))
    while True:
        # Serve on_delivery callbacks from previous calls to produce()
        producer.poll(0.0)
        try:
            user_name = input("Enter name: ")
            user_address = input("Enter address: ")
            user_favorite_number = int(input("Enter favorite number: "))
            user_favorite_color = input("Enter favorite color: ")
            user = User(name=user_name,
                        address=user_address,
                        favorite_color=user_favorite_color,
                        favorite_number=user_favorite_number)
            producer.produce(topic=topic,
                             key=str(uuid4()),
                             value=user,
                             on_delivery=delivery_report)
        except KeyboardInterrupt:
            break
        except ValueError:
            print("Invalid input, discarding record...")
            continue

    print("\nFlushing records...")
    producer.flush()
예제 #26
0
    args = ccloud_lib.parse_args()
    config_file = args.config_file
    topic = args.topic
    conf = ccloud_lib.read_ccloud_config(config_file)

    # Create topic if needed
    ccloud_lib.create_topic(conf, topic)

    # for full list of configurations, see:
    #  https://docs.confluent.io/platform/current/clients/confluent-kafka-python/#schemaregistryclient
    schema_registry_conf = {
        'url': conf['schema.registry.url'],
        'basic.auth.user.info': conf['basic.auth.user.info']
    }

    schema_registry_client = SchemaRegistryClient(schema_registry_conf)

    name_avro_serializer = AvroSerializer(ccloud_lib.name_schema,
                                          schema_registry_client,
                                          ccloud_lib.Name.name_to_dict)
    count_avro_serializer = AvroSerializer(ccloud_lib.count_schema,
                                           schema_registry_client,
                                           ccloud_lib.Count.count_to_dict)

    # for full list of configurations, see:
    #  https://docs.confluent.io/platform/current/clients/confluent-kafka-python/#serializingproducer
    producer_conf = {
        'bootstrap.servers': conf['bootstrap.servers'],
        'sasl.mechanisms': conf['sasl.mechanisms'],
        'security.protocol': conf['security.protocol'],
        'sasl.username': conf['sasl.username'],
예제 #27
0
class KafkaPC:
    def __init__(self, config_path, config_section):
        super(KafkaPC, self).__init__()

        self.in_topic = None
        self.out_topic = None
        self.in_schema = None
        self.out_schema = None

        self.read_config(config_path, config_section)
        self.connect_schema_registry()
        self.read_topics()
        self.create_topics_on_broker()
        self.register_schemas_in_registry()
        self.create_serializer()
        self.create_deserializer()
        self.create_consumer()
        self.create_producer()

    def connect_schema_registry(self):
        MAX_RETRIES = 3

        if self.config.get("KAFKA_SCHEMA_REGISTRY_URL") is not None:
            sr_conf = {"url": self.config["KAFKA_SCHEMA_REGISTRY_URL"]}

            retries = 0
            while retries < MAX_RETRIES:
                try:
                    self.schema_registry = SchemaRegistryClient(sr_conf)
                    print("Connected to Schema Registry")
                    break
                except Exception as e:
                    retries += 1
                    print(
                        f"Could not connect to Schema Registry, retry {retries}"
                    )
                    print({repr(e)})
                    sleep(5)
            if retries == MAX_RETRIES:
                raise ConnectionError("Could not connect to Schema Registry")
        else:
            raise ValueError("Need KAFKA_SCHEMA_REGISTRY_URL")

    def register_schemas_in_registry(self, suffix="-value"):
        MAX_RETRIES = 3

        for topic, schema in self.out_schema.items():
            subject = topic + suffix
            retries = 0
            while retries < MAX_RETRIES:
                try:
                    self.schema_registry.register_schema(subject_name=subject,
                                                         schema=schema)
                    print(f"Registered schema for topic {topic} in registry")
                    break
                except Exception as e:
                    retries += 1
                    print(
                        f"Could not register schema for topic {topic} in registry: {repr(e)}"
                    )
                    sleep(5)
            if retries == MAX_RETRIES:
                raise ConnectionError("Could not connect to Schema Registry")

    def create_topics_on_broker(self, partitions=1, replication=1):
        a = AdminClient({"bootstrap.servers": self.config["KAFKA_BROKER_URL"]})

        topic_set = set(self.out_topic)

        md = a.list_topics(timeout=10)
        broker_set = set(md.topics.values())
        diff_set = topic_set.difference(broker_set)
        new_topics = [
            NewTopic(topic,
                     num_partitions=partitions,
                     replication_factor=replication) for topic in diff_set
        ]

        fs = a.create_topics(new_topics)

        # Wait for operation to finish.
        # Timeouts are preferably controlled by passing request_timeout=15.0
        # to the create_topics() call.
        # All futures will finish at the same time.
        for topic, f in fs.items():
            try:
                f.result()  # The result itself is None
                print(f"Topic {topic} created on Broker")
            except Exception as e:
                print(f"Failed to create topic {topic} on Broker: {repr(e)}")

    def get_schema_from_registry(self, topic, suffix="-value"):
        response = None

        MAX_RETRIES = 3
        retries = 0
        while retries < MAX_RETRIES:

            try:
                schema = self.schema_registry.get_latest_version(topic +
                                                                 suffix)
                response = schema.schema
                print(f"Retrieved schema for topic {topic} from Registry")
                break
            except Exception as e:
                retries += 1
                print(f"Failed to get schema: {repr(e)}")
                sleep(3)
        return response

    def read_topics(self):

        if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"):
            self.in_topic = self.config["IN_TOPIC"]

            self.in_schema = {}
            for topic in self.in_topic:
                # try to get schema from registry
                schema = self.get_schema_from_registry(topic)
                # if no schema is found a simple string deserializer will be used, see line 87
                if schema is None:
                    self.in_schema[topic] = None
                else:
                    self.in_schema[topic] = schema

        if self.config.get("OUT_TOPIC"):
            self.out_topic = list(self.config["OUT_TOPIC"].keys())
            self.out_schema = {}
            for topic, schema in self.config["OUT_TOPIC"].items():
                self.out_schema[topic] = self.read_avro_schema(schema)

    def create_serializer(self):
        self.serializer = {}
        if self.out_topic is not None:
            for topic in self.out_topic:
                schema_str = self.out_schema[topic].schema_str
                self.serializer[topic] = AvroSerializer(
                    schema_str, self.schema_registry)

    def create_deserializer(self):
        self.deserializer = {}
        if self.in_topic is not None:
            for topic in self.in_topic:
                if self.in_schema[topic] is None:
                    self.deserializer[topic] = StringDeserializer("utf_8")
                else:
                    schema_str = self.in_schema[topic].schema_str
                    self.deserializer[topic] = AvroDeserializer(
                        schema_str, self.schema_registry)

    def create_consumer(self):

        if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"):

            consumer_conf = {
                "bootstrap.servers": self.config["KAFKA_BROKER_URL"],
                "group.id": self.config["IN_GROUP"],
                "auto.offset.reset": "earliest",
            }

            self.consumer = Consumer(consumer_conf)
            self.consumer.subscribe(self.in_topic)

    def create_producer(self):
        if self.config.get("OUT_TOPIC"):
            producer_conf = {
                "bootstrap.servers": self.config["KAFKA_BROKER_URL"]
            }
            self.producer = Producer(producer_conf)

    def read_config(self, config_path, config_section):
        self.config = {}
        if config_path is not None and config_section is not None:
            config_section = config_section.replace(" ", "").split(",")
        else:
            raise ValueError(
                "Configuration requires config_path and config_section")
        try:
            with open(config_path, "r") as ymlfile:
                config = yaml.load(ymlfile, Loader=yaml.FullLoader)
                for section in config_section:
                    for key, value in config[section].items():
                        self.config[key] = value

        except Exception as e:
            print(f"Failed to read the config: {repr(e)}")
            sys.exit()

    def read_avro_schema(self, schema):

        with open(schema, "r") as f:
            schema_str = f.read()
        avro_schema_str = Schema(schema_str, "AVRO")

        return avro_schema_str

    def decode_msg(self, msg):

        try:
            topic = msg.topic()
            value = self.deserializer[topic](msg.value(), None)
            return value
        except Exception as e:
            print(f"Error decoding avro data: {repr(e)}")
            # sys.exit()

    def send_msg(self, message, partition=0, topic=None):

        # if no topic is provided, the first topic in the list is used as default
        if topic is None:
            out_topic = self.out_topic[0]
        else:
            out_topic = topic

        # encode the data with the specified Avro out_schema
        ctx = SerializationContext(out_topic, MessageField.VALUE)
        ser_message = self.serializer[out_topic](message, ctx)

        try:
            self.producer.produce(topic=out_topic,
                                  value=ser_message,
                                  partition=partition)
        except Exception as e:
            print(f"Error sending data to Kafka: {repr(e)}")
예제 #28
0
def main(
    name: str,
    shutdown: multiprocessing.Value,
    request_queue: multiprocessing.Queue,
    config: Config
) -> None:
    """Execute tasks forever.

    This method is the entrypoint for the worker which executes the monitoring
    tasks. It is executed in a dedicate child process.
    """
    if config.verbose:
        logging.basicConfig(level=logging.INFO)
    log = logging.getLogger(name)
    log.info(f"Starting process {name}.")

    # SIGINT will be delivered to the whole process group. We'll need to ignore
    # it in the worker processes to give them the opportunity to finish any
    # pending work.
    signal.signal(signal.SIGINT, signal.SIG_IGN)

    schema_registry_client = SchemaRegistryClient({
        'url': config.schema_registry
    })
    avro_serializer = AvroSerializer(
        Report.SCHEMA,
        schema_registry_client,
        Report.asdict
    )

    producer = SerializingProducer({
        'client.id': name,
        'bootstrap.servers': config.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'security.protocol': 'SSL',
        'ssl.key.location': config.auth_key,
        'ssl.certificate.location': config.auth_cert,
        'ssl.ca.location': config.ca_cert,
        'value.serializer': avro_serializer,
    })
    err = _report_error(log)

    while not shutdown.value:
        producer.poll(0.0)
        try:
            now = datetime.now()
            req = request_queue.get(timeout=1)
            curl = pycurl.Curl()
            curl.setopt(pycurl.URL, req)
            curl.setopt(pycurl.CONNECTTIMEOUT, 30)
            curl.setopt(pycurl.TIMEOUT, 300)
            curl.setopt(pycurl.NOSIGNAL, 1)
            curl.setopt(pycurl.WRITEFUNCTION, len)
            try:
                curl.perform()
                report = Report(
                    timestamp=now.timestamp(),
                    url=req,
                    code=int(curl.getinfo(pycurl.RESPONSE_CODE)),
                    namelookup=curl.getinfo(pycurl.NAMELOOKUP_TIME),
                    connect=curl.getinfo(pycurl.CONNECT_TIME),
                    appconnect=curl.getinfo(pycurl.APPCONNECT_TIME),
                    pretransfer=curl.getinfo(pycurl.PRETRANSFER_TIME),
                    starttransfer=curl.getinfo(pycurl.STARTTRANSFER_TIME),
                    total=curl.getinfo(pycurl.TOTAL_TIME),
                )
                log.info(str(report))
                producer.produce(
                    topic=config.topic,
                    key=req,
                    value=report,
                    on_delivery=err
                )
            except TypeError:
                # It'll never work if we misconfigure PycURL.
                raise
            except pycurl.error as exc:
                # TODO: Record the failure in Kafka.
                log.warning(f"Failed to retrieve {req}", exc)
            # TODO: Handle exceptions from the Kafka Producer.
            finally:
                curl.close()
        except queue.Empty:
            log.debug("No request to process.")
    # Flush any results that haven't been committed yet.
    log.warning(f"Process {name} shutting down.")
    producer.flush()
def MyProducer(config):
    schema_str = """
    {
        "$schema": "http://json-schema.org/draft-04/schema#",
        "title": "covid",
        "type": "object",
        "properties": {
            "ID": {
                "type": "string"
            },
            "Country": {
                "type": "string"
            },
            "CountryCode": {
                "type": "string"
            },
            "Slug": {
                "type": "string"
            },
            "NewConfirmed": {
                "type": "integer"
            },
            "TotalConfirmed": {
                "type": "integer"
            },
            "NewDeaths": {
                "type": "integer"
            },
            "TotalDeaths": {
                "type": "integer"
            },
            "NewRecovered": {
                "type": "integer"
            },
            "TotalRecovered": {
                "type": "integer"
            },
            "Date": {
                "type": "string"
            },
            "Premium": {
                "type": "object"
            }
        },
        "required": [
            "ID",
            "Country",
            "CountryCode",
            "Slug",
            "NewConfirmed",
            "TotalConfirmed",
            "NewDeaths",
            "TotalDeaths",
            "NewRecovered",
            "TotalRecovered",
            "Date",
            "Premium"
        ]
    }
    """
    schema_registry_conf = {
        'url': config["schema.registry.url"],
        'basic.auth.user.info': config['basic.auth.user.info']
    }
    schema_registry_client = SchemaRegistryClient(schema_registry_conf)

    jsons = JSONSerializer(schema_str, schema_registry_client,
                           lambda f, ctx: f)

    producer_conf = {
        'bootstrap.servers': config['bootstrap.servers'],
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': jsons,
        'security.protocol': config["security.protocol"],
        "sasl.mechanisms": config["sasl.mechanisms"],
        "sasl.username": config["sasl.username"],
        "sasl.password": config["sasl.password"]
    }

    return SerializingProducer(producer_conf)