# MAGIC %md # MAGIC ### Set up the client for the Schema Registry # COMMAND ---------- from confluent_kafka.schema_registry import SchemaRegistryClient schema_registry_conf = { 'url': schemaRegistryUrl, 'basic.auth.user.info': '{}:{}'.format(confluentRegistryApiKey, confluentRegistrySecret) } schema_registry_client = SchemaRegistryClient(schema_registry_conf) # COMMAND ---------- import pyspark.sql.functions as fn from pyspark.sql.avro.functions import from_avro keyRestResponseSchema = schema_registry_client.get_latest_version( confluentTopicName + "-key").schema confluentKeySchema = keyRestResponseSchema.schema_str valueRestResponseSchema = schema_registry_client.get_latest_version( confluentTopicName + "-value").schema confluentValueSchema = valueRestResponseSchema.schema_str # Set the option for how to fail - either stop on the first failure it finds (FAILFAST) or just set corrupt data to null (PERMISSIVE) #fromAvroOptions = {"mode":"FAILFAST"}
def test_config_url_invalid(): conf = {'url': 'htt://SchemaRegistry:65534'} with pytest.raises(ValueError) as e: SchemaRegistryClient(conf) assert e.match('Invalid url htt://SchemaRegistry:65534')
def _get_schema(schema_registry_client: SchemaRegistryClient, topic: str) -> str: """Return a schema string from an AVRO server.""" return schema_registry_client.get_latest_version(topic).schema.schema_str
def get_schema_registry_client(self, conf): return SchemaRegistryClient(conf)
#!/usr/bin/env python3 import json import os import pkgutil from confluent_kafka.schema_registry import SchemaRegistryClient, Schema, SchemaReference sr_conf = {'url': os.environ.get('SCHEMA_REGISTRY', 'http://localhost:8081')} client = SchemaRegistryClient(sr_conf) def register(file, subject, references=[]): schema_bytes = pkgutil.get_data("jlab_jaws", file) json_dict = json.loads(schema_bytes) json_str = json.dumps(json_dict) unregistered_schema = Schema(json_str, 'AVRO', references) id = client.register_schema(subject, unregistered_schema) print('Successfully registered {} with id: {}'.format(subject, id)) registered_schema = client.get_latest_version(subject) return registered_schema
def main(): sr_conf = {'url': SCHEMA_REGISTRY_URL} schema_registry_client = SchemaRegistryClient(sr_conf) schema_str = """ { "namespace": "io.confluent.ksql.avro_schemas", "name": "User", "type": "record", "fields":[ {"name":"DATESTAMP","type":"string"}, {"name":"TIMESTAMP","type":"string"}, {"name":"MILLISEC","type":"string"}, {"name":"LOGLEVEL","type":"string"}, {"name":"REQUESTID","type":"string"}, {"name":"RECORDFORMATVERSION","type":"string"}, {"name":"SOURCEIP","type":"string"}, {"name":"DNSDOMAIN","type":"string"}, {"name":"MESSAGETYPE","type":"string"}, {"name":"OPERATION","type":"string"}, {"name":"AUTHUSER","type":"string"}, {"name":"AUTHDOMAIN","type":"string"}, {"name":"HTTPCODE","type":"string"}, {"name":"SOURCEBYTES","type":"string"}, {"name":"RESPONSEBYTES","type":"string"}, {"name":"ELAPSEDTIME","type":"string"}, {"name":"DOMAIN","type":"string"}, {"name":"BUCKET","type":"string"}, {"name":"OBJECT","type":"string"} ] } """ avro_deserializer = AvroDeserializer(schema_str, schema_registry_client) string_deserializer = StringDeserializer('utf_8') consumer_conf = { 'bootstrap.servers': bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': group, 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue record = msg.value() if record is not None: if record['OPERATION'] == "POST" and record[ 'DOMAIN'] != "%28none%29": urllistraw = "http://" + record['DOMAIN'] + "/" + record[ 'BUCKET'] + "/" + record['OBJECT'] urllist = urllistraw[:-1] print(urllist) r = requests.head(urllist) print(r.headers) else: continue except KeyboardInterrupt: break consumer.close()
parser.add_argument('athena_table_name', nargs='?', help='The name of the table to create') parser.add_argument( 's3_location', nargs='?', help='S3 location of your database. Example: s3://bucket/folder/') parser.add_argument('--partition', nargs='+', type=str, help='partitions, can be specified multiple times.', default=[]) args = parser.parse_args() schema_registry = SchemaRegistryClient({"url": args.registry_url}) avro_schema_literal = schema_registry.get_latest_version( f"{args.avro_subject}").schema.schema_str athena_schema, partition_schema = create_athena_schema_from_avro( avro_schema_literal, args.partition) if partition_schema: partition_statement = f'\nPARTITIONED BY ({partition_schema})' else: partition_statement = '' print(f''' CREATE DATABASE IF NOT EXISTS {args.athena_database}; ''')
class KafkaAvroCDCConsumer: def __init__(self, consumer_name, topic_name="kafka-avro-producer", groupID='KafkaAvroConsumer', autocommit=True): # Consumer name for logging purposes self.logging_prefix = '[' + consumer_name + '][KafkaAvroCDCConsumer]' # Schema Registry configuration self.schema_registry_conf = EventBackboneConfig.getSchemaRegistryConf() # Schema Registry Client self.schema_registry_client = SchemaRegistryClient( self.schema_registry_conf) # Get Schema for the key self.schema_id_key = self.schema_registry_client.get_latest_version( EventBackboneConfig.getKeySubject()).schema_id # print('The Schema ID for the key is: {}'.format(self.schema_id_key)) self.schema_key = self.schema_registry_client.get_schema( self.schema_id_key).schema_str print(self.logging_prefix + ' - Key Subject: {}'.format(EventBackboneConfig.getKeySubject())) print(self.logging_prefix + ' - Key Schema:') print(self.logging_prefix + ' - -----------') print(self.logging_prefix + ' - ' + self.schema_key + "\n") # Get Schema for the value self.schema_id_value = self.schema_registry_client.get_latest_version( EventBackboneConfig.getValueSubject()).schema_id # print('The Schema ID for the value is: {}'.format(self.schema_id_value)) self.schema_value = self.schema_registry_client.get_schema( self.schema_id_value).schema_str print(self.logging_prefix + ' - Value Subject: {}'.format( EventBackboneConfig.getValueSubject())) print(self.logging_prefix + ' - Value Schema:') print(self.logging_prefix + ' - -------------\n') print(self.logging_prefix + ' - ' + self.schema_value + '\n') # Key Deserializer self.key_deserializer = AvroDeserializer(self.schema_key, self.schema_registry_client) # Value Deserializer # Presenting the schema to the Avro Deserializer is needed at the moment. In the future it might change # https://github.com/confluentinc/confluent-kafka-python/issues/834 self.value_deserializer = AvroDeserializer(self.schema_value, self.schema_registry_client) # Get the consumer configuration self.consumer_conf = EventBackboneConfig.getConsumerConfiguration( groupID, autocommit, self.key_deserializer, self.value_deserializer) # Create the consumer self.consumer = DeserializingConsumer(self.consumer_conf) # Print consumer configuration EventBackboneConfig.printConsumerConfiguration( self.logging_prefix, self.consumer_conf, self.schema_registry_conf['url']) # Subscribe to the topic self.consumer.subscribe([topic_name]) def traceResponse(self, msg): print( self.logging_prefix + ' - New event received\n\tTopic: {}\n\tPartition: {}\n\tOffset: {}\n\tkey: {}\n\tvalue: {}' .format(msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value())) # Polls for next event def pollNextEvent(self): # Poll for messages msg = self.consumer.poll(timeout=POLL_TIMEOUT) anEvent = {} # Validate the returned message if msg is None: print(self.logging_prefix + ' - [INFO] - No new messages on the topic') return None elif msg.error(): if ("PARTITION_EOF" in msg.error()): print(self.logging_prefix + ' - [INFO] - End of partition') else: print(self.logging_prefix + ' - [ERROR] - Consumer error: {}'.format(msg.error())) return None else: # Print the message self.traceResponse(msg) return msg.value() # Polls for the next event but returns the raw event def pollNextRawEvent(self): records = self.consumer.poll(timeout=POLL_TIMEOUT) if records is None: return None if records.error(): # Stop reading if we find end of partition in the error message if ("PARTITION_EOF" in records.error()): return None else: print( self.logging_prefix + ' - [ERROR] - Consumer error: {}'.format(records.error())) return None else: self.traceResponse(records) return records def commitEvent(self, event): self.consumer.commit(event) def close(self): self.consumer.close()
def test_config_ssl_certificate_no_key(): conf = {'url': TEST_URL, 'ssl.certificate.location': '/ssl/certificates/client'} test_client = SchemaRegistryClient(conf) assert test_client._rest_client.session.cert == '/ssl/certificates/client'
def test_config_ssl_key_no_certificate(): conf = {'url': TEST_URL, 'ssl.key.location': '/ssl/keys/client'} with pytest.raises(ValueError, match="ssl.certificate.location required" " when configuring ssl.key.location"): SchemaRegistryClient(conf)
def test_config_url_trailing_slash(): conf = {'url': 'http://SchemaRegistry:65534/'} test_client = SchemaRegistryClient(conf) assert test_client._rest_client.base_url == TEST_URL
def test_config_url_None(): conf = {} with pytest.raises(ValueError, match="Missing required configuration" " property url"): SchemaRegistryClient(conf)
def test_config_url_invalid_type(): conf = {'url': dict()} with pytest.raises(TypeError, match="url must be an instance of str," " not <(.*)>$"): SchemaRegistryClient(conf)
# For a complete example; see https://github.com/confluentinc/confluent-kafka-python/blob/master/examples/protobuf_producer.py from uuid import uuid4 # Protobuf generated class; resides at ./meal_pb2.py # Create it by running # protoc -I=. --python_out=. ./meal.proto import meal_pb2 from confluent_kafka import SerializingProducer from confluent_kafka.serialization import StringSerializer from confluent_kafka.schema_registry import SchemaRegistryClient from confluent_kafka.schema_registry.protobuf import ProtobufSerializer topic = 'MEAL_DELIVERY' schema_registry_client = SchemaRegistryClient( {'url': 'http://schema-registry:8081'}) protobuf_serializer = ProtobufSerializer(meal_pb2.Meal, schema_registry_client) producer_conf = { 'bootstrap.servers': 'kafka:29092', 'key.serializer': StringSerializer('utf_8'), 'value.serializer': protobuf_serializer } producer = SerializingProducer(producer_conf) producer.poll(0.0) mybeer = meal_pb2.Meal.DrinkItems(drink_name="beer") mywine = meal_pb2.Meal.DrinkItems(drink_name="wine")
def main(args): topic = args.topic outputtopic = args.outputtopic schema_enriched_event_str = EnrichedEventSchema schema_dict = ast.literal_eval(schema_enriched_event_str) schema_metrics = MetricSchema sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) string_deserializer = StringDeserializer('utf_8') avro_serializer = AvroSerializer(schema_metrics, schema_registry_client) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) avro_deserializer = AvroDeserializer(schema_enriched_event_str, schema_registry_client) consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group + str(random.Random()), 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) cluster = Cluster([args.host]) session = cluster.connect("datascience") cluster.register_user_type('datascience', 'datafield', Datafield) client_influxdb = InfluxDBClient('35.181.155.182', 8086, "dbsaleh2") # client_influxdb = InfluxDBClient(url="http://35.181.155.182:8086 , "mydb") while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. start = time.time() msg = consumer.poll(1.0) if msg is None: continue evt = msg.value() query = f""" insert into eventenrich ( "eventId" , "dateTimeRef", "nomenclatureEv", "canal", "media", "schemaVersion", "headerVersion", "serveur", "adresseIP", "idTelematique", "idPersonne", "dateNaissance", "paysResidence", "paysNaissance", "revenusAnnuel", "csp", "eventBC", "eventContent" ) VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s, %s,%s, %s) """ #eventBc = evt["EventBusinessContext"][0].replace("com.bnpparibas.dsibddf.event.","") eventBc = evt["eventBC"].replace("com.bnpparibas.dsibddf.event.", "") eventContent = evt["EventBusinessContext"][1] transformed_event = transform_enriched_event_to_cassandra_model( evt, eventBc, schema_dict, eventContent) insert_enriched_event_to_cassandra(transformed_event, session, query) elapsed_time = (time.time() - start) except Exception as e: print(f"Exception => {e}") continue query = 'SELECT * FROM metrics' result = client_influxdb.query(query, database="dbsaleh2") print(result) data = [] print(elapsed_time) metrics = [{ "measurement": "metrics", "fields": { "metricName": "hystorize", "timeforhystorize": elapsed_time } }] data.append(metrics) # client_influxdb.write_points("hystorize",elapsed_time, database="dbsaleh2") client_influxdb.write_points(metrics, database="dbsaleh2") producer.produce(topic=outputtopic, value={ 'metricName': "hystorize", 'time': elapsed_time }, on_delivery=delivery_report) producer.flush() consumer.close()
def get_schema_registry_client(url="http://localhost:8081"): return SchemaRegistryClient({"url": url})
def send_record(args): if not any([args.record_value, args.record_file]): raise AttributeError( "--record-value or --record-file are not provided.") if args.schema_file is None: raise AttributeError("--schema-file is not provided.") if args.security_protocol and args.security_protocol.lower() not in [ 'plaintext', 'ssl' ]: raise AttributeError( "--security-protocol must be either plaintext or ssl.") schema_registry_client = SchemaRegistryClient( {'url': args.schema_registry}) with open(args.schema_file, 'r') as file: schema = file.read() string_serializer = StringSerializer('utf-8') avro_serializer = AvroSerializer(schema, schema_registry_client) producer_config = { "bootstrap.servers": args.bootstrap_servers, 'key.serializer': string_serializer, 'value.serializer': avro_serializer, } security_protocol = args.security_protocol.lower() if security_protocol == "ssl" and all( [args.ssl_ca_location, args.ssl_cert_location, args.ssl_key_location]): producer_config.update({ 'security.protocol': security_protocol, 'ssl.ca.location': args.ssl_ca_location, 'ssl.key.location': args.ssl_key_location, 'ssl.certificate.location': args.ssl_cert_location }) else: raise AttributeError( "--security-protocol is ssl, please supply certificates.") producer = SerializingProducer(producer_config) key = args.record_key if args.record_key else str(uuid.uuid4()) if args.record_file: with open(args.record_file, 'r') as f: data = f.readlines() for line in data: try: producer.produce(topic=args.topic, key=key, value=json.loads(line)) except Exception as e: print( f"Exception while producing record value - {line} to topic - {args.topic}: {e}" ) else: print( f"Successfully producing record value - {line} to topic - {args.topic}" ) else: value = args.record_value try: producer.produce(topic=args.topic, key=key, value=value) except Exception as e: print( f"Exception while producing record value - {value} to topic - {args.topic}: {e}" ) else: print( f"Successfully producing record value - {value} to topic - {args.topic}" ) producer.flush()
def __init__(self, consumer_name, topic_name="kafka-avro-producer", groupID='KafkaAvroConsumer', autocommit=True): # Consumer name for logging purposes self.logging_prefix = '[' + consumer_name + '][KafkaAvroCDCConsumer]' # Schema Registry configuration self.schema_registry_conf = EventBackboneConfig.getSchemaRegistryConf() # Schema Registry Client self.schema_registry_client = SchemaRegistryClient( self.schema_registry_conf) # Get Schema for the key self.schema_id_key = self.schema_registry_client.get_latest_version( EventBackboneConfig.getKeySubject()).schema_id # print('The Schema ID for the key is: {}'.format(self.schema_id_key)) self.schema_key = self.schema_registry_client.get_schema( self.schema_id_key).schema_str print(self.logging_prefix + ' - Key Subject: {}'.format(EventBackboneConfig.getKeySubject())) print(self.logging_prefix + ' - Key Schema:') print(self.logging_prefix + ' - -----------') print(self.logging_prefix + ' - ' + self.schema_key + "\n") # Get Schema for the value self.schema_id_value = self.schema_registry_client.get_latest_version( EventBackboneConfig.getValueSubject()).schema_id # print('The Schema ID for the value is: {}'.format(self.schema_id_value)) self.schema_value = self.schema_registry_client.get_schema( self.schema_id_value).schema_str print(self.logging_prefix + ' - Value Subject: {}'.format( EventBackboneConfig.getValueSubject())) print(self.logging_prefix + ' - Value Schema:') print(self.logging_prefix + ' - -------------\n') print(self.logging_prefix + ' - ' + self.schema_value + '\n') # Key Deserializer self.key_deserializer = AvroDeserializer(self.schema_key, self.schema_registry_client) # Value Deserializer # Presenting the schema to the Avro Deserializer is needed at the moment. In the future it might change # https://github.com/confluentinc/confluent-kafka-python/issues/834 self.value_deserializer = AvroDeserializer(self.schema_value, self.schema_registry_client) # Get the consumer configuration self.consumer_conf = EventBackboneConfig.getConsumerConfiguration( groupID, autocommit, self.key_deserializer, self.value_deserializer) # Create the consumer self.consumer = DeserializingConsumer(self.consumer_conf) # Print consumer configuration EventBackboneConfig.printConsumerConfiguration( self.logging_prefix, self.consumer_conf, self.schema_registry_conf['url']) # Subscribe to the topic self.consumer.subscribe([topic_name])
class KafkaPC: def __init__(self, config_path, config_section): super(KafkaPC, self).__init__() self.in_topic = None self.out_topic = None self.in_schema = None self.out_schema = None self.read_config(config_path, config_section) self.connect_schema_registry() self.read_topics() self.create_serializer() self.create_deserializer() self.create_consumer() self.create_producer() def connect_schema_registry(self): if self.config.get("KAFKA_SCHEMA_REGISTRY_URL") is not None: sr_conf = {"url": self.config["KAFKA_SCHEMA_REGISTRY_URL"]} self.schema_registry = SchemaRegistryClient(sr_conf) else: raise ValueError("Need KAFKA_SCHEMA_REGISTRY_URL") def get_schema_from_registry(self, topic, suffix="-value"): response = None try: schema = self.schema_registry.get_latest_version(topic + suffix) response = schema.schema except Exception as e: print(f"Exception: {repr(e)}") return response def read_topics(self): if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"): self.in_topic = self.config["IN_TOPIC"] self.in_schema = {} for topic in self.in_topic: # try to get schema from registry schema = self.get_schema_from_registry(topic) # if no schema is found a simple string deserializer will be used, see line 87 if schema is None: self.in_schema[topic] = None else: self.in_schema[topic] = schema if self.config.get("OUT_TOPIC"): self.out_topic = list(self.config["OUT_TOPIC"].keys()) self.out_schema = {} for topic, schema in self.config["OUT_TOPIC"].items(): self.out_schema[topic] = self.read_avro_schema(schema) def create_serializer(self): self.serializer = {} if self.out_topic is not None: for topic in self.out_topic: schema_str = self.out_schema[topic].schema_str self.serializer[topic] = AvroSerializer( schema_str, self.schema_registry) def create_deserializer(self): self.deserializer = {} if self.in_topic is not None: for topic in self.in_topic: if self.in_schema[topic] is None: self.deserializer[topic] = StringDeserializer("utf_8") else: schema_str = self.in_schema[topic].schema_str self.deserializer[topic] = AvroDeserializer( schema_str, self.schema_registry) def create_consumer(self): if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"): consumer_conf = { "bootstrap.servers": self.config["KAFKA_BROKER_URL"], "group.id": self.config["IN_GROUP"], "auto.offset.reset": "earliest", } self.consumer = Consumer(consumer_conf) self.consumer.subscribe(self.in_topic) def create_producer(self): if self.config.get("OUT_TOPIC"): producer_conf = { "bootstrap.servers": self.config["KAFKA_BROKER_URL"] } self.producer = Producer(producer_conf) def read_config(self, config_path, config_section): self.config = {} if config_path is not None and config_section is not None: config_section = config_section.replace(" ", "").split(",") else: raise ValueError( "Configuration requires config_path and config_section") try: with open(config_path, "r") as ymlfile: config = yaml.load(ymlfile, Loader=yaml.FullLoader) for section in config_section: for key, value in config[section].items(): self.config[key] = value except Exception as e: print(f"Failed to read the config: {repr(e)}") sys.exit() def read_avro_schema(self, schema): with open(schema, "r") as f: schema_str = f.read() avro_schema_str = Schema(schema_str, "AVRO") return avro_schema_str def decode_msg(self, msg): try: topic = msg.topic() value = self.deserializer[topic](msg.value(), None) return value except Exception as e: print(f"Error decoding avro data: {repr(e)}") sys.exit() def send_msg(self, message, partition=0, topic=None): # if no topic is provided, the first topic in the list is used as default if topic is None: out_topic = self.out_topic[0] else: out_topic = topic # encode the data with the specified Avro out_schema ctx = SerializationContext(out_topic, MessageField.VALUE) ser_message = self.serializer[out_topic](message, ctx) try: self.producer.produce(topic=out_topic, value=ser_message, partition=partition) except Exception as e: print(f"Error sending data to Kafka: {repr(e)}")
def main(args): topic = args.topic outputtopic = args.outputtopic schema_enriched_event_str = EnrichedEventSchema schema_metrics = MetricSchema sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) string_deserializer = StringDeserializer('utf_8') avro_serializer = AvroSerializer(schema_metrics, schema_registry_client) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) avro_deserializer = AvroDeserializer(schema_enriched_event_str, schema_registry_client) consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group + str(random.Random()), 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) cluster = Cluster([args.host]) session = cluster.connect("datascience") session.row_factory = dict_factory client_influxdb = InfluxDBClient('35.181.155.182', 8086, "dbsaleh2") while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. start = time.time() msg = consumer.poll(1.0) if msg is None: continue evt = msg.value() idPersonne = evt["idPersonne"] rows = session.execute(GET_ENRICHED_EVENT_QUERY, (idPersonne, )) if rows: # print(idPersonne, f"rows={rows.all().__len__()}") # stat_process(idPersonne, rows) # som = rec_process(rows,0,0) # print("some", som) # row["csp"] = get_value_column_enriched_data(row, "csp") # row["paysNaissance"] = get_value_column_enriched_data(row, "paysNaissance") # # # #get_value_column_event_content # row['appVersion'] = get_value_column_event_content(row, "appVersion") # row['montant'] = get_value_column_event_content(row, "montant") # row['androidID'] = get_value_column_event_content(row, "androidID") # del rows[0]['eventContent'] elapsed_time = time.time() - start #producer.produce(topic=outputtopic, key=str(uuid4()), value={'metricName':"hystorize",'time':elapsed_time}, on_delivery=delivery_report) #producer.flush() except Exception: print('Exception') continue metrics = [{ "measurement": "metrics", "fields": { "metricName": "score", "timeforscore": elapsed_time } }] print(elapsed_time) client_influxdb.write_points(metrics, database="dbsaleh2") producer.produce(topic=outputtopic, value={ 'metricName': "score", 'time': elapsed_time }, on_delivery=delivery_report) producer.flush() consumer.close()
def __init__(self, record: KafkaRecord, schema_registry_config: dict): self._record = record self._key_fields = record.key_fields self._key_included = record.include_key self._schema_registry_client = SchemaRegistryClient( schema_registry_config)
def main(args): topic = args.topic outputtopic = args.outputtopic schema_str = EventSchema schema_enriched_event_str = EnrichedEventSchema sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) avro_deserializer = AvroDeserializer(schema_str, schema_registry_client) string_deserializer = StringDeserializer('utf_8') avro_serializer = AvroSerializer(schema_enriched_event_str, schema_registry_client) consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group + str(random.Random()), 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) cluster = Cluster([args.host]) session = cluster.connect("datascience") session.row_factory = dict_factory producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) loop = asyncio.get_event_loop() while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. start = time.time() msg = consumer.poll(1.0) if msg is None: continue evt = msg.value() print("msg=>", evt) def enrich(evt): print("evt", evt) if evt is not None: print("récupérer dans kafka") row = session.execute( GET_ENRICHED_DATA_QUERY, (evt["EventHeader"]["acteurDeclencheur"]["idPersonne"], )).one() if row: evt['EnrichedData'] = row # evt['EventBusinessContext'] = evt["EventBusinessContext"][1] EnrichedEvent = { "eventId": evt["EventHeader"]["eventId"], "dateTimeRef": evt["EventHeader"]["dateTimeRef"], "nomenclatureEv": evt["EventHeader"]["nomenclatureEv"], "canal": evt["EventHeader"]["canal"], "media": evt["EventHeader"]["media"], "schemaVersion": evt["EventHeader"]["schemaVersion"], "headerVersion": evt["EventHeader"]["headerVersion"], "serveur": evt["EventHeader"]["serveur"], "adresseIP": evt["EventHeader"]["acteurDeclencheur"] ["adresseIP"], "idTelematique": evt["EventHeader"]["acteurDeclencheur"] ["idTelematique"], "idPersonne": evt["EventHeader"]["acteurDeclencheur"] ["idPersonne"], "dateNaissance": row["dateNaissance"], "paysResidence": row["paysResidence"], "paysNaissance": row["paysNaissance"], "revenusAnnuel": row["revenusAnnuel"], "csp": row["csp"], "EventBusinessContext": evt["EventBusinessContext"] } producer.produce(topic=outputtopic, key=str(uuid4()), value=EnrichedEvent, on_delivery=delivery_report) producer.flush() async_enrich = async_wrap(enrich) loop.run_until_complete(async_enrich(evt)) except Exception: print('Exception') continue consumer.close()
# For a complete example; see https://github.com/confluentinc/confluent-kafka-python/blob/master/examples/protobuf_producer.py from uuid import uuid4 # Protobuf generated class; resides at ./meal_pb2.py # Create it by running # protoc -I=. --python_out=. ./meal.proto import meal_pb2 from confluent_kafka import SerializingProducer from confluent_kafka.serialization import StringSerializer from confluent_kafka.schema_registry import SchemaRegistryClient from confluent_kafka.schema_registry.protobuf import ProtobufSerializer topic = 'DEMO_MEAL_PROTO' schema_registry_client = SchemaRegistryClient({'url': 'http://localhost:8081'}) protobuf_serializer = ProtobufSerializer(meal_pb2.Meal, schema_registry_client) producer_conf = { 'bootstrap.servers': 'localhost:9092', 'key.serializer': StringSerializer('utf_8'), 'value.serializer': protobuf_serializer } producer = SerializingProducer(producer_conf) producer.poll(0.0) mybeer = meal_pb2.Meal.DrinkItems(drink_name="beer") mywine = meal_pb2.Meal.DrinkItems(drink_name="wine")
from enum import Enum from confluent_kafka.admin import AdminClient from confluent_kafka.schema_registry import SchemaRegistryClient class Topics(Enum): TOPIC_1 = "topic1" TOPIC_2 = "topic2" SCHEMA_REGISTRY_URL = "http://127.0.0.1:8081" SCHEMA_REGISTRY_CLIENT = SchemaRegistryClient({'url': SCHEMA_REGISTRY_URL}) BOOTSTRAP_SERVERS = "127.0.0.1:9092" NUM_PARTITIONS = 3 ADMIN_CLIENT = AdminClient({'bootstrap.servers': BOOTSTRAP_SERVERS})
def main(args): topic = args.topic schema_str = """ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "User", "description": "A Confluent Kafka Python User", "type": "object", "properties": { "name": { "description": "User's name", "type": "string" }, "favorite_number": { "description": "User's favorite number", "type": "number", "exclusiveMinimum": 0 }, "favorite_color": { "description": "User's favorite color", "type": "string" } }, "required": [ "name", "favorite_number", "favorite_color" ] } """ schema_registry_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_conf) json_serializer = JSONSerializer(schema_str, schema_registry_client, user_to_dict) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': json_serializer } producer = SerializingProducer(producer_conf) print("Producing user records to topic {}. ^C to exit.".format(topic)) while True: # Serve on_delivery callbacks from previous calls to produce() producer.poll(0.0) try: user_name = input("Enter name: ") user_address = input("Enter address: ") user_favorite_number = int(input("Enter favorite number: ")) user_favorite_color = input("Enter favorite color: ") user = User(name=user_name, address=user_address, favorite_color=user_favorite_color, favorite_number=user_favorite_number) producer.produce(topic=topic, key=str(uuid4()), value=user, on_delivery=delivery_report) except KeyboardInterrupt: break except ValueError: print("Invalid input, discarding record...") continue print("\nFlushing records...") producer.flush()
args = ccloud_lib.parse_args() config_file = args.config_file topic = args.topic conf = ccloud_lib.read_ccloud_config(config_file) # Create topic if needed ccloud_lib.create_topic(conf, topic) # for full list of configurations, see: # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/#schemaregistryclient schema_registry_conf = { 'url': conf['schema.registry.url'], 'basic.auth.user.info': conf['basic.auth.user.info'] } schema_registry_client = SchemaRegistryClient(schema_registry_conf) name_avro_serializer = AvroSerializer(ccloud_lib.name_schema, schema_registry_client, ccloud_lib.Name.name_to_dict) count_avro_serializer = AvroSerializer(ccloud_lib.count_schema, schema_registry_client, ccloud_lib.Count.count_to_dict) # for full list of configurations, see: # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/#serializingproducer producer_conf = { 'bootstrap.servers': conf['bootstrap.servers'], 'sasl.mechanisms': conf['sasl.mechanisms'], 'security.protocol': conf['security.protocol'], 'sasl.username': conf['sasl.username'],
class KafkaPC: def __init__(self, config_path, config_section): super(KafkaPC, self).__init__() self.in_topic = None self.out_topic = None self.in_schema = None self.out_schema = None self.read_config(config_path, config_section) self.connect_schema_registry() self.read_topics() self.create_topics_on_broker() self.register_schemas_in_registry() self.create_serializer() self.create_deserializer() self.create_consumer() self.create_producer() def connect_schema_registry(self): MAX_RETRIES = 3 if self.config.get("KAFKA_SCHEMA_REGISTRY_URL") is not None: sr_conf = {"url": self.config["KAFKA_SCHEMA_REGISTRY_URL"]} retries = 0 while retries < MAX_RETRIES: try: self.schema_registry = SchemaRegistryClient(sr_conf) print("Connected to Schema Registry") break except Exception as e: retries += 1 print( f"Could not connect to Schema Registry, retry {retries}" ) print({repr(e)}) sleep(5) if retries == MAX_RETRIES: raise ConnectionError("Could not connect to Schema Registry") else: raise ValueError("Need KAFKA_SCHEMA_REGISTRY_URL") def register_schemas_in_registry(self, suffix="-value"): MAX_RETRIES = 3 for topic, schema in self.out_schema.items(): subject = topic + suffix retries = 0 while retries < MAX_RETRIES: try: self.schema_registry.register_schema(subject_name=subject, schema=schema) print(f"Registered schema for topic {topic} in registry") break except Exception as e: retries += 1 print( f"Could not register schema for topic {topic} in registry: {repr(e)}" ) sleep(5) if retries == MAX_RETRIES: raise ConnectionError("Could not connect to Schema Registry") def create_topics_on_broker(self, partitions=1, replication=1): a = AdminClient({"bootstrap.servers": self.config["KAFKA_BROKER_URL"]}) topic_set = set(self.out_topic) md = a.list_topics(timeout=10) broker_set = set(md.topics.values()) diff_set = topic_set.difference(broker_set) new_topics = [ NewTopic(topic, num_partitions=partitions, replication_factor=replication) for topic in diff_set ] fs = a.create_topics(new_topics) # Wait for operation to finish. # Timeouts are preferably controlled by passing request_timeout=15.0 # to the create_topics() call. # All futures will finish at the same time. for topic, f in fs.items(): try: f.result() # The result itself is None print(f"Topic {topic} created on Broker") except Exception as e: print(f"Failed to create topic {topic} on Broker: {repr(e)}") def get_schema_from_registry(self, topic, suffix="-value"): response = None MAX_RETRIES = 3 retries = 0 while retries < MAX_RETRIES: try: schema = self.schema_registry.get_latest_version(topic + suffix) response = schema.schema print(f"Retrieved schema for topic {topic} from Registry") break except Exception as e: retries += 1 print(f"Failed to get schema: {repr(e)}") sleep(3) return response def read_topics(self): if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"): self.in_topic = self.config["IN_TOPIC"] self.in_schema = {} for topic in self.in_topic: # try to get schema from registry schema = self.get_schema_from_registry(topic) # if no schema is found a simple string deserializer will be used, see line 87 if schema is None: self.in_schema[topic] = None else: self.in_schema[topic] = schema if self.config.get("OUT_TOPIC"): self.out_topic = list(self.config["OUT_TOPIC"].keys()) self.out_schema = {} for topic, schema in self.config["OUT_TOPIC"].items(): self.out_schema[topic] = self.read_avro_schema(schema) def create_serializer(self): self.serializer = {} if self.out_topic is not None: for topic in self.out_topic: schema_str = self.out_schema[topic].schema_str self.serializer[topic] = AvroSerializer( schema_str, self.schema_registry) def create_deserializer(self): self.deserializer = {} if self.in_topic is not None: for topic in self.in_topic: if self.in_schema[topic] is None: self.deserializer[topic] = StringDeserializer("utf_8") else: schema_str = self.in_schema[topic].schema_str self.deserializer[topic] = AvroDeserializer( schema_str, self.schema_registry) def create_consumer(self): if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"): consumer_conf = { "bootstrap.servers": self.config["KAFKA_BROKER_URL"], "group.id": self.config["IN_GROUP"], "auto.offset.reset": "earliest", } self.consumer = Consumer(consumer_conf) self.consumer.subscribe(self.in_topic) def create_producer(self): if self.config.get("OUT_TOPIC"): producer_conf = { "bootstrap.servers": self.config["KAFKA_BROKER_URL"] } self.producer = Producer(producer_conf) def read_config(self, config_path, config_section): self.config = {} if config_path is not None and config_section is not None: config_section = config_section.replace(" ", "").split(",") else: raise ValueError( "Configuration requires config_path and config_section") try: with open(config_path, "r") as ymlfile: config = yaml.load(ymlfile, Loader=yaml.FullLoader) for section in config_section: for key, value in config[section].items(): self.config[key] = value except Exception as e: print(f"Failed to read the config: {repr(e)}") sys.exit() def read_avro_schema(self, schema): with open(schema, "r") as f: schema_str = f.read() avro_schema_str = Schema(schema_str, "AVRO") return avro_schema_str def decode_msg(self, msg): try: topic = msg.topic() value = self.deserializer[topic](msg.value(), None) return value except Exception as e: print(f"Error decoding avro data: {repr(e)}") # sys.exit() def send_msg(self, message, partition=0, topic=None): # if no topic is provided, the first topic in the list is used as default if topic is None: out_topic = self.out_topic[0] else: out_topic = topic # encode the data with the specified Avro out_schema ctx = SerializationContext(out_topic, MessageField.VALUE) ser_message = self.serializer[out_topic](message, ctx) try: self.producer.produce(topic=out_topic, value=ser_message, partition=partition) except Exception as e: print(f"Error sending data to Kafka: {repr(e)}")
def main( name: str, shutdown: multiprocessing.Value, request_queue: multiprocessing.Queue, config: Config ) -> None: """Execute tasks forever. This method is the entrypoint for the worker which executes the monitoring tasks. It is executed in a dedicate child process. """ if config.verbose: logging.basicConfig(level=logging.INFO) log = logging.getLogger(name) log.info(f"Starting process {name}.") # SIGINT will be delivered to the whole process group. We'll need to ignore # it in the worker processes to give them the opportunity to finish any # pending work. signal.signal(signal.SIGINT, signal.SIG_IGN) schema_registry_client = SchemaRegistryClient({ 'url': config.schema_registry }) avro_serializer = AvroSerializer( Report.SCHEMA, schema_registry_client, Report.asdict ) producer = SerializingProducer({ 'client.id': name, 'bootstrap.servers': config.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'security.protocol': 'SSL', 'ssl.key.location': config.auth_key, 'ssl.certificate.location': config.auth_cert, 'ssl.ca.location': config.ca_cert, 'value.serializer': avro_serializer, }) err = _report_error(log) while not shutdown.value: producer.poll(0.0) try: now = datetime.now() req = request_queue.get(timeout=1) curl = pycurl.Curl() curl.setopt(pycurl.URL, req) curl.setopt(pycurl.CONNECTTIMEOUT, 30) curl.setopt(pycurl.TIMEOUT, 300) curl.setopt(pycurl.NOSIGNAL, 1) curl.setopt(pycurl.WRITEFUNCTION, len) try: curl.perform() report = Report( timestamp=now.timestamp(), url=req, code=int(curl.getinfo(pycurl.RESPONSE_CODE)), namelookup=curl.getinfo(pycurl.NAMELOOKUP_TIME), connect=curl.getinfo(pycurl.CONNECT_TIME), appconnect=curl.getinfo(pycurl.APPCONNECT_TIME), pretransfer=curl.getinfo(pycurl.PRETRANSFER_TIME), starttransfer=curl.getinfo(pycurl.STARTTRANSFER_TIME), total=curl.getinfo(pycurl.TOTAL_TIME), ) log.info(str(report)) producer.produce( topic=config.topic, key=req, value=report, on_delivery=err ) except TypeError: # It'll never work if we misconfigure PycURL. raise except pycurl.error as exc: # TODO: Record the failure in Kafka. log.warning(f"Failed to retrieve {req}", exc) # TODO: Handle exceptions from the Kafka Producer. finally: curl.close() except queue.Empty: log.debug("No request to process.") # Flush any results that haven't been committed yet. log.warning(f"Process {name} shutting down.") producer.flush()
def MyProducer(config): schema_str = """ { "$schema": "http://json-schema.org/draft-04/schema#", "title": "covid", "type": "object", "properties": { "ID": { "type": "string" }, "Country": { "type": "string" }, "CountryCode": { "type": "string" }, "Slug": { "type": "string" }, "NewConfirmed": { "type": "integer" }, "TotalConfirmed": { "type": "integer" }, "NewDeaths": { "type": "integer" }, "TotalDeaths": { "type": "integer" }, "NewRecovered": { "type": "integer" }, "TotalRecovered": { "type": "integer" }, "Date": { "type": "string" }, "Premium": { "type": "object" } }, "required": [ "ID", "Country", "CountryCode", "Slug", "NewConfirmed", "TotalConfirmed", "NewDeaths", "TotalDeaths", "NewRecovered", "TotalRecovered", "Date", "Premium" ] } """ schema_registry_conf = { 'url': config["schema.registry.url"], 'basic.auth.user.info': config['basic.auth.user.info'] } schema_registry_client = SchemaRegistryClient(schema_registry_conf) jsons = JSONSerializer(schema_str, schema_registry_client, lambda f, ctx: f) producer_conf = { 'bootstrap.servers': config['bootstrap.servers'], 'key.serializer': StringSerializer('utf_8'), 'value.serializer': jsons, 'security.protocol': config["security.protocol"], "sasl.mechanisms": config["sasl.mechanisms"], "sasl.username": config["sasl.username"], "sasl.password": config["sasl.password"] } return SerializingProducer(producer_conf)