def main(args): topic = args.topic delimiter = args.delimiter producer_conf = producer_config(args) producer = SerializingProducer(producer_conf) print('Producing records to topic {}. ^C to exit.'.format(topic)) while True: # Serve on_delivery callbacks from previous calls to produce() producer.poll(0.0) try: msg_data = input(">") msg = msg_data.split(delimiter) if len(msg) == 2: producer.produce(topic=topic, key=msg[0], value=msg[1], on_delivery=delivery_report) else: producer.produce(topic=topic, value=msg[0], on_delivery=delivery_report) except KeyboardInterrupt: break print('\nFlushing {} records...'.format(len(producer))) producer.flush()
def _commit(self) -> None: """ Retrieves the current offset by calling :meth:`pyconnect.pyconnectsource.PyConnectSource.get_index` and publishes it to the offset topic that is defined in this sources :class:`pyconnect.config.SourceConfig` instance. """ idx = self.get_index() idx_schema = to_value_schema(idx) avro_value_serializer = AvroSerializer( schema_registry_client=self.schema_registry_client, schema_str=idx_schema) producer_config = { "bootstrap.servers": self.config["bootstrap.servers"], "key.serializer": None, "value.serializer": avro_value_serializer, **self.config["kafka_opts"], **self.config["kafka_producer_opts"], } offset_producer = SerializingProducer(producer_config) offset_producer.produce(key=None, value=idx, topic=self.config["offset_topic"]) offset_producer.flush()
def write_to_kafka(bootstrap_servers, schema_registry_url, topic_name, data): print("Kafka Version : ", confluent_kafka.version(),confluent_kafka.libversion()) schema_registry_conf = {'url': schema_registry_url} schema_registry_client = SchemaRegistryClient(schema_registry_conf) value_avro_serializer = AvroSerializer(schemas.weather_source_schema, schema_registry_client) string_serializer = StringSerializer('utf-8') conf = {'bootstrap.servers': bootstrap_servers, 'client.id': socket.gethostname(), 'on_delivery': delivery_report, 'key.serializer': string_serializer, 'value.serializer': value_avro_serializer } avroProducer = SerializingProducer(conf) key=datetime.date.today() + '~' + str(data['lat']) + '~' + str(data['lon']) message = json.dumps(data, cls=DatetimeEncoder) print("Key Type : ", type(key)) print("Value Type : ", type(json.loads(message))) avroProducer.produce(topic=topic_name, key=key, value=json.loads(message)) avroProducer.flush()
def main(args): topic = args.topic delimiter = args.delimiter producer_conf = {'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': StringSerializer('utf_8')} producer_conf.update(sasl_conf(args)) producer = SerializingProducer(producer_conf) print("Producing records to topic {}. ^C to exit.".format(topic)) while True: # Serve on_delivery callbacks from previous calls to produce() producer.poll(0.0) try: msg_data = input(">") msg = msg_data.split(delimiter) if len(msg) == 2: producer.produce(topic=topic, key=msg[0], value=msg[1], on_delivery=delivery_report) else: producer.produce(topic=topic, value=msg[0], on_delivery=delivery_report) except KeyboardInterrupt: break print("\nFlushing {} records...".format(len(producer))) producer.flush()
def main(args): topic = args.topic schema_str = """ { "namespace": "confluent.io.examples.serialization.avro", "name": "User", "type": "record", "fields": [ {"name": "name", "type": "string"}, {"name": "favorite_number", "type": "int"}, {"name": "favorite_color", "type": "string"} ] } """ schema_registry_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_conf) avro_serializer = AvroSerializer(schema_registry_client, schema_str, user_to_dict) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) print("Producing user records to topic {}. ^C to exit.".format(topic)) while True: # Serve on_delivery callbacks from previous calls to produce() producer.poll(0.0) try: user_name = input("Enter name: ") user_address = input("Enter address: ") user_favorite_number = int(input("Enter favorite number: ")) user_favorite_color = input("Enter favorite color: ") user = User(name=user_name, address=user_address, favorite_color=user_favorite_color, favorite_number=user_favorite_number) producer.produce(topic=topic, key=str(uuid4()), value=user, on_delivery=delivery_report) except KeyboardInterrupt: break except ValueError: print("Invalid input, discarding record...") continue print("\nFlushing records...") producer.flush()
class ProtoKafkaProducer: def __init__(self, config_env): self.config = config_env self.topic_name = self.config["kafka_produce_topic"] conf = { 'bootstrap.servers': self.config["bootstrap_servers"], 'message.max.bytes': self.config["kafkaMaxMessageBytes"], 'queue.buffering.max.ms': self.config["queue.buffering.max.ms"], 'queue.buffering.max.messages': self.config["queue.buffering.max.messages"], 'key.serializer': StringSerializer('utf_8'), 'value.serializer': self.__protobuf_serializer() } self.producer = SerializingProducer(conf) def on_delivery(self, err, msg): if err: print("Message failed delivery, error: %s", err) else: print("Message delivered to %s on partition %s", msg.topic(), msg.partition()) def __protobuf_serializer(self): schema_registry_conf = {'url': self.config['schemaregistry.url']} schema_registry_client = SchemaRegistryClient(schema_registry_conf) _proto_conf = { 'auto.register.schemas': self.config['auto.register.schemas'], } return ProtobufSerializer(self.config['proto_msg_type'], schema_registry_client, conf=_proto_conf) def produce(self, kafka_msg, kafka_key): try: self.producer.produce(topic=self.topic_name, value=kafka_msg, key=kafka_key, on_delivery=self.on_delivery) self.producer.flush() except Exception as e: print("Error during producing to kafka topic. Stacktrace is %s", e)
class NewsScheduler(object): def __init__(self, bootstrap_servers, rss_feeds, topic='crawl-queue', time_checkpoint_fn_base='scheduler_checkpoint'): self.bootstrap_servers = bootstrap_servers self.topic = topic self.feeds = rss_feeds self.time_checkpoints = dict() for spider_name in self.feeds.values(): fn = f'{time_checkpoint_fn_base}_{spider_name}.txt' fn = fn.replace('/', '_') # we don't want / in our pathnames. self.time_checkpoints[spider_name] = TimeCheckpoint(fn=fn) producer_conf = { 'bootstrap.servers': self.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': _json_serializer_wrapper } self.producer = SerializingProducer(producer_conf) def process_feed(self, feed_url, spider_name, flush=False): log.info(f"Processing feed '{feed_url}' via topic '{self.topic}'.") rss_feed = feedparser.parse(feed_url) for item in rss_feed.entries: item['spider'] = spider_name item_updated_time = struct_time_to_datetime(item.updated_parsed) if item_updated_time > self.time_checkpoints[ spider_name].checkpoint: log.info(f"New item: {item['title']}") self.producer.produce(topic=self.topic, key=str(uuid4()), value=dict(item)) self.time_checkpoints[ spider_name].checkpoint = struct_time_to_datetime( rss_feed.feed.updated_parsed) if flush: self.producer.flush() def run_loop(self, interval): for feed, spider in itertools.cycle(self.feeds.items()): self.process_feed(feed, spider) time.sleep(interval)
def send_record(args): """ Sends Record using a SerializingProducer & AvroSerializer """ topic = args.topic.rstrip() schema_registry_config = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_config) avro_serializer = AvroSerializer(schema_registry_client, DATA_SCHEMA, data_to_dict) producer_config = { "bootstrap.servers": args.bootstrap_servers, "key.serializer": StringSerializer('utf_8'), "value.serializer": avro_serializer } producer = SerializingProducer(producer_config) split_incoming_data = args.record_value.split(',') if not len(split_incoming_data) == 7: # Data Format Check print('** Error: Insufficient Incoming Data: ', split_incoming_data) raise Exception try: # Data Format Check incoming_data = { 'envId': int(split_incoming_data[0]), 'whenCollected': str(split_incoming_data[1]), 'timeLightOnMins': int(split_incoming_data[2]), 'humidity': int(split_incoming_data[3]), 'soilMoisture': int(split_incoming_data[4]), 'temperature': int(split_incoming_data[5]), 'waterConsumption': int(split_incoming_data[6]) } except Exception as error: print('** Error Creating Dict of Data: ', error) print(f'Producing data records to topic {topic}. ^C to exit.') producer.poll(1) try: key = args.record_key if args.record_key else str(uuid4()) data_object = Data(incoming_data) print('\t-Producing Avro record. . .') producer.produce(topic=topic, key=key, value=data_object, on_delivery=delivery_report) except ValueError: print('\t-Invalid input, discarding record. . .') print('\nFlushing records. . .') producer.flush()
class KafkaAvroProducer: def __init__(self, producer_name, value_schema, groupID='KafkaAvroProducer'): # Consumer name for logging purposes self.logging_prefix = '[' + producer_name + '][KafkaAvroProducer]' # Schema Registry configuration self.schema_registry_conf = EventBackboneConfig.getSchemaRegistryConf() # Schema Registry Client self.schema_registry_client = SchemaRegistryClient( self.schema_registry_conf) # String Serializer for the key self.key_serializer = StringSerializer('utf_8') # Avro Serializer for the value self.value_serializer = AvroSerializer(value_schema, self.schema_registry_client) # Get the producer configuration self.producer_conf = EventBackboneConfig.getProducerConfiguration( groupID, self.key_serializer, self.value_serializer) EventBackboneConfig.printProducerConfiguration( self.logging_prefix, self.producer_conf, self.schema_registry_conf['url']) # Create the producer self.producer = SerializingProducer(self.producer_conf) def delivery_report(self, err, msg): """ Called once for each message produced to indicate delivery result. Triggered by poll() or flush(). """ if err is not None: print( '[KafkaAvroProducer] - [ERROR] - Message delivery failed: {}'. format(err)) else: print('[KafkaAvroProducer] - Message delivered to {} [{}]'.format( msg.topic(), msg.partition())) def publishEvent(self, key, value, topicName='kafka-avro-producer'): # Produce the Avro message self.producer.produce(topic=topicName, value=value, key=key, on_delivery=self.delivery_report) # Flush self.producer.flush()
class DatahubKafkaEmitter: def __init__(self, config: KafkaEmitterConfig): self.config = config schema_registry_conf = { "url": self.config.connection.schema_registry_url, **self.config.connection.schema_registry_config, } schema_registry_client = SchemaRegistryClient(schema_registry_conf) def convert_mce_to_dict( mce: MetadataChangeEvent, ctx: SerializationContext ) -> dict: tuple_encoding = mce.to_obj(tuples=True) return tuple_encoding avro_serializer = AvroSerializer( schema_str=SCHEMA_JSON_STR, schema_registry_client=schema_registry_client, to_dict=convert_mce_to_dict, ) producer_config = { "bootstrap.servers": self.config.connection.bootstrap, "key.serializer": StringSerializer("utf_8"), "value.serializer": avro_serializer, **self.config.connection.producer_config, } self.producer = SerializingProducer(producer_config) def emit_mce_async( self, mce: MetadataChangeEvent, callback: Callable[[Exception, str], None], ) -> None: # Call poll to trigger any callbacks on success / failure of previous writes self.producer.poll(0) self.producer.produce( topic=self.config.topic, key=mce.proposedSnapshot.urn, value=mce, on_delivery=callback, ) def flush(self) -> None: self.producer.flush()
def main(args): topic = args.topic key_schema_str = open('schema/KeySchema.avsc', "r").read() value_schema_str = open('schema/ValueSchema.avsc', "r").read() schema_registry_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_conf) avro_key_serializer = AvroSerializer(key_schema_str, schema_registry_client, user_quote_key_to_dict) avro_value_serializer = AvroSerializer(value_schema_str, schema_registry_client, user_quote_value_to_dict) producer_conf = {'bootstrap.servers': args.bootstrap_servers, 'key.serializer': avro_key_serializer, 'value.serializer': avro_value_serializer} producer = SerializingProducer(producer_conf) print("Producing user records to topic {}. ^C to exit.".format(topic)) while True: # Serve on_delivery callbacks from previous calls to produce() producer.poll(0.0) try: user_id = input("Enter User ID: ") product_id = input("Enter Product ID: ") quoted_price = input("Enter price: ") quoted_quantity = int(input("Enter the desired quantity: ")) user_note = input("Enter additional note: ") user_quote_key = UserQuoteKey(user_id=int(user_id)) user_quote_value = UserQuoteValue(product_id=int(product_id), quoted_price=int(quoted_price), quoted_quantity=quoted_quantity, user_note=user_note) producer.produce(topic=topic, key=user_quote_key, value=user_quote_value, on_delivery=delivery_report) except KeyboardInterrupt: break except ValueError: print("Invalid input, discarding record...") continue print("\nFlushing records...") producer.flush()
class VideoProducer: def __init__(self, topic='test', client_id='producer1', bootstrap_servers='localhost:9092', video_reader=None): self.topic = topic self.video_reader = video_reader self.kafka_producer = SerializingProducer({ 'bootstrap.servers': bootstrap_servers, 'value.serializer': self.video_reader.serialize, 'queue.buffering.max.messages': 500000 }) self.delivered_records = 0 self.start_time = 0 def acked(self, err, msg): """Delivery report handler called on successful or failed delivery of message """ if err is not None: print("Failed to deliver message: {}".format(err)) else: self.delivered_records += 1 # print(sys.getsizeof(message)) def produce(self): start_time = time.time() while (time.time() - start_time < 60 and self.video_reader.online): self.kafka_producer.poll(0.0) frame = self.video_reader.read() if frame is not None: self.kafka_producer.produce(topic=self.topic, value=frame, on_delivery=self.acked) print("\nFlushing records...") self.kafka_producer.flush() finished_time = time.time() print("MPS: {}".format(self.delivered_records / (finished_time - start_time))) self.video_reader.release()
class KafkaProducer: def __init__(self, topic, producer_config): self._topic = topic self._producer = SerializingProducer(producer_config.dict) def produce(self, record): while True: try: self._producer.produce(topic=self._topic, key=record.key_to_avro_dict(), value=record.value_to_avro_dict(), on_delivery=self._delivery_report) self._producer.poll(0) break except BufferError as e: print(f'Failed to send on attempt {record}. ' f'Error received {str(e)}') self._producer.poll(1) def flush(self): if self._producer: self._producer.flush() @staticmethod def _delivery_report(err: KafkaError, msg: Message): """ Reports the failure or success of a message delivery. Note: In the delivery report callback the Message.key() and Message.value() will be the binary format as encoded by any configured Serializers and not the same object that was passed to produce(). If you wish to pass the original object(s) for key and value to delivery report callback we recommend a bound callback or lambda where you pass the objects along. Args: err ([KafkaError]): The error that occurred on None on success. msg ([Message]): The message that was produced or failed. """ if err is not None: print(f"Delivery failed for record {msg.key()}: {err}")
def main(args): topic = args.topic schema_registry_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_conf) protobuf_serializer = ProtobufSerializer(user_pb2.User, schema_registry_client, {'use.deprecated.format': True}) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': protobuf_serializer } producer = SerializingProducer(producer_conf) print("Producing user records to topic {}. ^C to exit.".format(topic)) while True: # Serve on_delivery callbacks from previous calls to produce() producer.poll(0.0) try: user_name = input("Enter name: ") user_favorite_number = int(input("Enter favorite number: ")) user_favorite_color = input("Enter favorite color: ") user = user_pb2.User(name=user_name, favorite_color=user_favorite_color, favorite_number=user_favorite_number) producer.produce(topic=topic, partition=0, key=str(uuid4()), value=user, on_delivery=delivery_report) except (KeyboardInterrupt, EOFError): break except ValueError: print("Invalid input, discarding record...") continue print("\nFlushing records...") producer.flush()
def produce(goal): count = 0 reusableProducer = SerializingProducer(getConfigs()) while (count < goal): try: reusableProducer.produce(topic='myprototopic', key=str(uuid4()), value=generateRecord(), on_delivery=getReport) # print("In process:{}".format(multiprocessing.current_process().name)) reusableProducer.poll(0.0) except KeyboardInterrupt: break except BufferError: sys.stderr.write( '%% Local producer queue is full (%d messages awaiting delivery): flushing...\n' % len(reusableProducer)) reusableProducer.flush() print("Flushing one producer thread") reusableProducer.flush()
class Producer: def __init__(self, bootstrap_servers: str, topic: str, value_serializer=None, config=None): producer_config = { "bootstrap.servers": bootstrap_servers, "value.serializer": value_serializer } if config: producer_config.update(config) self.producer = SerializingProducer(producer_config) self.topic = topic def send(self, key=None, value=None, on_delivery=default_callback): self.producer.produce(self.topic, key=key, value=value, on_delivery=on_delivery) self.producer.flush()
def produce(self, count: int): def increment(err, msg): assert err is None assert msg is not None assert msg.offset() == self.acked self.logger.debug("Acked offset %d", msg.offset()) self.acked += 1 producer = SerializingProducer({ 'bootstrap.servers': self.brokers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': self._make_serializer() }) self.logger.info("Producing %d %s records to topic %s", count, self.schema_type.name, self.topic) for i in range(count): # Prevent overflow of buffer while len(producer) > 50000: # Serve on_delivery callbacks from previous calls to produce() producer.poll(0.1) producer.produce(topic=self.topic, key=str(uuid4()), value=self._make_payload(i), on_delivery=increment) self.produced += 1 self.logger.info("Flushing records...") producer.flush() self.logger.info("Records flushed: %d", self.produced) while self.acked < count: producer.poll(0.01) self.logger.info("Records acked: %d", self.acked)
def test_producer(self): # Read arguments and configurations and initialize producer_config = { 'bootstrap.servers': self.conf['bootstrap.servers'], 'key.serializer': self.key_avro_serializer, 'value.serializer': self.value_avro_serializer } producer = SerializingProducer(producer_config) delivered_records = 0 for text in self.test_messages: url = 'www.test.com' scraper_dt = datetime.now(pytz.timezone('America/Denver')) scraper_dt = scraper_dt.strftime("%Y/%m/%d %H:%M:%S %z") value_obj = google.Value(text=text, scraper_dt=scraper_dt) key_obj = google.Key(url=(url)) producer.produce(topic=self.topic, key=key_obj, value=value_obj, on_delivery=kafka_utils.acked) delivered_records += producer.poll() producer.flush() assert delivered_records == len(self.test_messages)
def main(args): topic = args.topic schema_registry_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_conf) avro_serializer = AvroSerializer(User.avro_schema(), schema_registry_client, user_to_dict) producer_conf = {'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer} producer = SerializingProducer(producer_conf) print(f"Producing user records to topic {topic}. ^C to exit.") while True: producer.poll(0.0) try: user_name = input("Enter name: ") user_favorite_number = int(input("Enter favorite number: ")) user_favorite_color = input("Enter favorite color: ") user = User(name=user_name, favorite_color=user_favorite_color, favorite_number=user_favorite_number) producer.produce(topic=topic, key=str(uuid4()), value=user, on_delivery=delivery_report) except KeyboardInterrupt: break except ValueError: print("Invalid input, discarding record...") continue print("\nFlushing records...") producer.flush()
class KafkaLoggingHandler(logging.Handler): """ This handler enables the user to forward logs to Kafka. Attributes: additional_fields (dict): extra fields attached to logs kafka_topic_name (str): topic name producer (kafka.KafkaProducer): producer object """ __LOGGING_FILTER_FIELDS = [ "msecs", "relativeCreated", "levelno", "created" ] def __init__( self, hosts_list, topic, security_protocol="SSL", ssl_cafile=None, extended_producer_config=None, additional_fields=None, log_preprocess=None, internal_logger_level="INFO", delivery_timeout=2, ): """ Initialize the handler. Args: hosts_list: list of the Kafka hostnames topic: kafka consumer topic to where logs are forwarded security_protocol (str, optional): KafkaProducer security protocol ssl_cafile (None, optional): path to CA file extended_producer_config (None, optional): extra arguments to update confluent_kafka.SerializingProducer config additional_fields (None, optional): A dictionary with all the additional fields that you would like to add to the logs, such the application, environment, etc. log_preprocess (None/list, optional): list of functions, handler will send the following to Kafka ...preprocess[1](preprocess[0](raw_log))... internal_logger_level (str, optional): internal logger loglevel. delivery_timeout (int, optional): delivery timeout in seconds. Raises: KafkaLoggerException: in case of incorrect logger configuration """ self._internal_logger = self._init_internal_logger( internal_logger_level) self.log_preprocess = log_preprocess or [] self.additional_fields = additional_fields or {} self.additional_fields.update({ "host": socket.gethostname(), "host_ip": socket.gethostbyname(socket.gethostname()) }) if security_protocol == "SSL" and ssl_cafile is None: raise KafkaLoggerException("SSL CA file isn't provided.") self.kafka_topic_name = topic self.delivery_timeout_sec = delivery_timeout extended_producer_config = extended_producer_config or {} producer_config = { "bootstrap.servers": hosts_list, "security.protocol": security_protocol, "ssl.ca.location": ssl_cafile, "key.serializer": StringSerializer("utf_8"), "value.serializer": lambda msg, _: json.dumps(msg).encode("utf-8"), "delivery.timeout.ms": self.delivery_timeout_sec * 1000, "error_cb": self.error_callback, } producer_config.update(extended_producer_config) self.producer = SerializingProducer(producer_config) logging.Handler.__init__(self) self._internal_logger.debug( f"KAFKA LOGGER INITIALIZED WITH CONFIG: {str(producer_config)}") @staticmethod def _init_internal_logger(level="INFO"): internal_handler = logging.StreamHandler(sys.stderr) internal_handler.setLevel(level) internal_handler.setFormatter( logging.Formatter( "[%(asctime)s] [%(process)s] [%(name)s] [%(levelname)s]: %(message)s" )) internal_logger = logging.getLogger("confluent_kafka_handler") internal_logger.addHandler(internal_handler) internal_logger.setLevel(level) internal_logger.propagate = False return internal_logger def prepare_record_dict(self, record): """ Prepare a dictionary log item. Format a log record and extend dictionary with default values. Args: record (logging.LogRecord): log record Returns: dict: log item ready for Kafka """ # use default formatting # Update the msg dict to include all of the message attributes self.format(record) # If there's an exception, let's convert it to a string if record.exc_info: record.msg = repr(record.msg) record.exc_info = repr(record.exc_info) # Append additional fields rec = self.additional_fields.copy() for key, value in record.__dict__.items(): if key not in self.__LOGGING_FILTER_FIELDS: if key == "args": # convert ALL argument to a str representation # Elasticsearch supports number datatypes # but it is not 1:1 - logging "inf" float # causes _jsonparsefailure error in ELK value = tuple(repr(arg) for arg in value) if key == "msg" and not isinstance(value, str): # msg contains custom class object # if there is no formatting in the logging call value = str(value) rec[key] = "" if value is None else value if key == "created": # inspired by: cmanaha/python-elasticsearch-logger created_date = datetime.datetime.utcfromtimestamp( record.created) rec["timestamp"] = "{!s}.{:03d}Z".format( created_date.strftime("%Y-%m-%dT%H:%M:%S"), int(created_date.microsecond / 1000)) # apply preprocessor(s) for preprocessor in self.log_preprocess: rec = preprocessor(rec) return rec def emit(self, record): """ Prepare and send LogRecord to kafka topic Args: record: Logging message """ record_dict = self.prepare_record_dict(record) try: self.producer.produce(self.kafka_topic_name, value=record_dict, on_delivery=self.error_callback) self.producer.poll(0) except BufferError: self._internal_logger.error( "Confluent kafka queue is full, logs will be lost.") def error_callback(self, err, msg=None): if err: self._internal_logger.error(err) if msg: self._internal_logger.debug(msg) def flush(self): if hasattr(self, "producer"): self.producer.flush(self.delivery_timeout_sec + 0.1)
class KafkaAvroProducer: def __init__(self, value_schema, groupID='KafkaAvroProducer'): # Schema Registry configuration self.schema_registry_conf = self.getSchemaRegistryConf() # Schema Registry Client self.schema_registry_client = SchemaRegistryClient( self.schema_registry_conf) # String Serializer for the key self.key_serializer = StringSerializer('utf_8') # Avro Serializer for the value self.value_serializer = AvroSerializer(value_schema, self.schema_registry_client) # Get the producer configuration self.producer_conf = self.getProducerConfiguration(groupID) # Create the producer self.producer = SerializingProducer(self.producer_conf) def getSchemaRegistryConf(self): try: # For IBM Event Streams on IBM Cloud and on OpenShift, the Schema Registry URL is some sort of # https://KAFKA_USER:KAFKA_PASSWORD@SCHEMA_REGISTRY_URL # Make sure the SCHEMA_REGISTRY_URL your provide is in the form described above. url = os.environ['SCHEMA_REGISTRY_URL'] # If we are talking to ES on prem, it uses an SSL self-signed certificate. # Therefore, we need the CA public certificate for the SSL connection to happen. if (os.path.isfile(os.getenv('KAFKA_CERT', '/certs/es-cert.pem'))): ssl = os.getenv('KAFKA_CERT', '/certs/es-cert.pem') return {'url': url, 'ssl.ca.location': ssl} return {'url': url} except KeyError: print( '[KafkaAvroProducer] - [ERROR] - There is no SCHEMA_REGISTRY_URL environment variable' ) exit(1) def getProducerConfiguration(self, groupID): try: options = { 'bootstrap.servers': os.environ['KAFKA_BROKERS'], 'group.id': groupID, 'key.serializer': self.key_serializer, 'value.serializer': self.value_serializer } if (os.getenv('KAFKA_PASSWORD', '') != ''): # Set security protocol common to ES on prem and on IBM Cloud options['security.protocol'] = 'SASL_SSL' # Depending on the Kafka User, we will know whether we are talking to ES on prem or on IBM Cloud # If we are connecting to ES on IBM Cloud, the SASL mechanism is plain if (os.getenv('KAFKA_USER', '') == 'token'): options['sasl.mechanisms'] = 'PLAIN' # If we are connecting to ES on OCP, the SASL mechanism is scram-sha-512 else: options['sasl.mechanisms'] = 'SCRAM-SHA-512' # Set the SASL username and password options['sasl.username'] = os.getenv('KAFKA_USER', '') options['sasl.password'] = os.getenv('KAFKA_PASSWORD', '') # If we are talking to ES on prem, it uses an SSL self-signed certificate. # Therefore, we need the CA public certificate for the SSL connection to happen. if (os.path.isfile(os.getenv('KAFKA_CERT', '/certs/es-cert.pem'))): options['ssl.ca.location'] = os.getenv('KAFKA_CERT', '/certs/es-cert.pem') # Print out the producer configuration self.printProducerConfiguration(options) return options except KeyError as error: print( '[KafkaAvroProducer] - [ERROR] - A required environment variable does not exist: ' + error) exit(1) def printProducerConfiguration(self, options): # Printing out producer config for debugging purposes print( "[KafkaAvroProducer] - This is the configuration for the producer:" ) print( "[KafkaAvroProducer] - -------------------------------------------" ) print('[KafkaAvroProducer] - Bootstrap Server: {}'.format( options['bootstrap.servers'])) print('[KafkaAvroProducer] - Schema Registry url: {}'.format( self.schema_registry_conf['url'].split('@')[-1])) if (os.getenv('KAFKA_PASSWORD', '') != ''): # Obfuscate password if (len(options['sasl.password']) > 3): obfuscated_password = options['sasl.password'][ 0] + "*****" + options['sasl.password'][ len(options['sasl.password']) - 1] else: obfuscated_password = "******" print('[KafkaAvroProducer] - Security Protocol: {}'.format( options['security.protocol'])) print('[KafkaAvroProducer] - SASL Mechanism: {}'.format( options['sasl.mechanisms'])) print('[KafkaAvroProducer] - SASL Username: {}'.format( options['sasl.username'])) print('[KafkaAvroProducer] - SASL Password: {}'.format( obfuscated_password)) if (os.path.isfile(os.getenv('KAFKA_CERT', '/certs/es-cert.pem'))): print('[KafkaAvroProducer] - SSL CA Location: {}'.format( options['ssl.ca.location'])) print( "[KafkaAvroProducer] - -------------------------------------------" ) def delivery_report(self, err, msg): """ Called once for each message produced to indicate delivery result. Triggered by poll() or flush(). """ if err is not None: print( '[KafkaAvroProducer] - [ERROR] - Message delivery failed: {}'. format(err)) else: print('[KafkaAvroProducer] - Message delivered to {} [{}]'.format( msg.topic(), msg.partition())) def publishEvent(self, key, value, topicName='kafka-avro-producer'): # Produce the Avro message self.producer.produce(topic=topicName, value=value, key=key, on_delivery=self.delivery_report) # Flush self.producer.flush()
class TestConfluentProtobufProtobuf: def __init__(self, driver, nameSalt): self.driver = driver self.fileName = "travis_correct_confluent_protobuf_protobuf" self.topic = self.fileName + nameSalt self.sensor = sensor_pb2.SensorReading() self.sensor.dateTime = 1234 self.sensor.reading = 321.321 self.sensor.device.deviceID = "555-4321" self.sensor.device.enabled = True self.sensor.float_val = 4321.4321 self.sensor.int32_val = (1 << 31) - 1 self.sensor.sint32_val = (1 << 31) - 1 self.sensor.sint64_val = (1 << 63) - 1 self.sensor.uint32_val = (1 << 32) - 1 self.sensor.bytes_val = b'\xDE\xAD' self.sensor.double_array_val.extend([1 / 3, 32.21, 434324321]) self.sensor.uint64_val = (1 << 64) - 1 self.schema_registry_client = SchemaRegistryClient( {'url': driver.schemaRegistryAddress}) self.keyProtobufSerializer = ProtobufSerializer( sensor_pb2.SensorReading, self.schema_registry_client) self.valueProtobufSerializer = ProtobufSerializer( sensor_pb2.SensorReading, self.schema_registry_client) producer_conf = { 'bootstrap.servers': driver.kafkaAddress, 'key.serializer': self.keyProtobufSerializer, 'value.serializer': self.valueProtobufSerializer } self.protobufProducer = SerializingProducer(producer_conf) def getConfigFileName(self): return self.fileName + ".json" def send(self): for e in range(100): self.protobufProducer.produce(self.topic, self.sensor, self.sensor) self.protobufProducer.poll(0) self.protobufProducer.flush() def verify(self, round): res = self.driver.snowflake_conn.cursor().execute( "SELECT count(*) FROM {}".format(self.topic)).fetchone()[0] if res == 0: raise RetryableError() elif res != 100: raise NonRetryableError( "Number of record in table is different from number of record sent" ) # validate content of line 1 res = self.driver.snowflake_conn.cursor().execute( "Select * from {} limit 1".format(self.topic)).fetchone() # "schema_id" is lost since they are using native avro converter goldMeta = r'{"CreateTime":\d*,"key":{"bytes_val":"3q0=","dateTime":1234,"device":' \ r'{"deviceID":"555-4321","enabled":true},"double_array_val":' \ r'[0.3333333333333333,32.21,4.343243210000000e+08],"float_val":4321.432,' \ r'"int32_val":2147483647,"reading":321.321,"sint32_val":2147483647,"sint64_val":9223372036854775807,' \ r'"uint32_val":4294967295,"uint64_val":-1},"offset":\d*,"partition":\d*,"topic":"travis_correct_confluent_protobuf_protobuf....."}' goldContent = r'{"bytes_val":"3q0=","dateTime":1234,"device":{"deviceID":"555-4321","enabled":true},"double_array_val":' \ r'[0.3333333333333333,32.21,4.343243210000000e+08],"float_val":4321.432,"int32_val":2147483647,' \ r'"reading":321.321,"sint32_val":2147483647,"sint64_val":9223372036854775807,"uint32_val":4294967295,"uint64_val":-1}' self.driver.regexMatchOneLine(res, goldMeta, goldContent) self.driver.verifyStageIsCleaned(self.topic) def clean(self): self.driver.cleanTableStagePipe(self.topic)
def main(args): topic = args.topic schema_str = EventSchema schema_registry_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_conf) avro_serializer = AvroSerializer(schema_str, schema_registry_client) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) list_type = [{ "grilleIdent": "Numero 123T", "codeRetourServiceMetier": "code 23432543", "referer": "1qsd", "browserVersion": "qsdqsd", "androidUDID": "qsdqsdqsd", "iosIDFA": "qdqsdqsd", "appVersion": "qsdqsdqsdqsd", "idTmx": "qsdqsdqsd" }, { "numeroCompteBeneficiaire": "Numero 123T", "codePaysResidence": "code 23432543", "codePaysResidenceIso": "code 23432543", "adresseBeneficiaire": "code 23432543", "nomCompletBeneficiaire": "code 23432543", "idListeBeneficiaire": "code 23432543", "idBeneficiaire": "code 23432543", "modeValidation": 34, "bicBeneficiaire": "code 23432543", "idTmx": "code 23432543" }] while True: x = random.choice([0, 1]) eventHeader = { "eventId": str(uuid4()), "dateTimeRef": 1589364605654, "nomenclatureEv": "Event Header", "canal": 1, "media": 2, "schemaVersion": "v0", "headerVersion": "v2", "serveur": "s1", "acteurDeclencheur": { "adresseIP": "127.0.0.1", "idTelematique": str(uuid4()), "idPersonne": "zahir" } } value = { "EventHeader": eventHeader, "EventBusinessContext": list_type[x] } print(value) producer.produce(topic=topic, key=str(uuid4()), value=value, on_delivery=delivery_report) producer.flush() time.sleep(0.1)
import meal_pb2 from confluent_kafka import SerializingProducer from confluent_kafka.serialization import StringSerializer from confluent_kafka.schema_registry import SchemaRegistryClient from confluent_kafka.schema_registry.protobuf import ProtobufSerializer topic = 'meal' schema_registry_client = SchemaRegistryClient({'url': 'http://t620.lan:8081'}) protobuf_serializer = ProtobufSerializer(meal_pb2.Meal, schema_registry_client) producer_conf = { 'bootstrap.servers': 't620.lan:9092', 'key.serializer': StringSerializer('utf_8'), 'value.serializer': protobuf_serializer } producer = SerializingProducer(producer_conf) producer.poll(0.0) mybeer = meal_pb2.Meal.DrinkItems(drink_name="beer") mywine = meal_pb2.Meal.DrinkItems(drink_name="wine") meal = meal_pb2.Meal(name='pizza', drink=[mybeer, mywine]) # Less meal (for testing kafkajs which seems to miss the drinks #meal = meal_pb2.Meal(name='pizza', drink=[]) producer.produce(topic=topic, key=str(uuid4()), value=meal) producer.flush()
def main(args): topic = args.topic schema_str = """ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "User", "description": "A Confluent Kafka Python User", "type": "object", "properties": { "name": { "description": "User's name", "type": "string" }, "favorite_number": { "description": "User's favorite number", "type": "number", "exclusiveMinimum": 0 }, "favorite_color": { "description": "User's favorite color", "type": "string" } }, "required": [ "name", "favorite_number", "favorite_color" ] } """ schema_registry_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_conf) json_serializer = JSONSerializer(schema_registry_client, schema_str, user_to_dict) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': json_serializer } producer = SerializingProducer(producer_conf) print("Producing user records to topic {}. ^C to exit.".format(topic)) while True: # Serve on_delivery callbacks from previous calls to produce() producer.poll(0.0) try: user_name = input("Enter name: ") user_address = input("Enter address: ") user_favorite_number = int(input("Enter favorite number: ")) user_favorite_color = input("Enter favorite color: ") user = User(name=user_name, address=user_address, favorite_color=user_favorite_color, favorite_number=user_favorite_number) producer.produce(topic=topic, key=str(uuid4()), value=user, on_delivery=delivery_report) except KeyboardInterrupt: break except ValueError: print("Invalid input, discarding record...") continue print("\nFlushing records...") producer.flush()
class KafkaAvroProducer: def __init__(self, producer_name, value_schema, groupID = 'KafkaAvroProducer', kafka_brokers = "", kafka_user = "", kafka_pwd = "", kafka_cacert = "", kafka_sasl_mechanism = "", topic_name = ""): self.kafka_brokers = kafka_brokers self.kafka_user = kafka_user self.kafka_pwd = kafka_pwd self.kafka_sasl_mechanism = kafka_sasl_mechanism self.kafka_cacert = kafka_cacert self.topic_name = topic_name # Consumer name for logging purposes self.logging_prefix = '['+ producer_name + '][KafkaAvroProducer]' # Schema Registry configuration self.schema_registry_conf = {'url': config.SCHEMA_REGISTRY_URL} # Schema Registry Client self.schema_registry_client = SchemaRegistryClient(self.schema_registry_conf) # String Serializer for the key self.key_serializer = StringSerializer('utf_8') # Avro Serializer for the value print(value_schema) print(type(value_schema)) value_schema=value_schema.strip() self.value_serializer = AvroSerializer(value_schema, self.schema_registry_client) # Get the producer configuration self.producer_conf = self.getProducerConfiguration(groupID, self.key_serializer, self.value_serializer) # Create the producer self.producer = SerializingProducer(self.producer_conf) def delivery_report(self,err, msg): """ Called once for each message produced to indicate delivery result. Triggered by poll() or flush(). """ if err is not None: print('[KafkaAvroProducer] - [ERROR] - Message delivery failed: {}'.format(err)) else: print('[KafkaAvroProducer] - Message delivered to {} [{}]'.format(msg.topic(), msg.partition())) def publishEvent(self, key, value, topicName = 'kafka-avro-producer'): # Produce the Avro message self.producer.produce(topic=topicName,value=value,key=key, on_delivery=self.delivery_report) # Flush self.producer.flush() def getProducerConfiguration(self,groupID,key_serializer,value_serializer): try: options ={ 'bootstrap.servers': os.environ['KAFKA_BROKERS'], 'group.id': groupID, 'key.serializer': key_serializer, 'value.serializer': value_serializer } if (os.getenv('KAFKA_PASSWORD','') != ''): # Set security protocol common to ES on prem and on IBM Cloud options['security.protocol'] = 'SASL_SSL' # Depending on the Kafka User, we will know whether we are talking to ES on prem or on IBM Cloud # If we are connecting to ES on IBM Cloud, the SASL mechanism is plain if (os.getenv('KAFKA_USER','') == 'token'): options['sasl.mechanisms'] = 'PLAIN' # If we are connecting to ES on OCP, the SASL mechanism is scram-sha-512 else: options['sasl.mechanisms'] = 'SCRAM-SHA-512' # Set the SASL username and password options['sasl.username'] = os.getenv('KAFKA_USER','') options['sasl.password'] = os.getenv('KAFKA_PASSWORD','') # If we are talking to ES on prem, it uses an SSL self-signed certificate. # Therefore, we need the CA public certificate for the SSL connection to happen. if (os.path.isfile(os.getenv('KAFKA_CERT','/certs/es-cert.pem'))): options['ssl.ca.location'] = os.getenv('KAFKA_CERT','/certs/es-cert.pem') return options except KeyError as error: print('[KafkaAvroProducer] - [ERROR] - A required environment variable does not exist: ' + error) return {}
class Broker: def __init__(self, consumer_topic, producer_topic, client_id, bootstrap_servers, consumer_proto_class, producer_proto_class, processor, max_thread_calls): self.consumer_topic = consumer_topic self.producer_topic = producer_topic self.client_id = client_id self.bootstrap_servers = bootstrap_servers self.consumer_proto_class = consumer_proto_class self.producer_proto_class = producer_proto_class self.processor = processor self.max_thread_calls = max_thread_calls self.kafka_consumer = DeserializingConsumer({ 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.client_id, 'auto.offset.reset': "earliest", 'value.deserializer': self.derializer }) self.kafka_consumer.subscribe([self.consumer_topic]) self.kafka_producer = SerializingProducer({ 'bootstrap.servers': self.bootstrap_servers, 'queue.buffering.max.messages': 500000, 'value.serializer': self.serialize }) self.thread_queue = deque(maxlen=self.max_thread_calls) self.latest_thread_queue_id = 1 def derializer(self, bytes_message, _): message = image_pb2.ImageInfo() message.ParseFromString(bytes_message) return message def serialize(self, message, _): return message.SerializeToString() def get_thread_id(self): result = self.latest_thread_queue_id if result == self.max_thread_calls: self.latest_thread_queue_id = 1 else: self.latest_thread_queue_id += 1 return result def is_thread_queue_full(self): return len(self.thread_queue) == self.max_thread_calls def produce_when_ready(self, thread_id, message): while self.thread_queue[-1] != thread_id: logging.warning("Thread {} got stuck in queue".format(thread_id)) # time.sleep(0.01) self.kafka_producer.poll(0.0) self.kafka_producer.produce(topic=self.producer_topic, value=message) self.thread_queue.pop() def call_processor(self, thread_id, value, start_time): result = self.processor.process(value) self.produce_when_ready(thread_id, result) logging.debug("Total time for thead" + str(thread_id) + " is " + str(time.time() - start_time / 1000)) def run(self): while True: try: if self.is_thread_queue_full(): logging.warning( "Thread queue is full, waiting for previous threads to finished" ) continue msg = self.kafka_consumer.poll(1.0) if msg is None or msg.value() is None: logging.warning("No messages from kafka") continue caller_thread_id = self.get_thread_id() caller_thread = threading.Thread(target=self.call_processor, args=(caller_thread_id, msg.value(), msg.timestamp()[1])) self.thread_queue.appendleft(caller_thread_id) caller_thread.start() except KeyboardInterrupt: break self.kafka_consumer.close() self.kafka_producer.flush()
from confluent_kafka import SerializingProducer from confluent_kafka.serialization import IntegerSerializer, StringSerializer def callback(err, msg): if err is not None: print(f'Message deliver failed: {err}') else: print(f'Message delivered to {msg.topic()} [{msg.partition()}]') p = SerializingProducer({ 'bootstrap.servers': 'localhost:9092', 'key.serializer': IntegerSerializer(), 'value.serializer': StringSerializer() }) for i in range(100): polling_result = p.poll(0) if polling_result: print(f'Polling result: {polling_result}') p.produce('sample-topic', key=i, value=f'hello world {i}', on_delivery=callback) p.flush()
def main(args): topic = args.topic outputtopic = args.outputtopic schema_enriched_event_str = EnrichedEventSchema schema_metrics = MetricSchema sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) string_deserializer = StringDeserializer('utf_8') avro_serializer = AvroSerializer(schema_metrics, schema_registry_client) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) avro_deserializer = AvroDeserializer(schema_enriched_event_str, schema_registry_client) consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group + str(random.Random()), 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) cluster = Cluster([args.host]) session = cluster.connect("datascience") session.row_factory = dict_factory client_influxdb = InfluxDBClient('35.181.155.182', 8086, "dbsaleh2") while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. start = time.time() msg = consumer.poll(1.0) if msg is None: continue evt = msg.value() idPersonne = evt["idPersonne"] rows = session.execute(GET_ENRICHED_EVENT_QUERY, (idPersonne, )) if rows: # print(idPersonne, f"rows={rows.all().__len__()}") # stat_process(idPersonne, rows) # som = rec_process(rows,0,0) # print("some", som) # row["csp"] = get_value_column_enriched_data(row, "csp") # row["paysNaissance"] = get_value_column_enriched_data(row, "paysNaissance") # # # #get_value_column_event_content # row['appVersion'] = get_value_column_event_content(row, "appVersion") # row['montant'] = get_value_column_event_content(row, "montant") # row['androidID'] = get_value_column_event_content(row, "androidID") # del rows[0]['eventContent'] elapsed_time = time.time() - start #producer.produce(topic=outputtopic, key=str(uuid4()), value={'metricName':"hystorize",'time':elapsed_time}, on_delivery=delivery_report) #producer.flush() except Exception: print('Exception') continue metrics = [{ "measurement": "metrics", "fields": { "metricName": "score", "timeforscore": elapsed_time } }] print(elapsed_time) client_influxdb.write_points(metrics, database="dbsaleh2") producer.produce(topic=outputtopic, value={ 'metricName': "score", 'time': elapsed_time }, on_delivery=delivery_report) producer.flush() consumer.close()
def main( name: str, shutdown: multiprocessing.Value, request_queue: multiprocessing.Queue, config: Config ) -> None: """Execute tasks forever. This method is the entrypoint for the worker which executes the monitoring tasks. It is executed in a dedicate child process. """ if config.verbose: logging.basicConfig(level=logging.INFO) log = logging.getLogger(name) log.info(f"Starting process {name}.") # SIGINT will be delivered to the whole process group. We'll need to ignore # it in the worker processes to give them the opportunity to finish any # pending work. signal.signal(signal.SIGINT, signal.SIG_IGN) schema_registry_client = SchemaRegistryClient({ 'url': config.schema_registry }) avro_serializer = AvroSerializer( Report.SCHEMA, schema_registry_client, Report.asdict ) producer = SerializingProducer({ 'client.id': name, 'bootstrap.servers': config.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'security.protocol': 'SSL', 'ssl.key.location': config.auth_key, 'ssl.certificate.location': config.auth_cert, 'ssl.ca.location': config.ca_cert, 'value.serializer': avro_serializer, }) err = _report_error(log) while not shutdown.value: producer.poll(0.0) try: now = datetime.now() req = request_queue.get(timeout=1) curl = pycurl.Curl() curl.setopt(pycurl.URL, req) curl.setopt(pycurl.CONNECTTIMEOUT, 30) curl.setopt(pycurl.TIMEOUT, 300) curl.setopt(pycurl.NOSIGNAL, 1) curl.setopt(pycurl.WRITEFUNCTION, len) try: curl.perform() report = Report( timestamp=now.timestamp(), url=req, code=int(curl.getinfo(pycurl.RESPONSE_CODE)), namelookup=curl.getinfo(pycurl.NAMELOOKUP_TIME), connect=curl.getinfo(pycurl.CONNECT_TIME), appconnect=curl.getinfo(pycurl.APPCONNECT_TIME), pretransfer=curl.getinfo(pycurl.PRETRANSFER_TIME), starttransfer=curl.getinfo(pycurl.STARTTRANSFER_TIME), total=curl.getinfo(pycurl.TOTAL_TIME), ) log.info(str(report)) producer.produce( topic=config.topic, key=req, value=report, on_delivery=err ) except TypeError: # It'll never work if we misconfigure PycURL. raise except pycurl.error as exc: # TODO: Record the failure in Kafka. log.warning(f"Failed to retrieve {req}", exc) # TODO: Handle exceptions from the Kafka Producer. finally: curl.close() except queue.Empty: log.debug("No request to process.") # Flush any results that haven't been committed yet. log.warning(f"Process {name} shutting down.") producer.flush()