def produce(config, topic, input_messages): """ produce initiate sending a message to Kafka, call the produce method passing in the input_messages key/value and and callback Parameters ---------- topic: str topic where the input message publish too input_messages: dict a key/value input messages config: dict the config values that needed by the produce """ if topic is None: logger.debug('Required topic field must be set') raise ValueError() if len(input_messages) <= 0: logger.debug('Required data field must not be empty.') raise ValueError() bootstrap_servers, schema_registry = producer_config(config) producer = Producer(bootstrap_servers) admin_client = AdminClient(bootstrap_servers) topics = admin_client.list_topics().topics #Just to show what's available print(topics) if not topics: print('Not Topics') raise RuntimeError() sr = CachedSchemaRegistryClient(schema_registry) ser = MessageSerializer(sr) # get schema id, schema, version = sr.get_latest_schema(topic + "-value") if schema: print('In If Schema') for key, value in input_messages.items(): if validate_uuid4(key): print('In validate in For loop') serializedMessage = ser.encode_record_with_schema( topic, schema, value) producer.produce(topic=topic, key=key, value=serializedMessage, callback=acked) # producer.flush() # bad idea, it limits throughput to the broker round trip time producer.poll(1) else: print('In Else of For Loop') logger.error('Invalid UUID String: ', key) else: print('Schema not found for topic name: ', topic) print('In Else Schema') sys.exit(1)
class AvroMessageSerializer(object): def __init__(self, schema_subject, schema_registry_url): ''' Create a new serializer object, which includes the remote-loaded schema object specified by schema_subject. Note this constructor is not exception safe ''' self.schema_subject = schema_subject self.schema_registry_url = schema_registry_url self.schema_registry_client = CachedSchemaRegistryClient(url=self.schema_registry_url) self._load_schema() self.writer = SchemalessAvroRecordWriter(self.avro_schema) def _load_schema(self): try: schema_tuple = self.schema_registry_client.get_latest_schema(subject=self.schema_subject) except ValueError as e: raise ValueError('Schema subject ' + self.schema_subject + ' not found') if not schema_tuple[1]: raise ValueError('Schema subject ' + self.schema_subject + ' not found') self.schema_id = schema_tuple[0] self.avro_schema = schema_tuple[1].to_json() self.schema_version = schema_tuple[2] def kafka_avro_encode(self, record): with ContextBytesIO() as buf: # write the header # magic byte buf.write(struct.pack('b', _MAGIC_BYTE)) # write the schema ID in network byte order (big end) buf.write(struct.pack('>I', self.schema_id)) self.writer.write(buf, record) return buf.getvalue()
record_schema = avro.load(AVROLOADPATH) producer = AvroProducer(conf, default_value_schema=record_schema) try: producer.produce(topic=KAFKATOPIC, value=mce) producer.poll(0) sys.stdout.write('\n%s has been successfully produced!\n' % mce) except ValueError as e: sys.stdout.write('Message serialization failed %s' % e) producer.flush() zk = KazooClient(ZOOKEEPER) zk.start() client = CachedSchemaRegistryClient(SCHEMAREGISTRY) topics = zk.get_children("/brokers/topics") for dataset_name in topics: if dataset_name.startswith('_'): continue topic = dataset_name + '-value' schema_id, schema, schema_version = client.get_latest_schema(topic) if schema_id is None: print(f"Skipping topic without schema: {topic}") continue print(topic) build_kafka_dataset_mce(dataset_name, str(schema), int(schema_version)) sys.exit(0)
@udf(BinaryType()) def find_header_value(array, headerName): return next(x for x in array if x['key'] == headerName)['value'] schema_registry_client = CachedSchemaRegistryClient({ "url": "https://psrc-4j1d2.westus2.azure.confluent.cloud", "basic.auth.credentials.source": "USER_INFO", "basic.auth.user.info": "2BEQE2KDNBJGDH2Y:8nixndjUyjXqTJoXnm3X3GwLZPz5F8umq74/g9ioG2mIi4lm0CWF1nUAf8deIFbP" }) latest_id, latest_schema, latest_version = schema_registry_client.get_latest_schema( "transmissao-efetuada-value") spark = SparkSession \ .builder \ .appName("CapturarEventosJob") \ .master("local[*]") \ .getOrCreate() spark.sparkContext.setLogLevel('WARN') spark.udf.register("find_header_value", find_header_value) raw_data = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "pkc-epwny.eastus.azure.confluent.cloud:9092") \
from pyspark.sql import SparkSession, SQLContext from pyspark.sql.functions import col, expr, struct, lit, concat, array, date_format, current_timestamp from pyspark.sql.avro.functions import to_avro from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient schema_registry_client = CachedSchemaRegistryClient({ "url": "https://psrc-4j1d2.westus2.azure.confluent.cloud", "basic.auth.credentials.source": "USER_INFO", "basic.auth.user.info": "2BEQE2KDNBJGDH2Y:8nixndjUyjXqTJoXnm3X3GwLZPz5F8umq74/g9ioG2mIi4lm0CWF1nUAf8deIFbP" }) latest_id, latest_schema, latest_version = schema_registry_client.get_latest_schema( "relatorio-transmissao-value") magic_byte = bytes([0x0]) id_bytes = (latest_id).to_bytes(4, byteorder='big') spark = SparkSession \ .builder \ .appName("GerarRelatorioTransmissaoJob") \ .master("local[*]") \ .getOrCreate() sqlContext = SQLContext(spark.sparkContext) spark.sparkContext.setLogLevel('WARN')
config = json.load(config_file) Stats = namedtuple('Stats', [ 'time', 'ifcb_id', 'roi', 'name', 'classifier', 'prob', 'classification_time', 'biovolume', 'carbon', 'hab' ]) ClassifierStats = namedtuple( 'ClassifierStats', ['sample_name', 'prob', 'classifier', 'classification_time']) schema_config = {'url': config['schema.registry.url'], 'ssl.ca.location': None} # need to use CachedSchemaRegistryClient to get schema # - need to copy config because it is consumed when used in CachedSchemaRegistryClient schema_config_copy = schema_config.copy() cached_schema_client = CachedSchemaRegistryClient(schema_config) key_schema = str(cached_schema_client.get_latest_schema('ifcb-stats-key')[1]) value_schema = str( cached_schema_client.get_latest_schema('ifcb-stats-value')[1]) key_schema = avro.loads(key_schema) value_schema = avro.loads(value_schema) producer = AvroProducer( { 'bootstrap.servers': config['bootstrap.servers'], 'schema.registry.url': config['schema.registry.url'] }, default_key_schema=key_schema, default_value_schema=value_schema) app = faust.App(config['app_name'], broker=config['broker'],
import json import os config = { 'bootstrap.servers': 'localhost:9092', 'client.id': 'phoenix-local-producer' } sr_config = {'url': 'http://localhost:8081', 'auto.register.schemas': False} topic = 'avro-topic-1' suffix = '-value' subject = topic + suffix sr_client = CachedSchemaRegistryClient(sr_config) schema_details = sr_client.get_latest_schema(subject) RECORD_SCHEMA = schema_details[1] serializer = MessageSerializer(sr_client) class AvroModel: def __init__(self, id, name): self.id = id self.name = name data = AvroModel(None, None) logging.getLogger().setLevel(logging.INFO)