def test_produce_with_custom_registry(self): schema_registry = MockSchemaRegistryClient() value_schema = avro.load(os.path.join(avsc_dir, "basic_schema.avsc")) key_schema = avro.load(os.path.join(avsc_dir, "primitive_string.avsc")) producer = AvroProducer({}, schema_registry=schema_registry) producer.produce(topic='test', value={"name": 'abc"'}, value_schema=value_schema, key='mykey', key_schema=key_schema)
def test_produce_primitive_string_key(self): value_schema = avro.load(os.path.join(avsc_dir, "basic_schema.avsc")) key_schema = avro.load(os.path.join(avsc_dir, "primitive_string.avsc")) producer = AvroProducer({'schema.registry.url': 'http://127.0.0.1:9001'}) with self.assertRaises(ConnectionError): # Unexistent schema-registry producer.produce(topic='test', value={"name": 'abc"'}, value_schema=value_schema, key='mykey', key_schema=key_schema)
def test_produce_with_empty_key_value_with_schema(self): key_schema = avro.load(os.path.join(avsc_dir, "primitive_string.avsc")) value_schema = avro.load(os.path.join(avsc_dir, "primitive_float.avsc")) schema_registry = MockSchemaRegistryClient() producer = AvroProducer({}, schema_registry=schema_registry, default_key_schema=key_schema, default_value_schema=value_schema) producer.produce(topic='test', value=0.0, key='')
def test_produce_with_empty_key_no_schema(self): value_schema = avro.load(os.path.join(avsc_dir, "primitive_float.avsc")) schema_registry = MockSchemaRegistryClient() producer = AvroProducer({}, schema_registry=schema_registry, default_value_schema=value_schema) with self.assertRaises(KeySerializerError): producer.produce(topic='test', value=0.0, key='')
def verify_schema_registry_client(): from confluent_kafka import avro sr_conf = {'url': schema_registry_url} sr = avro.CachedSchemaRegistryClient(sr_conf) subject = str(uuid.uuid4()) avsc_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'avro') schema = avro.load(os.path.join(avsc_dir, "primitive_float.avsc")) schema_id = sr.register(subject, schema) assert schema == sr.get_by_id(schema_id) latest_id, latest_schema, latest_version = sr.get_latest_schema(subject) assert schema == latest_schema assert sr.get_version(subject, schema) == latest_version sr.update_compatibility("FULL", subject) assert sr.get_compatibility(subject) == "FULL" assert sr.test_compatibility(subject, schema) assert sr.delete_subject(subject) == [1]
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") # # TODO: Define this value schema in `schemas/station_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # #https://dattell.com/data-architecture-blog/kafka-optimization-how-many-partitions-are-needed/ #I liked the calculation here. I will start with these values because I do not know the #the technical limitations. It seem like with 8 stations and 12 arrivals an hour #we should need to process 10 mb per topic X 8 Stations # 12 arrivals X 60 seconds #x 60 minutes = 34 560 000 mb per hour #topic_name = topic name principals, Business Name Chicago Transport Authority (CTA), Name of the Python program, Name of the class # TODO: Come up with a better topic name topic_name = "CTAProducersStation" super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station. value_schema, # TODO: Uncomment once schema is defined num_partitions=1, num_replicas=1, ) #self.topic.name = topic_name self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) self.line = color.name #self.train_status = train_status.name def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # # # TODO: Complete this function by producing an arrival message to Kafka # # call the producer.produce class, try to find errors, # for testing purposes printed successful processing try: self.producer.produce(topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ "station_id": self.station_id, "train_id": train.train_id, "direction": direction, "line": self.line, "train_status": train.status.name, "prev_station_id": prev_station_id, "prev_direction": prev_direction, }) except Exception as e: logger.info("arrival kafka integration incomplete - skipping") print( f"Exception while producing record value to topic - {self.topic_name}: {e}" ) #else: # print(f"Successfully producing record value to topic - {self.topic_name}") def __str__(self): print("station after _str_ line 101") return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
def verify_avro_explicit_read_schema(): from confluent_kafka import avro """ verify that reading Avro with explicit reader schema works""" base_conf = { 'bootstrap.servers': bootstrap_servers, 'error_cb': error_cb, 'schema.registry.url': schema_registry_url } consumer_conf = dict( base_conf, **{ 'group.id': 'test.py', 'session.timeout.ms': 6000, 'enable.auto.commit': False, 'on_commit': print_commit_result, 'auto.offset.reset': 'earliest', 'schema.registry.url': schema_registry_url }) avsc_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'avro') writer_schema = avro.load(os.path.join(avsc_dir, "user_v1.avsc")) reader_schema = avro.load(os.path.join(avsc_dir, "user_v2.avsc")) user_value1 = {"name": " Rogers Nelson"} user_value2 = {"name": "Kenny Loggins"} combinations = [ dict(key=user_value1, key_schema=writer_schema, value=user_value2, value_schema=writer_schema), dict(key=user_value2, key_schema=writer_schema, value=user_value1, value_schema=writer_schema) ] avro_topic = topic + str(uuid.uuid4()) p = avro.AvroProducer(base_conf) for i, combo in enumerate(combinations): p.produce(topic=avro_topic, **combo) p.flush() c = avro.AvroConsumer(consumer_conf, reader_key_schema=reader_schema, reader_value_schema=reader_schema) c.subscribe([avro_topic]) msgcount = 0 while msgcount < len(combinations): msg = c.poll(1) if msg is None: continue if msg.error(): print("Consumer error {}".format(msg.error())) continue msgcount += 1 # Avro schema projection should return the two fields not present in the writer schema try: assert (msg.key().get('favorite_number') == 42) assert (msg.key().get('favorite_color') == "purple") assert (msg.value().get('favorite_number') == 42) assert (msg.value().get('favorite_color') == "purple") print( "success: schema projection worked for explicit reader schema") except KeyError: raise confluent_kafka.avro.SerializerError( "Schema projection failed when setting reader schema.")
def verify_avro(): from confluent_kafka import avro avsc_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'tests', 'avro') # Producer config conf = { 'bootstrap.servers': bootstrap_servers, 'error_cb': error_cb, 'api.version.request': api_version_request } # Create producer if schema_registry_url: conf['schema.registry.url'] = schema_registry_url p = avro.AvroProducer(conf) else: p = avro.AvroProducer(conf, schema_registry=InMemorySchemaRegistry()) prim_float = avro.load(os.path.join(avsc_dir, "primitive_float.avsc")) prim_string = avro.load(os.path.join(avsc_dir, "primitive_string.avsc")) basic = avro.load(os.path.join(avsc_dir, "basic_schema.avsc")) str_value = 'abc' float_value = 32. combinations = [ dict(key=float_value, key_schema=prim_float), dict(value=float_value, value_schema=prim_float), dict(key={'name': 'abc'}, key_schema=basic), dict(value={'name': 'abc'}, value_schema=basic), dict(value={'name': 'abc'}, value_schema=basic, key=float_value, key_schema=prim_float), dict(value={'name': 'abc'}, value_schema=basic, key=str_value, key_schema=prim_string), dict(value=float_value, value_schema=prim_float, key={'name': 'abc'}, key_schema=basic), dict(value=float_value, value_schema=prim_float, key=str_value, key_schema=prim_string), dict(value=str_value, value_schema=prim_string, key={'name': 'abc'}, key_schema=basic), dict(value=str_value, value_schema=prim_string, key=float_value, key_schema=prim_float), # Verify identity check allows Falsy object values(e.g., 0, empty string) to be handled properly (issue #342) dict(value='', value_schema=prim_string, key=0., key_schema=prim_float), dict(value=0., value_schema=prim_float, key='', key_schema=prim_string), ] # Consumer config cons_conf = { 'bootstrap.servers': bootstrap_servers, 'group.id': 'test.py', 'session.timeout.ms': 6000, 'enable.auto.commit': False, 'api.version.request': api_version_request, 'on_commit': print_commit_result, 'error_cb': error_cb, 'auto.offset.reset': 'earliest' } for i, combo in enumerate(combinations): combo['topic'] = str(uuid.uuid4()) p.produce(**combo) p.poll(0) p.flush() # Create consumer conf = copy(cons_conf) if schema_registry_url: conf['schema.registry.url'] = schema_registry_url c = avro.AvroConsumer(conf) else: c = avro.AvroConsumer(conf, schema_registry=InMemorySchemaRegistry()) c.subscribe([combo['topic']]) while True: msg = c.poll(0) if msg is None: continue if msg.error(): if msg.error().code( ) == confluent_kafka.KafkaError._PARTITION_EOF: break else: continue tstype, timestamp = msg.timestamp() print('%s[%d]@%d: key=%s, value=%s, tstype=%d, timestamp=%s' % (msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value(), tstype, timestamp)) # omit empty Avro fields from payload for comparison record_key = msg.key() record_value = msg.value() if isinstance(msg.key(), dict): record_key = { k: v for k, v in msg.key().items() if v is not None } if isinstance(msg.value(), dict): record_value = { k: v for k, v in msg.value().items() if v is not None } assert combo.get('key') == record_key assert combo.get('value') == record_value c.commit(msg, asynchronous=False) # Close consumer c.close()
from time import sleep import os import atexit from confluent_kafka import avro from confluent_kafka.avro import AvroProducer DRIVER_FILE_PREFIX = "./drivers/" KAFKA_TOPIC = "driver-positions-pyavro" # Load a driver id from an environment variable # if it isn't present use "driver-3" DRIVER_ID = os.getenv("DRIVER_ID", "driver-3") print("Starting Python Avro producer.") value_schema = avro.load("position_value.avsc") key_schema = avro.load("position_key.avsc") # Configure the location of the bootstrap server, Confluent interceptors # and a partitioner compatible with Java, and key/value schemas # see https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md producer = AvroProducer( { 'bootstrap.servers': 'kafka:9092', 'plugin.library.paths': 'monitoring-interceptor', 'partitioner': 'murmur2_random', 'schema.registry.url': 'http://schema-registry:8081' }, default_key_schema=key_schema, default_value_schema=value_schema)
def test_schema_from_file(self): parsed = avro.load(data_gen.get_schema_path('adv_schema.avsc')) self.assertTrue(isinstance(parsed, schema.Schema))
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") # # TODO: Define this value schema in `schemas/station_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # topic_name = f"{station_name}" # TODO: Come up with a better topic name super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station. value_schema, # TODO: Uncomment once schema is defined num_partitions=3, num_replicas=1, ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # # # TODO: Complete this function by producing an arrival message to Kafka # # ##logger.info("arrival kafka integration incomplete - skipping") #self.producer.produce( # topic=self.topic_name, # key={"timestamp": self.time_millis()}, # value={ # # # # # # TODO: Configure this # # # # # }, #) station_key = {"timestamp": self.time_millis()} station_val = { "station_id": self.station_id, "train_id": train.train_id, "direction": direction, "line": self.color, "train_status": train.status.name, "prev_station_id": prev_station_id, "prev_direction": prev_direction } print('Topic: ', self.topic_name) print('Producer: ', self.producer.produce) print('key_schema: ', self.key_schema) print('Key: ', station_key) print('value_schema: ', self.value_schema) print('Val: ', station_val) self.producer.produce(topic=self.topic_name, key_schema=self.key_schema, key=station_key, value_schema=self.value_schema, value=station_val) print('Stations issa go') def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
def verify_avro_explicit_read_schema(): from confluent_kafka import avro """ verify that reading Avro with explicit reader schema works""" base_conf = {'bootstrap.servers': bootstrap_servers, 'error_cb': error_cb, 'schema.registry.url': schema_registry_url} consumer_conf = dict(base_conf, **{ 'group.id': 'test.py', 'session.timeout.ms': 6000, 'enable.auto.commit': False, 'on_commit': print_commit_result, 'auto.offset.reset': 'earliest', 'schema.registry.url': schema_registry_url}) avsc_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'avro') writer_schema = avro.load(os.path.join(avsc_dir, "user_v1.avsc")) reader_schema = avro.load(os.path.join(avsc_dir, "user_v2.avsc")) user_value1 = { "name": " Rogers Nelson" } user_value2 = { "name": "Kenny Loggins" } combinations = [ dict(key=user_value1, key_schema=writer_schema, value=user_value2, value_schema=writer_schema), dict(key=user_value2, key_schema=writer_schema, value=user_value1, value_schema=writer_schema) ] avro_topic = topic + str(uuid.uuid4()) p = avro.AvroProducer(base_conf) for i, combo in enumerate(combinations): p.produce(topic=avro_topic, **combo) p.flush() c = avro.AvroConsumer(consumer_conf, reader_key_schema=reader_schema, reader_value_schema=reader_schema) c.subscribe([avro_topic]) msgcount = 0 while msgcount < len(combinations): msg = c.poll(1) if msg is None: continue if msg.error(): print("Consumer error {}".format(msg.error())) continue msgcount += 1 # Avro schema projection should return the two fields not present in the writer schema try: assert(msg.key().get('favorite_number') == 42) assert(msg.key().get('favorite_color') == "purple") assert(msg.value().get('favorite_number') == 42) assert(msg.value().get('favorite_color') == "purple") print("success: schema projection worked for explicit reader schema") except KeyError: raise confluent_kafka.avro.SerializerError("Schema projection failed when setting reader schema.")
def test_produce_value_and_key_schemas(self): value_schema = avro.load(os.path.join(avsc_dir, "basic_schema.avsc")) producer = AvroProducer({'schema.registry.url': 'http://127.0.0.1:9001'}, default_value_schema=value_schema, default_key_schema=value_schema) with self.assertRaises(ConnectionError): # Unexistent schema-registry producer.produce(topic='test', value={"name": 'abc"'}, key={"name": 'abc"'})
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") # # TODO: Define this value schema in `schemas/station_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # topic_name = f"{station_name}" # TODO: Come up with a better topic name super().__init__( topic_name, key_schema=Station.key_schema, # TODO: value_schema=Station. value_schema, # TODO: Uncomment once schema is defined # https://www.confluent.io/blog/how-choose-number-topics-partitions-kafka-cluster/ # p : single partition for production # c : single partition for consumption # t : target throughput # choose at least max(t/p, t/c) # partions = max(throughput/#producers, throughput/#consumers) # Partitions = Max(Overall Throughput/Producer Throughput, Overall Throughput/Consumer Throughput) # Example from video, with 3 Producers and 5 Consumers, each operating at 10MB/s per single producer/consumer # partition: Max(100MBs/(3 * 10MB/s), 100MBs/(5 * 10MB/s)) = Max(2) ~= *4 partitions needed* # TODO: num_partitions= 2, # higher partition leads to higher throughput but high latency # TODO: num_replicas=1, # replicas shared between brokers ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # # # TODO: Complete this function by producing an arrival message to Kafka # # #logger.info("arrival kafka integration incomplete - skipping") # make sure the arrival events to kafka are paired with Avro key and value schemas # look at train.py and line.py to get the properties of those instances (train and line) self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ # TODO: Configure this "station_id": self.station_id, "train_id": train. train_id, # to get train_id, look at `self.train_id` in train.py "direction": direction, "line": self.color. name, # to get the line , look at `self.color.name` in line.py "train_status": train.status. name, # to get train status, look at `self.status.name` in train.py "prev_station_id": prev_station_id, "prev_direction": prev_direction }, ) logger.info(f"producing arrival event to kafka is complete") def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) topic_name = f"org.chicago.cta.station.arrivals.{station_name}" super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station.value_schema, num_partitions=5, num_replicas=3, ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # # # TODO: Complete this function by producing an arrival message to Kafka # # arrival_data = Arrival(self.station_id, direction, prev_station_id, prev_direction, train.train_id, train.status.name, self.color.name) print(f"Arrival data: {asdict(arrival_data)}") self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ "station_id": self.station_id, "direction": direction, "prev_station_id": prev_station_id, "prev_direction": prev_direction, "train_id": train.train_id, "train_status": train.status.name, "line": self.color.name # # TODO: Check if line config is right # # }, ) def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # Complete the below by deciding on a topic name, number of partitions, and number of # replicas super().__init__( topic_name=f"org.chicago.cta.station.{station_name}.arrivals", key_schema=Station.key_schema, value_schema=Station.value_schema, num_partitions=5, num_replicas=1, ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # Complete this function by producing an arrival message to Kafka try: self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, key_schema=self.key_schema, value_schema=self.value_schema, value={ "station_id": self.station_id, "train_id": train.train_id, "direction": direction, "line": self.color.name, "train_status": train.status.name, "prev_station_id": prev_station_id, "prev_direction": prev_direction, }, ) except Exception as e: logger.fatal(e) raise e def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") # # TODO: Define this value schema in `schemas/station_value.json, then uncomment the below, done # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # TODO: Complete the below by deciding on a topic name, number of partitions, and number of topic_name = f"station_topic_{station_name}_{color.name}" # TODO: Come up with a better topic name, done super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station. value_schema, # TODO: Uncomment once schema is defined, done num_partitions=1, num_replicas=1, ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # TODO: Complete this function by producing an arrival message to Kafka, done logger.info( f"{self.topic_name}: train {train.train_id} arrived from direction \ {direction} and prev_direction {prev_direction} \ and prev_station_id {prev_station_id} ") #print(f"station_id: {self.station_id}") #print(f"trainid: {train.train_id}") #print(f"dir: {direction}") #print(f"colorname: {self.color.name}") #print(f"statusname: {train.status.name}") #print(f"prevstid: {prev_station_id}") #print(f"prevdir: {prev_direction}") if not prev_station_id: prev_station_id = 0 if not prev_direction: prev_direction = 'None' self.producer.produce(topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ "station_id": self.station_id, "train_id": train.train_id, "direction": direction, "line": self.color.name, "train_status": train.status.name, "prev_station_id": prev_station_id, "prev_direction": prev_direction, }) def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
def run_avro_loop(producer_conf, consumer_conf): from confluent_kafka import avro avsc_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'avro') p = avro.AvroProducer(producer_conf) prim_float = avro.load(os.path.join(avsc_dir, "primitive_float.avsc")) prim_string = avro.load(os.path.join(avsc_dir, "primitive_string.avsc")) basic = avro.load(os.path.join(avsc_dir, "basic_schema.avsc")) str_value = 'abc' float_value = 32.0 combinations = [ dict(key=float_value, key_schema=prim_float), dict(value=float_value, value_schema=prim_float), dict(key={'name': 'abc'}, key_schema=basic), dict(value={'name': 'abc'}, value_schema=basic), dict(value={'name': 'abc'}, value_schema=basic, key=float_value, key_schema=prim_float), dict(value={'name': 'abc'}, value_schema=basic, key=str_value, key_schema=prim_string), dict(value=float_value, value_schema=prim_float, key={'name': 'abc'}, key_schema=basic), dict(value=float_value, value_schema=prim_float, key=str_value, key_schema=prim_string), dict(value=str_value, value_schema=prim_string, key={'name': 'abc'}, key_schema=basic), dict(value=str_value, value_schema=prim_string, key=float_value, key_schema=prim_float), # Verify identity check allows Falsy object values(e.g., 0, empty string) to be handled properly (issue #342) dict(value='', value_schema=prim_string, key=0.0, key_schema=prim_float), dict(value=0.0, value_schema=prim_float, key='', key_schema=prim_string), ] for i, combo in enumerate(combinations): combo['topic'] = str(uuid.uuid4()) combo['headers'] = [('index', str(i))] p.produce(**combo) p.flush() c = avro.AvroConsumer(consumer_conf) c.subscribe([(t['topic']) for t in combinations]) msgcount = 0 while msgcount < len(combinations): msg = c.poll(1) if msg is None: continue if msg.error(): print(msg.error()) continue tstype, timestamp = msg.timestamp() print('%s[%d]@%d: key=%s, value=%s, tstype=%d, timestamp=%s' % (msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value(), tstype, timestamp)) # omit empty Avro fields from payload for comparison record_key = msg.key() record_value = msg.value() index = int(dict(msg.headers())['index']) if isinstance(msg.key(), dict): record_key = {k: v for k, v in msg.key().items() if v is not None} if isinstance(msg.value(), dict): record_value = {k: v for k, v in msg.value().items() if v is not None} assert combinations[index].get('key') == record_key assert combinations[index].get('value') == record_value c.commit() msgcount += 1 # Close consumer c.close()
self.message = self.getattribute(Timing_data, 'message') if __name__ == "__main__": if len(sys.argv) != 5: sys.stderr.write('Usage: %s <bootstrap-brokers> <topic>\n' % sys.argv[0]) sys.exit(1) #config broker = sys.argv[1] schema_registry_url = sys.argv[2] #config avroProducer avrotopic = sys.argv[3] value_schema = avro.load( '/home/silence/PycharmProjects/test/Avro/Timing.avsc') key_schema = avro.load('/home/silence/PycharmProjects/test/Avro/Id.avsc') avroconf = { 'bootstrap.servers': broker, 'schema.registry.url': schema_registry_url } #config jsonComsumer jsontopic = sys.argv[4] jsonconf = {'bootstrap.servers': broker} #create avroProducer avroProducer = AvroProducer(avroconf, default_key_schema=key_schema, default_value_schema=value_schema)
class KafkaStream(metaclass=IterateStream): __metaclass__ = IterateStream CONFIG = { 'start': { 'group.id': 'groupid', 'default.topic.config': { 'auto.offset.reset': 'beginning', 'auto.commit.enable': 'false' } }, 'end': { 'group.id': 'groupid' } } OFFSETS = { 'start': confluent_kafka.OFFSET_BEGINNING, 'end': confluent_kafka.OFFSET_END } KEY_SCHEMA = avro.load(os.path.join(SCHEMAS, 'keyschema.avsc')) VALUE_SCHEMA = { 'gdax': avro.load(os.path.join(SCHEMAS, 'gdax' + '.avsc')), 'reddit': avro.load(os.path.join(SCHEMAS, 'reddit' + '.avsc')), 'twitter': avro.load(os.path.join(SCHEMAS, 'twitter' + '.avsc')) } @classmethod def producer(cls, topic='gdax'): ip = cls.determine_ip() return AvroProducer( { 'bootstrap.servers': ip + ':9092', 'schema.registry.url': 'http://' + ip + ':8081' }, default_key_schema=cls.KEY_SCHEMA[topic], default_value_schema=cls.VALUE_SCHEMA) @classmethod def consumer(cls, topic='gdax', offset='start'): ip = cls.determine_ip() try: _offset = cls.OFFSETS[offset] _config = cls.CONFIG[offset] except KeyError: _config = cls.CONFIG['end'] _offset = offset print(_offset, _config) cls.avro_consumer = AvroConsumer( dict( { 'bootstrap.servers': ip + ':9092', 'schema.registry.url': 'http://' + ip + ':8081' }, **{ 'group.id': str(uuid.uuid1()).split('-')[0], 'default.topic.config': { 'auto.offset.reset': 'beginning', 'auto.commit.enable': 'false' } })) cls.avro_consumer.assign( [TopicPartition(topic, partition=0, offset=_offset)]) return cls @staticmethod def determine_ip(): try: return os.environ['KAFKA_SERVER_IP'] except KeyError: return 'localhost'
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") # # TODO: Define this value schema in `schemas/station_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # TODO: Come up with a better topic name topic_name = f"org.chicago.cta.station.arrivals.{station_name}" # TODO: Include/fill the following in the call to super.__init__(): # value_schema=Station.value_schema, # num_partitions=???, # num_replicas=???, # call the super to instantiate super's vars also incl. self.producer super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station.value_schema, num_partitions=3, num_replicas=1, ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # # # TODO: Complete this function by producing an arrival message to Kafka # # # schemas have already been set in instance creation hence commented out self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, # key_schema=Station.key_schema, # value_schema=Station.value_schema, value={ "station_id": self.station_id, "train_id": train.train_id, "direction": direction, "line": self.color.name, "train_status": train.status.name, "prev_station_id": prev_station_id, "prev_direction": prev_direction, }, ) def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | " \ "departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
def test_schema_load_parse_error(self): with pytest.raises(avro.ClientError) as excinfo: avro.load(data_gen.get_schema_path("invalid_scema.avsc")) assert 'Schema parse failed:' in str(excinfo.value)
class Turnstile(Producer): key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_key.json") # # TODO: Define this value schema in `schemas/turnstile_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_value.json") def __init__(self, station): """Create the Turnstile""" station_name = (station.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # super().__init__( f"{station_name}", # TODO: Come up with a better topic name key_schema=Turnstile.key_schema, value_schema=Turnstile. value_schema, #TODO: Uncomment once schema is defined num_partitions=3, num_replicas=1, ) self.station = station self.turnstile_hardware = TurnstileHardware(station) def run(self, timestamp, time_step): """Simulates riders entering through the turnstile.""" num_entries = self.turnstile_hardware.get_entries(timestamp, time_step) ##logger.info("turnstile kafka integration incomplete - skipping") # # # TODO: Complete this function by emitting a message to the turnstile topic for the number # of entries that were calculated # # turnstile_key = {"timestamp": self.time_millis()} turnstile_val = { "station_id": self.station.station_id, "station_name": self.station.name, "line": self.station.color } print('\n\n\n\nTopic: ', self.topic_name) print('Producer: ', self.producer.produce) print('key_schema: ', self.key_schema) print('Key: ', turnstile_key) print('value_schema: ', self.value_schema) print('Val: ', turnstile_val) self.producer.produce(topic=self.topic_name, key_schema=self.key_schema, key=turnstile_key, value_schema=self.value_schema, value=turnstile_val) print('Producers issa go!\n\n\n\n')
MESSAGE_INTERVAL_SECONDS=5 topic="ingester" def delivery_report(err, msg): """ Called once for each message produced to indicate delivery result. Triggered by poll() or flush(). """ if err is not None: print('Message delivery failed: {}'.format(err)) else: print('Message delivered to {} [{}]'.format(msg.topic(), msg.partition())) conf = { "bootstrap.servers": "10.227.52.245:31090,10.227.52.246:31091,10.227.52.247:31092", "on_delivery": delivery_report, "schema.registry.url": "http://10.227.52.247:30553" } key_schema = avro.load("./schemas/{}-key.avsc".format(topic)) value_schema = avro.load("./schemas/{}-value.avsc".format(topic)) avroProducer = AvroProducer(conf, default_key_schema=key_schema, default_value_schema=value_schema) with open('./data/example_ingest_messages.json', 'r') as json_file: messages = json.load(json_file) for key, value in messages: print(topic, key, value) avroProducer.produce(topic=topic, key=key, value=value) avroProducer.poll(0) time.sleep(MESSAGE_INTERVAL_SECONDS) avroProducer.flush()
import asyncio import random from datetime import datetime from confluent_kafka import avro from confluent_kafka.avro import AvroProducer import numpy as np from scipy.stats import t, norm, multinomial event_1_schema = avro.load('/Users/mbarak/projects/github/showcase/core/src/main/resources/Event.avsc') key_schema = avro.load('/Users/mbarak/projects/github/showcase/core/src/main/resources/UserKey.avsc') event_2_schema = avro.load('/Users/mbarak/projects/github/showcase/core/src/main/resources/Event2.avsc') users = [i for i in range(100)] event_1_producer = AvroProducer({'bootstrap.servers': 'localhost:9092', 'schema.registry.url': 'http://localhost:8081'}, default_value_schema=event_1_schema, default_key_schema=key_schema) event_2_producer = AvroProducer({'bootstrap.servers': 'localhost:9092', 'schema.registry.url': 'http://localhost:8081'}, default_value_schema=event_2_schema, default_key_schema=key_schema) async def generate_event_1(users, producer): n11 = norm(10, 2) n12 = norm(20,5) rv = multinomial(1, [0.3, 0.2, 0.5]) def gen_event(user): return ( "user_%s" % user, { "userId": "user_%s" % user, "userValue1": round(n11.rvs() if user % 4 == 0 else n12.rvs(), 2), "userValue2": int(np.argmax(rv.rvs())), "timestamp": int((datetime.utcnow() - datetime(1970, 1, 1)).total_seconds() * 1000)
def verify_avro_https(): from confluent_kafka import avro avsc_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'tests', 'avro') # Producer config conf = { 'bootstrap.servers': bootstrap_servers, 'error_cb': error_cb, 'api.version.request': api_version_request } conf.update(testconf.get('schema_registry_https', {})) p = avro.AvroProducer(conf) prim_float = avro.load(os.path.join(avsc_dir, "primitive_float.avsc")) prim_string = avro.load(os.path.join(avsc_dir, "primitive_string.avsc")) basic = avro.load(os.path.join(avsc_dir, "basic_schema.avsc")) str_value = 'abc' float_value = 32.0 combinations = [ dict(key=float_value, key_schema=prim_float), dict(value=float_value, value_schema=prim_float), dict(key={'name': 'abc'}, key_schema=basic), dict(value={'name': 'abc'}, value_schema=basic), dict(value={'name': 'abc'}, value_schema=basic, key=float_value, key_schema=prim_float), dict(value={'name': 'abc'}, value_schema=basic, key=str_value, key_schema=prim_string), dict(value=float_value, value_schema=prim_float, key={'name': 'abc'}, key_schema=basic), dict(value=float_value, value_schema=prim_float, key=str_value, key_schema=prim_string), dict(value=str_value, value_schema=prim_string, key={'name': 'abc'}, key_schema=basic), dict(value=str_value, value_schema=prim_string, key=float_value, key_schema=prim_float), # Verify identity check allows Falsy object values(e.g., 0, empty string) to be handled properly (issue #342) dict(value='', value_schema=prim_string, key=0.0, key_schema=prim_float), dict(value=0.0, value_schema=prim_float, key='', key_schema=prim_string), ] for i, combo in enumerate(combinations): combo['topic'] = str(uuid.uuid4()) combo['headers'] = [('index', str(i))] p.produce(**combo) p.flush() conf = { 'bootstrap.servers': bootstrap_servers, 'group.id': generate_group_id(), 'session.timeout.ms': 6000, 'enable.auto.commit': False, 'api.version.request': api_version_request, 'on_commit': print_commit_result, 'error_cb': error_cb, 'auto.offset.reset': 'earliest' } conf.update(testconf.get('schema_registry_https', {})) c = avro.AvroConsumer(conf) c.subscribe([(t['topic']) for t in combinations]) msgcount = 0 while msgcount < len(combinations): msg = c.poll(0) if msg is None or msg.error(): continue tstype, timestamp = msg.timestamp() print('%s[%d]@%d: key=%s, value=%s, tstype=%d, timestamp=%s' % (msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value(), tstype, timestamp)) # omit empty Avro fields from payload for comparison record_key = msg.key() record_value = msg.value() index = int(dict(msg.headers())['index']) if isinstance(msg.key(), dict): record_key = {k: v for k, v in msg.key().items() if v is not None} if isinstance(msg.value(), dict): record_value = { k: v for k, v in msg.value().items() if v is not None } assert combinations[index].get('key') == record_key assert combinations[index].get('value') == record_value c.commit() msgcount += 1 # Close consumer c.close()
def load_avro_schema_from_file( key_schema_file: str, value_schema_file: str) -> Tuple[Schema, Schema]: key_schema = avro.load(key_schema_file) value_schema = avro.load(value_schema_file) return key_schema, value_schema
def run_avro_loop(producer_conf, consumer_conf): from confluent_kafka import avro avsc_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'avro') p = avro.AvroProducer(producer_conf) prim_float = avro.load(os.path.join(avsc_dir, "primitive_float.avsc")) prim_string = avro.load(os.path.join(avsc_dir, "primitive_string.avsc")) basic = avro.load(os.path.join(avsc_dir, "basic_schema.avsc")) str_value = 'abc' float_value = 32.0 combinations = [ dict(key=float_value, key_schema=prim_float), dict(value=float_value, value_schema=prim_float), dict(key={'name': 'abc'}, key_schema=basic), dict(value={'name': 'abc'}, value_schema=basic), dict(value={'name': 'abc'}, value_schema=basic, key=float_value, key_schema=prim_float), dict(value={'name': 'abc'}, value_schema=basic, key=str_value, key_schema=prim_string), dict(value=float_value, value_schema=prim_float, key={'name': 'abc'}, key_schema=basic), dict(value=float_value, value_schema=prim_float, key=str_value, key_schema=prim_string), dict(value=str_value, value_schema=prim_string, key={'name': 'abc'}, key_schema=basic), dict(value=str_value, value_schema=prim_string, key=float_value, key_schema=prim_float), # Verify identity check allows Falsy object values(e.g., 0, empty string) to be handled properly (issue #342) dict(value='', value_schema=prim_string, key=0.0, key_schema=prim_float), dict(value=0.0, value_schema=prim_float, key='', key_schema=prim_string), ] for i, combo in enumerate(combinations): combo['topic'] = str(uuid.uuid4()) combo['headers'] = [('index', str(i))] p.produce(**combo) p.flush() c = avro.AvroConsumer(consumer_conf) c.subscribe([(t['topic']) for t in combinations]) msgcount = 0 while msgcount < len(combinations): msg = c.poll(1) if msg is None: continue if msg.error(): print(msg.error()) continue tstype, timestamp = msg.timestamp() print('%s[%d]@%d: key=%s, value=%s, tstype=%d, timestamp=%s' % (msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value(), tstype, timestamp)) # omit empty Avro fields from payload for comparison record_key = msg.key() record_value = msg.value() index = int(dict(msg.headers())['index']) if isinstance(msg.key(), dict): record_key = {k: v for k, v in msg.key().items() if v is not None} if isinstance(msg.value(), dict): record_value = { k: v for k, v in msg.value().items() if v is not None } assert combinations[index].get('key') == record_key assert combinations[index].get('value') == record_value c.commit() msgcount += 1 # Close consumer c.close()
#!/usr/bin/env python # coding: utf-8 from bs4 import BeautifulSoup import requests import re from time import sleep import json from confluent_kafka import avro from confluent_kafka.avro import AvroProducer url = input("geef de url van de video pagina: ") topic_name = input("geef de naam van de topic: ") value_schema = avro.load('schema/ValueSchema.avsc') key_schema = avro.load('schema/KeySchema.avsc') avroProducer = AvroProducer( {'message.max.bytes' : '15728640', 'bootstrap.servers': '127.0.0.1:9092', 'schema.registry.url': 'http://127.0.0.1:8081'}, default_key_schema=key_schema, default_value_schema=value_schema ) def getM3U8_1(json_obj): return json.loads(json_obj).get("text"), json.loads(json_obj).get("video").get("video_url") def getM3U8_2():
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) topic_name = f"{constants.STATION_TOPIC_PREFIX}.{station_name}" super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station.value_schema, num_partitions=1, num_replicas=1, ) self.station_id = int(station_id) self.color = color.name self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" logger.info("arrival kafka integration") self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ "station_id": self.station_id, "train_id": train.train_id, "direction": direction, "line": self.color, "train_status": train.status.name, "prev_station_id": prev_station_id, "prev_direction": prev_direction, }, ) def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
class Turnstile(Producer): key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_key.json") # # TODO: Define this value schema in `schemas/turnstile_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_value.json") def __init__(self, station): """Create the Turnstile""" station_name = (station.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # super().__init__( "org.chicago.cta.turnstile", # TODO: Come up with a better topic name key_schema=Turnstile.key_schema, # TODO: value_schema=Turnstile.value_schema, # TODO: num_partitions=3, # TODO: num_replicas=1) self.station = station self.turnstile_hardware = TurnstileHardware(station) def run(self, timestamp, time_step): """Simulates riders entering through the turnstile.""" try: num_entries = self.turnstile_hardware.get_entries( timestamp, time_step) #logger.info("turnstile kafka integration incomplete - skipping") # # # TODO: Complete this function by emitting a message to the turnstile topic for the number # of entries that were calculated # # # make sure the arrival events to kafka are paired with Avro key and value schemas logger.info(" %s people entered this station %s ", num_entries, self.station.name) # for _ in range(num_entries): self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ # TODO: Configure this "station_id": self.station.station_id, "station_name": self.station.name, "line": self.station.color.name }, ) except Exception as e: logger.info( "Turnstile failed to write to topic {} with exception {}". format(self.topic_name, e)) logger.info("schema : {}".format(Turnstile.value_schema)) logger.info("value : {}, {}, {}".format(self.station.station_id, self.station.name, self.station.color.name))
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name topic_name = 'station.arrivals.fav' # TODO: Come up with a better topic name super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station.value_schema, num_partitions=1, num_replicas=1, ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # TODO: Complete this function by producing an arrival message to Kafka self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ 'station_id': self.station_id, 'train_id': train.train_id, 'direction': direction, 'train_status': train.status.name, 'line': self.color.name, 'prev_station_id': prev_station_id, 'prev_direction': prev_direction }, ) def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") # # TODO: Define this value schema in `schemas/station_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # topic_name = "org.chicago.cta.station.arrivals.v1" # old "com.udacity.cta.station.arrival" super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station.value_schema, num_partitions=1, num_replicas=1, ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # # # TODO: Complete this function by producing an arrival message to Kafka # # logger.info("arrival kafka integration complete !!") self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ 'station_id': self.station_id, 'train_id': train.train_id, 'direction': direction, 'line': self.color.name, 'train_status': train.status.name, 'prev_station_id': prev_station_id, 'prev_direction': prev_direction, }, ) def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
class Station(Producer): """Defines a single station""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_key.json") # # TODO: Define this value schema in `schemas/station_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/arrival_value.json") def __init__(self, station_id, name, color, direction_a=None, direction_b=None): self.name = name station_name = (self.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # topic_name = f"{ARRIVALS_TOPIC_PREFIX}{station_name}" # TODO: Come up with a better topic name super().__init__( topic_name, key_schema=Station.key_schema, value_schema=Station.value_schema, num_partitions=1, num_replicas=1, ) self.station_id = int(station_id) self.color = color self.dir_a = direction_a self.dir_b = direction_b self.a_train = None self.b_train = None self.turnstile = Turnstile(self) def run(self, train, direction, prev_station_id, prev_direction): """Simulates train arrivals at this station""" # # # TODO: Complete this function by producing an arrival message to Kafka # # value = { "station_id": int(self.station_id), "train_id": str(train.train_id), "direction": str(direction), "line": str(self.color.name), "train_status": str(train.status.value), "prev_station_id": str(prev_station_id), "prev_direction": str(prev_direction), } self.producer.produce(topic=self.topic_name, key={"timestamp": self.time_millis()}, value=value) def __str__(self): return "Station | {:^5} | {:<30} | Direction A: | {:^5} | departing to {:<30} | Direction B: | {:^5} | departing to {:<30} | ".format( self.station_id, self.name, self.a_train.train_id if self.a_train is not None else "---", self.dir_a.name if self.dir_a is not None else "---", self.b_train.train_id if self.b_train is not None else "---", self.dir_b.name if self.dir_b is not None else "---", ) def __repr__(self): return str(self) def arrive_a(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'a' direction""" self.a_train = train self.run(train, "a", prev_station_id, prev_direction) def arrive_b(self, train, prev_station_id, prev_direction): """Denotes a train arrival at this station in the 'b' direction""" self.b_train = train self.run(train, "b", prev_station_id, prev_direction) def close(self): """Prepares the producer for exit by cleaning up the producer""" self.turnstile.close() super(Station, self).close()
class Turnstile(Producer): """Defines a turnstile in a train station.""" key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_key.json") value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_value.json" ) curve_df = pd.read_csv( f"{Path(__file__).parents[1]}/data/ridership_curve.csv" ) seed_df = pd.read_csv( f"{Path(__file__).parents[1]}/data/ridership_seed.csv" ) def __init__(self, station_id: int, station_name: str, color: str): super().__init__(config["TOPIC"]["TURNSTILE"], key_schema=self.key_schema, value_schema=self.value_schema, num_partitions=1, num_replicas=1) self._station_name = station_name self._station_id = station_id self._color = color self._metrics_df = self.seed_df[ self.seed_df["station_id"] == station_id] self._weekday_ridership = int( round(self._metrics_df.iloc[0]["avg_weekday_rides"]) ) self._saturday_ridership = int( round(self._metrics_df.iloc[0]["avg_saturday_rides"]) ) self._sunday_ridership = int( round(self._metrics_df.iloc[0]["avg_sunday-holiday_rides"]) ) self._steps_per_hour = \ float(config['PARAM']['TIMER_UPDATE_TIME_INTERVAL']) / \ float(config['PARAM']['CTA_LINE_UPDATE_INTERVAL']) def _get_entries(self): """Returns the number of turnstile entries.""" dow = timer.weekday if dow >= 0 or dow < 5: num_riders = self._weekday_ridership elif dow == 6: num_riders = self._saturday_ridership else: num_riders = self._sunday_ridership hour_curve = self.curve_df[self.curve_df["hour"] == timer.hour] hour_ratio = hour_curve.iloc[0]["ridership_ratio"] num_entries = num_riders * hour_ratio / self._steps_per_hour num_entries *= random.uniform(0.8, 1.2) return round(num_entries) async def _produce(self): self._producer.produce( topic=self._topic_name, key={"timestamp": self.time_millis()}, key_schema=self._key_schema, value={ "station_id": self._station_id, "station_name": self._station_name, "line": self._color }, value_schema=self._value_schema ) async def run(self): """Override.""" n_entries = self._get_entries() ret = asyncio.create_task(asyncio.sleep(0)) if n_entries > 0: ret = asyncio.gather(*[asyncio.create_task(self._produce()) for _ in range(n_entries)]) logger.debug(f"{n_entries} entries in {self._station_name}") return ret
class Turnstile(Producer): key_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_key.json") # # TODO: Define this value schema in `schemas/turnstile_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_value.json") def __init__(self, station): """Create the Turnstile""" station_name = (station.name.lower().replace("/", "_and_").replace( " ", "_").replace("-", "_").replace("'", "")) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # super().__init__( topic_name="org.chicago.cta.station.turnstile.v1", key_schema=Turnstile.key_schema, value_schema=Turnstile.value_schema, num_partitions=4, num_replicas=1, ) self.station = station self.turnstile_hardware = TurnstileHardware(station) def run(self, timestamp, time_step): """Simulates riders entering through the turnstile.""" num_entries = self.turnstile_hardware.get_entries(timestamp, time_step) logger.info("turnstile kafka integration incomplete - skipping") # # # TODO: Complete this function by emitting a message to the turnstile topic for the number # of entries that were calculated # # logger.debug( "%s riders have entered station %s at %s", num_entries, self.station.name, timestamp.isoformat(), ) for _ in range(num_entries): try: self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ "station_id": self.station.station_id, "station_name": self.station.name, "line": self.station.color.name, }, ) except Exception as e: logger.fatal(e) raise e
class Turnstile(Producer): key_schema = avro.load(f"{Path(__file__).parents[0]}/schemas/turnstile_key.json") # # TODO: Define this value schema in `schemas/turnstile_value.json, then uncomment the below # value_schema = avro.load( f"{Path(__file__).parents[0]}/schemas/turnstile_value.json" ) def __init__(self, station): """Create the Turnstile""" station_name = ( station.name.lower() .replace("/", "_and_") .replace(" ", "_") .replace("-", "_") .replace("'", "") ) # # # TODO: Complete the below by deciding on a topic name, number of partitions, and number of # replicas # # super().__init__( "org.chicago.cta.turnstile", # TODO: Come up with a better topic name key_schema=Turnstile.key_schema, value_schema=Turnstile.value_schema, num_partitions=1, num_replicas=1, ) self.station = station self.turnstile_hardware = TurnstileHardware(station) def run(self, timestamp, time_step): """Simulates riders entering through the turnstile.""" num_entries = self.turnstile_hardware.get_entries(timestamp, time_step) logger.info("turnstile kafka integration data") # # # TODO: Complete this function by emitting a message to the turnstile topic for the number # of entries that were calculated # # logger.info(f"Start emitting message to turnstile topic {self.topic_name}") for _ in range (num_entries): self.producer.produce( topic=self.topic_name, key={"timestamp": self.time_millis()}, value={ "station_id": self.station.station_id, "station_name":self.station.name, "line": self.station.color.name, }, value_schema=self.value_schema, key_schema=self.key_schema ) logger.info(f"Turnstile info emmited for topic {self.topic_name}")
from confluent_kafka import avro from confluent_kafka.avro import AvroProducer from lipsum import generate_words import os import random SCHEMA_REGISTRY_URL = 'http://172.17.0.5:8081' BOOTSTRAP_SERVERS = '172.17.0.4' AVSC_DIR = os.path.dirname(os.path.realpath(__file__)) KEY_SCHEMA = avro.load(os.path.join(AVSC_DIR, 'primitive_string.avsc')) VALUE_SCHEMA = avro.load(os.path.join(AVSC_DIR, 'basic_schema.avsc')) TOPIC = 'avrotopic' KEY = "mykey" avroProducer = AvroProducer({'bootstrap.servers': BOOTSTRAP_SERVERS, 'schema.registry.url': SCHEMA_REGISTRY_URL}, default_key_schema=KEY_SCHEMA, default_value_schema=VALUE_SCHEMA) for i in xrange(100): value = {"name": generate_words(count=1), "surname": generate_words(count=2), "number": random.randint(0, 100)} print str(value) avroProducer.produce(topic=TOPIC, value=value,
def verify_avro(): from confluent_kafka import avro avsc_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'tests', 'avro') # Producer config conf = {'bootstrap.servers': bootstrap_servers, 'error_cb': error_cb, 'api.version.request': api_version_request, 'default.topic.config': {'produce.offset.report': True}} # Create producer if schema_registry_url: conf['schema.registry.url'] = schema_registry_url p = avro.AvroProducer(conf) else: p = avro.AvroProducer(conf, schema_registry=InMemorySchemaRegistry()) prim_float = avro.load(os.path.join(avsc_dir, "primitive_float.avsc")) prim_string = avro.load(os.path.join(avsc_dir, "primitive_string.avsc")) basic = avro.load(os.path.join(avsc_dir, "basic_schema.avsc")) str_value = 'abc' float_value = 32. combinations = [ dict(key=float_value, key_schema=prim_float), dict(value=float_value, value_schema=prim_float), dict(key={'name': 'abc'}, key_schema=basic), dict(value={'name': 'abc'}, value_schema=basic), dict(value={'name': 'abc'}, value_schema=basic, key=float_value, key_schema=prim_float), dict(value={'name': 'abc'}, value_schema=basic, key=str_value, key_schema=prim_string), dict(value=float_value, value_schema=prim_float, key={'name': 'abc'}, key_schema=basic), dict(value=float_value, value_schema=prim_float, key=str_value, key_schema=prim_string), dict(value=str_value, value_schema=prim_string, key={'name': 'abc'}, key_schema=basic), dict(value=str_value, value_schema=prim_string, key=float_value, key_schema=prim_float), ] # Consumer config cons_conf = {'bootstrap.servers': bootstrap_servers, 'group.id': 'test.py', 'session.timeout.ms': 6000, 'enable.auto.commit': False, 'api.version.request': api_version_request, 'on_commit': print_commit_result, 'error_cb': error_cb, 'default.topic.config': { 'auto.offset.reset': 'earliest' }} for i, combo in enumerate(combinations): combo['topic'] = str(uuid.uuid4()) p.produce(**combo) p.poll(0) p.flush() # Create consumer conf = copy(cons_conf) if schema_registry_url: conf['schema.registry.url'] = schema_registry_url c = avro.AvroConsumer(conf) else: c = avro.AvroConsumer(conf, schema_registry=InMemorySchemaRegistry()) c.subscribe([combo['topic']]) while True: msg = c.poll(0) if msg is None: continue if msg.error(): if msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF: break else: continue tstype, timestamp = msg.timestamp() print('%s[%d]@%d: key=%s, value=%s, tstype=%d, timestamp=%s' % (msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value(), tstype, timestamp)) c.commit(msg, async=False) # Close consumer c.close()