def comsumer(): # consumer = SimpleConsumer(client, group=None, # topic=topic, partitions=[0, ], # auto_commit=False) # node_id = list(consumer.client.conns.keys())[0] # print dir(consumer.client.conns[node_id]) # for i in consumer.get_messages(100): # print i.offset # consumer.commit() # from pykafka import KafkaClient # # client = KafkaClient(hosts="127.0.0.1:9092") # print client.topics # topic1 = client.topics[topic] # consumer = topic1.get_simple_consumer(auto_commit_enable=True, ) # for message in consumer: # if message is not None: # print message.offset, message.value connect_str = '127.0.0.1:9092' consumer = KafkaConsumer(topic, group_id='my-group', bootstrap_servers=[connect_str], auto_offset_reset='largest',auto_commit_enable=True, auto_commit_interval_messages=1000)# largest,smallest consumer.set_topic_partitions((topic, 2, 50032),) # Optionally specify offsets to start from # kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0}) #partition只能被一个消费者消费,所以最好指定消费哪个partitions # kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 }) # print consumer.topics for message in consumer: print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) consumer.commit()
def consume(args): schema = args.schema table = args.table assert schema in settings.SCHEMAS, 'schema must in settings.SCHEMAS' assert table in settings.TABLES, 'table must in settings.TABLES' group_id = f'{schema}.{table}' consumer = KafkaConsumer( bootstrap_servers=settings.KAFKA_SERVER, value_deserializer=lambda x: json.loads(x, object_hook=object_hook), key_deserializer=lambda x: x.decode() if x else None, enable_auto_commit=False, group_id=group_id, auto_offset_reset='earliest', ) topic = settings.KAFKA_TOPIC partition = settings.PARTITIONS.get(group_id) consumer.assign([TopicPartition(topic, partition)]) event_list = [] logger.info( f'success consume topic:{topic},partition:{partition},schema:{schema},table:{table}' ) pk = reader.get_primary_key(schema, table) for msg in consumer: # type:ConsumerRecord logger.debug(f'kafka msg:{msg}') event = msg.value event_list.append(event) len_event = len(event_list) if len_event == settings.INSERT_NUMS or ( (int(time.time() * 10**6) - event_list[0]['event_unixtime']) / 10**6 >= settings.INSERT_INTERVAL > 0): data_dict = {} tmp_data = [] for items in event_list: action = items['action'] action_core = items['action_core'] data_dict.setdefault(table + schema + action + action_core, []).append(items) for k, v in data_dict.items(): tmp_data.append(v) result = writer.insert_event(tmp_data, settings.SKIP_TYPE, settings.SKIP_DELETE_TB_NAME, schema, table, pk) if result: event_list = [] consumer.commit() logger.info(f'commit success {len_event} events!') else: logger.error('insert event error!') exit()
def commitTopic(topic, group, partition, commit_offset): try: print( '====================================================================================' ) print('[commitTopic] : topic=' + topic + ', group=' + group + ', partition=' + str(partition) + ', commit_offset=' + str(commit_offset)) consumer2 = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers, enable_auto_commit=False, group_id=group) tp = TopicPartition(topic, partition) if int(commit_offset) > 0: consumer2.commit({tp: OffsetAndMetadata(commit_offset, None)}) except Exception as ee: print('error when commit Topic') print(str(ee)) finally: print('commitTopic end')
def consume(args): schema = args.schema tables = args.tables skip_error = args.skip_error assert schema in settings.SCHEMAS, f'schema {schema} must in settings.SCHEMAS' topic = settings.KAFKA_TOPIC tables_pk = {} partitions = [] for table in tables.split(','): assert table in settings.TABLES, f'table {table} must in settings.TABLES' partition = settings.PARTITIONS.get(f'{schema}.{table}') tp = TopicPartition(topic, partition) partitions.append(tp) tables_pk[table] = reader.get_primary_key(schema, table) group_id = f'{schema}.{tables}' consumer = KafkaConsumer( bootstrap_servers=settings.KAFKA_SERVER, value_deserializer=lambda x: json.loads(x, object_hook=object_hook), key_deserializer=lambda x: x.decode() if x else None, enable_auto_commit=False, group_id=group_id, auto_offset_reset='earliest', ) consumer.assign(partitions) event_list = {} is_insert = False last_time = 0 len_event = 0 logger.info(f'success consume topic:{topic},partitions:{partitions},schema:{schema},tables:{tables}') for msg in consumer: # type:ConsumerRecord logger.debug(f'kafka msg:{msg}') event = msg.value event_unixtime = event['event_unixtime'] / 10 ** 6 table = event['table'] schema = event['schema'] event_list.setdefault(table, []).append(event) len_event += 1 if last_time == 0: last_time = event_unixtime if len_event == settings.INSERT_NUMS: is_insert = True else: if event_unixtime - last_time >= settings.INSERT_INTERVAL > 0: is_insert = True if is_insert: data_dict = {} events_num = 0 for table, items in event_list.items(): for item in items: action = item['action'] action_core = item['action_core'] data_dict.setdefault(table, {}).setdefault(table + schema + action + action_core, []).append(item) for table, v in data_dict.items(): tmp_data = [] for k1, v1 in v.items(): events_num += len(v1) tmp_data.append(v1) try: result = writer.insert_event(tmp_data, schema, table, tables_pk.get(table)) if not result: logger.error('insert event error!') if not skip_error: exit() except Exception as e: logger.error(f'insert event error!,error:{e}') if not skip_error: exit() consumer.commit() logger.info(f'commit success {events_num} events!') event_list = {} is_insert = False len_event = last_time = 0
def consume(args): schema = args.schema skip_error = args.skip_error auto_offset_reset = args.auto_offset_reset topic = settings.KAFKA_TOPIC tables_pk = {} tables = settings.SCHEMAS.get(schema) partitions = [] for table in tables: partition = settings.PARTITIONS.get(schema) tp = TopicPartition(topic, partition) partitions.append(tp) tables_pk[table] = reader.get_primary_key(schema, table) consumer = KafkaConsumer( bootstrap_servers=settings.KAFKA_SERVER, value_deserializer=lambda x: json.loads(x, object_hook=object_hook), key_deserializer=lambda x: x.decode() if x else None, enable_auto_commit=False, group_id=schema, auto_offset_reset=auto_offset_reset, ) consumer.assign(partitions) event_list = {} is_insert = False last_time = 0 len_event = 0 logger.info( f'success consume topic:{topic},partitions:{partitions},schema:{schema},tables:{tables}' ) for msg in consumer: # type:ConsumerRecord logger.debug(f'kafka msg:{msg}') event = msg.value event_unixtime = event['event_unixtime'] / 10**6 table = event['table'] schema = event['schema'] action = event['action'] if action == 'query': do_query = True query = event['values']['query'] else: do_query = False query = None event_list.setdefault(table, []).append(event) len_event += 1 if last_time == 0: last_time = event_unixtime if len_event == settings.INSERT_NUMS: is_insert = True else: if event_unixtime - last_time >= settings.INSERT_INTERVAL > 0: is_insert = True if is_insert or do_query: data_dict = {} events_num = 0 for table, items in event_list.items(): for item in items: action = item['action'] action_core = item['action_core'] data_dict.setdefault(table, {}).setdefault( table + schema + action + action_core, []).append(item) for table, v in data_dict.items(): tmp_data = [] for k1, v1 in v.items(): events_num += len(v1) tmp_data.append(v1) try: result = writer.insert_event(tmp_data, schema, table, tables_pk.get(table)) if not result: logger.error('insert event error!') if not skip_error: exit() if settings.UI_ENABLE: insert_into_redis('consumer', schema, table, len(v1)) except Exception as e: logger.error(f'insert event error!,error:{e}') if not skip_error: exit() if do_query: try: logger.info(f'execute query:{query}') writer.execute(query) except Exception as e: logger.error(f'execute query error!,error:{e}') if not skip_error: exit() consumer.commit() logger.info(f'commit success {events_num} events!') event_list = {} is_insert = False len_event = last_time = 0
class IBUSStreamingDownsamplingConsumer: LOG_FORMAT ="{} UTC_TS\t"\ "{}" def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic, logTopic, interval): self.kafkaHost = kafkaHost self.kafkaPort = kafkaPort self.tcpHost = tcpHost self.tcpPort = tcpPort self.group_id = group_id self.topic = topic self.logTopic = logTopic self.interval = int(interval) self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)], group_id=group_id, enable_auto_commit=False) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)]) self.tcpWriter = None def getTopicPartitions(self): self.consumer.topics() #This ensures local cache is updated with # information about partitions, offsets etc. pids = self.consumer.partitions_for_topic(self.topic) tps = [TopicPartition(self.topic, pid) for pid in pids] return tps def getTopicPartitionsCommittedPositions(self): tps = self.getTopicPartitions() ret = [(tp, self.consumer.committed(tp)) for tp in tps] return ret async def tcp_server_handler(self, reader, writer): addr = str(writer.get_extra_info("socket").getpeername()) if self.tcpWriter is not None: self.log("refused " + addr) writer.write(b"Connection limit reached; connection refused.") writer.close() return self.log("accepted " + addr) self.tcpWriter = writer t1 = asyncio.create_task(self.poll_from_Kafka(writer)) try: while True: data = await reader.read(1) # 1024*16 bytes if not data: break except BrokenPipeError: """ Catches connecton reset by peer when we are sending the batched data, which is also when we cannot check for reader. The broken connection on the writer side will ultimately lead to BrokenPipeError on the reader side. Hence """ pass finally: t1.cancel() self.log("closed " + addr) writer.close() self.tcpWriter = None async def poll_from_Kafka(self, writer): while True: prevPos = self.getTopicPartitionsCommittedPositions() polled = self.consumer.poll(timeout_ms=1000) records = [ record.value for recordList in polled.values() for record in recordList ] try: for record in records: writer.write(record) await writer.drain() except ConnectionResetError: """ The error is not thrown reliably. If a connection is broken, and one try to writer.write(record) await writer.drain() This error may not manifest. It is thrown more often when one try to repeatedly write to and drain a broken connection. """ print("Last batch not fully sent, not commited.") for tp, pos in prevPos: self.consumer.seek(tp, pos) break else: self.consumer.commit() await asyncio.sleep(self.interval) def log(self, msg): self.producer.send( self.logTopic, self.LOG_FORMAT.format( datetime.now().timestamp(), msg ) \ .encode() ) def cleanup(self): self.log("shutdown") self.consumer.close() self.producer.flush() self.producer.close() def run(self): self.log("running") asyncio.run(self._async_run()) async def _async_run(self): tcpServer = await asyncio.start_server(self.tcp_server_handler, self.tcpHost, self.tcpPort) await tcpServer.serve_forever()
def consume(args): settings = Global.settings writer = Global.writer reader = Global.reader schema = args.schema skip_error = args.skip_error auto_offset_reset = args.auto_offset_reset offset = args.offset topic = settings.kafka_topic tables_pk = {} schema_table = settings.schema_table.get(schema) tables = schema_table.get("tables") for table in tables: tables_pk[table] = reader.get_primary_key(schema, table) consumer = KafkaConsumer( bootstrap_servers=settings.kafka_server, value_deserializer=lambda x: json.loads(x, object_hook=object_hook), key_deserializer=lambda x: x.decode() if x else None, enable_auto_commit=False, group_id=schema, auto_offset_reset=auto_offset_reset, ) partition = schema_table.get("kafka_partition") topic_partition = TopicPartition(topic, partition) consumer.assign([topic_partition]) if offset: consumer.seek(topic_partition, offset) event_list = {} is_insert = False last_time = 0 len_event = 0 logger.info( f"success consume topic:{topic},partitions:{partition},schema:{schema},tables:{tables}" ) for msg in consumer: # type:ConsumerRecord logger.debug(f"kafka msg:{msg}") event = msg.value event_unixtime = event["event_unixtime"] / 10**6 table = event["table"] schema = event["schema"] action = event["action"] if action == "query": alter_table = True query = event["values"]["query"] else: alter_table = False query = None event_list.setdefault(table, []).append(event) len_event += 1 if last_time == 0: last_time = event_unixtime if len_event == settings.insert_num: is_insert = True else: if event_unixtime - last_time >= settings.insert_interval > 0: is_insert = True if is_insert or alter_table: data_dict = {} events_num = 0 for table, items in event_list.items(): for item in items: action = item["action"] action_core = item["action_core"] data_dict.setdefault(table, {}).setdefault( table + schema + action + action_core, []).append(item) for table, v in data_dict.items(): tmp_data = [] for k1, v1 in v.items(): events_num += len(v1) tmp_data.append(v1) try: result = writer.insert_event(tmp_data, schema, table, tables_pk.get(table)) if not result: logger.error("insert event error!") if not skip_error: exit() except Exception as e: logger.error(f"insert event error!,error:{e}") if not skip_error: exit() if alter_table: try: logger.info(f"execute query:{query}") writer.execute(query) except Exception as e: logger.error(f"execute query error!,error:{e}") if not skip_error: exit() consumer.commit() logger.info(f"commit success {events_num} events!") event_list = {} is_insert = False len_event = last_time = 0