def retrieve_partition_offset(): consumer = KafkaConsumer(bootstrap_servers=servers, group_id='kafka-group-id') tp = TopicPartition('kafka-topic', 0) consumer.assign([tp]) print("starting offset is ", consumer.position(tp))
def read_messages(): if ARG.SERVER: server_list = [ARG.SERVER + ':9092'] else: server_list = [ 'kafka.int.janelia.org:9092', 'kafka2.int.janelia.org:9092', 'kafka3.int.janelia.org:9092' ] if not ARG.GROUP: ARG.GROUP = None consumer = KafkaConsumer(bootstrap_servers=server_list, auto_offset_reset=ARG.OFFSET, consumer_timeout_ms=int(5000)) topics = consumer.topics() for topic in tqdm(sorted(topics)): COUNT['topics'] += 1 parts = consumer.partitions_for_topic(topic) if parts: partitions = [TopicPartition(topic, p) for p in parts] eoff = consumer.end_offsets(partitions) maxoff = 0 partnum = -1 for key in eoff: if eoff[key] > maxoff: maxoff = eoff[key] partnum = key.partition if not maxoff: EMPTY.write("%s\n" % (topic)) COUNT['empty'] += 1 continue part = TopicPartition(topic, 0) consumer.assign([part]) consumer.seek(part, maxoff - 1) for msg in consumer: if msg.timestamp == -1: ERROR.write("%s: %s\n" % (topic, msg)) COUNT['timestamp'] += 1 break today = datetime.today() delta = (today - datetime.fromtimestamp(msg.timestamp / 1000)).days timestr = strftime("%Y-%m-%d %H:%M:%S %Z", localtime(msg.timestamp / 1000)) if delta >= 365: OUTPUT.write("%s\t%s\t%s\n" % (topic, timestr, delta)) COUNT['old'] += 1 else: COUNT['current'] += 1 break print("Topics: %d" % (COUNT['topics'])) print("Topics >= 1 year old: %d" % (COUNT['old'])) print("Topics < 1 year old: %d" % (COUNT['current'])) print("Empty topics: %d" % (COUNT['empty'])) print("Topics missing timestamp: %d" % (COUNT['timestamp'])) EMPTY.close() if not COUNT['empty']: remove(EMPTY_FILE) ERROR.close() if not COUNT['timestamp']: remove(ERROR_FILE) OUTPUT.close() if not COUNT['old']: remove(OUTPUT_FILE)
# @Software: PyCharm """ import time import json from kafka import KafkaConsumer, TopicPartition from hdfs import InsecureClient consumer = KafkaConsumer('kzmg_all_payment', bootstrap_servers=['172.23.11.150:9092']) # print (consumer.partitions_for_topic("kzmg_all_payment")) # 获取phone-game-userinfo主题的分区信息 print(consumer.topics()) # 获取主题列表 # print (consumer.subscription()) # 获取当前消费者订阅的主题 # print (consumer.assignment()) # 获取当前消费者topic、分区信息 # print (consumer.beginning_offsets(consumer.assignment())) # 获取当前消费者可消费的偏移量 # print(consumer.end_offsets(consumer.assignment())) consumer.seek(TopicPartition(topic=u'kzmg_all_payment', partition=0), 125000) num = consumer.end_offsets(consumer.assignment()).values()[0] print(num) i = 0 # t= '2018-05-22' # timeArray =time.strptime(t,'%Y-%m-%d') # timeStamp=int(time.mktime(timeArray)) # print(consumer.offsets_for_times({TopicPartition(topic='kzmg_all_payment', partition=0):timeStamp})) client = InsecureClient('http://lg-11-152.ko.cn:50070', user='******') print(dir(client)) filePath = '/user/kzcq/datatest/kzmg_payment.json' tag_list = [] # for message in consumer: # print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, # message.offset, message.key,
def hgssh(): '''hgssh component of the vcsreplicator bootstrap procedure.''' import argparse parser = argparse.ArgumentParser() parser.add_argument('config', help='Path to config file') parser.add_argument('hg', help='Path to hg executable for use in bootstrap process') parser.add_argument('--workers', help='Number of concurrent workers to use for publishing messages', type=int, default=multiprocessing.cpu_count()) args = parser.parse_args() config = Config(filename=args.config) topic = config.c.get('replicationproducer', 'topic') # Create consumer to gather partition offsets consumer_config = { # set this so offsets are committed to Zookeeper 'api_version': (0, 8, 1), 'bootstrap_servers': config.c.get('replicationproducer', 'hosts'), 'enable_auto_commit': False, # We don't actually commit but this is just for good measure } consumer = KafkaConsumer(**consumer_config) partitions = consumer.partitions_for_topic(topic) # Gather the initial offsets topicpartitions = [ TopicPartition(topic, partition_number) for partition_number in sorted(partitions) ] offsets_start = consumer.end_offsets(topicpartitions) logger.info('gathered initial Kafka offsets') # Mapping of `replicatesync` future to corresponding repo name replicatesync_futures = {} with futures.ThreadPoolExecutor(args.workers) as e: # Create a future which makes a `replicatesync` call # for each repo on hg.mo for repo in find_hg_repos(REPOS_DIR): # Create a future to call `replicatesync` for this repo replicatesync_args = [ args.hg, '-R', repo, 'replicatesync', '--bootstrap', ] replicatesync_futures.update({ e.submit(subprocess.check_output, replicatesync_args): repo }) logger.info('calling `replicatesync --bootstrap` on %s' % repo) # Execute the futures and raise an Exception on fail for future in futures.as_completed(replicatesync_futures): repo = replicatesync_futures[future] exc = future.exception() if exc: logger.error('error occurred calling `replicatesync --bootstrap` on %s: %s' % (repo, exc)) raise Exception('error triggering replication of Mercurial repo %s: %s' % (repo, exc)) logger.info('called `replicatesync --bootstrap` on %s successfully' % repo) # Gather the final offsets offsets_end = consumer.end_offsets(topicpartitions) logger.info('gathered final Kafka offsets') # Create map of partition numbers to (start, end) offset tuples offsets_combined = { int(topicpartition.partition): (offsets_start[topicpartition], offsets_end[topicpartition]) for topicpartition in topicpartitions } # Create JSON for processing in ansible and print to stdout # Convert repo paths into their wire representations output = { 'offsets': offsets_combined, 'repositories': sorted([ config.get_replication_path_rewrite(repo) for repo in replicatesync_futures.values() ]), } print(json.dumps(output)) logger.info('hgssh bootstrap process complete!')
from kafka import KafkaConsumer, TopicPartition from config import * string_deserializer = lambda x: x.decode('utf-8') #################### Using seek() ################### # Don't give topic name while creating the consumer, instead use assign method as below consumer = KafkaConsumer(group_id='some_consumer_group', bootstrap_servers=[BOOTSTRAP_SERVERS], value_deserializer=string_deserializer, auto_offset_reset='latest', consumer_timeout_ms=100000) partition0 = TopicPartition('string-topic', 0) partition1 = TopicPartition('string-topic', 1) partition2 = TopicPartition('string-topic', 2) consumer.assign([partition0, partition1, partition2]) # Assume the consumer has consumed all messages from all partitions. # If my current offset is 54 for partition0 after consuming all messages then doing seek on 52 as below will display 2 messages. consumer.seek(partition0, 52) for msg in consumer: print("Consumed[%s-%d] %d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
def property_loop_start(): from device.models import Device from tag.models import Tag, TagTrack from tag.services import run_callbacks consumer = KafkaConsumer(bootstrap_servers=('124.70.129.107:9094', '124.70.193.90:9094', '124.70.217.193:9094 ')) topic = TopicPartition(topic='saveProps', partition=2) consumer.assign([topic]) for msg in consumer: try: print('save prop req') target: ConsumerRecord = msg data = json.loads(target.value) raw_tags = data['services'][0]['properties']['tags'] print(raw_tags) if raw_tags is None: tags = [] else: tags = parse_tags_byte_stream(bytes.fromhex(raw_tags)) if tags is None: continue try: device = Device.objects.get(device_id=data['device_id']) except Device.DoesNotExist: print('Unknown device with ID: {}'.format(data['device_id'])) continue event_time = parse(data['services'][0]['event_time']) detected_tags = [] for tid, reader1_id, reader1_dis, reader2_id, reader2_dis, reader3_id, reader3_dis in tags: reader1 = get_reader(reader1_id, device) reader2 = get_reader(reader2_id, device) reader3 = get_reader(reader3_id, device) tag = Tag.objects.get_or_create( tid=tid, device=device, defaults={'name': 'TAG_' + str(tid)})[0] TagTrack.objects.create(tag=tag, reader1=reader1, distance1=reader1_dis, reader2=reader2, distance2=reader2_dis, reader3=reader3, distance3=reader3_dis, created=event_time) tag.is_online = True tag.save() detected_tags.append(tag.id) # 存在性检测 # 规则:只检查active标签、active基站且原本online标签的存在性 invalid_tags = Tag.objects\ .filter( device=device, device__is_active=True, is_active=True, is_online=True )\ .exclude(id__in=detected_tags) invalid_tags_all = list(invalid_tags.all()) invalid_tags.update(is_online=False) for tag in invalid_tags_all: print('Callback run: Tag {}'.format(tag)) run_callbacks(tag, 'lost_signal') except (TypeError, KeyError, json.JSONDecodeError): print('Malformed data: {}'.format(msg))
self._create_kafka_consumer() def reset_consumer_server_topic_and_partition(self, server, topic, partition): self.bootstrap_servers = server self.topic = topic self.partition = partition self._create_kafka_consumer() if __name__ == "__main__": config = { 'group_id': 'g_2', 'client_id': 12, 'topic': 'topic_yang', 'partition': 0, 'bootstrap_servers': '172.31.32.39:9092' } con = ConsumerOperate(config) con.set_consumer_timeout(20000) print( con.consumer.committed(TopicPartition(topic='test33333', partition=0))) for i in con.consumer: print("fetching") print(i.topic, i.offset) con.commit_offset() print( con.consumer.committed( TopicPartition(topic='topic_yang', partition=0)))
def test_seek(consumer): partition = TopicPartition('test', 2) consumer.assign([partition]) consumer.seek(partition, 3)
def create_topic_partition(topic: str): return TopicPartition(topic=topic, partition=0)
def test_commit(consumer): partition = TopicPartition('test', 2) offset_metadata = OffsetAndMetadata(2, 'xx') response = consumer.commit({partition: offset_metadata}) print(response)
def test_position(consumer): partition = TopicPartition('test', 2) consumer.assign([partition]) response = consumer.position(partition) print(response)
def test_seek_to_end(consumer): partition = TopicPartition('test', 2) consumer.assign([partition]) response = consumer.seek_to_end(partition) print(response)
def test_seek_to_beginning(consumer): partition = TopicPartition('test', 2) consumer.assign([partition]) consumer.seek_to_beginning(partition) response = consumer.position(partition) print(response)
import time from PIL import Image import datetime import os import cv2 import numpy as np # settings client = "192.168.100.100:9092" topic = 'video' path = "/home/mooc/videoexample/consumed/" #consumer = KafkaConsumer(client) consumer = KafkaConsumer(bootstrap_servers=client) total_time = time.time() for i in range(100): topic_mod = topic + str(i) partitions = TopicPartition(topic_mod, 0) consumer.assign([partitions]) #consumer = KafkaConsumer(client) consumer.seek_to_beginning() lastoffset = consumer.end_offsets([partitions])[partitions] path_mod = path + topic_mod + "/" if not os.path.exists(path_mod): os.makedirs(path_mod) print("topic name: " + topic_mod) print(lastoffset) for msg in consumer: start_time = time.time() array = np.frombuffer(msg.value, dtype=np.dtype('uint8')) img = cv2.imdecode(array, 1) #cv2.imshow('recv',img) #cv2.imwrite(mk_path+str(key)+'.jpg', img)
def consume(args): schema = args.schema tables = args.tables skip_error = args.skip_error assert schema in settings.SCHEMAS, f'schema {schema} must in settings.SCHEMAS' topic = settings.KAFKA_TOPIC tables_pk = {} partitions = [] for table in tables.split(','): assert table in settings.TABLES, f'table {table} must in settings.TABLES' partition = settings.PARTITIONS.get(f'{schema}.{table}') tp = TopicPartition(topic, partition) partitions.append(tp) tables_pk[table] = reader.get_primary_key(schema, table) group_id = f'{schema}.{tables}' consumer = KafkaConsumer( bootstrap_servers=settings.KAFKA_SERVER, value_deserializer=lambda x: json.loads(x, object_hook=object_hook), key_deserializer=lambda x: x.decode() if x else None, enable_auto_commit=False, group_id=group_id, auto_offset_reset='earliest', ) consumer.assign(partitions) event_list = {} is_insert = False last_time = 0 len_event = 0 logger.info(f'success consume topic:{topic},partitions:{partitions},schema:{schema},tables:{tables}') for msg in consumer: # type:ConsumerRecord logger.debug(f'kafka msg:{msg}') event = msg.value event_unixtime = event['event_unixtime'] / 10 ** 6 table = event['table'] schema = event['schema'] event_list.setdefault(table, []).append(event) len_event += 1 if last_time == 0: last_time = event_unixtime if len_event == settings.INSERT_NUMS: is_insert = True else: if event_unixtime - last_time >= settings.INSERT_INTERVAL > 0: is_insert = True if is_insert: data_dict = {} events_num = 0 for table, items in event_list.items(): for item in items: action = item['action'] action_core = item['action_core'] data_dict.setdefault(table, {}).setdefault(table + schema + action + action_core, []).append(item) for table, v in data_dict.items(): tmp_data = [] for k1, v1 in v.items(): events_num += len(v1) tmp_data.append(v1) try: result = writer.insert_event(tmp_data, schema, table, tables_pk.get(table)) if not result: logger.error('insert event error!') if not skip_error: exit() except Exception as e: logger.error(f'insert event error!,error:{e}') if not skip_error: exit() consumer.commit() logger.info(f'commit success {events_num} events!') event_list = {} is_insert = False len_event = last_time = 0
from json import loads from kafka import KafkaConsumer, TopicPartition consumer = KafkaConsumer(bootstrap_servers=['localhost:9092'], auto_offset_reset='latest', enable_auto_commit=False, group_id='my-group', value_deserializer=lambda x: loads(x.decode('utf-8'))) #assign topic to read from tp = TopicPartition('measurements_node_2182', 0) consumer.assign([tp]) consumer.seek_to_end(tp) for message in consumer: print(message.value)
from kafka import KafkaConsumer, KafkaProducer, TopicPartition from util.http_status_server import HttpHealthServer from util.task_args import get_kafka_binder_brokers, get_input_channel, get_output_channel import logging logger = logging.getLogger('kafka') logger.addHandler(logging.StreamHandler(sys.stdout)) logger.setLevel(logging.INFO) logger.warning("Test warning mesage logger 12345 helllo") consumer = KafkaConsumer(bootstrap_servers=[get_kafka_binder_brokers()], api_version=(0, 9), group_id=None, auto_offset_reset='latest') producer = KafkaProducer(bootstrap_servers=[get_kafka_binder_brokers()], api_version=(0, 9)) tp = TopicPartition("two-forty.input", 0) consumer.assign([tp]) consumer.seek_to_end() HttpHealthServer.run_thread() counter = 0 while True: for message in consumer: producer.send("new_message", message)
# message.offset, message.key, # message.value.decode('utf-8'))) # i+=1 # if i>100: break """消费者(手动设置偏移量)""" # consumer = KafkaConsumer('phone-game-userinfo', bootstrap_servers=['172.23.11.150:9092']) consumer = KafkaConsumer('phone-game-userlogin-kong', bootstrap_servers=['172.23.11.150:9092']) print(consumer.partitions_for_topic( "phone-game-userinfo")) # 获取phone-game-userinfo主题的分区信息 print(consumer.topics()) # 获取主题列表 print(consumer.subscription()) # 获取当前消费者订阅的主题 print(consumer.assignment()) # 获取当前消费者topic、分区信息 print(consumer.beginning_offsets(consumer.assignment())) # 获取当前消费者可消费的偏移量 # consumer.seek(TopicPartition(topic=u'phone-game-userinfo', partition=0), 100875) # 重置偏移量,从第50个偏移量消费 consumer.seek(TopicPartition(topic=u'phone-game-userlogin-kong', partition=0), 1) print(consumer.end_offsets( consumer.assignment())) # Get the last offset for the given partitions print( consumer.end_offsets( [TopicPartition(topic='phone-game-userlogin-kong', partition=0)])) # 同上一句等价 t = '2018-05-10' timeArray = time.strptime(t, '%Y-%m-%d') timeStamp = int(time.mktime(timeArray)) print( consumer.offsets_for_times({ TopicPartition(topic='phone-game-userlogin-kong', partition=0): timeStamp }))
def main(): config = Configuration() # suppress debugging messages of tensorflow # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # load the scalers of the training data for the normalisation scalers = load_scalers(config) consumers = [] limiting_consumer = None selection = '' while selection not in ['cbs', 'snn']: print( 'Please select architecture that should be used. Type "snn" or "cbs"' ) selection = input() print() print('Creating consumers ...\n') # if using the fabric simulation start at the start of the topics # for live classification start at newest messages possible offset = 'earliest' if config.testing_using_fabric_sim else 'latest' try: # create consumers for all topics for topic in config.topic_list: c = KafkaConsumer( topic, bootstrap_servers=config.get_connection(), value_deserializer=lambda m: json.loads(m.decode('utf-8')), auto_offset_reset=offset) # based on the topic select one of the consumers for time interval determination if topic == config.limiting_topic: limiting_consumer = c consumers.append(c) except errors.NoBrokersAvailable: print( 'Configured kafka server is not available. Please check the connection or change the configuration.' ) sys.exit(0) # create and start a classifier thread that handles the classification of processed examples print('\nCreating classifier ...') print('\nUsed model file:') print(config.directory_model_to_use, '\n') print('The classifier will use k=' + str(config.k_of_knn) + ' for the k-NN algorithm') print( 'The mean similarity output is calculated on the basis of the k most similar cases' ) print('The time span is the time between the end timestamp of the') print('interval and the current time right before the output.') print( 'The total time is the time needed for the completely processing the example,' ) print('including the time in the queue.\n') classifier = Classifier(config, selection) classifier.start() print('Waiting for data to classify ...\n') try: # classify as until interrupted while 1: start_time = time.perf_counter() # read data for a single example from kafka, results contains lists of single messages results = read_single_example(consumers, limiting_consumer, config) # combine into a single dataframe df = list_to_dataframe(results, config) # transform dataframe into a array that can be used as neural network input example = df.to_numpy() # normalize the data of the example example = normalise_dataframe(example, scalers) # create a queue element containing element = (example, df.index[0], df.index[-1], start_time) # add element to the queue of examples to classify classifier.examples_to_classify.put(element) # reset all consumer offsets by two messages to reduce the time intervals that are left out for i in range(len(consumers)): partition = TopicPartition(config.topic_list[i], 0) last_offset = consumers[i].position(partition) new_offset = last_offset - 2 if last_offset - 2 >= 0 else 0 consumers[i].seek(partition, new_offset) except KeyboardInterrupt: # interrupt the classifier thread print('Exiting ...\n') classifier.stop = True
def Consumer(thread_name, topic, partition): print( thread_name, "Starting\tDispose", ) global is_dispose broker_list = '172.16.90.63:6667, 172.16.90.58:6667, 172.16.90.59:6667' ''' fetch_min_bytes(int) - 服务器为获取请求而返回的最小数据量,否则请等待 fetch_max_wait_ms(int) - 如果没有足够的数据立即满足fetch_min_bytes给出的要求,服务器在回应提取请求之前将阻塞的最大时间量(以毫秒为单位) fetch_max_bytes(int) - 服务器应为获取请求返回的最大数据量。这不是绝对最大值,如果获取的第一个非空分区中的第一条消息大于此值, 则仍将返回消息以确保消费者可以取得进展。注意:使用者并行执行对多个代理的提取,因此内存使用将取决于包含该主题分区的代理的数量。 支持的Kafka版本> = 0.10.1.0。默认值:52428800(50 MB)。 enable_auto_commit(bool) - 如果为True,则消费者的偏移量将在后台定期提交。默认值:True。 max_poll_records(int) - 单次调用中返回的最大记录数poll()。默认值:500 max_poll_interval_ms(int) - poll()使用使用者组管理时的调用之间的最大延迟 。这为消费者在获取更多记录之前可以闲置的时间量设置了上限。 如果 poll()在此超时到期之前未调用,则认为使用者失败,并且该组将重新平衡以便将分区重新分配给另一个成员。默认300000 ''' consumer = KafkaConsumer( bootstrap_servers=broker_list, group_id="xiaofei", client_id=thread_name, # auto_offset_reset="smallest", enable_auto_commit=False, fetch_min_bytes=1024 * 1024, # fetch_max_bytes=1024 * 1024 * 1024 * 10, fetch_max_wait_ms=60000, request_timeout_ms=305000, # consumer_timeout_ms=1, # max_poll_records=5000, # max_poll_interval_ms=60000 无该参数 ) dic = get_kafka(topic, partition) tp = TopicPartition(topic, partition) # print(thread_name, tp, dic['offset']) consumer.assign([tp]) # 重定向分区offset consumer.seek(tp, dic['offset']) print("程序首次运行\t线程:", thread_name, "分区:", partition, "偏移量:", dic['offset'], "\t开始消费...") num = 0 # end_offset = consumer.end_offsets([tp])[tp] # print(end_offset) while True: args = OrderedDict() checkThread() msg = consumer.poll(timeout_ms=60000) end_offset = consumer.end_offsets([tp])[tp] print('已保存的偏移量', consumer.committed(tp), '最新偏移量,', end_offset) # 测试线程死掉 # if thread_name=="Thread-1" and num==2: # sys.exit() if len(thread_msg) > 0 and is_dispose is True: is_dispose = False for msg_send in thread_msg: exp(msg_send) send_msg(msg_send) thread_msg.clear() if len(msg) > 0: print("线程:", thread_name, "分区:", partition, "最大偏移量:", end_offset, "有无数据,", len(msg)) lines = 0 for data in msg.values(): for line in data: lines += 1 line = eval(line.value.decode('utf-8')) value, log_name = get_line(col_dic, line) sql = sql_dic[log_name] if value is not None: args.setdefault(sql, []).append(tuple(value)) print(thread_name, "处理条数", lines) # 数据保存至数据库 is_succeed = save_to_db(args, thread_name) if is_succeed: # 更新保存在数据库中的分区的偏移量 is_succeed1 = update_offset(topic, partition, end_offset) # 手动提交偏移量到kafka consumer.commit( offsets={tp: (OffsetAndMetadata(end_offset, None))}) # print(thread_name,"to db suss",num+1) if is_succeed1 == 0: sys.exit() else: sys.exit() else: pass # print(thread_name,'没有数据') # time.sleep(60) num += 1
def hgweb(): '''hgweb component of the vcsreplicator bootstrap procedure. Takes a vcsreplicator config path on the CLI and takes a JSON data structure on stdin''' import argparse # Configure logging logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(name)s %(message)s') formatter.converter = time.gmtime handler.setFormatter(formatter) logger.addHandler(handler) # Parse CLI args parser = argparse.ArgumentParser() parser.add_argument('config', help='Path of config file to load') parser.add_argument('input', help='JSON data input (output from the hgssh bootstrap procedure) file path') parser.add_argument('--workers', help='Number of concurrent workers to use for performing clones', type=int, default=multiprocessing.cpu_count()) args = parser.parse_args() logger.info('reading hgssh JSON document') with open(args.input, 'r') as f: hgssh_data = json.loads(f.read()) logger.info('JSON document read') # Convert the JSON keys to integers hgssh_data['offsets'] = { int(k): v for k, v in hgssh_data['offsets'].items() } config = Config(filename=args.config) consumer_config = { # set this so offsets are committed to Zookeeper 'api_version': (0, 8, 1), 'bootstrap_servers': config.c.get('consumer', 'hosts'), 'client_id': config.c.get('consumer', 'client_id'), 'enable_auto_commit': False, 'group_id': config.c.get('consumer', 'group'), 'max_partition_fetch_bytes': MAX_BUFFER_SIZE, 'value_deserializer': value_deserializer, } topic = config.c.get('consumer', 'topic') topicpartitions = [ TopicPartition(topic, partition) for partition, (start_offset, end_offset) in sorted(hgssh_data['offsets'].items()) # there is no need to do an assignment if the length of the # bootstrap message range is 0 if start_offset != end_offset ] consumer = KafkaConsumer(**consumer_config) # We will remove repos from this set as we replicate them # Once this is an empty set we are done repositories_to_clone = set(hgssh_data['repositories']) extra_messages = collections.defaultdict(collections.deque) # maps repo names to extra processing messages clone_futures_repo_mapping = {} # maps cloning futures to repo names extra_messages_futures_repo_mapping = {} # maps extra messages futures to repo names # Overwrite default hglib path so handle_message_main and it's derivatives # use the correct virtualenv hglib.HGPATH = config.c.get('programs', 'hg') # Maps partitions to the list of messages within the bootstrap range aggregate_messages_by_topicpartition = { tp.partition: [] for tp in topicpartitions } # Gather all the Kafka messages within the bootstrap range for each partition for topicpartition in topicpartitions: start_offset, end_offset = hgssh_data['offsets'][topicpartition.partition] end_offset -= 1 # Assign the consumer to the next partition and move to the start offset logger.info('assigning the consumer to partition %s' % topicpartition.partition) consumer.assign([topicpartition]) logger.info('seeking the consumer to offset %s' % start_offset) consumer.seek(topicpartition, start_offset) consumer.commit(offsets={ topicpartition: OffsetAndMetadata(start_offset, '') }) logger.info('partition %s of topic %s moved to offset %s' % (topicpartition.partition, topicpartition.topic, start_offset)) # Get all the messages we need to process from kafka for message in consumer: # Check if the message we are processing is within the range of accepted messages # If we are in the range, add this message to the list of messages on this partition # If we are at the end of the range, break from the loop and move on to the next partition if message.offset <= end_offset: aggregate_messages_by_topicpartition[message.partition].append(message) logger.info('message on partition %s, offset %s has been collected' % (message.partition, message.offset)) consumer.commit(offsets={ TopicPartition(topic, message.partition): OffsetAndMetadata(message.offset + 1, ''), }) if message.offset >= end_offset: logger.info('finished retrieving messages on partition %s' % message.partition) break logger.info('finished retrieving messages from Kafka') outputdata = collections.defaultdict(list) # Process the previously collected messages with futures.ThreadPoolExecutor(args.workers) as e: for partition, messages in sorted(aggregate_messages_by_topicpartition.items()): logger.info('processing messages for partition %s' % partition) for message in messages: payload = message.value # Ignore heartbeat messages if payload['name'] == 'heartbeat-1': continue if payload['path'] in repositories_to_clone: # If we have not yet replicated the repository for this message, # of the repo sync message is not tagged with the bootstrap flag, # move on to the next message. The assumed upcoming hg-repo-sync-2 # message will clone the data represented in this message anyways. if payload['name'] != 'hg-repo-sync-2' or not payload['bootstrap']: continue logger.info('scheduled clone for %s' % payload['path']) # Schedule the repo sync clone_future = e.submit(clone_repo, config, payload['path'], payload['requirements'], payload['hgrc'], payload['heads']) # Here we register the future against its repo name clone_futures_repo_mapping[clone_future] = payload['path'] # Remove the repo from the set of repos # which have not been scheduled to sync repositories_to_clone.remove(payload['path']) else: # If the repo is not in the list of repositories to clone, # then we have already scheduled the repo sync and we will # need to process this message once the sync completes. extra_messages[payload['path']].append((config, payload)) logger.info('extra messages found for %s: %s total' % (payload['path'], len(extra_messages[payload['path']])) ) if repositories_to_clone: logger.error('did not receive expected sync messages for %s' % repositories_to_clone) # Add errors to audit output for repo in repositories_to_clone: outputdata[repo].append('did not receive sync message') # Process clones remaining_clones = len(clone_futures_repo_mapping) for completed_future in futures.as_completed(clone_futures_repo_mapping): repo = clone_futures_repo_mapping[completed_future] exc = completed_future.exception() if exc: message = 'error triggering replication of Mercurial repo %s: %s' % (repo, str(exc)) logger.error(message) # Add error to audit output outputdata[repo].append(message) else: logger.info('%s successfully cloned' % repo) remaining_clones -= 1 logger.info('%s repositories remaining' % remaining_clones) # Schedule extra message processing if necessary if repo in extra_messages: logger.info('scheduling extra processing for %s' % repo) configs, payloads = zip(*extra_messages[repo]) future = e.submit(map, handle_message_main, configs, payloads) extra_messages_futures_repo_mapping[future] = repo # Process extra messages total_message_batches = len(extra_messages_futures_repo_mapping) for completed_future in futures.as_completed(extra_messages_futures_repo_mapping): repo = extra_messages_futures_repo_mapping[completed_future] exc = completed_future.exception() if exc: message = 'error processing extra messages for %s: %s' % (repo, str(exc)) logger.error(message) # Add error to audit output outputdata[repo].append(message) else: logger.info('extra processing for %s completed successfully' % repo) total_message_batches -= 1 logger.info('%s batches remaining' % total_message_batches) logger.info('%s bootstrap process complete' % config.c.get('consumer', 'group')) # If anything broke, dump the errors and set exit code 1 if outputdata: with open('/repo/hg/hgweb_bootstrap_out.json', 'w') as f: f.write(json.dumps(outputdata)) return 1
def getMsgData(topic, group, result, maxsize): try: saveResult = SaveDataResult() saveResult.guid = str(uuid.uuid4()) saveResult.CreateDate = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") msgInfos = [] result.guid = saveResult.guid result.topic_messages = [] consumer = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers, enable_auto_commit=False, group_id=group) # Get all partitions by topic par = consumer.partitions_for_topic(topic) now_count = 0 for p in par: tp = TopicPartition(topic, p) consumer.assign([tp]) print(tp) info = MsgPartitionInfo() # Get committed offset print('start to get committed offset.....') try: committed = consumer.committed(tp) or 0 except Exception, e_commit: print(str(e_commit)) # Move consumer to end to get the last position consumer.seek_to_end(tp) last_offset = consumer.position(tp) # Move consumer to beginning to get the first position consumer.seek_to_beginning() now_offset = consumer.position(tp) from_offset = committed if from_offset is None: from_offset = now_offset if from_offset < now_offset: from_offset = now_offset info.partition_ID = tp.partition info.get_last_offset = last_offset msgInfos.append(info) print("[%s] partition(%s) -> now:%s, last:%s, committed:%s" % (tp.topic, tp.partition, now_offset, last_offset, committed)) # Get msg from position to offset while (from_offset < last_offset) and (now_count < maxsize): consumer.seek(tp, from_offset) polldata = consumer.poll(100) from_offset += 1 now_count += 1 print('now_count=' + str(now_count)) result.topic_messages.append(polldata[tp][0].value) saveResult.MsgInfo = json.dumps(msgInfos, default=encode_MsgPartitionInfo, ensure_ascii=False) print(saveResult.MsgInfo) consumer.close() saveResult.message = "Success" saveResult.Code = 200 producer = KafkaProducer(bootstrap_servers=tmpbootstrap_servers) producer.send(topic + "_log", json.dumps(saveResult, default=encode_SaveDataResult)) producer.flush()
def main(input, output, consumer, sc, spark, es): print(input) # supprime dans elasticsearch l'heure précédente es.indices.delete(index='speed-layer-twitter', ignore=[400, 404]) print("suppression speed-layer avant commencement") heureMax = datetime.now().hour + 1 minutesMax = (datetime.now().hour * 60) + datetime.now().minute + 1 valeur = {} list_hastag = [] #consumer_timeout_ms=1000 i = 0 for message in consumer: i = i + 1 print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) print("OFFSET : ", message.offset) # consumer.get.offsett # offsset précédent # offsset = offsset - 1 valeur = json.loads(message.value.decode()) print(valeur) timestamp = valeur["timestamp"] datehashtag = valeur['datehashtag'] hastags = valeur['hashtags'] # liste python sc.parrallize list_hastag.append(valeur) # On détermine l'heure max dateHas = datetime.strptime(datehashtag, '%Y/%m/%d %H:%M:%S') heureTopic = dateHas.hour minuteTopic = dateHas.minute totalMinutesTopic = dateHas.hour * 60 + dateHas.minute print("heure: %d, minute: %d" % (heureTopic, minuteTopic)) print("minutetopic: %d, minutemax: %d" % (totalMinutesTopic, minutesMax)) # kapa architecture # if hr max > je sorts avec offsset -1 et commit offsset print(hastags, "à la date : ", datehashtag) if totalMinutesTopic >= minutesMax: # Mark this message as fully consumed # so it can be included in the next commit #consumer.task_done(message) # Commit the message that was just consumed #consumer.commit() #offsets = {message.key:OffsetAndMetadata(message.offset, '')} meta = consumer.partitions_for_topic("topic_hashtags") tp = TopicPartition(message.topic, message.partition) offsets = {tp: OffsetAndMetadata(message.offset, None)} consumer.commit(offsets=offsets) #consumer.commit(OffsetAndMetadata(message.offset, meta)) # meta = consumer.partitions_for_topic("topic_hashtags") # options = {} # options[message.partition] = OffsetAndMetadata(message.offset - 1, meta) # consumer.commit(options) print("On stoppe la boucle") break # créer dataframe avro # sauvegarder hdfs schema = { "namespace": "ffo.hashtag", "type": "record", "name": "Node", "fields": [ { "name": "datehashtag", "type": "string" }, { "name": "timestamp", "type": "int" }, { "name": "hashtags", "type": { "type": "array", "items": "string" }, "default": {} }, ] } hdfs_client = hdfs.InsecureClient("http://0.0.0:50070") with hdfs_client.write(output, overwrite=True) as avro_file: fastavro.writer(avro_file, schema, list_hastag) print("ok") # exploite dataframe opérations explode group by # pousse dans elasticsearch distData = sc.parallelize(list_hastag) # Convert to a Spark dataframe df = distData.toDF() # Cache data to avoid re-computing everything df.persist() dt = df.select(explode(df.hashtags), df.datehashtag, df.timestamp) for f in dt.collect(): print("==============enregistrement elasticsearch===================") print(f.col) print(f.datehashtag) print("==============enregistrement elasticsearch===================") elastic(f.timestamp, f.datehashtag, f.col, es)
def run_consumer(consumer_config, kafka_config, neo4j_config): neo_driver = Neo4jBoltDriver(neo4j_config).connect() c = KafkaConsumer(**kafka_config) # First we need to retrieve the list of indexes records = neo_driver.session.read_transaction( lambda tx: tx.run("CALL db.indexes;")) labels = [l[10:-6] for l in records.value()] logger.info("[*] Initial indexes : {}".format(labels)) # Let's start ! conf = {**kafka_config} del conf["sasl_plain_password"] topics = consumer_config["kafka"]["topics"] logger.info("[*] Starting to consume topic {} : {}".format(topics, conf)) tps = [TopicPartition(topic, 0) for topic in topics] c.assign(tps) logger.info("Consumer assigned to: {}".format([tp.topic for tp in tps])) last_heartbeat = 0 while True: if int(time.time() ) - last_heartbeat >= consumer_config["heartbeat"]["delay"]: last_heartbeat = trigger_heartbeat(neo_driver) batch_messages = c.poll() if batch_messages.values(): # The consumer currently handle only one partition, so there is no need to iterate (topic_partition, records), *_ = batch_messages.items() # Keep the first and last offset for debug first_offset = records[0].offset last_offset = records[-1].offset # List of formatted messages used in micro-batches messages = validate_and_reformat_messages( team=topic_partition.topic.split(".")[1], records=[r.value for r in records], ) # Check if indexes are created new_labels = set(messages["nodes"].keys()) - set(labels) for lab in new_labels: cypher = "CREATE CONSTRAINT ON (l:`$LABEL$`) ASSERT l.name IS UNIQUE;".replace( "$LABEL$", lab) neo_driver.exec_cypher(cypher, {}) logger.info("[*] {} index created, indexes are now {}".format( lab, labels)) labels.append(lab) start_batch_time = time.time() logger.info( "[*] Consuming {} with offsets from #{} to #{}...".format( topic_partition.topic, first_offset, last_offset)) # Let's start the subsets for the nodes for label, nodes in messages["nodes"].items(): cypher = NODE_TEMPLATE.replace("$LABEL$", label) start_query_time = time.time() _, summary = neo_driver.exec_cypher(cypher, {"nodes": nodes}) logger.info( "[nodes] {} done using {} messages in {}s : {}".format( label, len(nodes), round(time.time() - start_query_time, 3), summary.counters, )) # Let's start the subsets for the relationships for source, targets in messages["rels"].items(): for target, rels in targets.items(): cypher = REL_TEMPLATE.replace("$SOURCE$", source) cypher = cypher.replace("$TARGET$", target) start_query_time = time.time() result, summary = neo_driver.exec_cypher( cypher, {"rels": rels}) failures, stats = catch_relationship_validation_errors( result) if len(failures): logger.warning( "[validation] {} relationship(s) failed validation" .format(len(failures))) for f in failures: logger.warning( "[validation] relationship properties failed validation : {}" .format(f)) logger.info( "[rels] {} -> {} done using {} message(s) in {}s : {}". format( source, target, len(rels), round(time.time() - start_query_time, 3), stats, )) logger.info("[*] Batch done in {} seconds".format( round(time.time() - start_batch_time, 3))) c.commit()
def on_partitions_assigned(self, assigned): print(assigned) consumer.seek(TopicPartition("Panda_Media", 0), 0) return
logger = get_logger('notzam') KAFKA_BROKER_URL = os.environ.get('KAFKA_BROKER_URL') def home(request): return render(request, 'ml_home.html') def model_summary(request: HttpRequest): return render(request, 'ml_model_summary.html', {'model_summary': get_model_summary()}) trained = consumer(KAFKA_BROKER_URL) trained_partition = TopicPartition('trained', 0) trained.assign([trained_partition]) trained.poll(1) detected = consumer(KAFKA_BROKER_URL) detected_partition = TopicPartition('detected', 0) detected.assign([detected_partition]) detected.poll(1) logger.info(KAFKA_BROKER_URL) def training(request): if request.is_ajax(): msg = trained.poll(50) msg = _ext_record_value(msg, trained_partition)
from kafka import KafkaConsumer from kafka import TopicPartition import tornado import json import tornado.ioloop import tornado.web import tornado.websocket import tornado.template from config import KAFKA_BOOTSTRAP_SERVERS from config import MESSAGE_BURST import time from random import randint consumer = KafkaConsumer(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS) consumer.assign([TopicPartition('hello-csv', 0)]) class MainHandler(tornado.web.RequestHandler): def get(self): loader = tornado.template.Loader(".") self.write(loader.load("./resources/templates/graph.html").generate()) class WSHandler(tornado.websocket.WebSocketHandler): def check_origin(self, origin): return True def open(self): self.write_message("{}") # for message in consumer: # messageJson = json.loads(message.value)
def receiveFromKafka(mode, topic_override=None): TOPIC = KAFKA_TOPIC_READ if (topic_override != None): TOPIC = topic_override logger.info("Will use topic = {}".format(TOPIC)) consumer = HerokuKafkaConsumer( #KAKFA_TOPIC, # Optional: You don't need to pass any topic at all url= KAFKA_URL, # Url string provided by heroku ssl_cert= KAFKA_CLIENT_CERT, # Client cert string ssl_key= KAFKA_CLIENT_CERT_KEY, # Client cert key string ssl_ca= KAFKA_TRUSTED_CERT, # Client trusted cert string prefix= KAFKA_PREFIX, # Prefix provided by heroku, auto_offset_reset="smallest", max_poll_records=100, enable_auto_commit=True, auto_commit_interval_ms=100, group_id=KAFKA_GROUP_ID, api_version = (0,9) ) """ To subscribe to topic(s) after creating a consumer pass in a list of topics without the KAFKA_PREFIX. """ partition=1 tp = TopicPartition(KAFKA_PREFIX + TOPIC, partition) if (mode == "subscribe"): consumer.subscribe(topics=(TOPIC)) elif (mode == "assign"): consumer.assign([tp]) # display list of partition assignerd assignments = consumer.assignment() for assignment in assignments: logger.debug(assignment) partitions=consumer.partitions_for_topic(KAFKA_PREFIX + TOPIC) if (partitions): for partition in partitions: logger.debug("Partition="+str(partition)) topics=consumer.topics() if (topics): for topic in topics: logger.debug("Topic:"+topic) #exit(1) logger.debug('waiting ..') """ .assign requires a full topic name with prefix """ """ Listening to events it is exactly the same as in kafka_python. Read the documention linked below for more info! """ i=0 for message in consumer: try: logger.debug ("%i %s:%d:%d: key=%s value=%s" % (i, message.topic, message.partition, message.offset, message.key, message.value)) dictValue = ujson.loads(message.value) logger.debug(dictValue) # check value in the field Action Type if ("payload" in dictValue): if (dictValue['payload']['Action_Type__c'] == 'PushNotification'): logger.info("about to send a BROWSER NOTIFICATION using PUSHER") message = dictValue['payload']['message__c'] userid = dictValue['payload']['userid__c'], notification.sendNotification(userid, message) """ if ('channel' in dictValue): # means it's coming from a Platform EVENT via kafka if ('host_accept_guest__e' in dictValue['channel'].lower()): logger.info("about to send a SMS using BLOWER") message = "Dear {} {} , {} {} is aware of your arrival and will be here shortly".format( dictValue['data']['payload']['Guest_Firstname__c'], dictValue['data']['payload']['Guest_Lastname__c'], dictValue['data']['payload']['Host_Firstname__c'], dictValue['data']['payload']['Host_Lastname__c'], ) blower.sendMessage(message, dictValue['data']['payload']['Guest_Phone_Number__c']) elif ('send_smss__e' in dictValue['channel'].lower()): logger.info("about to send a SMS using BLOWER") message = dictValue['data']['payload']['message__c'] phone_Number = dictValue['data']['payload']['phone_Number__c'], blower.sendMessage(message, phone_Number) #{'schema': 'h7kPS4B7NEsigjlW7748lg', # 'payload': { # 'CreatedById': '0051t000002FB13AAG', # 'message__c': 'Hello ! ', # 'Action_Type__c': 'PushNotification', # 'CreatedDate': '2020-06-16T15:52:45.535Z', # 'userid__c': 'dac11bb3-148e-4b27-a6f2-caf0af09fb0a'}, 'event': {'replayId': 14570697}}} elif ('push_notification__e' in dictValue['channel'].lower()): logger.info("about to send a BROWSER NOTIFICATION using PUSHER") message = dictValue['data']['payload']['message__c'] userid = dictValue['data']['payload']['userid__c'], notification.sendNotification(userid, message) """ consumer.commit() except Exception as e : import traceback traceback.print_exc() consumer.commit() i += 1
import json import logging logger = logging.getLogger(__name__) # enable the debug logger if you want to see ALL of the lines logging.basicConfig(level=logging.INFO) # Creating Kafka Consumer # Consumes Kafka messages listBootstrapServer = ['127.0.0.1:9092'] consumer = KafkaConsumer( bootstrap_servers=listBootstrapServer, auto_offset_reset='earliest', value_deserializer=lambda m: json.loads(m.decode('ascii')), consumer_timeout_ms=1000000, group_id='My-first-app') # Assign topicPartition = TopicPartition('Learning_Kafka_1', 0) consumer.assign([topicPartition]) # Seek consumer.seek(topicPartition, 40) for message in consumer: print("topic=%s partition=%d offset=%d key=%s value=%s" % (message.topic, message.partition, message.offset, str( message.key), message.value))
info['platform-release']=platform.release() info['platform-version']=platform.version() info['architecture']=platform.machine() info['hostname']=socket.gethostname() info['ip-address']=socket.gethostbyname(socket.gethostname()) info['mac-address']=':'.join(re.findall('..', '%012x' % uuid.getnode())) info['processor']=platform.processor() info['ram']=str(round(psutil.virtual_memory().total / (1024.0 ** 3)))+" GB" return json.dumps(info) except Exception as e: logging.exception(e) print("Starting Consumer 2;") print(json.loads(getSystemInfo())) consumer = KafkaConsumer(bootstrap_servers="localhost:9093", client_id="number_consumer2", auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=1000) partition = TopicPartition('number', 0) consumer.assign([partition]) consumer.seek_to_beginning(partition) sum_numbers = 0 for msg in consumer: sum_numbers += int.from_bytes(msg.value, 'big') print("The sum of all numbers recorded is: {:i}", sum_numbers)