def main(): happy_log_probs, sad_log_probs = readSentimentList( 'twitter_sentiment_list.csv') consumer = KafkaConsumer("tweets", bootstrap_servers=["localhost:9092"], auto_offset_reset='smallest') kafka = KafkaClient("localhost:9092") producer = SimpleProducer(kafka) topic = 'hashtag_sentiment' positive_tags = Counter() negative_tags = Counter() while True: for message in consumer.fetch_messages(): txt = message.value txt = re.sub(r'[^\x00-\x7F]', ' ', txt) hashtags, sentiment = classifySentiment( txt, happy_log_probs, sad_log_probs) for hashtag in hashtags: if sentiment > 0: positive_tags[hashtag] += 1 else: negative_tags[hashtag] += 1 results = {} for key, val in positive_tags.most_common(20): results[key] = val producer.send_messages(topic, json.dumps(results)) time.sleep(10)
def read_groups(self): self.log.info("Kafka consumer running") self.consumer = KafkaConsumer( CONSUMER_OFFSET_TOPIC, group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='smallest', auto_commit_enable=False, consumer_timeout_ms=10000, ) self.log.info("Consumer ready") self.watermarks = self.get_current_watermarks() while not self.finished(): try: message = self.consumer.next() max_offset = self.get_max_offset(message.partition) if message.offset >= max_offset - 1: self.finished_partitions.add(message.partition) except ConsumerTimeout: break except ( FailedPayloadsError, KafkaUnavailableError, LeaderNotAvailableError, NotLeaderForPartitionError, ) as e: self.log.warning("Got %s, retrying", e.__class__.__name__) self.process_consumer_offset_message(message) return self.kafka_groups
def comsumer(): # consumer = SimpleConsumer(client, group=None, # topic=topic, partitions=[0, ], # auto_commit=False) # node_id = list(consumer.client.conns.keys())[0] # print dir(consumer.client.conns[node_id]) # for i in consumer.get_messages(100): # print i.offset # consumer.commit() # from pykafka import KafkaClient # # client = KafkaClient(hosts="127.0.0.1:9092") # print client.topics # topic1 = client.topics[topic] # consumer = topic1.get_simple_consumer(auto_commit_enable=True, ) # for message in consumer: # if message is not None: # print message.offset, message.value connect_str = '127.0.0.1:9092' consumer = KafkaConsumer(topic, group_id='my-group', bootstrap_servers=[connect_str], auto_offset_reset='largest',auto_commit_enable=True, auto_commit_interval_messages=1000)# largest,smallest consumer.set_topic_partitions((topic, 2, 50032),) # Optionally specify offsets to start from # kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0}) #partition只能被一个消费者消费,所以最好指定消费哪个partitions # kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 }) # print consumer.topics for message in consumer: print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) consumer.commit()
def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic( CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks( list(self.active_partitions.values())) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[ tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(list(self.active_partitions.values())) self.log.info("Consuming from %s", self.active_partitions) message_iterator = iter(self.consumer) while not self.finished(): try: message = next(message_iterator) except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[ message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) self._remove_unsubscribed_topics() return { group: topics.keys() for group, topics in six.iteritems(self._kafka_groups) if topics }
def CheckTopicExsited(topic): consumer = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers, enable_auto_commit=False, group_id='consumer') # Get all partitions by topic par = consumer.partitions_for_topic(topic) print(par) if par is None: return False return True
def invoke_kafka_consumer(self, p_str_broker_host, p_is_sasl): if p_is_sasl: # consumer = Consumer({ # 'bootstrap.servers': config.BOOTSTRAP_SERVERS, # 'group.id': config.CONSUMER_GROUP, # 'enable.auto.commit': False, # }) return KafkaConsumer(bootstrap_servers=p_str_broker_host, security_protocol='SASL_PLAINTEXT', sasl_mechanism='PLAIN', sasl_plain_username='******', sasl_plain_password='******') else: return KafkaConsumer(bootstrap_servers=p_str_broker_host)
class IndexedConsumer(): """ A simple consumer to retrieve messages from the input queue when it is time to send them """ def __init__(self, input_topic, hosts): self.input_topic = input_topic self.consumer = KafkaConsumer(bootstrap_servers=hosts) def retrieve_event(self, event_reference): self.consumer.set_topic_partitions( (self.input_topic, event_reference.partition, event_reference.offset)) message = self.consumer.next() event = ScheduledEvent.from_dict(json.loads(message.value)) return event
def create_consumer(topics, brokers, group, max_bytes=1024 * 1024, max_wait_ms=100): kafka = KafkaConsumer(*topics, metadata_broker_list=brokers, group_id=group, fetch_message_max_bytes=max_bytes, fetch_wait_max_ms=max_wait_ms) return kafka
def get_kafka_consumer(topic: str, consumer_group: str) -> KafkaConsumer: return KafkaConsumer(topic, bootstrap_servers=['localhost:9092'], auto_offset_reset='latest', enable_auto_commit=True, group_id=consumer_group, value_deserializer=lambda x: loads(x.decode('utf-8')))
def kafka_consumer_entrypoint(): global g_config_kafka print("Kafka config: " + str(g_config_kafka)) consumer = KafkaConsumer( g_config_kafka['topic_name'], g_config_kafka['group_id'], bootstrap_servers=[g_config_kafka['bootstrap_server']], security_protocol="SSL", ssl_cafile=g_config_kafka['ssl_cafile'], ssl_keyfile=g_config_kafka['ssl_keyfile'], ssl_certfile=g_config_kafka['ssl_certfile']) for message in consumer: # Extract message.value: try: value = json.loads(message.value.decode('utf-8')) # Message is a json dictionary of form: # { site_id: ..., status_code: ..., regex_results: ...} # print(";; debug: " + str(value)) db_store_probe_results(value['site_id'], value['status_code'], value['regex_results']) except: print("Unable to parse message from the kafka topic.") raise return True
def configure_internal_queues(self): """ configures the internal queues used hold references to events in the input queue """ for i in range(self.number_of_queues): client = KafkaClient(hosts=self.kafka_hosts) queue_name = SCHEDULER_QUEUE_FORMAT.format(2**i) client.ensure_topic_exists(queue_name) indexed_consumer = IndexedConsumer(self.input_topic, self.kafka_hosts) queue_consumer = KafkaConsumer( queue_name, bootstrap_servers=self.kafka_hosts, group_id=queue_name, consumer_timeout_ms=2000, auto_commit_enable=False, ) queue_producer = SimpleProducer(client) queue_duration = 2**i self.queues.append( InternalQueue( queue_consumer, indexed_consumer, queue_producer, self.number_of_queues, queue_duration, ))
def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) # Fetch metadata as partitions_for_topic only returns locally cached metadata # See https://github.com/dpkp/kafka-python/issues/1742 self.consumer.topics() if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic(CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks(list(self.active_partitions.values())) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(list(self.active_partitions.values())) self.log.info("Consuming from %s", self.active_partitions) message_iterator = iter(self.consumer) while not self.finished(): try: message = next(message_iterator) except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) self._remove_unsubscribed_topics() return { group: topics.keys() for group, topics in six.iteritems(self._kafka_groups) if topics }
def __init__(self, topic, addr, auto_commit=False, auto_offset_reset="earliest"): """Initializes with Topic Name, Broker Address, and Consumer Settings""" self.consumer = KafkaConsumer(topic, bootstrap_servers=addr, value_deserializer=lambda m: json.loads(m.decode('ascii')), enable_auto_commit=auto_commit, auto_offset_reset=auto_offset_reset, api_version=(0,1,0))
def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic, logTopic, interval): self.kafkaHost = kafkaHost self.kafkaPort = kafkaPort self.tcpHost = tcpHost self.tcpPort = tcpPort self.group_id = group_id self.topic = topic self.logTopic = logTopic self.interval = int(interval) self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)], group_id=group_id, enable_auto_commit=False) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)]) self.tcpWriter = None
def __init__(self, kafka_host, kafka_port, tcp_host, tcp_port, topic, log_topic): self.kafka_host = kafka_host self.kafka_port = kafka_port self.tcp_host = tcp_host self.tcp_port = tcp_port self.topic = topic self.log_topic = log_topic self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)], enable_auto_commit=False, max_poll_records=1024 * 1024, max_partition_fetch_bytes=1024 * 1024 * 100) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)]) self.connections = {} self.sample_end_time = self.get_end_time(time()) self.lastPolled = []
def thread_main(topic): consumer = KafkaConsumer(topic, group_id='kafka_monitor', metadata_broker_list=broker_list) offset = consumer._offsets.fetch for part in offset: kafka_logsize.labels(topic=part[0], partition=part[1]).set(offset[part])
def kafka_input(collector, **options): group_id = options.pop("group_id", "hackathon") broker = options.pop("broker", os.getenv("KAFKA_BROKER", "").split(",")) consumer = KafkaConsumer(collector, metadata_broker_list=broker, group_id=group_id, auto_commit_enable=False) return { "collector": collector, "files": [KafkaInputBview(consumer, collector), kafka_iter(consumer)], "format": kafka_format }
def _create_kafka_consumer(self): consumer = KafkaConsumer( self._kafka_topic, bootstrap_servers=self._kafka_brokers, auto_offset_reset=self._kafka_start_offset, # largest #当zookeeper中没有初始的offset时,或者超出offset上限时的处理方式 。 enable_auto_commit=False, ## true时,Consumer会在消费消息后将offset同步到zookeeper,这样当Consumer失败后,新的consumer就能从zookeeper获取最新的offset client_id=str(uuid.uuid1()) if id == None else id, group_id=self._kafka_group) # discard old ones return consumer
def initialize(self, stormconf, context): #self.words = itertools.cycle(['dog', 'cat', # 'zebra', 'elephant']) #self.sentences = [ # "She advised him to take a long holiday, so he immediately quit work and took a trip around the world", # "I was very glad to get a present from her", # "He will be here in half an hour", # "She saw him eating a sandwich", #] #self.sentences = itertools.cycle(self.sentences) self.consumer = KafkaConsumer(b'twitterstream', bootstrap_servers=['0.0.0.0:9092'])
def commitTopic(topic, group, partition, commit_offset): try: print( '====================================================================================' ) print('[commitTopic] : topic=' + topic + ', group=' + group + ', partition=' + str(partition) + ', commit_offset=' + str(commit_offset)) consumer2 = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers, enable_auto_commit=False, group_id=group) tp = TopicPartition(topic, partition) if int(commit_offset) > 0: consumer2.commit({tp: OffsetAndMetadata(commit_offset, None)}) except Exception as ee: print('error when commit Topic') print(str(ee)) finally: print('commitTopic end')
def configure_input_queue(self): """ configures the input queue that other services can use to schedule an event to be delivered """ client = KafkaClient(hosts=self.kafka_hosts) client.ensure_topic_exists(self.input_topic) indexed_consumer = IndexedConsumer(self.input_topic, self.kafka_hosts) queue_consumer = KafkaConsumer(self.input_topic, bootstrap_servers=self.kafka_hosts, group_id=CONSUMER_GROUP) queue_producer = SimpleProducer(KafkaClient(hosts=self.kafka_hosts)) self.queues.append( InputQueue(queue_consumer, indexed_consumer, queue_producer, self.number_of_queues))
def receive_message(self, cgroup_name): consumer = KafkaConsumer(TOPIC, group_id=cgroup_name, bootstrap_servers=[BOOTSTRAP_IP]) try: for msg in consumer: msg = msg.value logger.info("consumer receive message %s" % msg) future = self.thread_pool.submit(self.msg_handler, (msg)) future.add_done_callback(self.callback_handler) except Exception: logger.error("consumer error") logger.error(traceback.format_exc()) finally: self.thread_pool.shutdown(wait=True)
def run(self): consumer = KafkaConsumer(bootstrap_servers=self.bootstrap_servers, auto_offset_reset='earliest', group_id=self.group, consumer_timeout_ms=1000) consumer.subscribe(self.topics) while not self.stop_event.is_set(): for message in consumer: print(message) if self.stop_event.is_set(): break consumer.close()
def consume(args): schema = args.schema table = args.table assert schema in settings.SCHEMAS, 'schema must in settings.SCHEMAS' assert table in settings.TABLES, 'table must in settings.TABLES' group_id = f'{schema}.{table}' consumer = KafkaConsumer( bootstrap_servers=settings.KAFKA_SERVER, value_deserializer=lambda x: json.loads(x, object_hook=object_hook), key_deserializer=lambda x: x.decode() if x else None, enable_auto_commit=False, group_id=group_id, auto_offset_reset='earliest', ) topic = settings.KAFKA_TOPIC partition = settings.PARTITIONS.get(group_id) consumer.assign([TopicPartition(topic, partition)]) event_list = [] logger.info( f'success consume topic:{topic},partition:{partition},schema:{schema},table:{table}' ) pk = reader.get_primary_key(schema, table) for msg in consumer: # type:ConsumerRecord logger.debug(f'kafka msg:{msg}') event = msg.value event_list.append(event) len_event = len(event_list) if len_event == settings.INSERT_NUMS or ( (int(time.time() * 10**6) - event_list[0]['event_unixtime']) / 10**6 >= settings.INSERT_INTERVAL > 0): data_dict = {} tmp_data = [] for items in event_list: action = items['action'] action_core = items['action_core'] data_dict.setdefault(table + schema + action + action_core, []).append(items) for k, v in data_dict.items(): tmp_data.append(v) result = writer.insert_event(tmp_data, settings.SKIP_TYPE, settings.SKIP_DELETE_TB_NAME, schema, table, pk) if result: event_list = [] consumer.commit() logger.info(f'commit success {len_event} events!') else: logger.error('insert event error!') exit()
def run(self): avro_serde = AvroSerDe(AVRO_SCHEMA_STRING) client = KafkaClient('localhost:9092') consumer = KafkaConsumer(KAFKA_TOPIC, group_id='my_group', bootstrap_servers=['localhost:9092']) # Keep track of and print statistics. attempts = 0 failures = 0 failure_rate = 0.0 for message in consumer: event = avro_serde.bytes_to_obj(message.value) print '--> ' + str(event) if event['op'] == 'login': attempts += 1 if not event['success']: failures += 1 failure_rate = float(failures) / attempts print '--> Event: ' + str(event) print '--> Failure Rate: ' + str(failure_rate)
class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self.kafka_groups = defaultdict(set) self.finished_partitions = set() def read_groups(self): self.log.info("Kafka consumer running") self.consumer = KafkaConsumer( CONSUMER_OFFSET_TOPIC, group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='smallest', auto_commit_enable=False, consumer_timeout_ms=10000, ) self.log.info("Consumer ready") self.watermarks = self.get_current_watermarks() while not self.finished(): try: message = self.consumer.next() max_offset = self.get_max_offset(message.partition) if message.offset >= max_offset - 1: self.finished_partitions.add(message.partition) except ConsumerTimeout: break except ( FailedPayloadsError, KafkaUnavailableError, LeaderNotAvailableError, NotLeaderForPartitionError, ) as e: self.log.warning("Got %s, retrying", e.__class__.__name__) self.process_consumer_offset_message(message) return self.kafka_groups def parse_consumer_offset_message(self, message): key = bytearray(message.key) ((key_schema, ), cur) = relative_unpack('>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException( ) # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition, ), cur) = relative_unpack('>l', key, cur) if message.value: value = bytearray(message.value) ((value_schema, ), cur) = relative_unpack('>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset, ), cur) = relative_unpack('>q', value, cur) else: offset = None # Offset was deleted return str(group), str(topic), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message( message) except InvalidMessageException: return if offset: self.kafka_groups[group].add(topic) else: # No offset means group deletion self.kafka_groups.pop(group, None) def get_current_watermarks(self): self.consumer._client.load_metadata_for_topics() offsets = get_topics_watermarks( self.consumer._client, [CONSUMER_OFFSET_TOPIC], ) return { partition: offset for partition, offset in offsets[CONSUMER_OFFSET_TOPIC].iteritems() if offset.highmark > offset.lowmark } def get_max_offset(self, partition): return self.watermarks[partition].highmark def finished(self): return len(self.finished_partitions) >= len(self.watermarks)
parser = argparse.ArgumentParser() parser.add_argument("collector") parser.add_argument("--from-beginning", action="store_true") parser.add_argument("--ripe-servers", default=",".join(RIPE_SERVERS)) parser.add_argument("--our-servers", default="localhost:9092") args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_http_server(4340 + PARTITIONS[args.collector]) logger.info("loading the stats server on %s", 4340 + PARTITIONS[args.collector]) consumer = KafkaConsumer("raw-{}".format(args.collector), group_id='test_hackathon10', bootstrap_servers=args.ripe_servers.split(",")) save_file = "offsets-{}".format(args.collector) if args.from_beginning: logger.info("starting from scratch") offsets = {("raw-{}".format(args.collector), i): 0 for i in range(0, 10)} consumer.set_topic_partitions(offsets) elif os.path.exists(save_file): with open(save_file, "r") as f: offsets = cPickle.load(f) logger.info("loading offsets from file: %s", offsets) consumer.set_topic_partitions(offsets) else: logger.info("starting from last messages")
funcs.append(partial(annotate_if_roa, ro_rad_tree)) if args.irr_org_file is not None and args.irr_mnt_file: relations_dict = dict() fill_relation_struct(args.irr_org_file, relations_dict, "organisations") fill_relation_struct(args.irr_mnt_file, relations_dict, "maintainers") funcs.append(partial(annotate_if_relation, relations_dict)) if args.as_rel_file is not None and args.ppdc_ases_file is not None and args.as2org_file is not None: a, b,c,d = caida_filter_annaunce(args.as_rel_file, args.ppdc_ases_file, args.as2org_file) funcs.append(partial(is_legittimate, a, b, c,d)) if args.from_timestamp is None: consumer = KafkaConsumer("conflicts", metadata_broker_list=args.our_servers.split(","), group_id="detector", auto_commit_enable=False) offset, = consumer.get_partition_offsets("conflicts", PARTITIONS[args.collector], -1, 1) consumer.set_topic_partitions({("conflicts", PARTITIONS[args.collector]): offset - 1}) last_message = next(iter(consumer)) last_data = json.loads(last_message.value) last_ts = last_data["timestamp"] logger.info("last detected event was at offset %s timestamp %s", offset, last_ts) else: last_ts = args.from_timestamp logger.info("detecting conflicts newer than %s", datetime.utcfromtimestamp(last_ts)) start_http_server(4240 + PARTITIONS[args.collector]) client = KafkaClient(args.our_servers.split(","))
class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self.kafka_groups = defaultdict(set) self.finished_partitions = set() def read_groups(self): self.log.info("Kafka consumer running") self.consumer = KafkaConsumer( CONSUMER_OFFSET_TOPIC, group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='smallest', auto_commit_enable=False, consumer_timeout_ms=10000, ) self.log.info("Consumer ready") self.watermarks = self.get_current_watermarks() while not self.finished(): try: message = self.consumer.next() max_offset = self.get_max_offset(message.partition) if message.offset >= max_offset - 1: self.finished_partitions.add(message.partition) except ConsumerTimeout: break except ( FailedPayloadsError, KafkaUnavailableError, LeaderNotAvailableError, NotLeaderForPartitionError, ) as e: self.log.warning("Got %s, retrying", e.__class__.__name__) self.process_consumer_offset_message(message) return self.kafka_groups def parse_consumer_offset_message(self, message): key = bytearray(message.key) ((key_schema,), cur) = relative_unpack('>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException() # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition,), cur) = relative_unpack('>l', key, cur) if message.value: value = bytearray(message.value) ((value_schema,), cur) = relative_unpack('>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset,), cur) = relative_unpack('>q', value, cur) else: offset = None # Offset was deleted return str(group), str(topic), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message(message) except InvalidMessageException: return if offset: self.kafka_groups[group].add(topic) else: # No offset means group deletion self.kafka_groups.pop(group, None) def get_current_watermarks(self): self.consumer._client.load_metadata_for_topics() offsets = get_topics_watermarks( self.consumer._client, [CONSUMER_OFFSET_TOPIC], ) return {partition: offset for partition, offset in offsets[CONSUMER_OFFSET_TOPIC].iteritems() if offset.highmark > offset.lowmark} def get_max_offset(self, partition): return self.watermarks[partition].highmark def finished(self): return len(self.finished_partitions) >= len(self.watermarks)
import argparse relations, childs, parents = caida_filter_annaunce( "20160101.as-rel.txt", "20160101.ppdc-ases.txt") print(len(relations), len(childs), len(parents)) parser = argparse.ArgumentParser( description="get a feed of abnormal BGP conflicts") parser.add_argument("--offset", type=int) args = parser.parse_args() logging.basicConfig(level=logging.INFO) consumer = KafkaConsumer("hijacks", bootstrap_servers=["comet-17-08.sdsc.edu:9092"], group_id="client") if args.offset is not None: topics = [("hijacks", i, args.offset) for i in PARTITIONS.values()] consumer.set_topic_partitions(*topics) hijacks = 0 total = 0 for item in consumer: total += 1 if (is_legittimate(relations, childs, parents, json.loads(item.value)) == 0): hijacks += 1 #print(item.value) if (total == 10000): print(total, hijacks)
from kafka.client import KafkaClient from kafka.consumer import KafkaConsumer from kafka.producer import SimpleProducer import numpy as np from sklearn import svm from sklearn.externals import joblib import mysql.connector from datetime import datetime import json client = KafkaClient("ip-172-31-28-55.ec2.internal:6667") consumer = KafkaConsumer("shm", metadata_broker_list=['ip-172-31-28-55.ec2.internal:6667']) #consumer = KafkaConsumer("shm", metadata_broker_list=['ip-172-31-28-55.ec2.internal:6667']) conn = mysql.connector.connect(user='******', password='******', host='iotshm-data.ck3sx5qm0blx.us-west-2.rds.amazonaws.com', database='iotshm') cursor = conn.cursor() #add_health = ("""INSERT IGNORE INTO iotshm.Health (sensor_id, timestamp, reading_type, healthy) VALUES (%s, %s, %s, %s)""") add_magnitude = ("""INSERT IGNORE INTO iotshm.Magnitude (frequency, sensor_id, magnitude, reading_type, timestamp, healthy) VALUES(%s, %s, %s, %s, %s, %s)""") # TODO add new classifier files and change file names x_clf = joblib.load('xClf.pkl') y_clf = joblib.load('xClf.pkl')
from pyspark.sql import SparkSession spark = SparkSession.builder.appName('Basics').getOrCreate() # import pyspark class Row from module sql from pyspark.sql import * from pyspark.sql.types import * import tempfile # ml from pyspark.ml import Pipeline from pyspark.ml import PipelineModel # start a kafka consumer session from kafka.consumer import KafkaConsumer consumer = KafkaConsumer( "titanic", bootstrap_servers=['ip-172-31-12-218.us-east-2.compute.internal:6667']) print('consumer launched') testSchema = [ "PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked" ] pipeline = Pipeline.load("/home/ubuntu/titanic/pipeline") model = PipelineModel.load("/home/ubuntu/titanic/model") def getTrain(msg): # put passenger info into dataframe # print msg
broker = SimpleClient(kafka) lags = {} zk = KazooClient(hosts=zookeepers, read_only=True) zk.start() logsize = 0 # topics=zk.get_children("/consumers/%s/owners" % (group) ) topic = sys.argv[1] data_need = sys.argv[2] # for topic in topics: if topic: logsize = 0 # print topic partitions = broker.get_partition_ids_for_topic(topic) # print partitions consumer = KafkaConsumer(broker, group, str(topic)) responses = broker.send_offset_fetch_request( group, [OffsetFetchRequestPayload(topic, p) for p in partitions], fail_on_error=True) # print responses latest_offset = 0 for res in responses: if topic != "test": latest_offset += res[2] # print latest_offset for partition in partitions: log = "/consumers/%s/offsets/%s/%s" % (group, topic, partition) if zk.exists(log): data, stat = zk.get(log) logsize += int(data) # print logsize
class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self._kafka_groups = defaultdict(lambda: defaultdict(dict)) self.active_partitions = {} self._finished = False def read_group(self, group_id): partition_count = get_offset_topic_partition_count(self.kafka_config) partition = get_group_partition(group_id, partition_count) return self.read_groups(partition).get(group_id, []) def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) # Fetch metadata as partitions_for_topic only returns locally cached metadata # See https://github.com/dpkp/kafka-python/issues/1742 self.consumer.topics() if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic(CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks(list(self.active_partitions.values())) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(list(self.active_partitions.values())) self.log.info("Consuming from %s", self.active_partitions) message_iterator = iter(self.consumer) while not self.finished(): try: message = next(message_iterator) except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) self._remove_unsubscribed_topics() return { group: topics.keys() for group, topics in six.iteritems(self._kafka_groups) if topics } def _remove_unsubscribed_topics(self): for group, topics in list(six.iteritems(self._kafka_groups)): for topic, partitions in list(six.iteritems(topics)): # If offsets for all partitions are 0, consider the topic as unsubscribed if not any(partitions.values()): del self._kafka_groups[group][topic] self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic)) def remove_partition_from_consumer(self, partition): deleted = self.active_partitions.pop(partition) # Terminate if there are no more partitions to consume if not self.active_partitions: self.log.info("Completed reading from all partitions") self._finished = True return # Reassign the remaining partitions to the consumer while saving the # position positions = [ (p, self.consumer.position(p)) for p in self.active_partitions.values() ] self.consumer.assign(list(self.active_partitions.values())) for topic_partition, position in positions: self.consumer.seek(topic_partition, position) self.log.info( "Completed reading from %s. Remaining partitions: %s", deleted, self.active_partitions, ) def parse_consumer_offset_message(self, message): key = message.key ((key_schema,), cur) = relative_unpack(b'>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException() # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition,), cur) = relative_unpack(b'>l', key, cur) if message.value: value = message.value ((value_schema,), cur) = relative_unpack(b'>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset,), cur) = relative_unpack(b'>q', value, cur) else: offset = None # Offset was deleted return group.decode(), topic.decode(), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message(message) except InvalidMessageException: return if offset is not None: self._kafka_groups[group][topic][partition] = offset self.log.info( "Updated group {group} topic {topic} and updated offset in list of groups".format( group=group, topic=topic, ), ) # TODO: check if we can ever find an offset commit message with message.value is None elif offset is None and group in self._kafka_groups and \ topic in self._kafka_groups[group]: # No offset means topic deletion del self._kafka_groups[group][topic] self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic)) def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return {part: offset for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC]) if offset.highmark > offset.lowmark and (partitions is None or part in partitions_set)} def finished(self): return self._finished
def consume(args): schema = args.schema tables = args.tables skip_error = args.skip_error assert schema in settings.SCHEMAS, f'schema {schema} must in settings.SCHEMAS' topic = settings.KAFKA_TOPIC tables_pk = {} partitions = [] for table in tables.split(','): assert table in settings.TABLES, f'table {table} must in settings.TABLES' partition = settings.PARTITIONS.get(f'{schema}.{table}') tp = TopicPartition(topic, partition) partitions.append(tp) tables_pk[table] = reader.get_primary_key(schema, table) group_id = f'{schema}.{tables}' consumer = KafkaConsumer( bootstrap_servers=settings.KAFKA_SERVER, value_deserializer=lambda x: json.loads(x, object_hook=object_hook), key_deserializer=lambda x: x.decode() if x else None, enable_auto_commit=False, group_id=group_id, auto_offset_reset='earliest', ) consumer.assign(partitions) event_list = {} is_insert = False last_time = 0 len_event = 0 logger.info(f'success consume topic:{topic},partitions:{partitions},schema:{schema},tables:{tables}') for msg in consumer: # type:ConsumerRecord logger.debug(f'kafka msg:{msg}') event = msg.value event_unixtime = event['event_unixtime'] / 10 ** 6 table = event['table'] schema = event['schema'] event_list.setdefault(table, []).append(event) len_event += 1 if last_time == 0: last_time = event_unixtime if len_event == settings.INSERT_NUMS: is_insert = True else: if event_unixtime - last_time >= settings.INSERT_INTERVAL > 0: is_insert = True if is_insert: data_dict = {} events_num = 0 for table, items in event_list.items(): for item in items: action = item['action'] action_core = item['action_core'] data_dict.setdefault(table, {}).setdefault(table + schema + action + action_core, []).append(item) for table, v in data_dict.items(): tmp_data = [] for k1, v1 in v.items(): events_num += len(v1) tmp_data.append(v1) try: result = writer.insert_event(tmp_data, schema, table, tables_pk.get(table)) if not result: logger.error('insert event error!') if not skip_error: exit() except Exception as e: logger.error(f'insert event error!,error:{e}') if not skip_error: exit() consumer.commit() logger.info(f'commit success {events_num} events!') event_list = {} is_insert = False len_event = last_time = 0
parser.add_argument("--our-servers", default=",".join(["comet-17-22.sdsc.edu:9092"])) parser.add_argument("--as-rel-file", help="TXT file containing AS relation") parser.add_argument("--ppdc-ases-file") args = parser.parse_args() collectors = COLLECTORS if len(args.collector) == 0 else args.collector logging.basicConfig(level=logging.INFO) topics = ["rib-{}".format(c) for c in collectors] logger.info("using topics %s", topics) consumer = KafkaConsumer(*topics, bootstrap_servers=args.our_servers.split(","), group_id="follower") if args.offset is not None: consumer.set_topic_partitions({(t, 0): args.offset for t in topics}) # setup filters filters = [] if args.anycast_file is not None: anycast = Radix() count = 0 with open(args.anycast_file, "r") as f: for prefix in f: if not prefix.startswith("#"): anycast.add(prefix.strip())
def main(): logger = logging.getLogger(os.path.basename(__file__)) # Setup Aiven SDK logger.info("Setting up Aiven SDK") client = AivenClient("https://api.aiven.io") client.set_auth_token(os.environ["AIVEN_TOKEN"]) # Lookup the target service logger.info("Looking up the target Aiven Kafka Service") service = client.get_service(project=os.environ["AIVEN_PROJECT"], service=os.environ["AIVEN_SERVICE"]) if not service: raise SystemExit("Failed to look up the target service") # Store credentials on disk. This is using the main access certificates (avnadmin). logger.info("Storing Aiven service access credentials") with open("client.crt", "w") as fh: fh.write(service["connection_info"]["kafka_access_cert"]) with open("client.key", "w") as fh: fh.write(service["connection_info"]["kafka_access_key"]) # Project CA certificate logger.info("Fetching project CA certificate") result = client.get_project_ca(project=os.environ["AIVEN_PROJECT"]) with open("ca.crt", "w") as fh: fh.write(result["certificate"]) # Initialize Kafka client kafka_client = KafkaConsumer( bootstrap_servers=service["service_uri"], security_protocol="SSL", ssl_cafile="ca.crt", ssl_certfile="client.crt", ssl_keyfile="client.key", ) partitions = kafka_client.partitions_for_topic(os.environ["AIVEN_TOPIC"]) tps = [ TopicPartition(os.environ["AIVEN_TOPIC"], partition) for partition in partitions ] last_timestamp = time.monotonic() last_offsets = {} logger.info("Start result collection loop, break with CTRL-C") readings = [] while True: delta = 0 result = kafka_client.end_offsets(tps) timenow = time.monotonic() for tp, offset in result.items(): if tp in last_offsets: delta += offset - last_offsets[tp] last_offsets[tp] = offset messages_per_second = int(delta / (timenow - last_timestamp)) readings.append(messages_per_second) readings = readings[-30:] logger.info("%d messages/s, 30 sample average %d messages/s", messages_per_second, sum(readings) / len(readings)) last_timestamp = timenow time.sleep(2)
if __name__ == "__main__": import argparse relations,childs,parents=caida_filter_annaunce("20160101.as-rel.txt","20160101.ppdc-ases.txt") print(len(relations),len(childs),len(parents)) parser = argparse.ArgumentParser(description="get a feed of abnormal BGP conflicts") parser.add_argument("--offset", type=int) args = parser.parse_args() logging.basicConfig(level=logging.INFO) consumer = KafkaConsumer("hijacks", bootstrap_servers=["comet-17-08.sdsc.edu:9092"], group_id="client") if args.offset is not None: topics = [("hijacks", i, args.offset) for i in PARTITIONS.values()] consumer.set_topic_partitions(*topics) hijacks=0 total=0 for item in consumer: total+=1 if(is_legittimate(relations,childs,parents, json.loads(item.value))==0): hijacks+=1 #print(item.value) if(total==10000): print(total,hijacks)
from kafka.consumer import KafkaConsumer from json import loads from mongoengine import * from matilda.data_pipeline import object_model consumer = KafkaConsumer( 'numtest', # kafka topic bootstrap_servers=['localhost:9092'], # same as our producer # It handles where the consumer restarts reading after breaking down or being turned off and can be set either # to earliest or latest. When set to latest, the consumer starts reading at the end of the log. # When set to earliest, the consumer starts reading at the latest committed offset. auto_offset_reset='earliest', enable_auto_commit= True, # makes sure the consumer commits its read offset every interval. # join a consumer group for dynamic partition assignment and offset commits # a consumer needs to be part of a consumer group to make the auto commit work. # otherwise, need to do it manually i.e. consumer.assign([TopicPartition('foobar', 2)]); msg = next(consumer) group_id='my-group', # deserialize encoded values value_deserializer=lambda x: loads(x.decode('utf-8'))) def get_atlas_db_url(username, password, dbname): return f"mongodb+srv://{username}:{password}@cluster0.ptrie.mongodb.net/{dbname}?retryWrites=true&w=majority&" \ f"ssl=true" atlas_url = get_atlas_db_url(username='******', password='******', dbname='matilda-db')