Пример #1
0
def main():

    producer = KafkaProducer(bootstrap_servers=config.KAFKA_SERVERS,
                             value_serializer=lambda v: json.dumps(v).encode())

    headers = [
        'CMTE_ID', 'AMNDT_IND', 'RPT_TP', 'TRANSACTION_PGI', 'IMAGE_NUM',
        'TRANSACTION_TP', 'ENTITY_TP', 'NAME', 'CITY', 'STATE', 'ZIP_CODE',
        'EMPLOYER', 'OCCUPATION', 'TRANSACTION_DT', 'TRANSACTION_AMT',
        'OTHER_ID', 'TRAN_ID', 'FILE_NUM', 'MEMO_CD', 'MEMO_TEXT', 'SUB_ID'
    ]

    with open('/home/ubuntu/2016/by_date/itcont_2016_10151005_20150726.txt'
              ) as f:
        data = f.readlines()

    data = [x.split("|") for x in data]
    #producer = KafkaProducer(bootstrap_servers = '18.205.181.166:9092',value_serializer = lambda v: json.dumps(v).encode('utf-8'))

    for row in data:
        row = {h: x for h, x in zip(headers, row)}
        #print row
        producer.send('data', row)

#print row

    producer.flush()
    producer = KafkaProducer(retries=5)
Пример #2
0
class KafkaLoggingHandler(logging.Handler):
    def __init__(self, hosts_list, topic, **kwargs):
        logging.Handler.__init__(self)

        self.kafka_topic_name = topic
        self.producer = KafkaProducer(bootstrap_servers=hosts_list)

    def emit(self, record):
        # drop kafka logging to avoid infinite recursion
        if record.name == 'kafka':
            return
        try:
            # use default formatting
            msg = self.format(record)
            msg = str.encode(msg)

            self.producer.send(self.kafka_topic_name, msg)
            self.producer.flush()
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            self.handleError(record)

    def close(self):
        if self.producer is not None:
            self.producer.close()
        logging.Handler.close(self)
Пример #3
0
    def run(self):
        producer = KafkaProducer(bootstrap_servers='DNS from master:9092')

        for i in x:
            producer.send('my_topic', b'i')
            producer.flush()
            time.sleep(0.5)

        while not self.stop_event.is_set():
            s3 = boto3.client('s3')  #low-level functional API
            my_bucket = resource.Bucket(BUCKET_NAME)

            for datum in docs:
                self.producer.send('docs', datum)

            time.sleep(1)

        producer.close()
Пример #4
0
def main():
    	producer = KafkaProducer(bootstrap_servers=["localhost:9092"],value_serializer= lambda v: json.dumps(v).encode())
        

	headers = ['CMTE_ID', 'AMNDT_IND', 'RPT_TP', 'TRANSACTION_PGI', 'IMAGE_NUM',       \
                   'TRANSACTION_TP','ENTITY_TP','NAME', 'CITY', 'STATE', 'ZIP_CODE',       \
                   'EMPLOYER', 'OCCUPATION', 'TRANSACTION_DT','TRANSACTION_AMT',           \
                   'OTHER_ID', 'TRAN_ID', 'FILE_NUM', 'MEMO_CD', 'MEMO_TEXT', 'SUB_ID'     ]
	split_counter = len(glob.glob('/home/ubuntu/manip_data/split_*'))
	
	for j in range(1):
		for i in range(split_counter):
			with open('/home/ubuntu/manip_data/split_'+file_number(i)) as f:
				reader = csv.reader(f, delimiter='|')
			
				for row in reader:
					row = {h:x for h,x in zip(headers,row)}
					producer.send('datatwo', row)

	producer.flush()
	producer.close()
Пример #5
0
def send_avro_record_to_kafka(topic, value, bootstrap_servers,
                              avro_schema_json):
    value_schema = avro.schema.parse(avro_schema_json)

    producer = KafkaProducer(bootstrap_servers=bootstrap_servers)

    writer = DatumWriter(value_schema)
    bytes_writer = io.BytesIO()
    encoder = BinaryEncoder(bytes_writer)

    writer.write(value, encoder)

    try:
        producer.send(topic=topic, value=bytes_writer.getvalue())
    except Exception as e:
        print(
            f"Exception while producing record value - {value} to topic - {topic}: {e}"
        )
    else:
        print(
            f"Successfully producing record value - {value} to topic - {topic}"
        )

    producer.flush()
Пример #6
0
class KafkaEventProducer:
    FLUSH_PER = 10000
    gobmodel = GOBModel()

    def __init__(self, catalogue: str, collection: str, logger):
        self.catalogue = catalogue
        self.collection = collection
        self.logger = logger
        self.gob_db_session = None
        self.db_session = None
        self.gob_db_base = None
        self.Event = None
        self.producer = None
        self.total_cnt = 0

        self._init_connections()

        self.event_builder = EventDataBuilder(self.gob_db_session,
                                              self.gob_db_base, catalogue,
                                              collection)

    def _get_tables_to_reflect(self):
        """Returns tables to reflect:
        - events
        - object table (e.g. gebieden_buurten)
        - relation tables (e.g. rel_gb_brt_gbd_wijk_ligt_in_wijk, ...)

        :return:
        """
        relations = get_relations_for_collection(self.gobmodel, self.catalogue,
                                                 self.collection)
        relation_tables = [
            self.gobmodel.get_table_name('rel', rel_table)
            for rel_table in relations.values()
        ]

        return [
            'events',
            self.gobmodel.get_table_name(self.catalogue, self.collection)
        ] + relation_tables

    def _init_gob_db_session(self):
        """Inits db session for gob db (to access events)

        :return:
        """

        engine = create_engine(URL(**GOB_DATABASE_CONFIG),
                               connect_args={'sslmode': 'require'})
        self.gob_db_session = Session(engine)
        meta = MetaData()
        meta.reflect(engine, only=self._get_tables_to_reflect())
        base = automap_base(metadata=meta)
        base.prepare()
        self.Event = base.classes.events
        self.gob_db_base = base
        self.logger.info("Initialised events storage")

    def _init_local_db_session(self):
        """Inits db session for local (gob_kafka) db

        :return:
        """
        engine = create_engine(URL(**DATABASE_CONFIG),
                               connect_args={'sslmode': 'require'})
        Base.metadata.bind = engine
        self.db_session = Session(engine)

    def _init_kafka(self):
        self.producer = KafkaProducer(
            **KAFKA_CONNECTION_CONFIG,
            max_in_flight_requests_per_connection=1,
            # With retries, max_in_flight should always be 1 to ensure ordering of batches!
            retries=3)
        self.logger.info("Initialised Kafka connection")

    def _init_connections(self):
        self._init_gob_db_session()
        self._init_local_db_session()
        self._init_kafka()

    def _get_last_event(self):
        last_event = self.db_session \
            .query(LastSentEvent) \
            .filter_by(catalogue=self.catalogue, collection=self.collection) \
            .first()

        return last_event

    def _get_last_eventid(self):
        last_event = self._get_last_event()
        return last_event.last_event if last_event else -1

    def _set_last_eventid(self, eventid: int):
        last_event = self._get_last_event()

        if last_event:
            last_event.last_event = eventid
        else:
            last_event = LastSentEvent(catalogue=self.catalogue,
                                       collection=self.collection,
                                       last_event=eventid)
            self.db_session.add(last_event)

        self.db_session.commit()

    def _get_events(self, min_eventid: int):
        return self.gob_db_session \
            .query(self.Event) \
            .yield_per(10000) \
            .filter(and_(self.Event.catalogue == self.catalogue, self.Event.entity == self.collection,
                         self.Event.eventid > min_eventid)) \
            .order_by(self.Event.eventid.asc())

    def _add_event(self, event):
        header = {
            'event_type': event.action,
            'event_id': event.eventid,
            'tid': event.tid,
            'catalog': event.catalogue,
            'collection': event.entity,
        }
        headers = [(k, _to_bytes(str(v)) if v else b'')
                   for k, v in header.items()]
        data = self.event_builder.build_event(event.tid)

        self.producer.send(KAFKA_TOPIC,
                           key=_to_bytes(header['tid']),
                           value=_to_bytes(json.dumps(data)),
                           headers=headers)

    def _flush(self, last_eventid: int):
        self.producer.flush(timeout=120)
        self._set_last_eventid(last_eventid)
        print(
            f"Flushed Kafka events. Total events: {self.total_cnt}. Last event id: {last_eventid}"
        )

    def produce(self):
        last_eventid = self._get_last_eventid()
        self.logger.info(f"Start producing. Last event was {last_eventid}")

        events = self._get_events(last_eventid)

        for event in events:
            self._add_event(event)

            self.total_cnt += 1
            last_eventid = event.eventid

            self.gob_db_session.expunge(event)

            if self.total_cnt % self.FLUSH_PER == 0:
                self._flush(last_eventid)

        self._flush(last_eventid)
        self.logger.info(f"Produced {self.total_cnt} Kafka events")
Пример #7
0
from kafka.producer import KafkaProducer
import json
import csv
from time import sleep

producer = KafkaProducer(
    bootstrap_servers='localhost:9092',
    value_serializer=lambda v: json.dumps(v).encode('utf-8'))
with open('blacklist.csv') as file:
    reader = csv.DictReader(file, delimiter=",")
    for row in reader:
        producer.send(topic='blacklist', value=row)
        producer.flush()
        sleep(1)
Пример #8
0
class AIMSDownsamplingTCPServerConsumer:
    LOG_FORMAT ="{} UTC_TS\t"\
                "{}"
    INTERVAL = 60
    DELAY = 0
    MAX_CONNECTION = 32

    def __init__(self, kafka_host, kafka_port, tcp_host, tcp_port, topic,
                 log_topic):
        self.kafka_host = kafka_host
        self.kafka_port = kafka_port
        self.tcp_host = tcp_host
        self.tcp_port = tcp_port
        self.topic = topic
        self.log_topic = log_topic
        self.consumer = KafkaConsumer(
            topic,
            bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)],
            enable_auto_commit=False,
            max_poll_records=1024 * 1024,
            max_partition_fetch_bytes=1024 * 1024 * 100)
        self.producer = KafkaProducer(
            bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)])
        self.connections = {}
        self.sample_end_time = self.get_end_time(time())
        self.lastPolled = []

    def run(self):
        self.log("running")
        asyncio.run(self._async_run())

    async def _async_run(self):
        tcpServer = await asyncio.start_server(self.connection_handler,
                                               self.tcp_host, self.tcp_port)
        await asyncio.gather(tcpServer.serve_forever(), self.poll_from_kafka())

    async def connection_handler(self, reader, writer):
        addr = str(writer.get_extra_info("peername"))
        addr = str(writer.get_extra_info("peername"))
        # A new connection, but we can accept no more
        if addr not in self.connections and \
            len(self.connections)>=self.MAX_CONNECTION:
            self.refuse_client(addr, writer)
            return
        # Add connection
        self.add_client(addr, writer)
        # Read data from connection
        remaining_data = b""
        try:
            while True:
                data = await reader.read(1)  # 1024*8 bytes
                if not data:
                    break
        except BrokenPipeError:
            """
      Catches connecton reset by peer when we are sending the batched data,
       which is also when we cannot check for reader. The broken connection
       on the writer side will ultimately lead to  BrokenPipeError on the
       reader side. Hence
      """
            pass
        finally:
            self.remove_client(addr)

    async def poll_from_kafka(self):
        polled = self.consumer.poll(timeout_ms=self.INTERVAL * 1000 / 2)
        self.lastPolled = polled
        while True:
            t = time()
            if t >= self.sample_end_time + self.DELAY:
                polled = self.consumer.poll(timeout_ms=self.INTERVAL * 1000 /
                                            2)
                lastPolled = self.lastPolled
                start_time = self.sample_end_time - self.INTERVAL
                end_time = self.sample_end_time
                self.lastPolled = polled
                self.sample_end_time = self.get_end_time(time())
                if len(self.connections) != 0:
                    # run on lastPolled first to hit the cache
                    parsed_records = self.get_parsed_records(lastPolled) + \
                                      self.get_parsed_records(polled)
                    parsed_records = list(
                        filter(
                            lambda rec: rec["observation_date_time"] is
                            not None, parsed_records))
                    ds_records = self.down_sample(parsed_records, start_time,
                                                  end_time)
                    messages = [rec["message"] for rec in ds_records]
                    for addr in self.connections.keys():
                        await self.send_or_ignore_message(addr, messages)
            await asyncio.sleep(0.1)

    def get_parsed_records(self, polled):
        # Create cache
        if "_get_parsed_records__polled" not in self.__dict__:
            self._get_parsed_records__polled = []
        if "_get_parsed_records__ret" not in self.__dict__:
            self._get_parsed_records__ret = []
        # Cache hit
        if self._get_parsed_records__polled == polled:
            return self._get_parsed_records__ret
        # Cache not hit
        self._get_parsed_records__polled = polled
        self._get_parsed_records__ret = []
        records = []
        for recordList in polled.values():
            records.extend([rec.value for rec in recordList])
        for rec in records:
            self._get_parsed_records__ret.append(self.parse_hl7(rec))
        return self._get_parsed_records__ret

    def parse_hl7(self, message):
        segments = message.decode(errors="ignore") \
                          .strip() \
                          .split(MESSAGE_SEGMENT_END_BYTE)
        location = None
        date_time = None
        observation_types = []
        observation_type = None
        for seg in segments:
            fields = seg.split('|')
            if fields[0] == "PV1":
                try:
                    location = fields[3]
                except IndexError:
                    pass
            if fields[0] == "OBR":
                try:
                    date_time = mktime(strptime(fields[7], "%Y%m%d%H%M%S"))
                except IndexError:
                    pass
            if fields[0] == "OBX":
                try:
                    observation_types.append(fields[13])
                except IndexError:
                    observation_types.append(None)
        observation_type_set = set(observation_types)
        if len(observation_type_set)==1 and \
            "APERIODIC" in observation_type_set:
            observation_type = "aperiodic"
        if len(observation_type_set)==1 and \
            None in observation_type_set:
            observation_type = "default"
        return {
            "assigned_patient_location": location,
            "observation_date_time": date_time,
            "observation_type": observation_type,
            "message": message
        }

    def down_sample(self, parsed_records, start_time, end_time):
        dt = [rec["observation_date_time"] for rec in parsed_records]
        records = []
        tmp = {}
        sorted_records = sorted(parsed_records,
                                key=lambda rec: rec["observation_date_time"])
        for rec in sorted_records:
            date_time = rec["observation_date_time"]
            location = rec["assigned_patient_location"]
            observation_type = rec["observation_type"]
            message = rec["message"]
            if date_time<start_time or \
                date_time>=end_time:
                continue
            tmp[location] = tmp.get(location, {})
            tmp[location][observation_type] = rec
        for d in tmp.values():
            for rec in d.values():
                records.append(rec)
        return records

    def log(self, msg):
        self.producer.send( self.log_topic,
                            self.LOG_FORMAT.format( datetime.now().timestamp(),
                                                    msg
                                                    ) \
                                .encode()
                            )

    def get_end_time(self, current_time):
        interval = self.INTERVAL
        return current_time - current_time % interval + interval

    async def send_or_ignore_message(self, addr, messages):
        writer = self.connections[addr]
        try:
            for msg in messages:
                writer.write(msg)
                await writer.drain()
        except ConnectionResetError:
            """
      The error is not thrown reliably. If a connection is broken, and
       one try to
          writer.write(record)
          await writer.drain()
       This error may not manifest. It is thrown more often when one try
       to repeatedly write to and drain a broken connection.
      """
            self.remove_client(addr)

    def refuse_client(self, addr, writer):
        self.log("{} refused".format(addr))
        writer.close()

    def add_client(self, addr, writer):
        if addr not in self.connections:
            self.log("{} accepted".format(addr))
            self.connections[addr] = writer
        else:
            self.remove_client(addr)
            self.add_client(addr)

    def remove_client(self, addr):
        if addr in self.connections:
            self.log("{} closed".format(addr))
            writer = self.connections.pop(addr)
            try:
                writer.close()
            except ConnectionResetError:
                pass

    def cleanup(self):
        self.log("shutdown")
        for addr in self.connections.keys():
            self.remove_client(addr)
        self.producer.flush()
        self.producer.close()
Пример #9
0
class IBUSStreamingDownsamplingConsumer:
    LOG_FORMAT ="{} UTC_TS\t"\
                "{}"

    def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic,
                 logTopic, interval):
        self.kafkaHost = kafkaHost
        self.kafkaPort = kafkaPort
        self.tcpHost = tcpHost
        self.tcpPort = tcpPort
        self.group_id = group_id
        self.topic = topic
        self.logTopic = logTopic
        self.interval = int(interval)
        self.consumer = KafkaConsumer(
            topic,
            bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)],
            group_id=group_id,
            enable_auto_commit=False)
        self.producer = KafkaProducer(
            bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)])
        self.tcpWriter = None

    def getTopicPartitions(self):
        self.consumer.topics()  #This ensures local cache is updated with
        # information about partitions, offsets etc.
        pids = self.consumer.partitions_for_topic(self.topic)
        tps = [TopicPartition(self.topic, pid) for pid in pids]
        return tps

    def getTopicPartitionsCommittedPositions(self):
        tps = self.getTopicPartitions()
        ret = [(tp, self.consumer.committed(tp)) for tp in tps]
        return ret

    async def tcp_server_handler(self, reader, writer):
        addr = str(writer.get_extra_info("socket").getpeername())
        if self.tcpWriter is not None:
            self.log("refused " + addr)
            writer.write(b"Connection limit reached; connection refused.")
            writer.close()
            return
        self.log("accepted " + addr)
        self.tcpWriter = writer
        t1 = asyncio.create_task(self.poll_from_Kafka(writer))
        try:
            while True:
                data = await reader.read(1)  # 1024*16 bytes
                if not data:
                    break
        except BrokenPipeError:
            """
      Catches connecton reset by peer when we are sending the batched data,
       which is also when we cannot check for reader. The broken connection
       on the writer side will ultimately lead to  BrokenPipeError on the
       reader side. Hence
      """
            pass
        finally:
            t1.cancel()
            self.log("closed " + addr)
            writer.close()
            self.tcpWriter = None

    async def poll_from_Kafka(self, writer):
        while True:
            prevPos = self.getTopicPartitionsCommittedPositions()
            polled = self.consumer.poll(timeout_ms=1000)
            records = [
                record.value for recordList in polled.values()
                for record in recordList
            ]
            try:
                for record in records:
                    writer.write(record)
                    await writer.drain()
            except ConnectionResetError:
                """
        The error is not thrown reliably. If a connection is broken, and
         one try to
            writer.write(record)
            await writer.drain()
         This error may not manifest. It is thrown more often when one try
         to repeatedly write to and drain a broken connection.
        """
                print("Last batch not fully sent, not commited.")
                for tp, pos in prevPos:
                    self.consumer.seek(tp, pos)
                break
            else:
                self.consumer.commit()
            await asyncio.sleep(self.interval)

    def log(self, msg):
        self.producer.send( self.logTopic,
                            self.LOG_FORMAT.format( datetime.now().timestamp(),
                                                    msg
                                                    ) \
                                .encode()
                            )

    def cleanup(self):
        self.log("shutdown")
        self.consumer.close()
        self.producer.flush()
        self.producer.close()

    def run(self):
        self.log("running")
        asyncio.run(self._async_run())

    async def _async_run(self):
        tcpServer = await asyncio.start_server(self.tcp_server_handler,
                                               self.tcpHost, self.tcpPort)
        await tcpServer.serve_forever()