def start_stream(): # Define the streaming pipeline. # note: currently cudf engine supports only flatten json message format. source = Stream.from_kafka_batched( args.input_topic, consumer_conf, poll_interval=args.poll_interval, # npartitions value varies based on kafka topic partitions configuration. npartitions=1, asynchronous=True, dask=True, engine="cudf", max_batch_size=args.max_batch_size, ) global output # If benchmark arg is True, use streamz to compute benchmark if args.benchmark: print("Benchmark will be calculated") output = (source.map(inference).map( lambda x: (x[0], x[1], int(round(time.time())), x[2])).map( sink_to_kafka).gather().sink_to_list()) else: output = source.map(inference).map(sink_to_kafka).gather() source.start()
def _start_stream(self): # Define the streaming pipeline. if self.config["cudf_engine"]: source = Stream.from_kafka_batched( self.kafka_conf["input_topic"], self.kafka_conf["consumer_conf"], poll_interval=self.args.poll_interval, # npartitions value varies based on kafka topic partitions configuration. npartitions=self.kafka_conf["n_partitions"], asynchronous=True, dask=True, engine="cudf", max_batch_size=self.args.max_batch_size, ) else: source = Stream.from_kafka_batched( self.kafka_conf["input_topic"], self.kafka_conf["consumer_conf"], poll_interval=self.args.poll_interval, # npartitions value varies based on kafka topic partitions configuration. npartitions=self.kafka_conf["n_partitions"], asynchronous=True, dask=True, max_batch_size=self.args.max_batch_size, ) sink = self.config["sink"] global output # If benchmark arg is True, use streamz to compute benchmark if self.args.benchmark: print("Benchmark will be calculated") output = (source.map(self.inference).map( lambda x: (x[0], x[1], int(round(time.time())), x[2])).map( self.sink_dict[sink]).gather().sink_to_list()) else: output = source.map(self.inference).map( self.sink_dict[sink]).gather() source.start()
def start_stream(): source = Stream.from_kafka_batched( args.input_topic, consumer_conf, poll_interval=args.poll_interval, # npartitions value varies based on kafka topic partitions configuration. npartitions=1, asynchronous=True, dask=True, max_batch_size=args.max_batch_size, ) global output # If benchmark arg is True, use streamz to compute benchmark if args.benchmark: print("Benchmark will be calculated") output = (source.map(inference).map( lambda x: (x[0], x[1], int(round(time.time())), x[2])).map( sink_to_kafka).gather().sink_to_list()) else: output = source.map(inference).map(sink_to_kafka).gather() source.start()
client.run(worker_init) # Define the streaming pipeline. consumer_conf = { "bootstrap.servers": args.broker, "group.id": args.group_id, "session.timeout.ms": 60000, "enable.partition.eof": "true", "auto.offset.reset": "earliest", } print("Consumer conf:", consumer_conf) source = Stream.from_kafka_batched( args.input_topic, consumer_conf, poll_interval=args.poll_interval, npartitions=1, asynchronous=True, dask=True, max_batch_size=args.max_batch_size, ) # If benchmark arg is True, use streamz to compute benchmark if args.benchmark: print("Benchmark will be calculated") output = (source.map(inference).map( lambda x: (x[0], x[1], x[2], int(round(time.time())), x[3])).map( sink_to_kafka).gather().sink_to_list()) else: output = source.map(inference).map(sink_to_kafka).gather() source.start()
if i % 1000 == 0: producer.flush() consumer = ck.Consumer(cconf) tp = ck.TopicPartition(topic, 0, 0) t0 = time.time() msg = consumer.poll(0) while msg and msg.value(): keep = msg msg = consumer.poll(0) print('direct', time.time() - t0) print('batched', time.time()) stream = Stream.from_kafka_batched(topic, cconf, npartitions=n_parts, poll_interval=0.1) stream.map(lambda batch: (any(int(msg) >= n_msg - 1 for msg in batch), time.time())).sink(print) stream.start() # import dask.distributed # client = dask.distributed.Client(processes=False) # # print('dask start', time.time()) # stream = Stream.from_kafka_batched(topic, cconf, npartitions=n_parts, dask=True) # stream.map(lambda batch: (any( # int(msg) >= n_msg-1 for msg in batch), time.time())).gather().sink(print) # stream.start()
default="localhost:9092", help="Kafka broker") parser.add_argument("--input_topic", default="input", help="Input kafka topic") parser.add_argument("--output_topic", default="output", help="Output kafka topic") parser.add_argument("--group_id", default="streamz", help="Kafka group ID") args = parser.parse_args() cluster = LocalCUDACluster() client = Client(cluster) print(client) client.run(worker_init) # Define the streaming pipeline. consumer_conf = { 'bootstrap.servers': args.broker, 'group.id': args.group_id, 'session.timeout.ms': 60000 } source = Stream.from_kafka_batched(args.input_topic, consumer_conf, poll_interval='1s', npartitions=1, asynchronous=True, dask=False) inference = source.map(predict_batch) wel_parsing = inference.map(wel_parsing) alerts = wel_parsing.map(threshold_alert).map(sink_to_kafka) # Start the stream. source.start()