def test_score_stream(): class MyEstimator(StreamEstimator): def partial_fit(self, X, y): pass def predict(self, X): pass def score(self, X, y): return 1 n_rows = 20 X_example, y_example = pd.DataFrame({ 'name': [None] * n_rows, 'amount': [None] * n_rows }), pd.Series([]) X_stream, y_stream = Stream(), Stream() X, y = DataFrame(X_stream, example=X_example), Series(y_stream, example=y_example) model = MyEstimator() score_stream = model.stream_score(X, y) score_list = list() score_stream.stream.sink(score_list.append) score_predicate = lambda: score_list == [1] * n_rows await_for(score_predicate, .1)
def test_tcp_async(): port = 9876 s = Source.from_tcp(port) out = s.sink_to_list() s.start() yield await_for(lambda: s.server is not None, 2, period=0.02) try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("localhost", port)) sock.send(b'data\n') sock.close() sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("localhost", port)) sock.send(b'data\n') sock2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock2.connect(("localhost", port)) sock2.send(b'data2\n') yield await_for(lambda: out == [b'data\n', b'data\n', b'data2\n'], 2, period=0.01) finally: s.stop() sock.close() sock2.close()
def test_from_kafka_thread(): j = random.randint(0, 10000) ARGS = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j } with kafka_service() as kafka: kafka, TOPIC = kafka stream = Stream.from_kafka([TOPIC], ARGS) out = stream.sink_to_list() stream.start() for i in range(10): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() # it takes some time for messages to come back out of kafka yield await_for(lambda: len(out) == 10, 10, period=0.1) assert out[-1] == b'value-9' kafka.produce(TOPIC, b'final message') kafka.flush() yield await_for(lambda: out[-1] == b'final message', 10, period=0.1) stream._close_consumer() kafka.produce(TOPIC, b'lost message') kafka.flush() # absolute sleep here, since we expect output list *not* to change yield gen.sleep(1) assert out[-1] == b'final message' stream._close_consumer()
def test_kafka_dask_checkpointing_sync_nodes(c, s, w1, w2): ''' Testing whether Dask's scatter and gather works in conformity with the reference counting checkpointing implementation. ''' j1 = random.randint(0, 10000) ARGS1 = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j1, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest' } j2 = j1 + 1 ARGS2 = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j2, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest' } with kafka_service() as kafka: kafka, TOPIC = kafka for i in range(10): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() stream1 = Stream.from_kafka_batched(TOPIC, ARGS1, asynchronous=True, dask=True) out1 = stream1.map(split).gather().filter( lambda x: x[-1] % 2 == 1).sink_to_list() stream1.start() yield await_for(lambda: any(out1) and out1[-1][-1] == 9, 10, period=0.2) stream1.upstream.stopped = True stream2 = Stream.from_kafka_batched(TOPIC, ARGS1, asynchronous=True, dask=True) out2 = stream2.map(split).gather().filter( lambda x: x[-1] % 2 == 1).sink_to_list() stream2.start() time.sleep(5) assert len(out2) == 0 stream2.upstream.stopped = True stream3 = Stream.from_kafka_batched(TOPIC, ARGS2, asynchronous=True, dask=True) out3 = stream3.map(split).gather().filter( lambda x: x[-1] % 2 == 1).sink_to_list() stream3.start() yield await_for(lambda: any(out3) and out3[-1][-1] == 9, 10, period=0.2) stream3.upstream.stopped = True
def test_from_file(): with tmpfile() as fn: with open(fn, 'wt') as f: f.write('{"x": 1, "y": 2}\n') f.write('{"x": 2, "y": 2}\n') f.write('{"x": 3, "y": 2}\n') f.flush() source = Stream.from_textfile(fn, poll_interval=0.010, asynchronous=True, start=False) L = source.map(json.loads).pluck('x').sink_to_list() assert L == [] source.start() yield await_for(lambda: len(L) == 3, timeout=5) assert L == [1, 2, 3] f.write('{"x": 4, "y": 2}\n') f.write('{"x": 5, "y": 2}\n') f.flush() start = time() while L != [1, 2, 3, 4, 5]: yield gen.sleep(0.01) assert time() < start + 2 # reads within 2s source = Stream.from_textfile(fn, poll_interval=0.010, asynchronous=True, start=False, from_end=True) L = source.map(json.loads).pluck('x').sink_to_list() source.start() yield gen.sleep(0.10) assert L == [] f.write('{"x": 6, "y": 2}\n') f.write('{"x": 7, "y": 2}\n') f.flush() yield await_for(lambda: len(L) == 2, timeout=5) assert L == [6, 7]
def test_kafka_dask_batch(c, s, w1, w2): j = random.randint(0, 10000) ARGS = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j } with kafka_service() as kafka: kafka, TOPIC = kafka stream = Stream.from_kafka_batched(TOPIC, ARGS, keys=True, asynchronous=True, dask=True) out = stream.gather().sink_to_list() stream.start() yield gen.sleep(5) # this frees the loop while dask workers report in assert isinstance(stream, DaskStream) for i in range(10): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() yield await_for(lambda: any(out), 10, period=0.2) assert {'key': None, 'value': b'value-1'} in out[0] stream.stop() yield gen.sleep(0) stream.upstream.upstream.consumer.close()
def test_process(): cmd = ["python", "-c", "for i in range(4): print(i)"] s = Source.from_process(cmd) out = s.sink_to_list() s.start() yield await_for(lambda: out == [b'0\n', b'1\n', b'2\n', b'3\n'], timeout=5) s.stop()
def test_from_file_end(): with tmpfile() as fn: with open(fn, 'wt') as f: f.write('data1\n') f.flush() source = Stream.from_textfile(fn, poll_interval=0.010, start=False, from_end=True) out = source.sink_to_list() source.start() assert out == [] yield await_for(lambda: source.started, 2, period=0.02) f.write('data2\n') f.flush() yield await_for(lambda: out == ['data2\n'], timeout=5, period=0.1)
def test_stream_predict(): n_rows = 100 X_example = pd.DataFrame({ 'name': [None] * n_rows, 'amount': [None] * n_rows }) X_stream = Stream() X = DataFrame(X_stream, example=X_example) model = MyStreamingEstimator() example_data = pd.Series(pd.np.ones(X_example.shape[0])) pred_series = model.stream_predict(X, y_example=pd.Series(example_data)) pred_df = model.stream_predict(X, y_example=pd.DataFrame(data=example_data)) pred_series_list, pred_df_list = [], [] pred_series.stream.sink(pred_series_list.append) pred_df.stream.sink(pred_df_list.append) n_fits = 10 for i in range(n_fits): X_stream.emit(X_example) ctr_predicate = lambda: (model.predict_ctr == n_fits) target_predictions = np.ones((X_example.shape[0], n_fits)) pred_series_predicate = \ lambda: pd.np.array_equal(pd.np.concatenate(pred_series_list).reshape(-1), target_predictions.reshape(-1)) pred_df_predicate = \ lambda: pd.np.array_equal(pd.np.concatenate(pred_df_list).reshape(-1), target_predictions.reshape(-1)) await_for(ctr_predicate, .1) await_for(pred_series_predicate, .1) await_for(pred_df_predicate, .1)
def test_process_str(): cmd = 'python -c "for i in range(4): print(i)"' s = Source.from_process(cmd) if sys.platform != "win32": # don't know why - something with pytest and new processes policy = asyncio.get_event_loop_policy() watcher = asyncio.SafeChildWatcher() policy.set_child_watcher(watcher) watcher.attach_loop(s.loop.asyncio_loop) out = s.sink_to_list() s.start() yield await_for(lambda: out == [b'0\n', b'1\n', b'2\n', b'3\n'], timeout=5) s.stop()
def test_process(): cmd = ["python", "-c", "for i in range(4): print(i, end='')"] s = Source.from_process(cmd, with_end=True) if sys.platform != "win32": # don't know why - something with pytest and new processes policy = asyncio.get_event_loop_policy() watcher = asyncio.SafeChildWatcher() policy.set_child_watcher(watcher) watcher.attach_loop(s.loop.asyncio_loop) out = s.sink_to_list() s.start() yield await_for(lambda: out == [b'0123'], timeout=5) s.stop()