def _tst_10_read_multi_csv_file_compress_with_crash( self, file_list: List[str], tag: str) -> None: s = self.scheduler() module = CSVLoader(file_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 4) aio.run_gather(s.start(), sts) _close(module) s = self.scheduler(clean=True) module = CSVLoader( file_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def test_06_read_http_multi_csv_bz2_with_crash(self) -> None: self._http_srv = _HttpSrv() tag = self.get_tag() s = self.scheduler() url_list = [make_url("bigfile", ext=BZ2)] * 2 module = CSVLoader(url_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 3) aio.run_gather(s.start(), sts) self._http_srv.restart() s = self.scheduler(clean=True) module = CSVLoader( url_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def test_01_read_http_csv_with_crash_and_counter(self) -> None: self._http_srv = _HttpSrv() tag = self.get_tag() s = self.scheduler() url = make_url("bigfile") module = CSVLoader(url, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 2) aio.run_gather(s.start(), sts) self._http_srv.restart() s = self.scheduler(clean=True) csv = CSVLoader( url, recovery=True, index_col=False, recovery_tag=tag, header=None, scheduler=s, ) counter = Counter(scheduler=s) counter.input[0] = csv.output.result self.assertTrue(csv.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = counter.output.result aio.run(s.start()) self.assertEqual(len(csv.table), 1000000) self.assertEqual(counter.table["counter"].loc[0], 1000000)
def test_09_read_multi_csv_file_with_crash(self) -> None: s = self.scheduler() tag = "t9" file_list = [get_dataset("bigfile"), get_dataset("bigfile")] module = CSVLoader(file_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 3) aio.run_gather(s.start(), sts) _close(module) s = self.scheduler(clean=True) module = CSVLoader( file_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def test_pattern(self): s = self.scheduler() n_samples = 1_000 centers = [(0.1, 0.3, 0.5), (0.7, 0.5, 3.3), (-0.4, -0.3, -11.1)] cols = ["A", "B", "C"] with s: data = BlobsTable( columns=cols, centers=centers, cluster_std=0.2, rows=n_samples, scheduler=s, ) # ds = DataShape(scheduler=s) # ds.input.table = data.output.result factory = StatsFactory(input_module=data, scheduler=s) factory.create_dependent_modules(var_name="my_dyn_var") factory.input.table = data.output.result sink = Sink(scheduler=s) # sink.input.inp = ds.output.result sink.input.inp = factory.output.result async def fake_input_1(scheduler: Scheduler, rn: int) -> None: module = scheduler["my_dyn_var"] print("from input my_dyn_var") await module.from_input({"matrix": matrix_hist}) s.on_loop(my_stop, 4) s.on_loop(fake_input_1, 3) aio.run(s.start())
def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None: p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s = self.scheduler() filenames = Table( name="file_names", dshape="{filename: string}", data={ "filename": [ make_url("smallfile", ext=BZ2), make_url("smallfile", ext=BZ2), ] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) _close(csv) self.assertEqual(len(csv.table), 60000)
def _tst_08_read_multi_csv_file_compress_no_crash( self, files: List[str]) -> None: s = self.scheduler() module = CSVLoader(files, index_col=False, header=None, scheduler=s) # , save_context=False) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 60000)
def test_read_csv(self) -> None: s = self.scheduler() module = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 1000000)
def test_dataflow_6_dynamic(self) -> None: s = self.scheduler() table = RandomTable(name="table", columns=["a"], throttle=1000, scheduler=s) sink = Sink(name="sink", scheduler=s) sink.input.inp = table.output.result prt = Print(name="prt", proc=self.terse, scheduler=s) prt.input.df = table.output.result prt2 = Print(name="prt2", proc=self.terse, scheduler=s) prt2.input.df = table.output.result # from nose.tools import set_trace; set_trace() s.commit() async def modify_1(scheduler: Scheduler, run_number: int) -> None: with s as dataflow: print("Checking module deletion") self.assertTrue(isinstance(dataflow, Dataflow)) deps = dataflow.collateral_damage("prt2") self.assertEqual(deps, set(["prt2"])) deps = dataflow.collateral_damage("prt") self.assertEqual(deps, set(["prt"])) deps = dataflow.collateral_damage("prt", "prt2") self.assertEqual(deps, set(["prt", "prt2"])) dataflow.delete_modules("prt2") s.on_loop(modify_2, 5) async def modify_2(scheduler: Scheduler, run_number: Any) -> None: self.assertFalse("prt2" in scheduler) with s as dataflow: print("Checking more module deletion") deps = dataflow.collateral_damage("prt") self.assertEqual(deps, {"prt"}) deps = dataflow.collateral_damage("prt", "sink") self.assertEqual(deps, {"prt", "sink", "table"}) dataflow.delete_modules("prt") s.on_loop(modify_3, 5) async def modify_3(scheduler: Scheduler, run_number: int) -> None: self.assertFalse("prt" in scheduler) with s as dataflow: print("Checking even more module deletion") deps = dataflow.collateral_damage("sink") self.assertEqual(deps, {"sink", "table"}) dataflow.delete_modules("sink", "table") async def stop_error(scheduler: Scheduler, run_number: int) -> None: self.assertFalse("Scheduler should have stopped") await scheduler.stop() s.on_loop(modify_1, 5) s.on_loop(stop_error, 100) aio.run(s.start())
def test_dataflow_9_errors(self) -> None: s = self.scheduler() table = RandomTable(name="table", columns=["a", "b", "c"], throttle=1000, scheduler=s) sink = Sink(name="sink", scheduler=s) sink.input.inp = table.output.result s.commit() # Start loading a dataset, then visualize it, then change the visualizations async def modify_1(scheduler: Scheduler, run_number: int) -> None: print("Adding scatterplot_1") with scheduler as dataflow: dataflow1 = dataflow sp = MCScatterPlot( name="scatterplot_1", classes=[("Scatterplot", "a", "b")], approximate=True, scheduler=scheduler, ) sp.create_dependent_modules(table, "result") print(f"Created scatterplot_1, groups: {dataflow.groups()}") with self.assertRaises(ProgressiveError): with scheduler as dataflow: self.assertIs(dataflow, dataflow1) prt = Print(name="print", proc=self.terse, scheduler=scheduler) # prt.input.df = table.output.result _ = prt scheduler.on_loop(modify_2, 3) # Schedule the next activity async def modify_2(scheduler: Scheduler, run_number: int) -> None: print("Removing table") self.assertFalse("scatterplot_1" in scheduler) with scheduler as dataflow: print("Checking sink+table modules deletion") deps = dataflow.collateral_damage("sink", "print") print(f"collateral_damage('sink') = '{sorted(deps)}'") dataflow.delete_modules(*deps) async def stop_error(scheduler: Scheduler, run_number: int) -> None: self.assertFalse("Scheduler should have stopped") await scheduler.stop() s.on_loop(modify_1, 3) s.on_loop(stop_error, 10) aio.run(s.start()) self.assertFalse("scatterplot_1" in s) self.assertFalse("print" in s)
def test_read_fake_csv(self) -> None: s = self.scheduler() module = CSVLoader( RandomBytesIO(cols=30, rows=1000000), index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 1000000)
def test_04_read_http_multi_csv_bz2_no_crash(self) -> None: self._http_srv = _HttpSrv() s = self.scheduler() module = CSVLoader( [make_url("smallfile", ext=BZ2)] * 2, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 60000)
def test_07_read_multi_csv_file_no_crash(self) -> None: s = self.scheduler() module = CSVLoader( [get_dataset("smallfile"), get_dataset("smallfile")], index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 60000)
def test_01_read_http_csv_with_crash(self) -> None: self._http_srv = _HttpSrv() tag = self.get_tag() s = self.scheduler() url = make_url("bigfile") module = CSVLoader(url, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 2) aio.run_gather(s.start(), sts) self._http_srv.restart() s = self.scheduler(clean=True) module = CSVLoader( url, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 1000000) col = module.table.loc[:, 0] assert col is not None arr1 = col.to_array().reshape(-1) arr2 = BIGFILE_DF.loc[:, 0].values # import pdb;pdb.set_trace() self.assertTrue(np.allclose(arr1, arr2))
def test_01_read_http_csv_no_crash(self) -> None: p = Process(target=run_simple_server, args=()) p.start() self._http_proc = p time.sleep(SLEEP) s = self.scheduler() module = CSVLoader(make_url("bigfile"), index_col=False, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) _close(module) self.assertEqual(len(module.table), 1000000)
def test_as_array(self) -> None: s = self.scheduler() module = CSVLoader( get_dataset("bigfile"), index_col=False, as_array="array", header=None, scheduler=s, ) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result self.assertTrue(module.result is None) aio.run(s.start()) table = module.table self.assertEqual(len(table), 1000000) self.assertEqual(table.columns, ["array"]) self.assertEqual(table["array"].shape, (1000000, 30))
def test_read_multiple_csv(self) -> None: s = self.scheduler() filenames = Table( name="file_names", dshape="{filename: string}", data={ "filename": [get_dataset("smallfile"), get_dataset("smallfile")] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) self.assertEqual(len(csv.table), 60000)
def test_02_read_http_csv_crash_recovery(self) -> None: p = Process(target=run_throttled_server, args=(8000, 10**7)) p.start() self._http_proc = p time.sleep(SLEEP) s = self.scheduler() module = CSVLoader(make_url("bigfile"), index_col=False, header=None, scheduler=s, timeout=0.01) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) _close(module) # self.assertGreater(module.parser._recovery_cnt, 0) self.assertEqual(len(module.table), 1000000)
def test_read_multiple_fake_csv(self) -> None: s = self.scheduler() filenames = Table( name="file_names2", dshape="{filename: string}", data={ "filename": [ "buffer://fake1?cols=10&rows=30000", "buffer://fake2?cols=10&rows=30000", ] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) self.assertEqual(len(csv.table), 60000)
def test_sf(self): np.random.seed(42) s = self.scheduler() random = RandomTable(3, rows=10_000, scheduler=s) sf = StatsFactory(input_module=random, scheduler=s) sf.create_dependent_modules(var_name="my_dyn_var") sf.input.table = random.output.result sink = Sink(scheduler=s) # sink.input.inp = random.output.result sink.input.inp = sf.output.result async def fake_input_1(scheduler: Scheduler, rn: int) -> None: module = scheduler["my_dyn_var"] print("from input my_dyn_var", "test_sf") await module.from_input({"matrix": matrix_max}) s.on_loop(my_stop, 4) s.on_loop(fake_input_1, 3) aio.run(s.start()) print(s.modules())
def test_as_array2(self) -> None: s = self.scheduler() module = CSVLoader( get_dataset("bigfile"), index_col=False, as_array={ "firsthalf": ["_" + str(r) for r in range(13)], "secondhalf": ["_" + str(r) for r in range(13, 30)], }, header=None, scheduler=s, ) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result self.assertTrue(module.result is None) aio.run(s.start()) table = module.table self.assertEqual(len(table), 1000000) self.assertEqual(table.columns, ["firsthalf", "secondhalf"]) self.assertEqual(table["firsthalf"].shape, (1000000, 13)) self.assertEqual(table["secondhalf"].shape, (1000000, 17))
def test_scheduler(self) -> None: with self.assertRaises(ProgressiveError): s = Scheduler(0) s = Scheduler() csv = CSVLoader( get_dataset("bigfile"), name="csv", index_col=False, header=None, scheduler=s, ) self.assertIs(s["csv"], csv) sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result # allow csv to start check_running = False async def _is_running() -> None: nonlocal check_running check_running = csv.scheduler().is_running() aio.run_gather(s.start(), _is_running()) self.assertTrue(check_running) def add_min(s: Scheduler, r: int) -> None: with s: m = Min(scheduler=s) m.input.table = csv.output.result prt = Print(proc=self.terse, scheduler=s) prt.input.df = m.output.result s.on_loop(add_min, 10) s.on_loop(self._stop, 20) self.assertIs(s["csv"], csv) json = s.to_json(short=False) self.assertFalse(json["is_running"]) self.assertTrue(json["is_terminated"]) html = s._repr_html_() self.assertTrue(len(html) != 0)
def test_as_array3(self) -> None: s = self.scheduler() try: module = CSVLoader( get_dataset("mnist_784"), index_col=False, as_array=lambda cols: {"array": [c for c in cols if c != "class"]}, scheduler=s, ) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result self.assertTrue(module.result is None) aio.run(s.start()) table = module.table self.assertEqual(len(table), 70000) self.assertEqual(table.columns, ["array", "class"]) self.assertEqual(table["array"].shape, (70000, 784)) self.assertEqual(table["class"].shape, (70000, )) except TimeoutError: print("Cannot download mnist") pass
def load_csv(self) -> None: module = CSVLoader( filepath_or_buffer=get_dataset("smallfile"), force_valid_ids=True, index_col=False, header=None, scheduler=self.scheduler_, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=self.scheduler_) sink.input.inp = module.output.result aio.run(self.scheduler_.start(persist=True)) t = module.table self.assertFalse(t is None) self.assertEqual(len(t), 30000) df = pd.read_csv(filepath_or_buffer=get_dataset("smallfile"), index_col=False, header=None) for col in range(t.ncol): coldf = df[col] colt = t[col] self.assertTrue(np.all(coldf == colt.values))
def test_dataflow_0(self) -> None: scheduler = self.scheduler() saved_inputs = None saved_outputs = None with scheduler as dataflow: csv = CSVLoader( get_dataset("smallfile"), name="csv", index_col=False, header=None, scheduler=scheduler, ) self.assertIs(scheduler["csv"], csv) self.assertEqual( dataflow.validate_module(csv), ['Output slot "result" missing in module "csv"'], ) m = Min(name="min", scheduler=scheduler) self.assertIs(dataflow[m.name], m) self.assertEqual( dataflow.validate_module(m), [ 'Input slot "table" missing in module "min"', 'Output slot "result" missing in module "min"', ], ) prt = Print(proc=self.terse, name="print", scheduler=scheduler) self.assertIs(dataflow[prt.name], prt) self.assertEqual( dataflow.validate_module(prt), ['Input slot "df" missing in module "print"'], ) m.input.table = csv.output.result prt.input.df = m.output.result self.assertEqual(len(dataflow), 3) self.assertEqual(dataflow.dir(), ["csv", "min", "print"]) errors = dataflow.validate() self.assertEqual(errors, []) deps = dataflow.order_modules() self.assertEqual(deps, ["csv", m.name, prt.name]) saved_inputs = dataflow.inputs saved_outputs = dataflow.outputs # dataflow.__exit__() is called here # print('Old modules:', end=' ') # pprint(scheduler._modules) # scheduler._update_modules() # force modules in the main loop # print('New modules:', end=' ') # pprint(scheduler.modules()) with scheduler as dataflow: # nothing should change when nothing is modified in dataflow self.assertEqual(len(dataflow), 3) deps = dataflow.order_modules() self.assertEqual(deps, ["csv", m.name, prt.name]) self.assertEqual(dataflow.inputs, saved_inputs) self.assertEqual(dataflow.outputs, saved_outputs) # scheduler._update_modules() # force modules in the main loop with scheduler as dataflow: sink = Sink(name="sink", scheduler=scheduler) sink.input.inp = m.output.result dataflow.delete_modules(prt) self.assertEqual(len(dataflow), 3) deps = dataflow.order_modules() self.assertEqual(deps, ["csv", m.name, "sink"]) # pprint(dataflow.inputs) # pprint(dataflow.outputs) # print('Old modules:') # pprint(scheduler._new_modules) # scheduler._update_modules() # force modules in the main loop # print('New modules:') # pprint(scheduler.modules()) with scheduler as dataflow: self.assertEqual(len(dataflow), 3) deps = dataflow.order_modules() self.assertEqual(deps, ["csv", m.name, "sink"]) prt = Print(proc=self.terse, name="print", scheduler=scheduler) self.assertIs(dataflow[prt.name], prt) self.assertEqual( dataflow.validate_module(prt), ['Input slot "df" missing in module "print"'], ) prt.input.df = m.output.result
def test_dataflow_7_dynamic(self) -> None: s = self.scheduler() table = RandomTable(name="table", columns=["a", "b", "c"], throttle=1000, scheduler=s) sink = Sink(name="sink", scheduler=s) sink.input.inp = table.output.result s.commit() # Start loading a dataset, then visualize it, then change the visualizations async def modify_1(scheduler: Scheduler, run_number: int) -> None: print("Adding scatterplot_1") # from nose.tools import set_trace; set_trace() with scheduler as dataflow: sp = MCScatterPlot( name="scatterplot_1", classes=[("Scatterplot", "a", "b")], approximate=True, scheduler=scheduler, ) sp.create_dependent_modules(table, "result") print(f"Created scatterplot_1, groups: {dataflow.groups()}") scheduler.on_loop(modify_2, 10) # Schedule the next activity async def modify_2(scheduler: Scheduler, run_number: int) -> None: print("Removing scatterplot_1") self.assertTrue("scatterplot_1" in scheduler) with scheduler as dataflow: print("Checking scatterplot_1 module deletion") deps = dataflow.collateral_damage("scatterplot_1") print(f"collateral_damage('scatterplot_1') = '{sorted(deps)}'") dataflow.delete_modules(*deps) scheduler.on_loop(modify_3, 10) async def modify_3(scheduler: Scheduler, run_number: int) -> None: print("Adding scatterplot_2") self.assertFalse("scatterplot_1" in scheduler) with scheduler: sp = MCScatterPlot( name="scatterplot_2", classes=[("Scatterplot", "a", "c")], approximate=True, scheduler=scheduler, ) sp.create_dependent_modules(table, "result") scheduler.on_loop(modify_4, 10) # Schedule the next activity async def modify_4(scheduler: Scheduler, run_number: int) -> None: print("Removing scatterplot_2") self.assertFalse("scatterplot_1" in scheduler) self.assertTrue("scatterplot_2" in scheduler) with scheduler as dataflow: print("Checking scatterplot module deletion") print("Checking scatterplot_2 module addition") deps = dataflow.collateral_damage("scatterplot_2") print(f"collateral_damage('scatterplot_2') = '{sorted(deps)}'") dataflow.delete_modules(*deps) s.on_loop(modify_5, 5) async def modify_5(scheduler: Scheduler, run_number: int) -> None: print("Removing table") self.assertFalse("scatterplot_1" in scheduler) self.assertFalse("scatterplot_2" in scheduler) with scheduler as dataflow: print("Checking sink+table modules deletion") deps = dataflow.collateral_damage("sink") print(f"collateral_damage('sink') = '{sorted(deps)}'") dataflow.delete_modules(*deps) async def stop_error(scheduler: Scheduler, run_number: int) -> None: self.assertFalse("Scheduler should have stopped") await scheduler.stop() s.on_loop(modify_1, 10) s.on_loop(stop_error, 100) aio.run(s.start()) self.assertFalse("scatterplot_1" in s) self.assertFalse("scatterplot_2" in s)