def test_combine_first_nan(self): s = self.scheduler() cst1 = Constant(Table(name='tcf_xmin_xmax_nan', data=pd.DataFrame({ 'xmin': [1], 'xmax': [2] }), create=True), scheduler=s) cst2 = Constant(Table(name='tcf_ymin_ymax_nan', data=pd.DataFrame({ 'ymin': [np.nan], 'ymax': [np.nan] }), create=True), scheduler=s) cst3 = Constant(Table(name='tcf_ymin_ymax2_nan', data=pd.DataFrame({ 'ymin': [3], 'ymax': [4] }), create=True), scheduler=s) cf = CombineFirst(scheduler=s) cf.input.table = cst1.output.table cf.input.table = cst2.output.table cf.input.table = cst3.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = cf.output.table s.start() s.join() df = cf.table() last = df.last().to_dict() self.assertTrue(last['xmin']==1 and last['xmax']==2 and \ last['ymin']==3 and last['ymax']==4)
def test_merge_simple(self): s = self.scheduler() cst1 = Constant(Table(name=None, data=pd.DataFrame({ 'xmin': [1], 'xmax': [2] })), scheduler=s) cst2 = Constant(Table(name=None, data=pd.DataFrame({ 'ymin': [3], 'ymax': [4] })), scheduler=s) merge = Merge(left_index=True, right_index=True, scheduler=s) merge.input.table = cst1.output.table merge.input.table = cst2.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = merge.output.table s.start() s.join() res = merge.trace_stats(max_runs=1) #pd.set_option('display.expand_frame_repr', False) #print(res) df = merge.table() last = df.loc[df.index[-1]] self.assertTrue(last['xmin']==1 and last['xmax']==2 and \ last['ymin']==3 and last['ymax']==4)
def test_hist_index_min_max(self): "Test min_out and max_out on HistogramIndex" s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column='_1', scheduler=s) range_qry.create_dependent_modules(random, 'table', min_value=min_value, max_value=max_value) prt = Print(proc=self.terse, scheduler=s) prt.input.df = range_qry.output.table hist_index = range_qry.hist_index min_ = Min(name='min_' + str(hash(hist_index)), scheduler=s) min_.input.table = hist_index.output.min_out prt2 = Print(proc=self.terse, scheduler=s) prt2.input.df = min_.output.table max_ = Max(name='max_' + str(hash(hist_index)), scheduler=s) max_.input.table = hist_index.output.max_out pr3 = Print(proc=self.terse, scheduler=s) pr3.input.df = max_.output.table s.start() s.join() res1 = random.table().min()['_1'] res2 = min_.table().last().to_dict()['_1'] self.assertAlmostEqual(res1, res2) res1 = random.table().max()['_1'] res2 = max_.table().last().to_dict()['_1'] self.assertAlmostEqual(res1, res2)
def test_join_simple(self): s = self.scheduler() cst1 = Constant(Table(name='test_join_simple_cst1', data=pd.DataFrame({ 'xmin': [1], 'xmax': [2] }), create=True), scheduler=s) cst2 = Constant(Table(name='test_join_simple_cst2', data=pd.DataFrame({ 'ymin': [3], 'ymax': [4] }), create=True), scheduler=s) reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s) reduce_.input.table = cst1.output.table reduce_.input.table = cst2.output.table join = reduce_.expand() # join = BinJoin(scheduler=s) # join.input.first = cst1.output.table # join.input.second = cst2.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = join.output.table s.start() s.join() res = join.trace_stats(max_runs=1) print(res) df = join.table() last = df.loc[df.index[-1]] self.assertTrue(last['xmin'] == 1 and last['xmax'] == 2 and \ last['ymin'] == 3 and last['ymax'] == 4)
def _2_csv_2_const_scenario(module: Module, s: Scheduler) -> Callable[[Scheduler, int], None]: csv_a = CSVLoader(get_dataset("smallfile"), index_col=False, header=None, scheduler=s) csv_b = CSVLoader(get_dataset("smallfile"), index_col=False, header=None, scheduler=s) table_c = Table("const_c_2_csv_2_const_scenario", dshape="{a: int}", create=True) const_c = Constant(table=table_c, scheduler=s) table_d = Table("const_d_2_csv_2_const_scenario", dshape="{a: int}", create=True) const_d = Constant(table=table_d, scheduler=s) module.input.a = csv_a.output.result module.input.b = csv_b.output.result module.input.c = const_c.output.result module.input.d = const_d.output.result def _fun(s: Scheduler, r: int) -> None: if r > 10: s.task_stop() return _fun
def test_intersection(self) -> None: s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape="{_1: float64}", data={"_1": [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape="{_1: float64}", data={"_1": [0.8]}) max_value = Constant(table=t_max, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.create_dependent_modules(random, "result") bisect_min = Bisect(column="_1", op=">", hist_index=hist_index, scheduler=s) bisect_min.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_min.input.limit = min_value.output.result bisect_max = Bisect(column="_1", op="<", hist_index=hist_index, scheduler=s) bisect_max.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_max.input.limit = max_value.output.result inter = Intersection(scheduler=s) inter.input[0] = bisect_min.output.result inter.input[0] = bisect_max.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = inter.output.result aio.run(s.start()) assert hist_index.input_module is not None idx = (hist_index.input_module.output["result"].data().eval( "(_1>0.3)&(_1<0.8)", result_object="index")) self.assertEqual(inter.table.index, bitmap(idx))
def test_intersection(self): s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]}) max_value = Constant(table=t_max, scheduler=s) hist_index = HistogramIndex(column='_1', scheduler=s) hist_index.create_dependent_modules(random, 'table') bisect_min = Bisect(column='_1', op='>', hist_index=hist_index, scheduler=s) bisect_min.input.table = hist_index.output.table #bisect_.input.table = random.output.table bisect_min.input.limit = min_value.output.table bisect_max = Bisect(column='_1', op='<', hist_index=hist_index, scheduler=s) bisect_max.input.table = hist_index.output.table #bisect_.input.table = random.output.table bisect_max.input.limit = max_value.output.table inter = Intersection(scheduler=s) inter.input.table = bisect_min.output.table inter.input.table = bisect_max.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = inter.output.table s.start() s.join() idx = hist_index.input_module.output['table']\ .data().eval('(_1>0.3)&(_1<0.8)', result_object='index') self.assertEqual(inter.table().selection, bitmap(idx))
def test_last_row_simple(self): s = self.scheduler() t1 = Table(name=get_random_name("cst1"), data={ 'xmin': [1], 'xmax': [2] }) t2 = Table(name=get_random_name("cst2"), data={ 'ymin': [3], 'ymax': [4] }) cst1 = Constant(t1, scheduler=s) cst2 = Constant(t2, scheduler=s) join = Join(scheduler=s) join.input.table = cst1.output.table join.input.table = cst2.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = join.output.table s.start() s.join() #res = join.trace_stats(max_runs=1) #pd.set_option('display.expand_frame_repr', False) #print(res) df = join.table() last = df.last() self.assertTrue(last['xmin']==1 and last['xmax']==2 and \ last['ymin']==3 and last['ymax']==4)
def test_hist_index_min_max(self) -> None: "Test min_out and max_out on HistogramIndex" s = self.scheduler() with s: random = RandomTable(2, rows=100000, scheduler=s) t_min = PsDict({"_1": 0.3}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": 0.8}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules( random, "result", min_value=min_value, max_value=max_value ) prt = Print(proc=self.terse, scheduler=s) prt.input[0] = range_qry.output.result hist_index = range_qry.hist_index assert hist_index is not None min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s) min_.input[0] = hist_index.output.min_out prt2 = Print(proc=self.terse, scheduler=s) prt2.input[0] = min_.output.result max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s) max_.input[0] = hist_index.output.max_out pr3 = Print(proc=self.terse, scheduler=s) pr3.input[0] = max_.output.result aio.run(s.start()) res1 = cast(float, random.table.min()["_1"]) res2 = cast(float, min_.psdict["_1"]) self.assertAlmostEqual(res1, res2) res1 = cast(float, random.table.max()["_1"]) res2 = cast(float, max_.psdict["_1"]) self.assertAlmostEqual(res1, res2)
def test_merge_simple(self) -> None: s = self.scheduler() cst1 = Constant(Table(name=None, data=pd.DataFrame({ "xmin": [1], "xmax": [2] })), scheduler=s) cst2 = Constant(Table(name=None, data=pd.DataFrame({ "ymin": [3], "ymax": [4] })), scheduler=s) merge = Merge(left_index=True, right_index=True, scheduler=s) merge.input[0] = cst1.output.result merge.input[0] = cst2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = merge.output.result aio.run(s.start()) _ = merge.trace_stats(max_runs=1) # pd.set_option('display.expand_frame_repr', False) # print(res) df = merge.table last = df.loc[df.index[-1]] assert last is not None self.assertTrue(last["xmin"] == 1 and last["xmax"] == 2 and last["ymin"] == 3 and last["ymax"] == 4)
def test_join_simple(self) -> None: s = self.scheduler() cst1 = Constant( Table( name="test_join_simple_cst1", data=pd.DataFrame({"xmin": [1], "xmax": [2]}), create=True, ), scheduler=s, ) cst2 = Constant( Table( name="test_join_simple_cst2", data=pd.DataFrame({"ymin": [3], "ymax": [4]}), create=True, ), scheduler=s, ) cst3 = Constant( Table( name="test_join_simple_cst3", data=pd.DataFrame({"zmin": [5], "zmax": [6]}), create=True, ), scheduler=s, ) # join=Join(scheduler=s) # reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s) # reduce_.input[0] = cst1.output.result # reduce_.input[0] = cst2.output.result # reduce_.input[0] = cst3.output.result # join = reduce_.expand() join = Reduce.expand( BinJoin, "first", "second", "result", [cst1.output.result, cst2.output.result, cst3.output.result], scheduler=s, ) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = join.output.result aio.run(s.start()) res = join.trace_stats(max_runs=1) print(res) df = join.table last = df.loc[df.index[-1]] assert last is not None self.assertTrue( last["xmin"] == 1 and last["xmax"] == 2 and last["ymin"] == 3 and last["ymax"] == 4 and last["zmin"] == 5 and last["zmax"] == 6 )
def _impl_stirred_tst_percentiles_rq(self, accuracy: float, **kw: Any) -> None: """ """ s = self.scheduler() with s: random = RandomTable(2, rows=10000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result t_min = PsDict({"_1": 0.3}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": 0.8}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules(stirrer, "result", min_value=min_value, max_value=max_value) hist_index = range_qry.hist_index assert hist_index t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0}) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(accuracy=accuracy, scheduler=s) percentiles.input[0] = range_qry.output.result percentiles.input.percentiles = which_percentiles.output.result percentiles.input.hist = hist_index.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = percentiles.output.result aio.run(s.start()) pdict = notNone(percentiles.table.last()).to_dict() v = range_qry.table["_1"].values p25 = np.percentile(v, 25.0) # type: ignore p50 = np.percentile(v, 50.0) # type: ignore p75 = np.percentile(v, 75.0) # type: ignore print( "TSV=> accuracy: ", accuracy, " 25:", p25, pdict["_25"], " 50:", p50, pdict["_50"], " 75:", p75, pdict["_75"], ) self.assertAlmostEqual(p25, pdict["_25"], delta=0.01) self.assertAlmostEqual(p50, pdict["_50"], delta=0.01) self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
def _query_min_max_impl(self, random, t_min, t_max, s): min_value = Constant(table=t_min, scheduler=s) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column='_1', scheduler=s) range_qry.create_dependent_modules(random, 'table', min_value=min_value, max_value=max_value) prt = Print(proc=self.terse, scheduler=s) prt.input.df = range_qry.output.table prt2 = Print(proc=self.terse, scheduler=s) prt2.input.df = range_qry.output.min pr3 = Print(proc=self.terse, scheduler=s) pr3.input.df = range_qry.output.max return range_qry
def test_bisect2(self) -> None: s = self.scheduler() random = RandomTable(2, rows=100_000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=100, # update_rows=5, # fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result t = Table(name=None, dshape="{value: string}", data={"value": [0.5]}) min_value = Constant(table=t, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.create_dependent_modules(stirrer, "result") bisect_ = Bisect(column="_1", op=">", hist_index=hist_index, scheduler=s) bisect_.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_.input.limit = min_value.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = bisect_.output.result aio.run(s.start()) idx = stirrer.table.eval("_1>0.5", result_object="index") self.assertEqual(bisect_.table.index, bitmap(idx))
def _impl_tst_percentiles(self, accuracy): """ """ s = self.scheduler() random = RandomTable(2, rows=10000, scheduler=s) hist_index = HistogramIndex(column='_1', scheduler=s) hist_index.input.table = random.output.table t_percentiles = Table( name=None, dshape='{_25: float64, _50: float64, _75: float64}', data={ '_25': [25.0], '_50': [50.0], '_75': [75.0] }) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(hist_index, accuracy=accuracy, scheduler=s) percentiles.input.table = random.output.table percentiles.input.percentiles = which_percentiles.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = percentiles.output.table s.start() s.join() pdict = percentiles.table().last().to_dict() v = random.table()['_1'].values p25 = np.percentile(v, 25.0) p50 = np.percentile(v, 50.0) p75 = np.percentile(v, 75.0) print("Table=> accuracy: ", accuracy, " 25:", p25, pdict['_25'], " 50:", p50, pdict['_50'], " 75:", p75, pdict['_75']) self.assertAlmostEqual(p25, pdict['_25'], delta=0.01) self.assertAlmostEqual(p50, pdict['_50'], delta=0.01) self.assertAlmostEqual(p75, pdict['_75'], delta=0.01)
def _query_min_max_impl( self, random: RandomTable, t_min: PsDict, t_max: PsDict, s: Scheduler ) -> RangeQuery: min_value = Constant(table=t_min, scheduler=s) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules( random, "result", min_value=min_value, max_value=max_value ) prt = Print(proc=self.terse, scheduler=s) prt.input[0] = range_qry.output.result prt2 = Print(proc=self.terse, scheduler=s) prt2.input[0] = range_qry.output.min pr3 = Print(proc=self.terse, scheduler=s) pr3.input[0] = range_qry.output.max return range_qry
def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None: p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s = self.scheduler() filenames = Table( name="file_names", dshape="{filename: string}", data={ "filename": [ make_url("smallfile", ext=BZ2), make_url("smallfile", ext=BZ2), ] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) _close(csv) self.assertEqual(len(csv.table), 60000)
def _4_const_scenario(module: Module, s: Scheduler) -> Callable[[Scheduler, int], None]: table_ = Table("const_4_scenario", dshape="{a: int}", create=True) const_a = Constant(table=table_, scheduler=s) const_b = Constant(table=table_, scheduler=s) const_c = Constant(table=table_, scheduler=s) const_d = Constant(table=table_, scheduler=s) module.input.a = const_a.output.result module.input.b = const_b.output.result module.input.c = const_c.output.result module.input.d = const_d.output.result def _fun(s: Scheduler, r: int) -> None: if r > 10: s.task_stop() return _fun
def _impl_tst_percentiles_rq(self, accuracy): """ """ s = self.scheduler() random = RandomTable(2, rows=10000, scheduler=s) t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column='_1', scheduler=s) range_qry.create_dependent_modules(random, 'table', min_value=min_value, max_value=max_value) hist_index = range_qry.hist_index t_percentiles = Table( name=None, dshape='{_25: float64, _50: float64, _75: float64}', data={ '_25': [25.0], '_50': [50.0], '_75': [75.0] }) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(hist_index, accuracy=accuracy, scheduler=s) percentiles.input.table = range_qry.output.table percentiles.input.percentiles = which_percentiles.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = percentiles.output.table s.start() s.join() pdict = percentiles.table().last().to_dict() v = range_qry.table()['_1'].values p25 = np.percentile(v, 25.0) p50 = np.percentile(v, 50.0) p75 = np.percentile(v, 75.0) print("TSV=> accuracy: ", accuracy, " 25:", p25, pdict['_25'], " 50:", p50, pdict['_50'], " 75:", p75, pdict['_75']) self.assertAlmostEqual(p25, pdict['_25'], delta=0.01) self.assertAlmostEqual(p50, pdict['_50'], delta=0.01) self.assertAlmostEqual(p75, pdict['_75'], delta=0.01)
def test_range_query(self): "Run tests of the RangeQuery module" s = self.scheduler() random = RandomTable(2, rows=1000, scheduler=s) t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column='_1', scheduler=s) range_qry.create_dependent_modules(random, 'table', min_value=min_value, max_value=max_value) prt = Print(proc=self.terse, scheduler=s) prt.input.df = range_qry.output.table s.start() s.join() idx = range_qry.input_module.output['table']\ .data().eval('(_1>0.3)&(_1<0.8)', result_object='index') self.assertEqual(range_qry.table().selection, bitmap(idx))
def test_read_multiple_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def test_combine_first_dup(self) -> None: s = self.scheduler(True) cst1 = Constant( Table( name="tcf_xmin_xmax", data=pd.DataFrame({"xmin": [1], "xmax": [2]}), create=True, ), scheduler=s, ) cst2 = Constant( Table( name="tcf_ymin_ymax", data=pd.DataFrame({"ymin": [5], "ymax": [6]}), create=True, ), scheduler=s, ) cst3 = Constant( Table( name="tcf_ymin_ymax2", data=pd.DataFrame({"ymin": [3], "ymax": [4]}), create=True, ), scheduler=s, ) cf = CombineFirst(scheduler=s) cf.input[0] = cst1.output.result cf.input[0] = cst2.output.result cf.input[0] = cst3.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = cf.output.result aio.run(s.start()) # res = cf.trace_stats(max_runs=1) row = cf.table.last() assert row is not None last = row.to_dict() self.assertEqual(last["xmin"], 1) self.assertEqual(last["xmax"], 2) self.assertEqual(last["ymin"], 5) self.assertEqual(last["ymax"], 6)
def test_cmp_query(self): s=self.scheduler() random = RandomTable(10, rows=10000, scheduler=s) cmp_ = CmpQueryLast(scheduler=s) cst = Table("cmp_table", data={'_1': [0.5]}) value = Constant(cst, scheduler=s) cmp_.input.cmp = value.output.table cmp_.input.table = random.output.table pr=Print(proc=self.terse, scheduler=s) pr.input.df = cmp_.output.select s.start() s.join()
def test_combine_first_nan(self) -> None: s = self.scheduler(True) cst1 = Constant( Table( name="tcf_xmin_xmax_nan", data=pd.DataFrame({"xmin": [1], "xmax": [2]}), create=True, ), scheduler=s, ) cst2 = Constant( Table( name="tcf_ymin_ymax_nan", data=pd.DataFrame({"ymin": [np.nan], "ymax": [np.nan]}), create=True, ), scheduler=s, ) cst3 = Constant( Table( name="tcf_ymin_ymax2_nan", data=pd.DataFrame({"ymin": [3], "ymax": [4]}), create=True, ), scheduler=s, ) cf = CombineFirst(scheduler=s) cf.input[0] = cst1.output.result cf.input[0] = cst2.output.result cf.input[0] = cst3.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = cf.output.result aio.run(s.start()) last = notNone(cf.table.last()).to_dict() self.assertTrue( last["xmin"] == 1 and last["xmax"] == 2 and last["ymin"] == 3 and last["ymax"] == 4 )
def test_last_row_simple(self) -> None: s = self.scheduler() t1 = Table(name=get_random_name("cst1"), data={"xmin": [1], "xmax": [2]}) t2 = Table(name=get_random_name("cst2"), data={"ymin": [3], "ymax": [4]}) cst1 = Constant(t1, scheduler=s) cst2 = Constant(t2, scheduler=s) join = Join(scheduler=s) join.input[0] = cst1.output.result join.input[0] = cst2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = join.output.result aio.run(s.start()) # res = join.trace_stats(max_runs=1) # pd.set_option('display.expand_frame_repr', False) # print(res) last = notNone(join.table.last()) self.assertTrue( last["xmin"] == 1 and last["xmax"] == 2 and last["ymin"] == 3 and last["ymax"] == 4 )
def test_read_multiple_fake_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [ 'buffer://fake1?cols=10&rows=30000', 'buffer://fake2?cols=10&rows=30000']}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def _range_query_impl(self, lo, up) -> None: "Run tests of the RangeQuery module" s = self.scheduler() with s: random = RandomTable(2, rows=1000, scheduler=s) t_min = PsDict({"_1": lo}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": up}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules( random, "result", min_value=min_value, max_value=max_value ) prt = Print(proc=self.terse, scheduler=s) prt.input[0] = range_qry.output.result aio.run(s.start()) assert range_qry.input_module is not None idx = ( range_qry.input_module.output["result"] .data() .eval(f"(_1>{lo})&(_1<{up})", result_object="index") ) self.assertEqual(range_qry.table.index, bitmap(idx))
def te_st_join_simple(self) -> None: s = self.scheduler() cst1 = Constant( Table( name="test_join_simple_cst1", data=pd.DataFrame({ "xmin": [1], "xmax": [2] }), create=True, ), scheduler=s, ) cst2 = Constant( Table( name="test_join_simple_cst2", data=pd.DataFrame({ "ymin": [3], "ymax": [4] }), create=True, ), scheduler=s, ) join = Join(scheduler=s) join.input[0] = cst1.output.result join.input[0] = cst2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = join.output.result aio.run(s.start()) res = join.trace_stats(max_runs=1) print(res) df = join.table last = df.loc[df.index[-1]] assert last is not None self.assertTrue(last["xmin"] == 1 and last["xmax"] == 2 and last["ymin"] == 3 and last["ymax"] == 4)
def _impl_stirred_tst_percentiles(self, accuracy: float, **kw: Any) -> None: """ """ s = self.scheduler() with s: random = RandomTable(2, rows=10000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.input[0] = stirrer.output.result t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0}) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(accuracy=accuracy, scheduler=s) percentiles.input[0] = stirrer.output.result percentiles.input.percentiles = which_percentiles.output.result percentiles.input.hist = hist_index.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = percentiles.output.result aio.run(s.start()) pdict = notNone(percentiles.table.last()).to_dict() # v = random.table()['_1'].values # from nose.tools import set_trace; set_trace() v = stirrer.table.to_array(columns=["_1"]).reshape(-1) p25 = np.percentile(v, 25.0) # type: ignore p50 = np.percentile(v, 50.0) # type: ignore p75 = np.percentile(v, 75.0) # type: ignore print( "Table=> accuracy: ", accuracy, " 25:", p25, pdict["_25"], " 50:", p50, pdict["_50"], " 75:", p75, pdict["_75"], ) # from nose.tools import set_trace; set_trace() self.assertAlmostEqual(p25, pdict["_25"], delta=0.01) self.assertAlmostEqual(p50, pdict["_50"], delta=0.01) self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
def test_03_read_multiple_csv_crash_recovery(self): #if TRAVIS: return p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [make_url('smallfile'), make_url('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.table csv.start() s.join() _close(csv) self.assertEqual(len(csv.table()), 60000)