def test_binary2(self) -> None: s = self.scheduler() cols = 10 _ = RandomTable(cols, rows=100_000, scheduler=s) _ = RandomTable(cols, rows=100_000, scheduler=s) with self.assertRaises(AssertionError): _ = Binary(np.add, columns=["_3", "_5", "_7"], scheduler=s)
def t_num_expr_impl(self, cls: Type[NumExprABC]) -> Tuple[Any, ...]: s = self.scheduler() random1 = RandomTable(10, rows=100000, scheduler=s) random2 = RandomTable(10, rows=100000, scheduler=s) module = cls( columns={ "first": ["_1", "_2", "_3"], "second": ["_1", "_2", "_3"] }, scheduler=s, ) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) first = random1.table.to_array() first_2 = first[:, 1] first_3 = first[:, 2] second = random2.table.to_array() second_2 = second[:, 1] second_3 = second[:, 2] ne_1 = ne.evaluate("first_2+2*second_3") ne_2 = ne.evaluate("first_3-5*second_2") res = module.table.to_array() self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True)) self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True)) return first_2, first_3, second_2, second_3
def _t_impl(self, cls: Type[TableModule], ufunc: np.ufunc, mod_name: str) -> None: print("Testing", mod_name) s = self.scheduler() random1 = RandomTable( 3, rows=100_000, scheduler=s, random=lambda x: np.random.randint(10, size=x), # type: ignore dtype="int64", ) random2 = RandomTable( 3, rows=100_000, scheduler=s, random=lambda x: np.random.randint(10, size=x), # type: ignore dtype="int64", ) module = cls(scheduler=s) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) res1 = ufunc(random1.table.to_array(), random2.table.to_array()) res2 = module.table.to_array() self.assertTrue(module.name.startswith(mod_name)) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def test_hist_index_min_max(self): "Test min_out and max_out on HistogramIndex" s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column='_1', scheduler=s) range_qry.create_dependent_modules(random, 'table', min_value=min_value, max_value=max_value) prt = Print(proc=self.terse, scheduler=s) prt.input.df = range_qry.output.table hist_index = range_qry.hist_index min_ = Min(name='min_' + str(hash(hist_index)), scheduler=s) min_.input.table = hist_index.output.min_out prt2 = Print(proc=self.terse, scheduler=s) prt2.input.df = min_.output.table max_ = Max(name='max_' + str(hash(hist_index)), scheduler=s) max_.input.table = hist_index.output.max_out pr3 = Print(proc=self.terse, scheduler=s) pr3.input.df = max_.output.table s.start() s.join() res1 = random.table().min()['_1'] res2 = min_.table().last().to_dict()['_1'] self.assertAlmostEqual(res1, res2) res1 = random.table().max()['_1'] res2 = max_.table().last().to_dict()['_1'] self.assertAlmostEqual(res1, res2)
def _impl_tst_percentiles(self, accuracy): """ """ s = self.scheduler() random = RandomTable(2, rows=10000, scheduler=s) hist_index = HistogramIndex(column='_1', scheduler=s) hist_index.input.table = random.output.table t_percentiles = Table( name=None, dshape='{_25: float64, _50: float64, _75: float64}', data={ '_25': [25.0], '_50': [50.0], '_75': [75.0] }) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(hist_index, accuracy=accuracy, scheduler=s) percentiles.input.table = random.output.table percentiles.input.percentiles = which_percentiles.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = percentiles.output.table s.start() s.join() pdict = percentiles.table().last().to_dict() v = random.table()['_1'].values p25 = np.percentile(v, 25.0) p50 = np.percentile(v, 50.0) p75 = np.percentile(v, 75.0) print("Table=> accuracy: ", accuracy, " 25:", p25, pdict['_25'], " 50:", p50, pdict['_50'], " 75:", p75, pdict['_75']) self.assertAlmostEqual(p25, pdict['_25'], delta=0.01) self.assertAlmostEqual(p50, pdict['_50'], delta=0.01) self.assertAlmostEqual(p75, pdict['_75'], delta=0.01)
def t_mix_ufunc_impl( self, cls: Type[MixUfuncABC], ufunc1: np.ufunc = np.log, ufunc2: np.ufunc = np.add, ) -> None: s = self.scheduler() random1 = RandomTable(10, rows=100000, scheduler=s) random2 = RandomTable(10, rows=100000, scheduler=s) module = cls( columns={ "first": ["_1", "_2", "_3"], "second": ["_1", "_2", "_3"] }, scheduler=s, ) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) first = random1.table.to_array() first_2 = first[:, 1] _ = first[:, 2] second = random2.table.to_array() _ = second[:, 1] second_3 = second[:, 2] ne_1 = ufunc2(first_2, second_3).astype("float64") ne_2 = ufunc1(second_3).astype("float64") res = module.table.to_array() self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True)) self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
def test_max(self): s=Scheduler() random = RandomTable(10, rows=10000, scheduler=s) max=Max(scheduler=s) max.input.df = random.output.df pr=Print(scheduler=s) pr.input.df = max.output.df s.start() res1 = random.df()[random.columns.difference([random.UPDATE_COLUMN])].max() res2 = last_row(max.df(), remove_update=True) self.assertTrue(np.allclose(res1, res2))
def test_random_table(self): s=Scheduler() module=RandomTable(['a', 'b'], rows=10000, scheduler=s) self.assertEqual(module.df().columns[0],'a') self.assertEqual(module.df().columns[1],'b') self.assertEqual(len(module.df().columns), 3) # add the UPDATE_COLUMN prlen = Every(proc=print_len, constant_time=True, scheduler=s) prlen.input.df = module.output.df s.start() self.assertEqual(len(module.df()), 10000) self.assertFalse(module.df()['a'].isnull().any()) self.assertFalse(module.df()['b'].isnull().any())
def test_max(self): s = self.scheduler() random = RandomTable(10, rows=10000, scheduler=s) max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = max_.output.table s.start() s.join() res1 = random.table().max() res2 = max_.cxx_module.get_output_table().last() self.compare(res1, res2)
def test_min(self): s = self.scheduler() random = RandomTable(10, rows=10000, scheduler=s) min_ = Min(name='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = min_.output.table s.start() s.join() res1 = random.table().min() res2 = min_.table().last() self.compare(res1, res2)
def test_var(self): s=Scheduler() random = RandomTable(1, rows=1000, scheduler=s) var=Var(scheduler=s) var.input.df = random.output.df pr=Print(scheduler=s) pr.input.df = var.output.df s.start() res1 = random.df()[1].var() res2 = last_row(var.df(), remove_update=True) #print 'res1:', res1 #print 'res2:', res2 self.assertTrue(np.allclose(res1, res2))
def test_hadamard(self) -> None: s = self.scheduler() random1 = RandomTable(3, rows=100000, scheduler=s) random2 = RandomTable(3, rows=100000, scheduler=s) module = Hadamard(scheduler=s) module.input.x1 = random1.output.result module.input.x2 = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) res1 = np.multiply(random1.table.to_array(), random2.table.to_array()) res2 = module.table.to_array() self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def test_random_table2(self): s=Scheduler() # produces more than 4M rows per second on my laptop module=RandomTable(10, rows=10000000, force_valid_ids=True, scheduler=s) self.assertEqual(len(module.df().columns), 11) # add the UPDATE_COLUMN self.assertEqual(module.df().columns[0],'_1') self.assertEqual(module.df().columns[1],'_2') prlen = Every(proc=print_len, constant_time=True, scheduler=s) prlen.input.df = module.output.df s.start() self.assertEqual(len(module.df()), 10000000) self.assertFalse(module.df()['_1'].isnull().any()) self.assertFalse(module.df()['_2'].isnull().any())
def test_binary(self) -> None: s = self.scheduler() random1 = RandomTable(3, rows=100_000, scheduler=s) random2 = RandomTable(3, rows=100_000, scheduler=s) module = Binary(np.add, scheduler=s) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) res1 = np.add(random1.table.to_array(), random2.table.to_array()) res2 = module.table.to_array() self.assertTrue(module.name.startswith("binary_")) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def test_range_query_min_max3(self): "Test min and max on RangeQuery output" s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]}) t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [15000.]}) range_qry = self._query_min_max_impl(random, t_min, t_max, s) s.start() s.join() min_data = range_qry.output.min.data() max_data = range_qry.output.max.data() max_rand = random.table().max()['_1'] self.assertAlmostEqual(min_data['_1'].loc[0], 0.3) self.assertAlmostEqual(max_data['_1'].loc[0], max_rand)
def test_intersection(self): s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]}) max_value = Constant(table=t_max, scheduler=s) hist_index = HistogramIndex(column='_1', scheduler=s) hist_index.create_dependent_modules(random, 'table') bisect_min = Bisect(column='_1', op='>', hist_index=hist_index, scheduler=s) bisect_min.input.table = hist_index.output.table #bisect_.input.table = random.output.table bisect_min.input.limit = min_value.output.table bisect_max = Bisect(column='_1', op='<', hist_index=hist_index, scheduler=s) bisect_max.input.table = hist_index.output.table #bisect_.input.table = random.output.table bisect_max.input.limit = max_value.output.table inter = Intersection(scheduler=s) inter.input.table = bisect_min.output.table inter.input.table = bisect_max.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = inter.output.table s.start() s.join() idx = hist_index.input_module.output['table']\ .data().eval('(_1>0.3)&(_1<0.8)', result_object='index') self.assertEqual(inter.table().selection, bitmap(idx))
def test_dataflow_2_add_remove(self) -> None: scheduler = self.scheduler(clean=True) table = RandomTable(name="table", columns=["a"], throttle=1000, scheduler=scheduler) m = Min(name="min", scheduler=scheduler) prt = Print(proc=self.terse, name="print_min", scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result started = False def proc(x: Any) -> None: nonlocal started print("proc max called") started = True async def _add_max_remove_min(scheduler: Scheduler, run_number: int) -> None: with scheduler as dataflow: print("adding new modules") m = Max(name="max", scheduler=scheduler) prt = Print(name="print_max", proc=proc, scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result print("removing min module") dataflow.delete_modules("min", "print_min") # t = _add_max_remove_min(csv, scheduler, proc=proc) scheduler.on_loop(_add_max_remove_min, 5) scheduler.on_loop(self._stop, 10) aio.run(scheduler.start()) self.assertTrue(started)
def test_dataflow_1_dynamic(self) -> None: scheduler = self.scheduler(clean=True) table = RandomTable(name="table", columns=["a"], throttle=1000, scheduler=scheduler) m = Min(name="min", scheduler=scheduler) prt = Print(proc=self.terse, name="print_min", scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result started = False def proc(x: Any) -> None: nonlocal started print("proc max called") started = True async def _add_max(scheduler: Scheduler, run_number: int) -> None: with scheduler: print("adding new modules") m = Max(name="max", scheduler=scheduler) prt = Print(name="print_max", proc=proc, scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result scheduler.on_loop(_add_max, 5) # run the function after 5 loops scheduler.on_loop(self._stop, 10) # from nose.tools import set_trace; set_trace() aio.run(scheduler.start()) self.assertTrue(started)
def test_idxmax2(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, throttle=1000, scheduler=s) stirrer = Stirrer(update_column="_1", delete_rows=5, fixed_step_size=100, scheduler=s) stirrer.input[0] = random.output.result idxmax = IdxMax(scheduler=s) idxmax.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = idxmax.output.result pr2 = Print(proc=self.terse, scheduler=s) pr2.input[0] = max_.output.result aio.run(s.start()) # import pdb;pdb.set_trace() max1 = max_.psdict # print('max1', max1) max = idxmax.max() assert max is not None max2 = notNone(max.last()).to_dict() # print('max2', max2) self.compare(max1, max2)
def test_bisect2(self) -> None: s = self.scheduler() random = RandomTable(2, rows=100_000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=100, # update_rows=5, # fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result t = Table(name=None, dshape="{value: string}", data={"value": [0.5]}) min_value = Constant(table=t, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.create_dependent_modules(stirrer, "result") bisect_ = Bisect(column="_1", op=">", hist_index=hist_index, scheduler=s) bisect_.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_.input.limit = min_value.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = bisect_.output.result aio.run(s.start()) idx = stirrer.table.eval("_1>0.5", result_object="index") self.assertEqual(bisect_.table.index, bitmap(idx))
def test_scatterplot2(self): s = self.scheduler() random = RandomTable(2, rows=2000000, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_1', '_2')], approximate=True) sp.create_dependent_modules(random, 'table', with_sampling=False) cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input.df = random.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = sp.output.table decorate(s, VariablePatch1("variable_1")) decorate(s, VariablePatch2("variable_2")) decorate(s, ScatterPlotPatch("mc_scatter_plot_1")) sp.scheduler().start(idle_proc=idle_proc) s.join() js = sp.to_json() x, y, _ = zip(*js['sample']['data']) min_x = min(x) max_x = max(x) min_y = min(y) max_y = max(y) self.assertGreaterEqual(min_x, LOWER_X) self.assertGreaterEqual(min_y, LOWER_Y) self.assertLessEqual(max_x, UPPER_X) self.assertLessEqual(max_y, UPPER_Y)
def t_mix_ufunc_table_dict_impl(self, cls: Type[MixUfuncABC]) -> None: s = self.scheduler() random1 = RandomDict(10, scheduler=s) random2 = RandomTable(10, rows=100000, scheduler=s) module = cls( columns={ "first": ["_1", "_2", "_3"], "second": ["_1", "_2", "_3"] }, scheduler=s, ) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) first = list(random1.psdict.values()) first_2 = first[1] _ = first[2] second = random2.table.to_array() _ = second[:, 1] second_3 = second[:, 2] ne_1 = np.add(first_2, second_3) ne_2 = np.log(second_3) res = module.table.to_array() self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True)) self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
def test_paste(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, scheduler=s) min_1 = Min(name="min_1" + str(hash(random)), scheduler=s, columns=["_1"]) min_1.input[0] = random.output.result d2t_1 = Dict2Table(scheduler=s) d2t_1.input.dict_ = min_1.output.result min_2 = Min(name="min_2" + str(hash(random)), scheduler=s, columns=["_2"]) min_2.input[0] = random.output.result d2t_2 = Dict2Table(scheduler=s) d2t_2.input.dict_ = min_2.output.result bj = Paste(scheduler=s) bj.input.first = d2t_1.output.result bj.input.second = d2t_2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = bj.output.result aio.run(s.start()) res1 = random.table.min() res2 = notNone(bj.table.last()).to_dict() self.assertAlmostEqual(res1["_1"], res2["_1"]) self.assertAlmostEqual(res1["_2"], res2["_2"])
def test_binary3(self) -> None: s = self.scheduler() cols = 10 random1 = RandomTable(cols, rows=100_000, scheduler=s) random2 = RandomDict(cols, scheduler=s) module = Binary( np.add, columns={ "first": ["_3", "_5", "_7"], "second": ["_4", "_6", "_8"] }, scheduler=s, ) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) res1 = np.add( random1.table.to_array()[:, [2, 4, 6]], np.array(list(random2.psdict.values()))[[3, 5, 7]], ) res2 = module.table.to_array() self.assertTrue(module.name.startswith("binary_")) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def t_histogram2d_impl(self, **kw: Any) -> None: s = self.scheduler() random = RandomTable(3, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram2d = Histogram2D( 0, 1, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = stirrer.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(s.start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] t = stirrer.table.loc[:, ["_1", "_2"]] assert t is not None v = t.to_array() bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())
def test_hist_index_min_max(self) -> None: "Test min_out and max_out on HistogramIndex" s = self.scheduler() with s: random = RandomTable(2, rows=100000, scheduler=s) t_min = PsDict({"_1": 0.3}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": 0.8}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules( random, "result", min_value=min_value, max_value=max_value ) prt = Print(proc=self.terse, scheduler=s) prt.input[0] = range_qry.output.result hist_index = range_qry.hist_index assert hist_index is not None min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s) min_.input[0] = hist_index.output.min_out prt2 = Print(proc=self.terse, scheduler=s) prt2.input[0] = min_.output.result max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s) max_.input[0] = hist_index.output.max_out pr3 = Print(proc=self.terse, scheduler=s) pr3.input[0] = max_.output.result aio.run(s.start()) res1 = cast(float, random.table.min()["_1"]) res2 = cast(float, min_.psdict["_1"]) self.assertAlmostEqual(res1, res2) res1 = cast(float, random.table.max()["_1"]) res2 = cast(float, max_.psdict["_1"]) self.assertAlmostEqual(res1, res2)
def test_ldexp(self) -> None: cls, ufunc, mod_name = ColsLdexp, np.ldexp, "cols_ldexp_" print("Testing", mod_name) s = self.scheduler() cols = 10 random = RandomTable( cols, rows=10_000, scheduler=s, random=lambda x: np.random.randint(10, size=x), # type: ignore dtype="int64", ) module = cls( first=["_3", "_5", "_7"], second=["_4", "_6", "_8"], cols_out=["x", "y", "z"], scheduler=s, ) module.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) self.assertListEqual(module.table.columns, ["x", "y", "z"]) arr = random.table.to_array() res1 = ufunc(arr[:, [2, 4, 6]], arr[:, [3, 5, 7]]) res2 = module.table.to_array() self.assertTrue(module.name.startswith(mod_name)) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def test_intersection(self) -> None: s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape="{_1: float64}", data={"_1": [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape="{_1: float64}", data={"_1": [0.8]}) max_value = Constant(table=t_max, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.create_dependent_modules(random, "result") bisect_min = Bisect(column="_1", op=">", hist_index=hist_index, scheduler=s) bisect_min.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_min.input.limit = min_value.output.result bisect_max = Bisect(column="_1", op="<", hist_index=hist_index, scheduler=s) bisect_max.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_max.input.limit = max_value.output.result inter = Intersection(scheduler=s) inter.input[0] = bisect_min.output.result inter.input[0] = bisect_max.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = inter.output.result aio.run(s.start()) assert hist_index.input_module is not None idx = (hist_index.input_module.output["result"].data().eval( "(_1>0.3)&(_1<0.8)", result_object="index")) self.assertEqual(inter.table.index, bitmap(idx))
def test_hub_if_else(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: False, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result min_ = Min(name="min_" + str(hash(random)), scheduler=s) min_.input[0] = switch.output.result_else hub = Hub(scheduler=s) hub.input.table = min_.output.result hub.input.table = max_.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = hub.output.result aio.run(s.start()) res1 = stirrer.result.min() res2 = hub.result self.compare(res1, res2)
def setUpStep(self, step): self.set_step_info("{} rows".format(step * L)) s = Scheduler() random = RandomTable(10, rows=step * L, scheduler=s) s.start() #return random self.random_table = pd.DataFrame( random.output.table.output_module.table().to_dict())
def test_bin_join(self): s = self.scheduler() random = RandomTable(10, rows=10000, scheduler=s) min_1 = Min(name='min_1'+str(hash(random)), scheduler=s, columns=['_1']) min_1.input.table = random.output.table min_2 = Min(name='min_2'+str(hash(random)), scheduler=s, columns=['_2']) min_2.input.table = random.output.table bj = BinJoin(scheduler=s) bj.input.first = min_1.output.table bj.input.second = min_2.output.table pr=Print(proc=self.terse, scheduler=s) pr.input.df = bj.output.table s.start() s.join() res1 = random.table().min() res2 = bj.table().last().to_dict() self.assertAlmostEqual(res1['_1'], res2['_1']) self.assertAlmostEqual(res1['_2'], res2['_2'])
def test_var(self): s = self.scheduler() random = RandomTable(1, rows=1000, scheduler=s) var = Var(scheduler=s) var.input.table = random.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = var.output.table s.start() s.join() res1 = np.array( [float(e) for e in random.table().var(ddof=1).values()]) res2 = np.array([ float(e) for e in var.table().last().to_dict(ordered=True).values() ]) print('res1:', res1) print('res2:', res2) self.assertTrue(np.allclose(res1, res2))
def p10s_random_min_max(n): StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) min_ = Min(name='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table s.start()
def p10s_random_min_max(self): n = self.current_step StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) min_ = Min(mid='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table max_ = Max(id='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table s.start()
from progressivis import Scheduler, Every from progressivis.stats import RandomTable, Min, Max from progressivis.vis import Histograms #log_level() try: s = scheduler print 'No scheduler defined, using the standard one' except: s = Scheduler() csv = RandomTable(columns=['a', 'b', 'c'],rows=1000000, throttle=1000, scheduler=s) min = Min(scheduler=s) min.input.df = csv.output.df max = Max(scheduler=s) max.input.df = csv.output.df histograms = Histograms(scheduler=s) histograms.input.df = csv.output.df histograms.input.min = min.output.df histograms.input.max = max.output.df prlen = Every(scheduler=s) prlen.input.df = histograms.output.df if __name__=='__main__': print "Starting" csv.start()
from progressivis import Scheduler, Print from progressivis.cluster import MBKMeans from progressivis.stats import RandomTable from progressivis.vis import ScatterPlot try: s = scheduler except: s = Scheduler() table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s) mbkmeans = MBKMeans(columns=['a', 'b'], n_clusters=8, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.df = table.output.df prn = Print(scheduler=s) prn.input.df = mbkmeans.output.df #sp = ScatterPlot('a', 'b') #sp.create_dependent_modules(mbkmeans,'df') if __name__ == '__main__': table.start()