def test_hist_index_min_max(self) -> None: "Test min_out and max_out on HistogramIndex" s = self.scheduler() with s: random = RandomTable(2, rows=100000, scheduler=s) t_min = PsDict({"_1": 0.3}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": 0.8}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules( random, "result", min_value=min_value, max_value=max_value ) prt = Print(proc=self.terse, scheduler=s) prt.input[0] = range_qry.output.result hist_index = range_qry.hist_index assert hist_index is not None min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s) min_.input[0] = hist_index.output.min_out prt2 = Print(proc=self.terse, scheduler=s) prt2.input[0] = min_.output.result max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s) max_.input[0] = hist_index.output.max_out pr3 = Print(proc=self.terse, scheduler=s) pr3.input[0] = max_.output.result aio.run(s.start()) res1 = cast(float, random.table.min()["_1"]) res2 = cast(float, min_.psdict["_1"]) self.assertAlmostEqual(res1, res2) res1 = cast(float, random.table.max()["_1"]) res2 = cast(float, max_.psdict["_1"]) self.assertAlmostEqual(res1, res2)
def test_init_dict(self) -> None: d1 = PsDict(a=1, b=2, c=3) other = dict(a=1, b=2, c=3) d2 = PsDict(other) self.assertEqual(d1, d2) d3 = PsDict(other, x=8, y=5) self.assertEqual(len(d3), 5)
def _impl_stirred_tst_percentiles_rq(self, accuracy: float, **kw: Any) -> None: """ """ s = self.scheduler() with s: random = RandomTable(2, rows=10000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result t_min = PsDict({"_1": 0.3}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": 0.8}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules(stirrer, "result", min_value=min_value, max_value=max_value) hist_index = range_qry.hist_index assert hist_index t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0}) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(accuracy=accuracy, scheduler=s) percentiles.input[0] = range_qry.output.result percentiles.input.percentiles = which_percentiles.output.result percentiles.input.hist = hist_index.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = percentiles.output.result aio.run(s.start()) pdict = notNone(percentiles.table.last()).to_dict() v = range_qry.table["_1"].values p25 = np.percentile(v, 25.0) # type: ignore p50 = np.percentile(v, 50.0) # type: ignore p75 = np.percentile(v, 75.0) # type: ignore print( "TSV=> accuracy: ", accuracy, " 25:", p25, pdict["_25"], " 50:", p50, pdict["_50"], " 75:", p75, pdict["_75"], ) self.assertAlmostEqual(p25, pdict["_25"], delta=0.01) self.assertAlmostEqual(p50, pdict["_50"], delta=0.01) self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
def test_range_query_min_max(self) -> None: "Test min and max on RangeQuery output" s = self.scheduler() with s: random = RandomTable(2, rows=100000, scheduler=s) t_min = PsDict({"_1": 0.3}) t_max = PsDict({"_1": 0.8}) range_qry = self._query_min_max_impl(random, t_min, t_max, s) aio.run(s.start()) min_data = range_qry.output.min.data() max_data = range_qry.output.max.data() self.assertAlmostEqual(min_data["_1"], 0.3) self.assertAlmostEqual(max_data["_1"], 0.8)
def test_range_query_min_max2(self) -> None: "Test min and max on RangeQuery output" s = self.scheduler() with s: random = RandomTable(2, rows=100000, scheduler=s) t_min = PsDict({"_1": 0.0}) t_max = PsDict({"_1": float("nan")}) range_qry = self._query_min_max_impl(random, t_min, t_max, s) aio.run(s.start()) min_data = cast(PsDict, range_qry.output.min.data()) max_data = range_qry.output.max.data() min_rand = random.table.min()["_1"] self.assertAlmostEqual(min_data["_1"], min_rand, delta=0.0001) self.assertAlmostEqual(max_data["_1"], 1.0, delta=0.0001)
def test_ps_dict_new_ids(self) -> None: prev = PsDict(a=1, b=2, c=3) now = copy.copy(prev) now["x"] = 10 now["y"] = 20 new_ids = now.created_indices(prev) self.assertEqual(bitmap(new_ids), bitmap([3, 4]))
def test_ps_dict_updated_ids(self) -> None: prev = PsDict(a=1, b=2, c=3, d=4, e=5) now = copy.copy(prev) updated_ids = now.updated_indices(prev) self.assertEqual(bitmap(updated_ids), bitmap()) now["b"] += 1 now["d"] *= 2 updated_ids = now.updated_indices(prev) self.assertEqual(bitmap(updated_ids), bitmap([1, 3]))
def test_ps_dict_deleted_ids(self) -> None: prev = PsDict(a=1, b=2, c=3, d=4, e=5) now = copy.copy(prev) deleted_ids = now.deleted_indices(prev) self.assertEqual(bitmap(deleted_ids), bitmap()) del now["b"] now["c"] *= 3 deleted_ids = now.deleted_indices(prev) updated_ids = now.updated_indices(prev) self.assertEqual(bitmap(deleted_ids), bitmap([1])) self.assertEqual(bitmap(updated_ids), bitmap([2]))
def _range_query_impl(self, lo, up) -> None: "Run tests of the RangeQuery module" s = self.scheduler() with s: random = RandomTable(2, rows=1000, scheduler=s) t_min = PsDict({"_1": lo}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": up}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules( random, "result", min_value=min_value, max_value=max_value ) prt = Print(proc=self.terse, scheduler=s) prt.input[0] = range_qry.output.result aio.run(s.start()) assert range_qry.input_module is not None idx = ( range_qry.input_module.output["result"] .data() .eval(f"(_1>{lo})&(_1<{up})", result_object="index") ) self.assertEqual(range_qry.table.index, bitmap(idx))
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: indices = ctx.table.created.next(step_size) # returns a slice steps = indices_len(indices) input_df = ctx.table.data() op = input_df.loc[fix_loc(indices)].max(keepdims=False) if self.result is None: self.result = PsDict(op) else: for k, v in self.psdict.items(): self.result[k] = np.maximum(op[k], v) return self._return_run_step(self.next_state(ctx.table), steps_run=steps)
def _impl_stirred_tst_percentiles(self, accuracy: float, **kw: Any) -> None: """ """ s = self.scheduler() with s: random = RandomTable(2, rows=10000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.input[0] = stirrer.output.result t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0}) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(accuracy=accuracy, scheduler=s) percentiles.input[0] = stirrer.output.result percentiles.input.percentiles = which_percentiles.output.result percentiles.input.hist = hist_index.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = percentiles.output.result aio.run(s.start()) pdict = notNone(percentiles.table.last()).to_dict() # v = random.table()['_1'].values # from nose.tools import set_trace; set_trace() v = stirrer.table.to_array(columns=["_1"]).reshape(-1) p25 = np.percentile(v, 25.0) # type: ignore p50 = np.percentile(v, 50.0) # type: ignore p75 = np.percentile(v, 75.0) # type: ignore print( "Table=> accuracy: ", accuracy, " 25:", p25, pdict["_25"], " 50:", p50, pdict["_50"], " 75:", p75, pdict["_75"], ) # from nose.tools import set_trace; set_trace() self.assertAlmostEqual(p25, pdict["_25"], delta=0.01) self.assertAlmostEqual(p50, pdict["_50"], delta=0.01) self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: slot = self.get_input_slot("table") if slot.updated.any() or slot.deleted.any(): slot.reset() if self.result is not None: self.psdict.clear() # resize(0) slot.update(run_number) indices = slot.created.next(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) data = slot.data() op = data.loc[fix_loc(indices)].max(keepdims=False) if self.result is None: self.result = PsDict(op) else: for k, v in self.psdict.items(): self.result[k] = np.maximum(op[k], v) return self._return_run_step(self.next_state(slot), steps_run=steps)
def _impl_tst_percentiles(self, accuracy: float) -> None: """ """ s = self.scheduler() with s: random = RandomTable(2, rows=10000, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.input[0] = random.output.result t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0}) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(accuracy=accuracy, scheduler=s) percentiles.input[0] = random.output.result percentiles.input.percentiles = which_percentiles.output.result percentiles.input.hist = hist_index.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = percentiles.output.result aio.run(s.start()) last = percentiles.table.last() assert last is not None pdict = last.to_dict() v = random.table["_1"].values p25 = np.percentile(v, 25.0) # type: ignore p50 = np.percentile(v, 50.0) # type: ignore p75 = np.percentile(v, 75.0) # type: ignore print( "Table=> accuracy: ", accuracy, " 25:", p25, pdict["_25"], " 50:", p50, pdict["_50"], " 75:", p75, pdict["_75"], ) # from nose.tools import set_trace; set_trace() self.assertAlmostEqual(p25, pdict["_25"], delta=0.01) self.assertAlmostEqual(p50, pdict["_50"], delta=0.01) self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: first_slot = self.get_input_slot("first") # first_slot.update(run_number) second_slot = self.get_input_slot("second") assert first_slot is not None and second_slot is not None first_dict = first_slot.data() second_dict = second_slot.data() if first_dict is None or second_dict is None: return self._return_run_step(self.state_blocked, steps_run=0) # second_slot.update(run_number) first_slot.created.next() second_slot.created.next() first_slot.updated.next() second_slot.updated.next() first_slot.deleted.next() second_slot.deleted.next() if self.result is None: self.result = PsDict(**first_dict, **second_dict) else: self.psdict.update(first_dict) self.psdict.update(second_dict) return self._return_run_step(self.next_state(first_slot), steps_run=1)
def test_min_max_scaler_tol(self): s = self.scheduler() _, f = tf.mkstemp() print(f) df2.to_csv(f, index=False) cols = ["A", "B"] csv = SimpleCSVLoader(f, usecols=cols, throttle=100, scheduler=s) cst = Constant(table=PsDict({"delta": -5, "ignore_max": 10}), scheduler=s) sc = MinMaxScaler(reset_threshold=10_000, scheduler=s) # sc.input[0] = random.output.result sc.create_dependent_modules(csv) sc.input.control = cst.output.result pr = Print(proc=self.terse, scheduler=s) pr2 = Print(proc=self.terse, scheduler=s) pr.input[0] = sc.output.result pr2.input[0] = sc.output.info aio.run(s.start()) print(sc._info) os.unlink(f) self.assertEqual(len(csv.result) - sc._info["ignored"], len(sc.result)) for c in cols: self.assertGreaterEqual(min(sc.result[c]), 0.0) self.assertLessEqual(max(sc.result[c]), 1.0)
def __init__(self, threshold: int, **kwds: Any) -> None: super().__init__(**kwds) self._threshold = threshold self.result = PsDict({"reset": True})
gen_csv(file_name, rows=999999, reset=True) #, header='_0,_1', reset=False) data = CSVLoader(file_name, skipinitialspace=True, header=None, index_col=False, scheduler=s) mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=3, batch_size=100, tol=0.01, is_input=False, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_0', '_1', mbkmeans)]) sp.create_dependent_modules(data, 'table') sp['Scatterplot'].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf}) sp['Scatterplot'].max_value._table = PsDict({'_0': np.inf, '_1': np.inf}) mbkmeans.input.table = sp['Scatterplot'].range_query_2d.output.table #mbkmeans.input.table = data.output.table mbkmeans.create_dependent_modules() sp.move_point = mbkmeans.moved_center # for input management def myprint(d): if d['convergence'] != 'unknown': print(d) else: print('.', end='') prn = Every(scheduler=s, proc=print)
filt = MBKMeansFilter(i) filt.create_dependent_modules(mbkmeans, data, 'table') classes.append({ 'name': cname, 'x_column': '_0', 'y_column': '_1', 'sample': mbkmeans if i == 0 else None, 'input_module': filt, 'input_slot': 'table' }) sp = MCScatterPlot(scheduler=s, classes=classes) sp.create_dependent_modules() for i in range(n_clusters): cname = f"k{i}" sp[cname].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf}) sp[cname].max_value._table = PsDict({'_0': np.inf, '_1': np.inf}) mbkmeans.input.table = data.output.table mbkmeans.create_dependent_modules() sp.move_point = mbkmeans.moved_center # for input management def myprint(d): if d['convergence'] != 'unknown': print(d) else: print('.', end='') prn = Every(scheduler=s, proc=print) prn.input.df = mbkmeans.output.conv
def categ_as_vega_dataset(categs: PsDict): return [{"category": k, "count": v} for (k, v) in categs.items()]