def test_hist_index_min_max(self) -> None:
     "Test min_out and max_out on HistogramIndex"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=100000, scheduler=s)
         t_min = PsDict({"_1": 0.3})
         min_value = Constant(table=t_min, scheduler=s)
         t_max = PsDict({"_1": 0.8})
         max_value = Constant(table=t_max, scheduler=s)
         range_qry = RangeQuery(column="_1", scheduler=s)
         range_qry.create_dependent_modules(
             random, "result", min_value=min_value, max_value=max_value
         )
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = range_qry.output.result
         hist_index = range_qry.hist_index
         assert hist_index is not None
         min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s)
         min_.input[0] = hist_index.output.min_out
         prt2 = Print(proc=self.terse, scheduler=s)
         prt2.input[0] = min_.output.result
         max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s)
         max_.input[0] = hist_index.output.max_out
         pr3 = Print(proc=self.terse, scheduler=s)
         pr3.input[0] = max_.output.result
     aio.run(s.start())
     res1 = cast(float, random.table.min()["_1"])
     res2 = cast(float, min_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
     res1 = cast(float, random.table.max()["_1"])
     res2 = cast(float, max_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
示例#2
0
 def test_init_dict(self) -> None:
     d1 = PsDict(a=1, b=2, c=3)
     other = dict(a=1, b=2, c=3)
     d2 = PsDict(other)
     self.assertEqual(d1, d2)
     d3 = PsDict(other, x=8, y=5)
     self.assertEqual(len(d3), 5)
    def _impl_stirred_tst_percentiles_rq(self, accuracy: float,
                                         **kw: Any) -> None:
        """ """
        s = self.scheduler()
        with s:
            random = RandomTable(2, rows=10000, scheduler=s)
            stirrer = Stirrer(update_column="_2",
                              fixed_step_size=1000,
                              scheduler=s,
                              **kw)
            stirrer.input[0] = random.output.result
            t_min = PsDict({"_1": 0.3})
            min_value = Constant(table=t_min, scheduler=s)
            t_max = PsDict({"_1": 0.8})
            max_value = Constant(table=t_max, scheduler=s)
            range_qry = RangeQuery(column="_1", scheduler=s)
            range_qry.create_dependent_modules(stirrer,
                                               "result",
                                               min_value=min_value,
                                               max_value=max_value)

            hist_index = range_qry.hist_index
            assert hist_index
            t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0})
            which_percentiles = Constant(table=t_percentiles, scheduler=s)
            percentiles = Percentiles(accuracy=accuracy, scheduler=s)
            percentiles.input[0] = range_qry.output.result
            percentiles.input.percentiles = which_percentiles.output.result
            percentiles.input.hist = hist_index.output.result
            prt = Print(proc=self.terse, scheduler=s)
            prt.input[0] = percentiles.output.result
        aio.run(s.start())
        pdict = notNone(percentiles.table.last()).to_dict()
        v = range_qry.table["_1"].values
        p25 = np.percentile(v, 25.0)  # type: ignore
        p50 = np.percentile(v, 50.0)  # type: ignore
        p75 = np.percentile(v, 75.0)  # type: ignore
        print(
            "TSV=> accuracy: ",
            accuracy,
            " 25:",
            p25,
            pdict["_25"],
            " 50:",
            p50,
            pdict["_50"],
            " 75:",
            p75,
            pdict["_75"],
        )
        self.assertAlmostEqual(p25, pdict["_25"], delta=0.01)
        self.assertAlmostEqual(p50, pdict["_50"], delta=0.01)
        self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
 def test_range_query_min_max(self) -> None:
     "Test min and max on RangeQuery output"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=100000, scheduler=s)
         t_min = PsDict({"_1": 0.3})
         t_max = PsDict({"_1": 0.8})
         range_qry = self._query_min_max_impl(random, t_min, t_max, s)
     aio.run(s.start())
     min_data = range_qry.output.min.data()
     max_data = range_qry.output.max.data()
     self.assertAlmostEqual(min_data["_1"], 0.3)
     self.assertAlmostEqual(max_data["_1"], 0.8)
 def test_range_query_min_max2(self) -> None:
     "Test min and max on RangeQuery output"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=100000, scheduler=s)
         t_min = PsDict({"_1": 0.0})
         t_max = PsDict({"_1": float("nan")})
         range_qry = self._query_min_max_impl(random, t_min, t_max, s)
     aio.run(s.start())
     min_data = cast(PsDict, range_qry.output.min.data())
     max_data = range_qry.output.max.data()
     min_rand = random.table.min()["_1"]
     self.assertAlmostEqual(min_data["_1"], min_rand, delta=0.0001)
     self.assertAlmostEqual(max_data["_1"], 1.0, delta=0.0001)
示例#6
0
 def test_ps_dict_new_ids(self) -> None:
     prev = PsDict(a=1, b=2, c=3)
     now = copy.copy(prev)
     now["x"] = 10
     now["y"] = 20
     new_ids = now.created_indices(prev)
     self.assertEqual(bitmap(new_ids), bitmap([3, 4]))
示例#7
0
 def test_ps_dict_updated_ids(self) -> None:
     prev = PsDict(a=1, b=2, c=3, d=4, e=5)
     now = copy.copy(prev)
     updated_ids = now.updated_indices(prev)
     self.assertEqual(bitmap(updated_ids), bitmap())
     now["b"] += 1
     now["d"] *= 2
     updated_ids = now.updated_indices(prev)
     self.assertEqual(bitmap(updated_ids), bitmap([1, 3]))
示例#8
0
 def test_ps_dict_deleted_ids(self) -> None:
     prev = PsDict(a=1, b=2, c=3, d=4, e=5)
     now = copy.copy(prev)
     deleted_ids = now.deleted_indices(prev)
     self.assertEqual(bitmap(deleted_ids), bitmap())
     del now["b"]
     now["c"] *= 3
     deleted_ids = now.deleted_indices(prev)
     updated_ids = now.updated_indices(prev)
     self.assertEqual(bitmap(deleted_ids), bitmap([1]))
     self.assertEqual(bitmap(updated_ids), bitmap([2]))
 def _range_query_impl(self, lo, up) -> None:
     "Run tests of the RangeQuery module"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=1000, scheduler=s)
         t_min = PsDict({"_1": lo})
         min_value = Constant(table=t_min, scheduler=s)
         t_max = PsDict({"_1": up})
         max_value = Constant(table=t_max, scheduler=s)
         range_qry = RangeQuery(column="_1", scheduler=s)
         range_qry.create_dependent_modules(
             random, "result", min_value=min_value, max_value=max_value
         )
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = range_qry.output.result
     aio.run(s.start())
     assert range_qry.input_module is not None
     idx = (
         range_qry.input_module.output["result"]
         .data()
         .eval(f"(_1>{lo})&(_1<{up})", result_object="index")
     )
     self.assertEqual(range_qry.table.index, bitmap(idx))
 def run_step(self, run_number: int, step_size: int,
              howlong: float) -> ReturnRunStep:
     assert self.context
     with self.context as ctx:
         indices = ctx.table.created.next(step_size)  # returns a slice
         steps = indices_len(indices)
         input_df = ctx.table.data()
         op = input_df.loc[fix_loc(indices)].max(keepdims=False)
         if self.result is None:
             self.result = PsDict(op)
         else:
             for k, v in self.psdict.items():
                 self.result[k] = np.maximum(op[k], v)
         return self._return_run_step(self.next_state(ctx.table),
                                      steps_run=steps)
 def _impl_stirred_tst_percentiles(self, accuracy: float,
                                   **kw: Any) -> None:
     """ """
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=10000, scheduler=s)
         stirrer = Stirrer(update_column="_2",
                           fixed_step_size=1000,
                           scheduler=s,
                           **kw)
         stirrer.input[0] = random.output.result
         hist_index = HistogramIndex(column="_1", scheduler=s)
         hist_index.input[0] = stirrer.output.result
         t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0})
         which_percentiles = Constant(table=t_percentiles, scheduler=s)
         percentiles = Percentiles(accuracy=accuracy, scheduler=s)
         percentiles.input[0] = stirrer.output.result
         percentiles.input.percentiles = which_percentiles.output.result
         percentiles.input.hist = hist_index.output.result
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = percentiles.output.result
     aio.run(s.start())
     pdict = notNone(percentiles.table.last()).to_dict()
     # v = random.table()['_1'].values
     # from nose.tools import set_trace; set_trace()
     v = stirrer.table.to_array(columns=["_1"]).reshape(-1)
     p25 = np.percentile(v, 25.0)  # type: ignore
     p50 = np.percentile(v, 50.0)  # type: ignore
     p75 = np.percentile(v, 75.0)  # type: ignore
     print(
         "Table=> accuracy: ",
         accuracy,
         " 25:",
         p25,
         pdict["_25"],
         " 50:",
         p50,
         pdict["_50"],
         " 75:",
         p75,
         pdict["_75"],
     )
     # from nose.tools import set_trace; set_trace()
     self.assertAlmostEqual(p25, pdict["_25"], delta=0.01)
     self.assertAlmostEqual(p50, pdict["_50"], delta=0.01)
     self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
 def run_step(self, run_number: int, step_size: int,
              howlong: float) -> ReturnRunStep:
     slot = self.get_input_slot("table")
     if slot.updated.any() or slot.deleted.any():
         slot.reset()
         if self.result is not None:
             self.psdict.clear()  # resize(0)
         slot.update(run_number)
     indices = slot.created.next(step_size)
     steps = indices_len(indices)
     if steps == 0:
         return self._return_run_step(self.state_blocked, steps_run=0)
     data = slot.data()
     op = data.loc[fix_loc(indices)].max(keepdims=False)
     if self.result is None:
         self.result = PsDict(op)
     else:
         for k, v in self.psdict.items():
             self.result[k] = np.maximum(op[k], v)
     return self._return_run_step(self.next_state(slot), steps_run=steps)
 def _impl_tst_percentiles(self, accuracy: float) -> None:
     """ """
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=10000, scheduler=s)
         hist_index = HistogramIndex(column="_1", scheduler=s)
         hist_index.input[0] = random.output.result
         t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0})
         which_percentiles = Constant(table=t_percentiles, scheduler=s)
         percentiles = Percentiles(accuracy=accuracy, scheduler=s)
         percentiles.input[0] = random.output.result
         percentiles.input.percentiles = which_percentiles.output.result
         percentiles.input.hist = hist_index.output.result
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = percentiles.output.result
     aio.run(s.start())
     last = percentiles.table.last()
     assert last is not None
     pdict = last.to_dict()
     v = random.table["_1"].values
     p25 = np.percentile(v, 25.0)  # type: ignore
     p50 = np.percentile(v, 50.0)  # type: ignore
     p75 = np.percentile(v, 75.0)  # type: ignore
     print(
         "Table=> accuracy: ",
         accuracy,
         " 25:",
         p25,
         pdict["_25"],
         " 50:",
         p50,
         pdict["_50"],
         " 75:",
         p75,
         pdict["_75"],
     )
     # from nose.tools import set_trace; set_trace()
     self.assertAlmostEqual(p25, pdict["_25"], delta=0.01)
     self.assertAlmostEqual(p50, pdict["_50"], delta=0.01)
     self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
示例#14
0
 def run_step(self, run_number: int, step_size: int,
              howlong: float) -> ReturnRunStep:
     first_slot = self.get_input_slot("first")
     # first_slot.update(run_number)
     second_slot = self.get_input_slot("second")
     assert first_slot is not None and second_slot is not None
     first_dict = first_slot.data()
     second_dict = second_slot.data()
     if first_dict is None or second_dict is None:
         return self._return_run_step(self.state_blocked, steps_run=0)
     # second_slot.update(run_number)
     first_slot.created.next()
     second_slot.created.next()
     first_slot.updated.next()
     second_slot.updated.next()
     first_slot.deleted.next()
     second_slot.deleted.next()
     if self.result is None:
         self.result = PsDict(**first_dict, **second_dict)
     else:
         self.psdict.update(first_dict)
         self.psdict.update(second_dict)
     return self._return_run_step(self.next_state(first_slot), steps_run=1)
示例#15
0
 def test_min_max_scaler_tol(self):
     s = self.scheduler()
     _, f = tf.mkstemp()
     print(f)
     df2.to_csv(f, index=False)
     cols = ["A", "B"]
     csv = SimpleCSVLoader(f, usecols=cols, throttle=100, scheduler=s)
     cst = Constant(table=PsDict({"delta": -5, "ignore_max": 10}), scheduler=s)
     sc = MinMaxScaler(reset_threshold=10_000, scheduler=s)
     # sc.input[0] = random.output.result
     sc.create_dependent_modules(csv)
     sc.input.control = cst.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr2 = Print(proc=self.terse, scheduler=s)
     pr.input[0] = sc.output.result
     pr2.input[0] = sc.output.info
     aio.run(s.start())
     print(sc._info)
     os.unlink(f)
     self.assertEqual(len(csv.result) - sc._info["ignored"], len(sc.result))
     for c in cols:
         self.assertGreaterEqual(min(sc.result[c]), 0.0)
         self.assertLessEqual(max(sc.result[c]), 1.0)
示例#16
0
 def __init__(self, threshold: int, **kwds: Any) -> None:
     super().__init__(**kwds)
     self._threshold = threshold
     self.result = PsDict({"reset": True})
gen_csv(file_name, rows=999999, reset=True)  #, header='_0,_1', reset=False)
data = CSVLoader(file_name,
                 skipinitialspace=True,
                 header=None,
                 index_col=False,
                 scheduler=s)
mbkmeans = MBKMeans(columns=['_0', '_1'],
                    n_clusters=3,
                    batch_size=100,
                    tol=0.01,
                    is_input=False,
                    scheduler=s)
sp = MCScatterPlot(scheduler=s,
                   classes=[('Scatterplot', '_0', '_1', mbkmeans)])
sp.create_dependent_modules(data, 'table')
sp['Scatterplot'].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf})
sp['Scatterplot'].max_value._table = PsDict({'_0': np.inf, '_1': np.inf})
mbkmeans.input.table = sp['Scatterplot'].range_query_2d.output.table
#mbkmeans.input.table = data.output.table
mbkmeans.create_dependent_modules()
sp.move_point = mbkmeans.moved_center  # for input management


def myprint(d):
    if d['convergence'] != 'unknown':
        print(d)
    else:
        print('.', end='')


prn = Every(scheduler=s, proc=print)
示例#18
0
    filt = MBKMeansFilter(i)
    filt.create_dependent_modules(mbkmeans, data, 'table')
    classes.append({
        'name': cname,
        'x_column': '_0',
        'y_column': '_1',
        'sample': mbkmeans if i == 0 else None,
        'input_module': filt,
        'input_slot': 'table'
    })

sp = MCScatterPlot(scheduler=s, classes=classes)
sp.create_dependent_modules()
for i in range(n_clusters):
    cname = f"k{i}"
    sp[cname].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf})
    sp[cname].max_value._table = PsDict({'_0': np.inf, '_1': np.inf})
mbkmeans.input.table = data.output.table
mbkmeans.create_dependent_modules()
sp.move_point = mbkmeans.moved_center  # for input management


def myprint(d):
    if d['convergence'] != 'unknown':
        print(d)
    else:
        print('.', end='')


prn = Every(scheduler=s, proc=print)
prn.input.df = mbkmeans.output.conv
示例#19
0
def categ_as_vega_dataset(categs: PsDict):
    return [{"category": k, "count": v} for (k, v) in categs.items()]