def test_pattern(self):
        s = self.scheduler()
        n_samples = 1_000
        centers = [(0.1, 0.3, 0.5), (0.7, 0.5, 3.3), (-0.4, -0.3, -11.1)]
        cols = ["A", "B", "C"]
        with s:
            data = BlobsTable(
                columns=cols,
                centers=centers,
                cluster_std=0.2,
                rows=n_samples,
                scheduler=s,
            )
            # ds = DataShape(scheduler=s)
            # ds.input.table = data.output.result
            factory = StatsFactory(input_module=data, scheduler=s)
            factory.create_dependent_modules(var_name="my_dyn_var")
            factory.input.table = data.output.result
            sink = Sink(scheduler=s)
            # sink.input.inp = ds.output.result
            sink.input.inp = factory.output.result

        async def fake_input_1(scheduler: Scheduler, rn: int) -> None:
            module = scheduler["my_dyn_var"]
            print("from input my_dyn_var")
            await module.from_input({"matrix": matrix_hist})

        s.on_loop(my_stop, 4)
        s.on_loop(fake_input_1, 3)
        aio.run(s.start())
예제 #2
0
 def test_hist_index_min_max(self) -> None:
     "Test min_out and max_out on HistogramIndex"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=100000, scheduler=s)
         t_min = PsDict({"_1": 0.3})
         min_value = Constant(table=t_min, scheduler=s)
         t_max = PsDict({"_1": 0.8})
         max_value = Constant(table=t_max, scheduler=s)
         range_qry = RangeQuery(column="_1", scheduler=s)
         range_qry.create_dependent_modules(
             random, "result", min_value=min_value, max_value=max_value
         )
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = range_qry.output.result
         hist_index = range_qry.hist_index
         assert hist_index is not None
         min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s)
         min_.input[0] = hist_index.output.min_out
         prt2 = Print(proc=self.terse, scheduler=s)
         prt2.input[0] = min_.output.result
         max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s)
         max_.input[0] = hist_index.output.max_out
         pr3 = Print(proc=self.terse, scheduler=s)
         pr3.input[0] = max_.output.result
     aio.run(s.start())
     res1 = cast(float, random.table.min()["_1"])
     res2 = cast(float, min_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
     res1 = cast(float, random.table.max()["_1"])
     res2 = cast(float, max_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
예제 #3
0
 def test_binary3(self) -> None:
     s = self.scheduler()
     cols = 10
     random1 = RandomTable(cols, rows=100_000, scheduler=s)
     random2 = RandomDict(cols, scheduler=s)
     module = Binary(
         np.add,
         columns={
             "first": ["_3", "_5", "_7"],
             "second": ["_4", "_6", "_8"]
         },
         scheduler=s,
     )
     module.input.first = random1.output.result
     module.input.second = random2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = module.output.result
     aio.run(s.start())
     res1 = np.add(
         random1.table.to_array()[:, [2, 4, 6]],
         np.array(list(random2.psdict.values()))[[3, 5, 7]],
     )
     res2 = module.table.to_array()
     self.assertTrue(module.name.startswith("binary_"))
     self.assertTrue(np.allclose(res1, res2, equal_nan=True))
예제 #4
0
 def _tst_10_read_multi_csv_file_compress_with_crash(
         self, file_list: List[str], tag: str) -> None:
     s = self.scheduler()
     module = CSVLoader(file_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 4)
     aio.run_gather(s.start(), sts)
     _close(module)
     s = self.scheduler(clean=True)
     module = CSVLoader(
         file_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
예제 #5
0
 def test_load_csv(self) -> None:
     """
     Connecting modules via function calls
     """
     with Scheduler.default:
         csv = pv.load_csv(get_dataset("bigfile"), index_col=False, header=None)
         m = pv.min(csv)
         pv.echo(m, proc=prtm)
         M = pv.max(csv)
         pv.echo(M, proc=prtM)
         trace = M["_trace"]
         pv.echo(trace, proc=prtT)
         module = csv.module
         assert module is not None
         self.assertEqual(csv.scheduler(), module.scheduler())
     aio.run(csv.scheduler().start())
     table = csv.table
     lastm = m.table
     lastM = M.table
     self.assertEqual(len(table), 1000000)
     for col in table.columns:
         c = table[col]
         v = c.min()
         self.assertEqual(v, lastm[col])
         v = c.max()
         self.assertEqual(v, lastM[col])
예제 #6
0
 def test_intersection(self) -> None:
     s = self.scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     t_min = Table(name=None, dshape="{_1: float64}", data={"_1": [0.3]})
     min_value = Constant(table=t_min, scheduler=s)
     t_max = Table(name=None, dshape="{_1: float64}", data={"_1": [0.8]})
     max_value = Constant(table=t_max, scheduler=s)
     hist_index = HistogramIndex(column="_1", scheduler=s)
     hist_index.create_dependent_modules(random, "result")
     bisect_min = Bisect(column="_1",
                         op=">",
                         hist_index=hist_index,
                         scheduler=s)
     bisect_min.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_min.input.limit = min_value.output.result
     bisect_max = Bisect(column="_1",
                         op="<",
                         hist_index=hist_index,
                         scheduler=s)
     bisect_max.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_max.input.limit = max_value.output.result
     inter = Intersection(scheduler=s)
     inter.input[0] = bisect_min.output.result
     inter.input[0] = bisect_max.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = inter.output.result
     aio.run(s.start())
     assert hist_index.input_module is not None
     idx = (hist_index.input_module.output["result"].data().eval(
         "(_1>0.3)&(_1<0.8)", result_object="index"))
     self.assertEqual(inter.table.index, bitmap(idx))
예제 #7
0
 def test_06_read_http_multi_csv_bz2_with_crash(self) -> None:
     self._http_srv = _HttpSrv()
     tag = self.get_tag()
     s = self.scheduler()
     url_list = [make_url("bigfile", ext=BZ2)] * 2
     module = CSVLoader(url_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 3)
     aio.run_gather(s.start(), sts)
     self._http_srv.restart()
     s = self.scheduler(clean=True)
     module = CSVLoader(
         url_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
예제 #8
0
    def t_num_expr_impl(self, cls: Type[NumExprABC]) -> Tuple[Any, ...]:
        s = self.scheduler()
        random1 = RandomTable(10, rows=100000, scheduler=s)
        random2 = RandomTable(10, rows=100000, scheduler=s)
        module = cls(
            columns={
                "first": ["_1", "_2", "_3"],
                "second": ["_1", "_2", "_3"]
            },
            scheduler=s,
        )

        module.input.first = random1.output.result
        module.input.second = random2.output.result
        pr = Print(proc=self.terse, scheduler=s)
        pr.input[0] = module.output.result
        aio.run(s.start())
        first = random1.table.to_array()
        first_2 = first[:, 1]
        first_3 = first[:, 2]
        second = random2.table.to_array()
        second_2 = second[:, 1]
        second_3 = second[:, 2]
        ne_1 = ne.evaluate("first_2+2*second_3")
        ne_2 = ne.evaluate("first_3-5*second_2")
        res = module.table.to_array()
        self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True))
        self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
        return first_2, first_3, second_2, second_3
예제 #9
0
 def test_bisect2(self) -> None:
     s = self.scheduler()
     random = RandomTable(2, rows=100_000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=100,
         # update_rows=5,
         # fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     t = Table(name=None, dshape="{value: string}", data={"value": [0.5]})
     min_value = Constant(table=t, scheduler=s)
     hist_index = HistogramIndex(column="_1", scheduler=s)
     hist_index.create_dependent_modules(stirrer, "result")
     bisect_ = Bisect(column="_1",
                      op=">",
                      hist_index=hist_index,
                      scheduler=s)
     bisect_.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_.input.limit = min_value.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = bisect_.output.result
     aio.run(s.start())
     idx = stirrer.table.eval("_1>0.5", result_object="index")
     self.assertEqual(bisect_.table.index, bitmap(idx))
예제 #10
0
    def t_mix_ufunc_impl(
        self,
        cls: Type[MixUfuncABC],
        ufunc1: np.ufunc = np.log,
        ufunc2: np.ufunc = np.add,
    ) -> None:
        s = self.scheduler()
        random1 = RandomTable(10, rows=100000, scheduler=s)
        random2 = RandomTable(10, rows=100000, scheduler=s)
        module = cls(
            columns={
                "first": ["_1", "_2", "_3"],
                "second": ["_1", "_2", "_3"]
            },
            scheduler=s,
        )

        module.input.first = random1.output.result
        module.input.second = random2.output.result
        pr = Print(proc=self.terse, scheduler=s)
        pr.input[0] = module.output.result
        aio.run(s.start())
        first = random1.table.to_array()
        first_2 = first[:, 1]
        _ = first[:, 2]
        second = random2.table.to_array()
        _ = second[:, 1]
        second_3 = second[:, 2]
        ne_1 = ufunc2(first_2, second_3).astype("float64")
        ne_2 = ufunc1(second_3).astype("float64")
        res = module.table.to_array()
        self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True))
        self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
예제 #11
0
 def test_merge_simple(self) -> None:
     s = self.scheduler()
     cst1 = Constant(Table(name=None,
                           data=pd.DataFrame({
                               "xmin": [1],
                               "xmax": [2]
                           })),
                     scheduler=s)
     cst2 = Constant(Table(name=None,
                           data=pd.DataFrame({
                               "ymin": [3],
                               "ymax": [4]
                           })),
                     scheduler=s)
     merge = Merge(left_index=True, right_index=True, scheduler=s)
     merge.input[0] = cst1.output.result
     merge.input[0] = cst2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = merge.output.result
     aio.run(s.start())
     _ = merge.trace_stats(max_runs=1)
     # pd.set_option('display.expand_frame_repr', False)
     # print(res)
     df = merge.table
     last = df.loc[df.index[-1]]
     assert last is not None
     self.assertTrue(last["xmin"] == 1 and last["xmax"] == 2
                     and last["ymin"] == 3 and last["ymax"] == 4)
예제 #12
0
    def t_mix_ufunc_table_dict_impl(self, cls: Type[MixUfuncABC]) -> None:
        s = self.scheduler()
        random1 = RandomDict(10, scheduler=s)
        random2 = RandomTable(10, rows=100000, scheduler=s)
        module = cls(
            columns={
                "first": ["_1", "_2", "_3"],
                "second": ["_1", "_2", "_3"]
            },
            scheduler=s,
        )

        module.input.first = random1.output.result
        module.input.second = random2.output.result
        pr = Print(proc=self.terse, scheduler=s)
        pr.input[0] = module.output.result
        aio.run(s.start())
        first = list(random1.psdict.values())
        first_2 = first[1]
        _ = first[2]
        second = random2.table.to_array()
        _ = second[:, 1]
        second_3 = second[:, 2]
        ne_1 = np.add(first_2, second_3)
        ne_2 = np.log(second_3)
        res = module.table.to_array()
        self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True))
        self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
예제 #13
0
 def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None:
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s = self.scheduler()
     filenames = Table(
         name="file_names",
         dshape="{filename: string}",
         data={
             "filename": [
                 make_url("smallfile", ext=BZ2),
                 make_url("smallfile", ext=BZ2),
             ]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False,
                     header=None,
                     scheduler=s,
                     timeout=0.01)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     _close(csv)
     self.assertEqual(len(csv.table), 60000)
예제 #14
0
 def test_idxmax2(self) -> None:
     s = self.scheduler()
     random = RandomTable(10, rows=10000, throttle=1000, scheduler=s)
     stirrer = Stirrer(update_column="_1",
                       delete_rows=5,
                       fixed_step_size=100,
                       scheduler=s)
     stirrer.input[0] = random.output.result
     idxmax = IdxMax(scheduler=s)
     idxmax.input[0] = stirrer.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = idxmax.output.result
     pr2 = Print(proc=self.terse, scheduler=s)
     pr2.input[0] = max_.output.result
     aio.run(s.start())
     # import pdb;pdb.set_trace()
     max1 = max_.psdict
     # print('max1', max1)
     max = idxmax.max()
     assert max is not None
     max2 = notNone(max.last()).to_dict()
     # print('max2', max2)
     self.compare(max1, max2)
예제 #15
0
 def test_mv_blobs_table2(self) -> None:
     s = self.scheduler()
     sz = 100000
     blob1 = MVBlobsTable(["a", "b"],
                          means=means,
                          covs=covs,
                          rows=sz,
                          scheduler=s)
     blob1.default_step_size = 1500
     blob2 = MVBlobsTable(["a", "b"],
                          means=means,
                          covs=covs,
                          rows=sz,
                          scheduler=s)
     blob2.default_step_size = 200
     add = Add(scheduler=s)
     add.input.first = blob1.output.result
     add.input.second = blob2.output.result
     prlen = Every(proc=self.terse, constant_time=True, scheduler=s)
     prlen.input[0] = add.output.result
     aio.run(s.start())
     # s.join()
     self.assertEqual(len(blob1.table), sz)
     self.assertEqual(len(blob2.table), sz)
     arr1 = blob1.table.to_array()
     arr2 = blob2.table.to_array()
     self.assertTrue(np.allclose(arr1, arr2))
예제 #16
0
 def test_histogram1d1(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
     histogram1d.input[0] = csv.output.result
     histogram1d.input.min = min_.output.result
     histogram1d.input.max = max_.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = histogram1d.output.result
     aio.run(s.start())
     _ = histogram1d.trace_stats()
     last = notNone(histogram1d.table.last()).to_dict()
     h1 = last["array"]
     bounds = (last["min"], last["max"])
     df = pd.read_csv(
         get_dataset("bigfile"), header=None, usecols=[2]  # type: ignore
     )
     v = df.to_numpy().reshape(-1)
     h2, _ = np.histogram(  # type: ignore
         v, bins=histogram1d.params.bins, density=False, range=bounds
     )
     self.assertListEqual(h1.tolist(), h2.tolist())
예제 #17
0
 def test_hub_if_else(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     switch = Switch(condition=lambda x: False, scheduler=s)
     switch.input[0] = stirrer.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = switch.output.result
     min_ = Min(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = switch.output.result_else
     hub = Hub(scheduler=s)
     hub.input.table = min_.output.result
     hub.input.table = max_.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = hub.output.result
     aio.run(s.start())
     res1 = stirrer.result.min()
     res2 = hub.result
     self.compare(res1, res2)
예제 #18
0
    def t_histogram1d_impl(self, **kw: Any) -> None:
        s = self.scheduler()
        csv = CSVLoader(
            get_dataset("bigfile"), index_col=False, header=None, scheduler=s
        )
        stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw)
        stirrer.input[0] = csv.output.result
        min_ = Min(scheduler=s)
        min_.input[0] = stirrer.output.result
        max_ = Max(scheduler=s)
        max_.input[0] = stirrer.output.result
        histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
        histogram1d.input[0] = stirrer.output.result
        histogram1d.input.min = min_.output.result
        histogram1d.input.max = max_.output.result

        # pr = Print(scheduler=s)
        pr = Every(proc=self.terse, scheduler=s)
        pr.input[0] = histogram1d.output.result
        aio.run(s.start())
        _ = histogram1d.trace_stats()
        last = notNone(histogram1d.table.last()).to_dict()
        h1 = last["array"]
        bounds = (last["min"], last["max"])
        tab = stirrer.table.loc[:, ["_2"]]
        assert tab is not None
        v = tab.to_array().reshape(-1)
        h2, _ = np.histogram(  # type: ignore
            v, bins=histogram1d.params.bins, density=False, range=bounds
        )
        self.assertEqual(np.sum(h1), np.sum(h2))
        self.assertListEqual(h1.tolist(), h2.tolist())
예제 #19
0
 def test_01_read_http_csv_with_crash_and_counter(self) -> None:
     self._http_srv = _HttpSrv()
     tag = self.get_tag()
     s = self.scheduler()
     url = make_url("bigfile")
     module = CSVLoader(url,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 2)
     aio.run_gather(s.start(), sts)
     self._http_srv.restart()
     s = self.scheduler(clean=True)
     csv = CSVLoader(
         url,
         recovery=True,
         index_col=False,
         recovery_tag=tag,
         header=None,
         scheduler=s,
     )
     counter = Counter(scheduler=s)
     counter.input[0] = csv.output.result
     self.assertTrue(csv.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = counter.output.result
     aio.run(s.start())
     self.assertEqual(len(csv.table), 1000000)
     self.assertEqual(counter.table["counter"].loc[0], 1000000)
예제 #20
0
    def test_mb_k_means(self) -> None:
        s = self.scheduler()
        n_clusters = 3
        try:
            dataset = (get_dataset("cluster:s3"), )
        except TimeoutError:
            print("Cannot download cluster:s3")
            return

        with s:
            csv = CSVLoader(
                dataset,
                sep=" ",
                skipinitialspace=True,
                header=None,
                index_col=False,
                scheduler=s,
            )
            km = MBKMeans(
                n_clusters=n_clusters,
                random_state=42,
                is_input=False,
                is_greedy=False,
                scheduler=s,
            )
            # km.input.table = csv.output.result
            km.create_dependent_modules(csv)
            pr = Print(proc=self.terse, scheduler=s)
            pr.input[0] = km.output.result
            e = Every(proc=self.terse, scheduler=s)
            e.input[0] = km.output.labels
        aio.run(s.start())
        labels = km.labels()
        assert labels is not None
        self.assertEqual(len(csv.table), len(labels))
예제 #21
0
 def test_09_read_multi_csv_file_with_crash(self) -> None:
     s = self.scheduler()
     tag = "t9"
     file_list = [get_dataset("bigfile"), get_dataset("bigfile")]
     module = CSVLoader(file_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 3)
     aio.run_gather(s.start(), sts)
     _close(module)
     s = self.scheduler(clean=True)
     module = CSVLoader(
         file_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
예제 #22
0
 def test_paste(self) -> None:
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     min_1 = Min(name="min_1" + str(hash(random)),
                 scheduler=s,
                 columns=["_1"])
     min_1.input[0] = random.output.result
     d2t_1 = Dict2Table(scheduler=s)
     d2t_1.input.dict_ = min_1.output.result
     min_2 = Min(name="min_2" + str(hash(random)),
                 scheduler=s,
                 columns=["_2"])
     min_2.input[0] = random.output.result
     d2t_2 = Dict2Table(scheduler=s)
     d2t_2.input.dict_ = min_2.output.result
     bj = Paste(scheduler=s)
     bj.input.first = d2t_1.output.result
     bj.input.second = d2t_2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = bj.output.result
     aio.run(s.start())
     res1 = random.table.min()
     res2 = notNone(bj.table.last()).to_dict()
     self.assertAlmostEqual(res1["_1"], res2["_1"])
     self.assertAlmostEqual(res1["_2"], res2["_2"])
예제 #23
0
 def test_join(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     stat1 = Stats(1, reset_index=True, scheduler=s)
     stat1.input[0] = csv.output.result
     stat2 = Stats(2, reset_index=True, scheduler=s)
     stat2.input[0] = csv.output.result
     # join=Join(scheduler=s)
     # reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s)
     # reduce_.input[0] = stat1.output.stats
     # reduce_.input[0] = stat2.output.stats
     # join = reduce_.expand()
     join = Reduce.expand(
         BinJoin,
         "first",
         "second",
         "table",
         [stat1.output.stats, stat2.output.stats],
         scheduler=s,
     )
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = join.output.result
     prlen = Every(proc=self.terse, constant_time=True, scheduler=s)
     prlen.input[0] = csv.output.result
     aio.run(s.start())
     res = join.trace_stats(max_runs=1)
     print(res)
예제 #24
0
    def test_dataflow_1_dynamic(self) -> None:
        scheduler = self.scheduler(clean=True)

        table = RandomTable(name="table",
                            columns=["a"],
                            throttle=1000,
                            scheduler=scheduler)
        m = Min(name="min", scheduler=scheduler)
        prt = Print(proc=self.terse, name="print_min", scheduler=scheduler)
        m.input.table = table.output.result
        prt.input.df = m.output.result
        started = False

        def proc(x: Any) -> None:
            nonlocal started
            print("proc max called")
            started = True

        async def _add_max(scheduler: Scheduler, run_number: int) -> None:
            with scheduler:
                print("adding new modules")
                m = Max(name="max", scheduler=scheduler)
                prt = Print(name="print_max", proc=proc, scheduler=scheduler)
                m.input.table = table.output.result
                prt.input.df = m.output.result

        scheduler.on_loop(_add_max, 5)  # run the function after 5 loops
        scheduler.on_loop(self._stop, 10)

        # from nose.tools import set_trace; set_trace()
        aio.run(scheduler.start())
        self.assertTrue(started)
예제 #25
0
 def test_piped_load_csv2(self):
     """
     Connecting modules via the pipe operator (only one pipe)
     """
     with Scheduler.default:
         ret = (
             PipedInput(get_dataset("bigfile"))
             | pv.load_csv(index_col=False, header=None)
             | pv.min()
             | pv.echo(proc=prtm).repipe("csv_loader_1")
             | pv.max()
             | pv.echo(proc=prtM).repipe("max_1", out="_trace")
             | pv.echo(proc=prtT)
         )
         m = ret.fetch("min_1")
         M = ret.fetch("max_1")
         csv = ret.fetch("csv_loader_1")
         self.assertEqual(csv.scheduler(), csv.module.scheduler())
     aio.run(csv.scheduler().start())
     table = csv.table
     lastm = m.table
     lastM = M.table
     self.assertEqual(len(table), 1000000)
     for col in table.columns:
         # print('testing column %s'%col)
         c = table[col]
         v = c.min()
         self.assertEqual(v, lastm[col])
         v = c.max()
         self.assertEqual(v, lastM[col])
예제 #26
0
    def test_dataflow_2_add_remove(self) -> None:
        scheduler = self.scheduler(clean=True)

        table = RandomTable(name="table",
                            columns=["a"],
                            throttle=1000,
                            scheduler=scheduler)
        m = Min(name="min", scheduler=scheduler)
        prt = Print(proc=self.terse, name="print_min", scheduler=scheduler)
        m.input.table = table.output.result
        prt.input.df = m.output.result
        started = False

        def proc(x: Any) -> None:
            nonlocal started
            print("proc max called")
            started = True

        async def _add_max_remove_min(scheduler: Scheduler,
                                      run_number: int) -> None:
            with scheduler as dataflow:
                print("adding new modules")
                m = Max(name="max", scheduler=scheduler)
                prt = Print(name="print_max", proc=proc, scheduler=scheduler)
                m.input.table = table.output.result
                prt.input.df = m.output.result
                print("removing min module")
                dataflow.delete_modules("min", "print_min")

        # t = _add_max_remove_min(csv, scheduler, proc=proc)
        scheduler.on_loop(_add_max_remove_min, 5)
        scheduler.on_loop(self._stop, 10)
        aio.run(scheduler.start())
        self.assertTrue(started)
예제 #27
0
 def test_ldexp(self) -> None:
     cls, ufunc, mod_name = ColsLdexp, np.ldexp, "cols_ldexp_"
     print("Testing", mod_name)
     s = self.scheduler()
     cols = 10
     random = RandomTable(
         cols,
         rows=10_000,
         scheduler=s,
         random=lambda x: np.random.randint(10, size=x),  # type: ignore
         dtype="int64",
     )
     module = cls(
         first=["_3", "_5", "_7"],
         second=["_4", "_6", "_8"],
         cols_out=["x", "y", "z"],
         scheduler=s,
     )
     module.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = module.output.result
     aio.run(s.start())
     self.assertListEqual(module.table.columns, ["x", "y", "z"])
     arr = random.table.to_array()
     res1 = ufunc(arr[:, [2, 4, 6]], arr[:, [3, 5, 7]])
     res2 = module.table.to_array()
     self.assertTrue(module.name.startswith(mod_name))
     self.assertTrue(np.allclose(res1, res2, equal_nan=True))
예제 #28
0
 def test_blobs_table2(self) -> None:
     s = self.scheduler()
     sz = 100000
     centers = [(0.1, 0.3), (0.7, 0.5), (-0.4, -0.3)]
     blob1 = BlobsTable(["a", "b"],
                        centers=centers,
                        cluster_std=0.2,
                        rows=sz,
                        scheduler=s)
     blob1.default_step_size = 1500
     blob2 = BlobsTable(["a", "b"],
                        centers=centers,
                        cluster_std=0.2,
                        rows=sz,
                        scheduler=s)
     blob2.default_step_size = 200
     add = Add(scheduler=s)
     add.input.first = blob1.output.result
     add.input.second = blob2.output.result
     prlen = Every(proc=self.terse, constant_time=True, scheduler=s)
     prlen.input[0] = add.output.result
     aio.run(s.start())
     # s.join()
     self.assertEqual(len(blob1.table), sz)
     self.assertEqual(len(blob2.table), sz)
     arr1 = blob1.table.to_array()
     arr2 = blob2.table.to_array()
     self.assertTrue(np.allclose(arr1, arr2))
예제 #29
0
 def _t_impl(self, cls: Type[TableModule], ufunc: np.ufunc,
             mod_name: str) -> None:
     print("Testing", mod_name)
     s = self.scheduler()
     random1 = RandomTable(
         3,
         rows=100_000,
         scheduler=s,
         random=lambda x: np.random.randint(10, size=x),  # type: ignore
         dtype="int64",
     )
     random2 = RandomTable(
         3,
         rows=100_000,
         scheduler=s,
         random=lambda x: np.random.randint(10, size=x),  # type: ignore
         dtype="int64",
     )
     module = cls(scheduler=s)
     module.input.first = random1.output.result
     module.input.second = random2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = module.output.result
     aio.run(s.start())
     res1 = ufunc(random1.table.to_array(), random2.table.to_array())
     res2 = module.table.to_array()
     self.assertTrue(module.name.startswith(mod_name))
     self.assertTrue(np.allclose(res1, res2, equal_nan=True))
예제 #30
0
 def t_histogram2d_impl(self, **kw: Any) -> None:
     s = self.scheduler()
     random = RandomTable(3, rows=100000, scheduler=s)
     stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw)
     stirrer.input[0] = random.output.result
     min_ = Min(scheduler=s)
     min_.input[0] = stirrer.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = stirrer.output.result
     histogram2d = Histogram2D(
         0, 1, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = stirrer.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(s.start())
     last = notNone(histogram2d.table.last()).to_dict()
     h1 = last["array"]
     bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]]
     t = stirrer.table.loc[:, ["_1", "_2"]]
     assert t is not None
     v = t.to_array()
     bins = [histogram2d.params.ybins, histogram2d.params.xbins]
     h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds)
     h2 = np.flip(h2, axis=0)  # type: ignore
     self.assertEqual(np.sum(h1), np.sum(h2))
     self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())