Пример #1
0
 def test_histogram2d1(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram2d = Histogram2D(
         1, 2, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = csv.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(csv.scheduler().start())
     last = notNone(histogram2d.table.last()).to_dict()
     h1 = last["array"]
     bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]]
     df = pd.read_csv(
         get_dataset("bigfile"), header=None, usecols=[1, 2]  # type: ignore
     )
     v = df.to_numpy()  # .reshape(-1, 2)
     bins = [histogram2d.params.ybins, histogram2d.params.xbins]
     h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds)
     h2 = np.flip(h2, axis=0)  # type: ignore
     self.assertTrue(np.allclose(h1, h2))
Пример #2
0
    def t_histogram1d_impl(self, **kw: Any) -> None:
        s = self.scheduler()
        csv = CSVLoader(
            get_dataset("bigfile"), index_col=False, header=None, scheduler=s
        )
        stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw)
        stirrer.input[0] = csv.output.result
        min_ = Min(scheduler=s)
        min_.input[0] = stirrer.output.result
        max_ = Max(scheduler=s)
        max_.input[0] = stirrer.output.result
        histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
        histogram1d.input[0] = stirrer.output.result
        histogram1d.input.min = min_.output.result
        histogram1d.input.max = max_.output.result

        # pr = Print(scheduler=s)
        pr = Every(proc=self.terse, scheduler=s)
        pr.input[0] = histogram1d.output.result
        aio.run(s.start())
        _ = histogram1d.trace_stats()
        last = notNone(histogram1d.table.last()).to_dict()
        h1 = last["array"]
        bounds = (last["min"], last["max"])
        tab = stirrer.table.loc[:, ["_2"]]
        assert tab is not None
        v = tab.to_array().reshape(-1)
        h2, _ = np.histogram(  # type: ignore
            v, bins=histogram1d.params.bins, density=False, range=bounds
        )
        self.assertEqual(np.sum(h1), np.sum(h2))
        self.assertListEqual(h1.tolist(), h2.tolist())
Пример #3
0
 def test_hub_if_else(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     switch = Switch(condition=lambda x: False, scheduler=s)
     switch.input[0] = stirrer.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = switch.output.result
     min_ = Min(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = switch.output.result_else
     hub = Hub(scheduler=s)
     hub.input.table = min_.output.result
     hub.input.table = max_.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = hub.output.result
     aio.run(s.start())
     res1 = stirrer.result.min()
     res2 = hub.result
     self.compare(res1, res2)
Пример #4
0
 def test_histogram1d1(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
     histogram1d.input[0] = csv.output.result
     histogram1d.input.min = min_.output.result
     histogram1d.input.max = max_.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = histogram1d.output.result
     aio.run(s.start())
     _ = histogram1d.trace_stats()
     last = notNone(histogram1d.table.last()).to_dict()
     h1 = last["array"]
     bounds = (last["min"], last["max"])
     df = pd.read_csv(
         get_dataset("bigfile"), header=None, usecols=[2]  # type: ignore
     )
     v = df.to_numpy().reshape(-1)
     h2, _ = np.histogram(  # type: ignore
         v, bins=histogram1d.params.bins, density=False, range=bounds
     )
     self.assertListEqual(h1.tolist(), h2.tolist())
Пример #5
0
 def test_hist_index_min_max(self):
     "Test min_out and max_out on HistogramIndex"
     s = self.scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]})
     min_value = Constant(table=t_min, scheduler=s)
     t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]})
     max_value = Constant(table=t_max, scheduler=s)
     range_qry = RangeQuery(column='_1', scheduler=s)
     range_qry.create_dependent_modules(random,
                                        'table',
                                        min_value=min_value,
                                        max_value=max_value)
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = range_qry.output.table
     hist_index = range_qry.hist_index
     min_ = Min(name='min_' + str(hash(hist_index)), scheduler=s)
     min_.input.table = hist_index.output.min_out
     prt2 = Print(proc=self.terse, scheduler=s)
     prt2.input.df = min_.output.table
     max_ = Max(name='max_' + str(hash(hist_index)), scheduler=s)
     max_.input.table = hist_index.output.max_out
     pr3 = Print(proc=self.terse, scheduler=s)
     pr3.input.df = max_.output.table
     s.start()
     s.join()
     res1 = random.table().min()['_1']
     res2 = min_.table().last().to_dict()['_1']
     self.assertAlmostEqual(res1, res2)
     res1 = random.table().max()['_1']
     res2 = max_.table().last().to_dict()['_1']
     self.assertAlmostEqual(res1, res2)
Пример #6
0
 def test_hist_index_min_max(self) -> None:
     "Test min_out and max_out on HistogramIndex"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=100000, scheduler=s)
         t_min = PsDict({"_1": 0.3})
         min_value = Constant(table=t_min, scheduler=s)
         t_max = PsDict({"_1": 0.8})
         max_value = Constant(table=t_max, scheduler=s)
         range_qry = RangeQuery(column="_1", scheduler=s)
         range_qry.create_dependent_modules(
             random, "result", min_value=min_value, max_value=max_value
         )
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = range_qry.output.result
         hist_index = range_qry.hist_index
         assert hist_index is not None
         min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s)
         min_.input[0] = hist_index.output.min_out
         prt2 = Print(proc=self.terse, scheduler=s)
         prt2.input[0] = min_.output.result
         max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s)
         max_.input[0] = hist_index.output.max_out
         pr3 = Print(proc=self.terse, scheduler=s)
         pr3.input[0] = max_.output.result
     aio.run(s.start())
     res1 = cast(float, random.table.min()["_1"])
     res2 = cast(float, min_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
     res1 = cast(float, random.table.max()["_1"])
     res2 = cast(float, max_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
Пример #7
0
 def t_histogram2d_impl(self, **kw: Any) -> None:
     s = self.scheduler()
     random = RandomTable(3, rows=100000, scheduler=s)
     stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw)
     stirrer.input[0] = random.output.result
     min_ = Min(scheduler=s)
     min_.input[0] = stirrer.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = stirrer.output.result
     histogram2d = Histogram2D(
         0, 1, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = stirrer.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(s.start())
     last = notNone(histogram2d.table.last()).to_dict()
     h1 = last["array"]
     bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]]
     t = stirrer.table.loc[:, ["_1", "_2"]]
     assert t is not None
     v = t.to_array()
     bins = [histogram2d.params.ybins, histogram2d.params.xbins]
     h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds)
     h2 = np.flip(h2, axis=0)  # type: ignore
     self.assertEqual(np.sum(h1), np.sum(h2))
     self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())
Пример #8
0
 def test_min(self) -> None:
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     min_ = Min(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = min_.output.result
     aio.run(s.start())
     # s.join()
     res1 = random.table.min()
     res2 = min_.psdict
     self.compare(res1, res2)
Пример #9
0
 def test_min(self):
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     min_ = Min(name='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = min_.output.table
     s.start()
     s.join()
     res1 = random.table().min()
     res2 = min_.table().last()
     self.compare(res1, res2)
Пример #10
0
 def test_min(self):
     s = self.scheduler()
     random = SimpleCSVLoader(
         get_dataset("bigfile_multiscale"), nrows=10_000, scheduler=s
     )
     min_ = Min(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = min_.output.result
     aio.run(s.start())
     # s.join()
     res1 = random.result.min()
     res2 = min_.result
     self.compare(res1, res2)
Пример #11
0
 def test_idxmin(self):
     s=Scheduler()
     random = RandomTable(10, rows=10000,throttle=1000, scheduler=s)
     idxmin=IdxMin(scheduler=s)
     idxmin.input.df = random.output.df
     min=Min(scheduler=s)
     min.input.df = random.output.df
     pr=Print(scheduler=s)
     pr.input.df = idxmin.output.min
     s.start()
     min1=last_row(min.df(),remove_update=True)
     #print min1
     min2=last_row(idxmin.min(),remove_update=True)
     #print min2
     self.assertTrue((min1==min2).all())
 def test_histogram2d(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('bigfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     min_ = Min(scheduler=s)
     min_.input.table = csv.output.table
     max_ = Max(scheduler=s)
     max_.input.table = csv.output.table
     histogram2d = Histogram2D(1, 2, xbins=100, ybins=100,
                               scheduler=s)  # columns are called 1..30
     histogram2d.input.table = csv.output.table
     histogram2d.input.min = min_.output.table
     histogram2d.input.max = max_.output.table
     heatmap = Heatmap(filename='histo_%03d.png', scheduler=s)
     heatmap.input.array = histogram2d.output.table
     #pr = Print(scheduler=s)
     pr = Every(proc=self.terse, scheduler=s)
     #pr.input.df = heatmap.output.heatmap
     #pr.input.df = histogram2d.output.df
     pr.input.df = csv.output.table
     csv.scheduler().start()
     s.join()
     #self.scheduler.thread.join()
     s = histogram2d.trace_stats()
Пример #13
0
    def test_dataflow_1_dynamic(self) -> None:
        scheduler = self.scheduler(clean=True)

        table = RandomTable(name="table",
                            columns=["a"],
                            throttle=1000,
                            scheduler=scheduler)
        m = Min(name="min", scheduler=scheduler)
        prt = Print(proc=self.terse, name="print_min", scheduler=scheduler)
        m.input.table = table.output.result
        prt.input.df = m.output.result
        started = False

        def proc(x: Any) -> None:
            nonlocal started
            print("proc max called")
            started = True

        async def _add_max(scheduler: Scheduler, run_number: int) -> None:
            with scheduler:
                print("adding new modules")
                m = Max(name="max", scheduler=scheduler)
                prt = Print(name="print_max", proc=proc, scheduler=scheduler)
                m.input.table = table.output.result
                prt.input.df = m.output.result

        scheduler.on_loop(_add_max, 5)  # run the function after 5 loops
        scheduler.on_loop(self._stop, 10)

        # from nose.tools import set_trace; set_trace()
        aio.run(scheduler.start())
        self.assertTrue(started)
Пример #14
0
    def test_dataflow_2_add_remove(self) -> None:
        scheduler = self.scheduler(clean=True)

        table = RandomTable(name="table",
                            columns=["a"],
                            throttle=1000,
                            scheduler=scheduler)
        m = Min(name="min", scheduler=scheduler)
        prt = Print(proc=self.terse, name="print_min", scheduler=scheduler)
        m.input.table = table.output.result
        prt.input.df = m.output.result
        started = False

        def proc(x: Any) -> None:
            nonlocal started
            print("proc max called")
            started = True

        async def _add_max_remove_min(scheduler: Scheduler,
                                      run_number: int) -> None:
            with scheduler as dataflow:
                print("adding new modules")
                m = Max(name="max", scheduler=scheduler)
                prt = Print(name="print_max", proc=proc, scheduler=scheduler)
                m.input.table = table.output.result
                prt.input.df = m.output.result
                print("removing min module")
                dataflow.delete_modules("min", "print_min")

        # t = _add_max_remove_min(csv, scheduler, proc=proc)
        scheduler.on_loop(_add_max_remove_min, 5)
        scheduler.on_loop(self._stop, 10)
        aio.run(scheduler.start())
        self.assertTrue(started)
Пример #15
0
 def test_idxmin(self):
     s = self.scheduler()
     random = RandomTable(10, rows=10000, throttle=1000, scheduler=s)
     idxmin = IdxMin(scheduler=s)
     idxmin.input.table = random.output.table
     min_ = Min(scheduler=s)
     min_.input.table = random.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = idxmin.output.min
     s.start()
     s.join()
     min1 = min_.table().last().to_dict()
     #print('min1', min1)
     min2 = idxmin.min().last().to_dict()
     #print('min2', min2)
     self.assertAlmostEqual(min1, min2)
Пример #16
0
 def test_histogram1d(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
     histogram1d.input[0] = csv.output.result
     histogram1d.input.min = min_.output.result
     histogram1d.input.max = max_.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = histogram1d.output.result
     aio.run(s.start())
     _ = histogram1d.trace_stats()
Пример #17
0
 def test_bin_join(self):
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     min_1 = Min(name='min_1'+str(hash(random)), scheduler=s, columns=['_1'])
     min_1.input.table = random.output.table
     min_2 = Min(name='min_2'+str(hash(random)), scheduler=s, columns=['_2'])
     min_2.input.table = random.output.table
     bj = BinJoin(scheduler=s)
     bj.input.first = min_1.output.table
     bj.input.second = min_2.output.table
     pr=Print(proc=self.terse, scheduler=s)
     pr.input.df = bj.output.table
     s.start()
     s.join()
     res1 = random.table().min()
     res2 = bj.table().last().to_dict()
     self.assertAlmostEqual(res1['_1'], res2['_1'])
     self.assertAlmostEqual(res1['_2'], res2['_2'])
Пример #18
0
 def p10s_random_min_max(n):
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     min_ = Min(name='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     max_ = Max(name='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     s.start()
Пример #19
0
 def add_min():
     m = Min(scheduler=s)
     # Of course, sleeping here is a bad idea. this is to illustrate
     # that add_min will be executed atomically by the scheduler.
     # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent
     # state.
     #sleep(1)
     m.input.table = csv.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = m.output.table
Пример #20
0
 def p10s_random_min_max(self):
     n = self.current_step
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     min_ = Min(mid='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     max_ = Max(id='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     s.start()
Пример #21
0
 def test_idxmin(self) -> None:
     s = self.scheduler()
     random = RandomTable(10, rows=10000, throttle=1000, scheduler=s)
     idxmin = IdxMin(scheduler=s)
     idxmin.input[0] = random.output.result
     min_ = Min(scheduler=s)
     min_.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = idxmin.output.result
     pr2 = Print(proc=self.terse, scheduler=s)
     pr2.input[0] = min_.output.result
     aio.run(s.start())
     min1 = min_.psdict
     # print('min1', min1)
     min = idxmin.min()
     assert min is not None
     min2 = notNone(min.last()).to_dict()
     # print('min2', min2)
     self.compare(min1, min2)
Пример #22
0
 def test_histogram2d(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram2d = Histogram2D(
         1, 2, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = csv.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(csv.scheduler().start())
     _ = histogram2d.trace_stats()
Пример #23
0
 def test_paste(self) -> None:
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     min_1 = Min(name="min_1" + str(hash(random)),
                 scheduler=s,
                 columns=["_1"])
     min_1.input[0] = random.output.result
     d2t_1 = Dict2Table(scheduler=s)
     d2t_1.input.dict_ = min_1.output.result
     min_2 = Min(name="min_2" + str(hash(random)),
                 scheduler=s,
                 columns=["_2"])
     min_2.input[0] = random.output.result
     d2t_2 = Dict2Table(scheduler=s)
     d2t_2.input.dict_ = min_2.output.result
     bj = Paste(scheduler=s)
     bj.input.first = d2t_1.output.result
     bj.input.second = d2t_2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = bj.output.result
     aio.run(s.start())
     res1 = random.table.min()
     res2 = notNone(bj.table.last()).to_dict()
     self.assertAlmostEqual(res1["_1"], res2["_1"])
     self.assertAlmostEqual(res1["_2"], res2["_2"])
Пример #24
0
    def test_dataflow_3_dels(self) -> None:
        s = self.scheduler()
        table = RandomTable(name="table",
                            columns=["a"],
                            throttle=1000,
                            scheduler=s)
        m = Min(name="min", scheduler=s)
        m.input.table = table.output.result
        prt = Print(name="prt", scheduler=s)
        prt.input.df = m.output.result

        aio.run(s.step())
        with s as dataflow:
            self.assertTrue(isinstance(dataflow, Dataflow))
            deps = dataflow.collateral_damage("table")
            self.assertEqual(deps, set(["table", "min", "prt"]))
Пример #25
0
def main():
    "Main function"
    csvmod = RandomTable(columns=['a', 'b', 'c'],
                         rows=1000000,
                         random=np.random.randn,
                         throttle=1000,
                         scheduler=s)
    minmod = Min(scheduler=s)
    minmod.input.table = csvmod.output.table
    maxmod = Max(scheduler=s)
    maxmod.input.table = csvmod.output.table
    histograms = Histograms(scheduler=s)
    histograms.input.table = csvmod.output.table
    histograms.input.min = minmod.output.table
    histograms.input.max = maxmod.output.table
    prlen = Every(scheduler=s)
    prlen.input.df = histograms.output.table
    return csvmod
Пример #26
0
    def test_dataflow(self):
        s = Scheduler()
        with Dataflow(s):
            csv = CSVLoader(get_dataset('bigfile'),
                            name="csv",
                            index_col=False,
                            header=None)
            m = Min()
            m.input.table = csv.output.table
            prt = Print(proc=self.terse)
            prt.input.df = m.output.table

        self.assertIs(s["csv"], csv)
        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        s.stop()
        s.join()
Пример #27
0
    def test_histogram1d(self):
        s = self.scheduler()
        csv = CSVLoader(get_dataset('bigfile'),
                        index_col=False,
                        header=None,
                        scheduler=s)
        min_ = Min(scheduler=s)
        min_.input.table = csv.output.table
        max_ = Max(scheduler=s)
        max_.input.table = csv.output.table
        histogram1d = Histogram1D('_2',
                                  scheduler=s)  # columns are called 1..30
        histogram1d.input.table = csv.output.table
        histogram1d.input.min = min_.output.table
        histogram1d.input.max = max_.output.table

        #pr = Print(scheduler=s)
        pr = Every(proc=self.terse, scheduler=s)
        pr.input.df = csv.output.table
        s.start(tick_proc=lambda s, r: csv.is_terminated() and s.stop())
        s.join()
        s = histogram1d.trace_stats()
Пример #28
0
 def add_min(s: Scheduler, r: int) -> None:
     with s:
         m = Min(scheduler=s)
         m.input.table = csv.output.result
         prt = Print(proc=self.terse, scheduler=s)
         prt.input.df = m.output.result
Пример #29
0
from progressivis import Scheduler, Every, Print
from progressivis.io import CSVLoader
from progressivis.stats import Histogram2D, Min, Max
from progressivis.datasets import get_dataset
from progressivis.vis import Heatmap

print("Loading test_histogram2d")
print("Type of default_scheduler is %s" % type(Scheduler.default))

csv = CSVLoader(get_dataset('bigfile'),
                index_col=False,
                header=None,
                engine='c')
pr = Every()
pr.input.df = csv.output.table
min_ = Min()
min_.input.table = csv.output.table
max_ = Max()
max_.input.table = csv.output.table
histogram2d = Histogram2D('_1', '_2', xbins=128, ybins=128)
histogram2d.input.table = csv.output.table
histogram2d.input.min = min_.output.table
histogram2d.input.max = max_.output.table
# heatmap
heatmap = Heatmap(filename='histo_%03d.png')
heatmap.input.array = histogram2d.output.table
pr = Print(name='print')
pr.input.df = csv.output.table

if __name__ == '__main__':
    csv.start()
Пример #30
0
#SUFFIX= ''
PREFIX= '../nyc-taxi/'
SUFFIX= '.bz2'

URLS = [
    PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX,
]

filenames = pd.DataFrame({'filename': URLS})
cst = Constant(Table('filenames', data=filenames), scheduler=s)
csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['dropoff_longitude', 'dropoff_latitude'], filter_=filter_, scheduler=s)
csv.input.filenames = cst.output.table
min = Min(scheduler=s)
min.input.table = csv.output.table
max = Max(scheduler=s)
max.input.table = csv.output.table
histogram2d = Histogram2D('dropoff_longitude', 'dropoff_latitude', xbins=RESOLUTION, ybins=RESOLUTION, scheduler=s)
histogram2d.input.table = csv.output.table
histogram2d.input.min = min.output.table
histogram2d.input.max = max.output.table
heatmap = Heatmap(filename='nyc_dropoff_yellow%d.png', history=5, scheduler=s)
heatmap.input.array = histogram2d.output.table

if __name__=='__main__':
    s.start()
Пример #31
0
    def test_dataflow_0(self) -> None:
        scheduler = self.scheduler()
        saved_inputs = None
        saved_outputs = None
        with scheduler as dataflow:
            csv = CSVLoader(
                get_dataset("smallfile"),
                name="csv",
                index_col=False,
                header=None,
                scheduler=scheduler,
            )
            self.assertIs(scheduler["csv"], csv)
            self.assertEqual(
                dataflow.validate_module(csv),
                ['Output slot "result" missing in module "csv"'],
            )

            m = Min(name="min", scheduler=scheduler)
            self.assertIs(dataflow[m.name], m)
            self.assertEqual(
                dataflow.validate_module(m),
                [
                    'Input slot "table" missing in module "min"',
                    'Output slot "result" missing in module "min"',
                ],
            )

            prt = Print(proc=self.terse, name="print", scheduler=scheduler)
            self.assertIs(dataflow[prt.name], prt)
            self.assertEqual(
                dataflow.validate_module(prt),
                ['Input slot "df" missing in module "print"'],
            )

            m.input.table = csv.output.result
            prt.input.df = m.output.result

            self.assertEqual(len(dataflow), 3)
            self.assertEqual(dataflow.dir(), ["csv", "min", "print"])
            errors = dataflow.validate()
            self.assertEqual(errors, [])
            deps = dataflow.order_modules()
            self.assertEqual(deps, ["csv", m.name, prt.name])
            saved_inputs = dataflow.inputs
            saved_outputs = dataflow.outputs
            # dataflow.__exit__() is called here
        # print('Old modules:', end=' ')
        # pprint(scheduler._modules)
        # scheduler._update_modules()  # force modules in the main loop
        # print('New modules:', end=' ')
        # pprint(scheduler.modules())

        with scheduler as dataflow:
            # nothing should change when nothing is modified in dataflow
            self.assertEqual(len(dataflow), 3)
            deps = dataflow.order_modules()
            self.assertEqual(deps, ["csv", m.name, prt.name])
            self.assertEqual(dataflow.inputs, saved_inputs)
            self.assertEqual(dataflow.outputs, saved_outputs)
        # scheduler._update_modules()  # force modules in the main loop

        with scheduler as dataflow:
            sink = Sink(name="sink", scheduler=scheduler)
            sink.input.inp = m.output.result
            dataflow.delete_modules(prt)
            self.assertEqual(len(dataflow), 3)
            deps = dataflow.order_modules()
            self.assertEqual(deps, ["csv", m.name, "sink"])
            # pprint(dataflow.inputs)
            # pprint(dataflow.outputs)
        # print('Old modules:')
        # pprint(scheduler._new_modules)
        # scheduler._update_modules()  # force modules in the main loop
        # print('New modules:')
        # pprint(scheduler.modules())
        with scheduler as dataflow:
            self.assertEqual(len(dataflow), 3)
            deps = dataflow.order_modules()
            self.assertEqual(deps, ["csv", m.name, "sink"])
            prt = Print(proc=self.terse, name="print", scheduler=scheduler)
            self.assertIs(dataflow[prt.name], prt)
            self.assertEqual(
                dataflow.validate_module(prt),
                ['Input slot "df" missing in module "print"'],
            )

            prt.input.df = m.output.result