예제 #1
0
 def test_hub_if_else(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     switch = Switch(condition=lambda x: False, scheduler=s)
     switch.input[0] = stirrer.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = switch.output.result
     min_ = Min(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = switch.output.result_else
     hub = Hub(scheduler=s)
     hub.input.table = min_.output.result
     hub.input.table = max_.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = hub.output.result
     aio.run(s.start())
     res1 = stirrer.result.min()
     res2 = hub.result
     self.compare(res1, res2)
예제 #2
0
 def p10s_read_csv(self):
     s = Scheduler()
     module = CSVLoader(RandomBytesIO(cols=30,
                                      size=self.current_step * GIGA),
                        index_col=False,
                        header=None,
                        scheduler=s)
     module.start()
예제 #3
0
 def setUpStep(self, step):
     self.set_step_info("{} rows".format(step * L))
     s = Scheduler()
     random = RandomTable(10, rows=step * L, scheduler=s)
     s.start()
     #return random
     self.random_table = pd.DataFrame(
         random.output.table.output_module.table().to_dict())
예제 #4
0
 def p10s_random_min_max(n):
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     min_ = Min(name='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     max_ = Max(name='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     s.start()
예제 #5
0
 def p10s_random_min_max(self):
     n = self.current_step
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     min_ = Min(mid='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     max_ = Max(id='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     s.start()
예제 #6
0
 def test_filter(self) -> None:
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     filter_ = FilterMod(expr="_1 > 0.5", scheduler=s)
     filter_.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = filter_.output.result
     aio.run(s.start())
     idx = (filter_.get_input_slot("table").data().eval(
         "_1>0.5", result_object="index"))
     self.assertEqual(filter_.table.index, bitmap(idx))
예제 #7
0
 def test_filter(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     filter_ = FilterMod(expr='_1 > 0.5', scheduler=s)
     filter_.input.table = random.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = filter_.output.table
     s.start()
     s.join()
     idx = filter_.get_input_slot('table').data().eval(
         '_1>0.5', result_object='index')
     self.assertEqual(filter_._table.selection, bitmap(idx))
예제 #8
0
    def test_scheduler(self) -> None:
        with self.assertRaises(ProgressiveError):
            s = Scheduler(0)
        s = Scheduler()
        csv = CSVLoader(
            get_dataset("bigfile"),
            name="csv",
            index_col=False,
            header=None,
            scheduler=s,
        )
        self.assertIs(s["csv"], csv)
        sink = Sink(name="sink", scheduler=s)
        sink.input.inp = csv.output.result  # allow csv to start
        check_running = False

        async def _is_running() -> None:
            nonlocal check_running
            check_running = csv.scheduler().is_running()

        aio.run_gather(s.start(), _is_running())

        self.assertTrue(check_running)

        def add_min(s: Scheduler, r: int) -> None:
            with s:
                m = Min(scheduler=s)
                m.input.table = csv.output.result
                prt = Print(proc=self.terse, scheduler=s)
                prt.input.df = m.output.result

        s.on_loop(add_min, 10)
        s.on_loop(self._stop, 20)

        self.assertIs(s["csv"], csv)
        json = s.to_json(short=False)
        self.assertFalse(json["is_running"])
        self.assertTrue(json["is_terminated"])
        html = s._repr_html_()
        self.assertTrue(len(html) != 0)
예제 #9
0
    def test_resetter(self) -> None:
        """
        test_resetter()
        """
        s = Scheduler()
        resetter = MyResetter(threshold=30000, scheduler=s)

        def _func(slot: Slot) -> bool:
            return slot.data().get("reset") is True

        score = self._common(0.1, resetter=resetter, resetter_func=_func, scheduler=s)
        print("resetter 30K=>score", score)
        self.assertGreater(score, 0.77)
예제 #10
0
 def scheduler(self):
     sched = None
     if getenv("NOTHREAD"):
         if not self._output:
             print('[Using non-threaded scheduler]',
                   end=' ',
                   file=sys.stderr)
             self._output = True
         sched = BaseScheduler()
     else:
         sched = Scheduler()
     self._schedulers.append(sched)
     return sched
예제 #11
0
 def test_repair_min(self) -> None:
     """
     test_repair_min()
     min without deletes/updates
     """
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     min_ = ScalarMin(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = min_.output.result
     aio.run(s.start())
     res1 = random.table.min()
     res2 = min_.psdict
     self.compare(res1, res2)
예제 #12
0
 def test_dummy(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     dummy_ = DummyMod(update_column='_1',
                       delete_rows=5,
                       update_rows=5,
                       fixed_step_size=100,
                       scheduler=s)
     dummy_.input.table = random.output.table
     max_ = Max(name='max_' + str(hash(random)), scheduler=s)
     max_.input.table = dummy_.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = max_.output.table
     s.start()
     s.join()
예제 #13
0
 def test_repair_min2(self) -> None:
     """
     test_repair_min2()
     runs with sensitive ids deletion
     """
     s = Scheduler()
     ScalarMin._reset_calls_counter = 0  # type: ignore
     random = RandomTable(2, rows=100000, scheduler=s)
     min_ = ScalarMin(name="min_repair_test2", scheduler=s)
     stirrer = MyStirrer(watched="min_repair_test2", scheduler=s)
     stirrer.input[0] = random.output.result
     min_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = min_.output.result
     aio.run(s.start())
     self.assertEqual(ScalarMin._reset_calls_counter, 1)  # type: ignore
     res1 = stirrer.table.min()
     res2 = min_.psdict
     self.compare(res1, res2)
예제 #14
0
 def test_stirrer(self) -> None:
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = max_.output.result
     aio.run(s.start())
     res1 = stirrer.table.max()
     res2 = max_.result
     self.compare(res1, res2)
예제 #15
0
 def test_filter3(self) -> None:
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(update_column="_1",
                       update_rows=5,
                       fixed_step_size=100,
                       scheduler=s)
     stirrer.input[0] = random.output.result
     filter_ = FilterMod(expr="_1 > 0.5", scheduler=s)
     filter_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = filter_.output.result
     aio.run(s.start())
     tbl = filter_.get_input_slot("table").data()
     idx = tbl.eval("_1>0.5", result_object="index")
     self.assertEqual(filter_.table.index, bitmap(idx))
     df = pd.DataFrame(tbl.to_dict(), index=tbl.index.to_array())
     dfe = df.eval("_1>0.5")
     self.assertEqual(filter_.table.index, bitmap(df.index[dfe]))
예제 #16
0
    def test_dataflow(self):
        s = Scheduler()
        with Dataflow(s):
            csv = CSVLoader(get_dataset('bigfile'),
                            name="csv",
                            index_col=False,
                            header=None)
            m = Min()
            m.input.table = csv.output.table
            prt = Print(proc=self.terse)
            prt.input.df = m.output.table

        self.assertIs(s["csv"], csv)
        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        s.stop()
        s.join()
예제 #17
0
 def test_repair_max3(self) -> None:
     """
     test_repair_max3()
     runs with NON-sensitive ids deletion
     """
     s = Scheduler()
     ScalarMax._reset_calls_counter = 0  # type: ignore
     random = RandomTable(2, rows=100000, scheduler=s)
     max_ = ScalarMax(name="max_repair_test3", scheduler=s)
     stirrer = MyStirrer(watched="max_repair_test3",
                         proc_sensitive=False,
                         scheduler=s)
     stirrer.input[0] = random.output.result
     max_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = max_.output.result
     aio.run(s.start())
     self.assertEqual(ScalarMax._reset_calls_counter, 0)  # type: ignore
     res1 = stirrer.table.max()
     res2 = max_.psdict
     self.compare(res1, res2)
예제 #18
0
 def test_repair_max5(self) -> None:
     """
     test_repair_max5()
     runs with sensitive ids update (critical)
     """
     s = Scheduler()
     ScalarMax._reset_calls_counter = 0  # type: ignore
     random = RandomTable(2, rows=100000, scheduler=s)
     max_ = ScalarMax(name="max_repair_test4", scheduler=s)
     stirrer = MyStirrer(watched="max_repair_test4",
                         mode="update",
                         value=-9999.0,
                         scheduler=s)
     stirrer.input[0] = random.output.result
     max_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = max_.output.result
     aio.run(s.start())
     self.assertEqual(ScalarMax._reset_calls_counter, 1)  # type: ignore
     res1 = stirrer.table.max()
     res2 = max_.psdict
     self.compare(res1, res2)
예제 #19
0
    def test_scheduler(self):
        s = Scheduler()
        csv = CSVLoader(get_dataset('bigfile'),
                        name="csv",
                        index_col=False,
                        header=None,
                        scheduler=s)

        #smp = Sample(n=10,scheduler=s)
        #smp.input.df = csv.output.table

        self.assertIs(s["csv"], csv)
        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        #smp2 = Sample(n=15, scheduler=s)
        #smp2.input.df = csv.output.df

        def add_min():
            m = Min(scheduler=s)
            # Of course, sleeping here is a bad idea. this is to illustrate
            # that add_min will be executed atomically by the scheduler.
            # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent
            # state.
            #sleep(1)
            m.input.table = csv.output.table
            prt = Print(proc=self.terse, scheduler=s)
            prt.input.df = m.output.table

        s.on_tick_once(add_min)

        sleep(1)
        #self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id))
        #self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id))
        #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id))
        s.stop()
        s.join()
예제 #20
0
 def test_switch_if_then(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     switch = Switch(condition=lambda x: True, scheduler=s)
     switch.input[0] = stirrer.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = switch.output.result
     pr_else = Print(proc=self.terse, scheduler=s)
     pr_else.input[0] = switch.output.result_else
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = max_.output.result
     aio.run(s.start())
     res1 = stirrer.result.max()
     res2 = max_.result
     self.compare(res1, res2)
예제 #21
0
import os
from progressivis import Scheduler, Print, log_level
from progressivis.io import SimpleCSVLoader, DynVar
from progressivis.stats import Histogram2D, Min, Max
from progressivis.datasets import get_dataset
from progressivis.vis import StatsExtender
from progressivis.table import Table
from progressivis.table.constant import Constant
from progressivis.utils.psdict import PsDict
from progressivis.stats.scaling import MinMaxScaler
from progressivis_nb_widgets.nbwidgets import DataViewer
from progressivis.datasets import get_dataset
from progressivis.core import aio

log_level(package='progressivis.table')
s = Scheduler.default = Scheduler()

PREFIX = '../nyc-taxi/'

SUFFIX = '.bz2'

URLS = [
    PREFIX + 'yellow_tripdata_2015-01.csv' + SUFFIX,
    PREFIX + 'yellow_tripdata_2015-02.csv' + SUFFIX,
    PREFIX + 'yellow_tripdata_2015-03.csv' + SUFFIX,
    PREFIX + 'yellow_tripdata_2015-04.csv' + SUFFIX,
    PREFIX + 'yellow_tripdata_2015-05.csv' + SUFFIX,
    PREFIX + 'yellow_tripdata_2015-06.csv' + SUFFIX,
]
num_cols = [
    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
예제 #22
0
 def scheduler(self, clean: bool = False) -> Scheduler:
     if self._scheduler is None or clean:
         self._scheduler = Scheduler()
     return self._scheduler
예제 #23
0
from progressivis import Scheduler, Every  #, log_level
from progressivis.cluster import MBKMeans, MBKMeansFilter
from progressivis.io import CSVLoader
from progressivis.vis import MCScatterPlot
from progressivis.datasets import get_dataset
from progressivis.stats import RandomTable
from progressivis.utils.psdict import PsDict
import pandas as pd
import numpy as np
import os.path
import tempfile
from progressivis.datasets.random import generate_random_multivariate_normal_csv as gen_csv
try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

#dir_name = tempfile.mkdtemp(prefix='progressivis_tmp_')
dir_name = os.path.join(tempfile.gettempdir(), 'progressivis_tmp_')
os.makedirs(dir_name, exist_ok=True)
file_name = os.path.join(dir_name, "foobar.csv")
gen_csv(file_name, rows=99999, reset=True)  #, header='_0,_1', reset=False)
data = CSVLoader(file_name,
                 skipinitialspace=True,
                 header=None,
                 index_col=False,
                 scheduler=s)
n_clusters = 3
mbkmeans = MBKMeans(columns=['_0', '_1'],
                    n_clusters=n_clusters,
예제 #24
0
    def _common(
        self,
        rtol: float,
        threshold: Optional[int] = None,
        resetter: Optional[MyResetter] = None,
        resetter_func: Optional[Callable[[Slot], Any]] = None,
        scheduler: Optional[Scheduler] = None,
    ) -> float:
        global KNN, LABELS, INDICES
        if scheduler is None:
            s = Scheduler()
        else:
            s = scheduler
        try:
            dataset = get_dataset("mnist_784")
        except TimeoutError:
            print("Cannot download mnist")
            return 0
        data = CSVLoader(
            dataset,
            index_col=False,
            as_array="array",
            usecols=lambda x: x != "class",
            scheduler=s,
        )
        ppca = PPCA(scheduler=s)
        ppca.input[0] = data.output.result
        ppca.params.n_components = N_COMPONENTS
        if resetter:
            assert callable(resetter_func)
            resetter.input[0] = ppca.output.result
        ppca.create_dependent_modules(
            rtol=rtol,
            trace=TRACE,
            threshold=threshold,
            resetter=resetter,
            resetter_func=resetter_func,
        )

        prn = Every(scheduler=s, proc=_print)
        prn.input[0] = ppca.reduced.output.result
        aio.run(s.start())
        pca_ = ppca._transformer["inc_pca"]
        recovered = pca_.inverse_transform(_array(ppca.reduced.table))
        if KNN is None:
            print("Init KNN")
            KNN = KNeighborsClassifier(NNEIGHBOURS)
            arr = _array(data.table)
            df: pd.DataFrame = pd.read_csv(
                dataset, usecols=["class"]  # type: ignore
            )
            LABELS = df.values.reshape((-1,))
            indices_t = sample_without_replacement(
                n_population=len(data.table),
                n_samples=TRAIN_SAMPLE_SIZE,
                random_state=RANDOM_STATE,
            )
            KNN.fit(arr[indices_t], LABELS[indices_t])
        indices_p = sample_without_replacement(
            n_population=len(data.table),
            n_samples=PREDICT_SAMPLE_SIZE,
            random_state=RANDOM_STATE * 2 + 1,
        )
        return KNN.score(recovered[indices_p], LABELS[indices_p])  # type: ignore
예제 #25
0
 def p10s_read_csv(f):
     s = Scheduler()
     module = CSVLoader(f, index_col=False, header=None, scheduler=s)
     module.start()
예제 #26
0
 def p10s_random(self):
     n = self.current_step
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     s.start()
예제 #27
0
def make_df(n, L):
    s = Scheduler()
    random = RandomTable(10, rows=n * L, scheduler=s)
    s.start()
    #return random
    return pd.DataFrame(random.output.table.output_module.table().to_dict())
예제 #28
0
 def p10s_zarr_random(n):
     StorageEngine.default = "zarr"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     s.start()