def test_hub_if_else(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: False, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result min_ = Min(name="min_" + str(hash(random)), scheduler=s) min_.input[0] = switch.output.result_else hub = Hub(scheduler=s) hub.input.table = min_.output.result hub.input.table = max_.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = hub.output.result aio.run(s.start()) res1 = stirrer.result.min() res2 = hub.result self.compare(res1, res2)
def p10s_read_csv(self): s = Scheduler() module = CSVLoader(RandomBytesIO(cols=30, size=self.current_step * GIGA), index_col=False, header=None, scheduler=s) module.start()
def setUpStep(self, step): self.set_step_info("{} rows".format(step * L)) s = Scheduler() random = RandomTable(10, rows=step * L, scheduler=s) s.start() #return random self.random_table = pd.DataFrame( random.output.table.output_module.table().to_dict())
def p10s_random_min_max(n): StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) min_ = Min(name='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table s.start()
def p10s_random_min_max(self): n = self.current_step StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) min_ = Min(mid='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table max_ = Max(id='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table s.start()
def test_filter(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) filter_ = FilterMod(expr="_1 > 0.5", scheduler=s) filter_.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = filter_.output.result aio.run(s.start()) idx = (filter_.get_input_slot("table").data().eval( "_1>0.5", result_object="index")) self.assertEqual(filter_.table.index, bitmap(idx))
def test_filter(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) filter_ = FilterMod(expr='_1 > 0.5', scheduler=s) filter_.input.table = random.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = filter_.output.table s.start() s.join() idx = filter_.get_input_slot('table').data().eval( '_1>0.5', result_object='index') self.assertEqual(filter_._table.selection, bitmap(idx))
def test_scheduler(self) -> None: with self.assertRaises(ProgressiveError): s = Scheduler(0) s = Scheduler() csv = CSVLoader( get_dataset("bigfile"), name="csv", index_col=False, header=None, scheduler=s, ) self.assertIs(s["csv"], csv) sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result # allow csv to start check_running = False async def _is_running() -> None: nonlocal check_running check_running = csv.scheduler().is_running() aio.run_gather(s.start(), _is_running()) self.assertTrue(check_running) def add_min(s: Scheduler, r: int) -> None: with s: m = Min(scheduler=s) m.input.table = csv.output.result prt = Print(proc=self.terse, scheduler=s) prt.input.df = m.output.result s.on_loop(add_min, 10) s.on_loop(self._stop, 20) self.assertIs(s["csv"], csv) json = s.to_json(short=False) self.assertFalse(json["is_running"]) self.assertTrue(json["is_terminated"]) html = s._repr_html_() self.assertTrue(len(html) != 0)
def test_resetter(self) -> None: """ test_resetter() """ s = Scheduler() resetter = MyResetter(threshold=30000, scheduler=s) def _func(slot: Slot) -> bool: return slot.data().get("reset") is True score = self._common(0.1, resetter=resetter, resetter_func=_func, scheduler=s) print("resetter 30K=>score", score) self.assertGreater(score, 0.77)
def scheduler(self): sched = None if getenv("NOTHREAD"): if not self._output: print('[Using non-threaded scheduler]', end=' ', file=sys.stderr) self._output = True sched = BaseScheduler() else: sched = Scheduler() self._schedulers.append(sched) return sched
def test_repair_min(self) -> None: """ test_repair_min() min without deletes/updates """ s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) min_ = ScalarMin(name="min_" + str(hash(random)), scheduler=s) min_.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = min_.output.result aio.run(s.start()) res1 = random.table.min() res2 = min_.psdict self.compare(res1, res2)
def test_dummy(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) dummy_ = DummyMod(update_column='_1', delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s) dummy_.input.table = random.output.table max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = dummy_.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = max_.output.table s.start() s.join()
def test_repair_min2(self) -> None: """ test_repair_min2() runs with sensitive ids deletion """ s = Scheduler() ScalarMin._reset_calls_counter = 0 # type: ignore random = RandomTable(2, rows=100000, scheduler=s) min_ = ScalarMin(name="min_repair_test2", scheduler=s) stirrer = MyStirrer(watched="min_repair_test2", scheduler=s) stirrer.input[0] = random.output.result min_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = min_.output.result aio.run(s.start()) self.assertEqual(ScalarMin._reset_calls_counter, 1) # type: ignore res1 = stirrer.table.min() res2 = min_.psdict self.compare(res1, res2)
def test_stirrer(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) res1 = stirrer.table.max() res2 = max_.result self.compare(res1, res2)
def test_filter3(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_1", update_rows=5, fixed_step_size=100, scheduler=s) stirrer.input[0] = random.output.result filter_ = FilterMod(expr="_1 > 0.5", scheduler=s) filter_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = filter_.output.result aio.run(s.start()) tbl = filter_.get_input_slot("table").data() idx = tbl.eval("_1>0.5", result_object="index") self.assertEqual(filter_.table.index, bitmap(idx)) df = pd.DataFrame(tbl.to_dict(), index=tbl.index.to_array()) dfe = df.eval("_1>0.5") self.assertEqual(filter_.table.index, bitmap(df.index[dfe]))
def test_dataflow(self): s = Scheduler() with Dataflow(s): csv = CSVLoader(get_dataset('bigfile'), name="csv", index_col=False, header=None) m = Min() m.input.table = csv.output.table prt = Print(proc=self.terse) prt.input.df = m.output.table self.assertIs(s["csv"], csv) csv.scheduler().start() sleep(1) self.assertTrue(csv.scheduler().is_running()) s.stop() s.join()
def test_repair_max3(self) -> None: """ test_repair_max3() runs with NON-sensitive ids deletion """ s = Scheduler() ScalarMax._reset_calls_counter = 0 # type: ignore random = RandomTable(2, rows=100000, scheduler=s) max_ = ScalarMax(name="max_repair_test3", scheduler=s) stirrer = MyStirrer(watched="max_repair_test3", proc_sensitive=False, scheduler=s) stirrer.input[0] = random.output.result max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) self.assertEqual(ScalarMax._reset_calls_counter, 0) # type: ignore res1 = stirrer.table.max() res2 = max_.psdict self.compare(res1, res2)
def test_repair_max5(self) -> None: """ test_repair_max5() runs with sensitive ids update (critical) """ s = Scheduler() ScalarMax._reset_calls_counter = 0 # type: ignore random = RandomTable(2, rows=100000, scheduler=s) max_ = ScalarMax(name="max_repair_test4", scheduler=s) stirrer = MyStirrer(watched="max_repair_test4", mode="update", value=-9999.0, scheduler=s) stirrer.input[0] = random.output.result max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) self.assertEqual(ScalarMax._reset_calls_counter, 1) # type: ignore res1 = stirrer.table.max() res2 = max_.psdict self.compare(res1, res2)
def test_scheduler(self): s = Scheduler() csv = CSVLoader(get_dataset('bigfile'), name="csv", index_col=False, header=None, scheduler=s) #smp = Sample(n=10,scheduler=s) #smp.input.df = csv.output.table self.assertIs(s["csv"], csv) csv.scheduler().start() sleep(1) self.assertTrue(csv.scheduler().is_running()) #smp2 = Sample(n=15, scheduler=s) #smp2.input.df = csv.output.df def add_min(): m = Min(scheduler=s) # Of course, sleeping here is a bad idea. this is to illustrate # that add_min will be executed atomically by the scheduler. # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent # state. #sleep(1) m.input.table = csv.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = m.output.table s.on_tick_once(add_min) sleep(1) #self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id)) #self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id)) #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id)) s.stop() s.join()
def test_switch_if_then(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: True, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result pr_else = Print(proc=self.terse, scheduler=s) pr_else.input[0] = switch.output.result_else pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) res1 = stirrer.result.max() res2 = max_.result self.compare(res1, res2)
import os from progressivis import Scheduler, Print, log_level from progressivis.io import SimpleCSVLoader, DynVar from progressivis.stats import Histogram2D, Min, Max from progressivis.datasets import get_dataset from progressivis.vis import StatsExtender from progressivis.table import Table from progressivis.table.constant import Constant from progressivis.utils.psdict import PsDict from progressivis.stats.scaling import MinMaxScaler from progressivis_nb_widgets.nbwidgets import DataViewer from progressivis.datasets import get_dataset from progressivis.core import aio log_level(package='progressivis.table') s = Scheduler.default = Scheduler() PREFIX = '../nyc-taxi/' SUFFIX = '.bz2' URLS = [ PREFIX + 'yellow_tripdata_2015-01.csv' + SUFFIX, PREFIX + 'yellow_tripdata_2015-02.csv' + SUFFIX, PREFIX + 'yellow_tripdata_2015-03.csv' + SUFFIX, PREFIX + 'yellow_tripdata_2015-04.csv' + SUFFIX, PREFIX + 'yellow_tripdata_2015-05.csv' + SUFFIX, PREFIX + 'yellow_tripdata_2015-06.csv' + SUFFIX, ] num_cols = [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
def scheduler(self, clean: bool = False) -> Scheduler: if self._scheduler is None or clean: self._scheduler = Scheduler() return self._scheduler
from progressivis import Scheduler, Every #, log_level from progressivis.cluster import MBKMeans, MBKMeansFilter from progressivis.io import CSVLoader from progressivis.vis import MCScatterPlot from progressivis.datasets import get_dataset from progressivis.stats import RandomTable from progressivis.utils.psdict import PsDict import pandas as pd import numpy as np import os.path import tempfile from progressivis.datasets.random import generate_random_multivariate_normal_csv as gen_csv try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") #dir_name = tempfile.mkdtemp(prefix='progressivis_tmp_') dir_name = os.path.join(tempfile.gettempdir(), 'progressivis_tmp_') os.makedirs(dir_name, exist_ok=True) file_name = os.path.join(dir_name, "foobar.csv") gen_csv(file_name, rows=99999, reset=True) #, header='_0,_1', reset=False) data = CSVLoader(file_name, skipinitialspace=True, header=None, index_col=False, scheduler=s) n_clusters = 3 mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=n_clusters,
def _common( self, rtol: float, threshold: Optional[int] = None, resetter: Optional[MyResetter] = None, resetter_func: Optional[Callable[[Slot], Any]] = None, scheduler: Optional[Scheduler] = None, ) -> float: global KNN, LABELS, INDICES if scheduler is None: s = Scheduler() else: s = scheduler try: dataset = get_dataset("mnist_784") except TimeoutError: print("Cannot download mnist") return 0 data = CSVLoader( dataset, index_col=False, as_array="array", usecols=lambda x: x != "class", scheduler=s, ) ppca = PPCA(scheduler=s) ppca.input[0] = data.output.result ppca.params.n_components = N_COMPONENTS if resetter: assert callable(resetter_func) resetter.input[0] = ppca.output.result ppca.create_dependent_modules( rtol=rtol, trace=TRACE, threshold=threshold, resetter=resetter, resetter_func=resetter_func, ) prn = Every(scheduler=s, proc=_print) prn.input[0] = ppca.reduced.output.result aio.run(s.start()) pca_ = ppca._transformer["inc_pca"] recovered = pca_.inverse_transform(_array(ppca.reduced.table)) if KNN is None: print("Init KNN") KNN = KNeighborsClassifier(NNEIGHBOURS) arr = _array(data.table) df: pd.DataFrame = pd.read_csv( dataset, usecols=["class"] # type: ignore ) LABELS = df.values.reshape((-1,)) indices_t = sample_without_replacement( n_population=len(data.table), n_samples=TRAIN_SAMPLE_SIZE, random_state=RANDOM_STATE, ) KNN.fit(arr[indices_t], LABELS[indices_t]) indices_p = sample_without_replacement( n_population=len(data.table), n_samples=PREDICT_SAMPLE_SIZE, random_state=RANDOM_STATE * 2 + 1, ) return KNN.score(recovered[indices_p], LABELS[indices_p]) # type: ignore
def p10s_read_csv(f): s = Scheduler() module = CSVLoader(f, index_col=False, header=None, scheduler=s) module.start()
def p10s_random(self): n = self.current_step StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) s.start()
def make_df(n, L): s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) s.start() #return random return pd.DataFrame(random.output.table.output_module.table().to_dict())
def p10s_zarr_random(n): StorageEngine.default = "zarr" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) s.start()