async def modify_3(scheduler: Scheduler, run_number: int) -> None: print("Adding scatterplot_2") self.assertFalse("scatterplot_1" in scheduler) with scheduler: sp = MCScatterPlot( name="scatterplot_2", classes=[("Scatterplot", "a", "c")], approximate=True, scheduler=scheduler, ) sp.create_dependent_modules(table, "result") scheduler.on_loop(modify_4, 10) # Schedule the next activity
async def modify_1(scheduler: Scheduler, run_number: int) -> None: print("Adding scatterplot_1") # from nose.tools import set_trace; set_trace() with scheduler as dataflow: sp = MCScatterPlot( name="scatterplot_1", classes=[("Scatterplot", "a", "b")], approximate=True, scheduler=scheduler, ) sp.create_dependent_modules(table, "result") print(f"Created scatterplot_1, groups: {dataflow.groups()}") scheduler.on_loop(modify_2, 10) # Schedule the next activity
def test_scatterplot2(self): s = self.scheduler() random = RandomTable(2, rows=2000000, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_1', '_2')], approximate=True) sp.create_dependent_modules(random, 'table', with_sampling=False) cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input.df = random.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = sp.output.table decorate(s, VariablePatch1("variable_1")) decorate(s, VariablePatch2("variable_2")) decorate(s, ScatterPlotPatch("mc_scatter_plot_1")) sp.scheduler().start(idle_proc=idle_proc) s.join() js = sp.to_json() x, y, _ = zip(*js['sample']['data']) min_x = min(x) max_x = max(x) min_y = min(y) max_y = max(y) self.assertGreaterEqual(min_x, LOWER_X) self.assertGreaterEqual(min_y, LOWER_Y) self.assertLessEqual(max_x, UPPER_X) self.assertLessEqual(max_y, UPPER_Y)
def test_scatterplot(self): s = self.scheduler() csv = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, force_valid_ids=True, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_1', '_2')], approximate=True) sp.create_dependent_modules(csv, 'table') cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input.df = csv.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = sp.output.table csv.scheduler().start(idle_proc=idle_proc) s.join() self.assertEqual(len(csv.table()), 30000)
def test_scatterplot2(self) -> None: s = self.scheduler(clean=True) with s: random = RandomTable(2, rows=2000000, throttle=1000, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[("Scatterplot", "_1", "_2")], approximate=True) sp.create_dependent_modules(random, "result", with_sampling=False) cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input[0] = random.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = sp.output.result async def fake_input_1(scheduler: Scheduler, rn: int) -> None: module = scheduler["dyn_var_1"] print("from input dyn_var_1") await module.from_input({"x": LOWER_X, "y": LOWER_Y}) async def fake_input_2(scheduler: Scheduler, rn: int) -> None: module = scheduler["dyn_var_2"] print("from input dyn_var_2") await module.from_input({"x": UPPER_X, "y": UPPER_Y}) # finp1 = fake_input(s, "dyn_var_1", 6, {"x": LOWER_X, "y": LOWER_Y}) # finp2 = fake_input(s, "dyn_var_2", 6, {"x": UPPER_X, "y": UPPER_Y}) # sts = sleep_then_stop(s, 10) s.on_loop(self._stop, 10) # s.on_loop(prt) s.on_loop(fake_input_1, 3) s.on_loop(fake_input_2, 3) # aio.run_gather(sp.scheduler().start(), sts) aio.run(s.start()) js = sp.to_json() x, y, _ = zip(*js["sample"]["data"]) min_x = min(x) max_x = max(x) min_y = min(y) max_y = max(y) self.assertGreaterEqual(min_x, LOWER_X) self.assertGreaterEqual(min_y, LOWER_Y) self.assertLessEqual(max_x, UPPER_X) self.assertLessEqual(max_y, UPPER_Y)
async def modify_1(scheduler: Scheduler, run_number: int) -> None: print("Adding scatterplot_1") with scheduler as dataflow: dataflow1 = dataflow sp = MCScatterPlot( name="scatterplot_1", classes=[("Scatterplot", "a", "b")], approximate=True, scheduler=scheduler, ) sp.create_dependent_modules(table, "result") print(f"Created scatterplot_1, groups: {dataflow.groups()}") with self.assertRaises(ProgressiveError): with scheduler as dataflow: self.assertIs(dataflow, dataflow1) prt = Print(name="print", proc=self.terse, scheduler=scheduler) # prt.input.df = table.output.result _ = prt scheduler.on_loop(modify_2, 3) # Schedule the next activity
def test_scatterplot(self) -> None: s = self.scheduler(clean=True) with s: csv = CSVLoader( get_dataset("smallfile"), index_col=False, header=None, force_valid_ids=True, scheduler=s, ) sp = MCScatterPlot(scheduler=s, classes=[("Scatterplot", "_1", "_2")], approximate=True) sp.create_dependent_modules(csv, "result") cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input[0] = csv.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = sp.output.result # sts = sleep_then_stop(s, 5) s.on_loop(self._stop, 5) aio.run(csv.scheduler().start()) self.assertEqual(len(csv.table), 30000)
scheduler=s) classes = [] for i in range(n_clusters): cname = f"k{i}" filt = MBKMeansFilter(i) filt.create_dependent_modules(mbkmeans, data, 'table') classes.append({ 'name': cname, 'x_column': '_0', 'y_column': '_1', 'sample': mbkmeans if i == 0 else None, 'input_module': filt, 'input_slot': 'table' }) sp = MCScatterPlot(scheduler=s, classes=classes) sp.create_dependent_modules() for i in range(n_clusters): cname = f"k{i}" sp[cname].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf}) sp[cname].max_value._table = PsDict({'_0': np.inf, '_1': np.inf}) mbkmeans.input.table = data.output.table mbkmeans.create_dependent_modules() sp.move_point = mbkmeans.moved_center # for input management def myprint(d): if d['convergence'] != 'unknown': print(d) else: print('.', end='')
from progressivis import Scheduler, Print from progressivis.cluster import MBKMeans from progressivis.stats import RandomTable from progressivis.vis import MCScatterPlot import asyncio as aio try: s = scheduler except: s = Scheduler() table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s) mbkmeans = MBKMeans(columns=['a', 'b'], n_clusters=8, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = table.output.table prn = Print(scheduler=s) prn.input.df = mbkmeans.output.table sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', 'a', 'b')], approximate=True) sp.create_dependent_modules(mbkmeans, 'table') sp['Scatterplot'].range_query_2d.hist_index_x.params.init_threshold = 1 sp['Scatterplot'].range_query_2d.hist_index_y.params.init_threshold = 1 if __name__ == '__main__': #table.start() aio.run(s.start(coros=[aio.sleep(3600)]))
#log_level(package="progressivis.cluster") file_name = "/tmp/foobar.csv" gen_csv(file_name, rows=999999, reset=True) #, header='_0,_1', reset=False) data = CSVLoader(file_name, skipinitialspace=True, header=None, index_col=False, scheduler=s) mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=3, batch_size=100, tol=0.01, is_input=False, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_0', '_1', mbkmeans)]) sp.create_dependent_modules(data, 'table') sp['Scatterplot'].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf}) sp['Scatterplot'].max_value._table = PsDict({'_0': np.inf, '_1': np.inf}) mbkmeans.input.table = sp['Scatterplot'].range_query_2d.output.table #mbkmeans.input.table = data.output.table mbkmeans.create_dependent_modules() sp.move_point = mbkmeans.moved_center # for input management def myprint(d): if d['convergence'] != 'unknown': print(d) else: print('.', end='')
PREFIX + 'yellow_tripdata_2015-06.csv' + SUFFIX, ] FILENAMES = pd.DataFrame({'filename': URLS}) CST = Constant(Table('filenames', data=FILENAMES), scheduler=s) CSV = CSVLoader(index_col=False, skipinitialspace=True, usecols=['pickup_longitude', 'pickup_latitude'], filter_=_filter, scheduler=s) CSV.input.filenames = CST.output.table PR = Every(scheduler=s) PR.input.df = CSV.output.table SCATTERPLOT = MCScatterPlot(scheduler=s, classes=[('Scatterplot', 'pickup_longitude', 'pickup_latitude')], approximate=True) SCATTERPLOT.create_dependent_modules(CSV, 'table') s.set_interaction_opts(starving_mods=SCATTERPLOT.get_starving_mods(), max_iter=3, max_time=1.5) if __name__ == '__main__': s.start() while True: time.sleep(2) s.to_json() SCATTERPLOT.to_json() # simulate a web query #SCATTERPLOT.get_image() s.join() print(len(CSV.table()))
CST = Constant(Table('filenames', data=FILENAMES), scheduler=s) CSV = CSVLoader(index_col=False, skipinitialspace=True, usecols=[ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ], filter_=_filter, scheduler=s) # TODO: reimplement filter in read_csv.py CSV.input.filenames = CST.output.result PR = Every(scheduler=s) PR.input.df = CSV.output.result MULTICLASS = MCScatterPlot(scheduler=s, classes=[('pickup', 'pickup_longitude', 'pickup_latitude'), ('dropoff', 'dropoff_longitude', 'dropoff_latitude')], approximate=True) MULTICLASS.create_dependent_modules(CSV, 'result') async def coro(s): await aio.sleep(2) print("awake after 2 sec.") s.to_json() if __name__ == '__main__': aio.run(s.start(coros=[coro(s), aio.sleep(3600)])) print(len(CSV.table()))
CST = Constant(Table('filenames', data=FILENAMES), scheduler=s) CSV = CSVLoader(index_col=False, skipinitialspace=True, usecols=[ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ], filter_=_filter, scheduler=s) # TODO: reimplement filter in read_csv.py CSV.input.filenames = CST.output.table PR = Every(scheduler=s) PR.input.df = CSV.output.table MULTICLASS = MCScatterPlot(scheduler=s, classes=[('pickup', 'pickup_longitude', 'pickup_latitude'), ('dropoff', 'dropoff_longitude', 'dropoff_latitude')], approximate=True) MULTICLASS.create_dependent_modules(CSV, 'table') s.set_interaction_opts(starving_mods=MULTICLASS.get_starving_mods(), max_iter=3, max_time=1.5) if __name__ == '__main__': s.start() while True: time.sleep(2) s.to_json() MULTICLASS.to_json() # simulate a web query s.join() print(len(CSV.table()))
Clustering datasets may be found at http://cs.joensuu.fi/sipu/datasets/ """ from progressivis import Scheduler, Every#, log_level from progressivis.cluster import MBKMeans from progressivis.io import CSVLoader from progressivis.vis import MCScatterPlot from progressivis.datasets import get_dataset from progressivis.stats import RandomTable try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_0', '_1', mbkmeans)]) sp.create_dependent_modules(data,'table') mbkmeans.input.table = data.output.table prn = Every(scheduler=s) prn.input.df = mbkmeans.output.table if __name__ == '__main__': #data.start() #s.join() aio.run(s.start())