def test_filter(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) filter_ = FilterMod(expr='_1 > 0.5', scheduler=s) filter_.input.table = random.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = filter_.output.table s.start() s.join() idx = filter_.get_input_slot('table').data().eval( '_1>0.5', result_object='index') self.assertEqual(filter_._table.selection, bitmap(idx))
def test_dummy(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) dummy_ = DummyMod(update_column='_1', delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s) dummy_.input.table = random.output.table max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = dummy_.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = max_.output.table s.start() s.join()
def test_dataflow(self): s = Scheduler() with Dataflow(s): csv = CSVLoader(get_dataset('bigfile'), name="csv", index_col=False, header=None) m = Min() m.input.table = csv.output.table prt = Print(proc=self.terse) prt.input.df = m.output.table self.assertIs(s["csv"], csv) csv.scheduler().start() sleep(1) self.assertTrue(csv.scheduler().is_running()) s.stop() s.join()
def test_scheduler(self): s = Scheduler() csv = CSVLoader(get_dataset('bigfile'), name="csv", index_col=False, header=None, scheduler=s) #smp = Sample(n=10,scheduler=s) #smp.input.df = csv.output.table self.assertIs(s["csv"], csv) csv.scheduler().start() sleep(1) self.assertTrue(csv.scheduler().is_running()) #smp2 = Sample(n=15, scheduler=s) #smp2.input.df = csv.output.df def add_min(): m = Min(scheduler=s) # Of course, sleeping here is a bad idea. this is to illustrate # that add_min will be executed atomically by the scheduler. # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent # state. #sleep(1) m.input.table = csv.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = m.output.table s.on_tick_once(add_min) sleep(1) #self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id)) #self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id)) #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id)) s.stop() s.join()
""" Clustering datasets may be found at https://cs.joensuu.fi/sipu/datasets/ """ from progressivis import Scheduler, Every#, log_level from progressivis.cluster import MBKMeans from progressivis.io import CSVLoader from progressivis.vis import ScatterPlot from progressivis.datasets import get_dataset try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = data.output.table prn = Every(scheduler=s) prn.input.df = mbkmeans.output.table sp = ScatterPlot('_0','_1', scheduler=s) sp.move_point = mbkmeans # for input management sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans) if __name__ == '__main__': data.start() s.join()
mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s) mbkmeans.input.df = data.output.df prn = Every(scheduler=s) prn.input.df = mbkmeans.output.df sp = ScatterPlot(0,1, scheduler=s) sp.move_point = mbkmeans # for input management #sp.create_dependent_modules(mbkmeans,'centroids') # Create modules by hand rather than with the utility. # We show the cluster centroids on the scatterplot and the # data as a heatmap # histogram2d histogram2d = Histogram2D(0, 1, scheduler=s) histogram2d.input.df = data.output.df min_mod = Min([0,1], scheduler=s) max_mod = Max([0,1], scheduler=s) min_mod.input.df = data.output.df max_mod.input.df = data.output.df histogram2d.input.min = min_mod.output.df histogram2d.input.max = max_mod.output.df # heatmap heatmap = Heatmap(filename='heatmap%d.png', history=100, scheduler=s) heatmap.input.array = histogram2d.output.df # scatterplot sp.input.heatmap = heatmap.output.heatmap sp.input.df = mbkmeans.output.df if __name__ == '__main__': data.start() s.join()