def test_scatterplot(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,force_valid_ids=True,scheduler=s) sp = ScatterPlot(x_column='_1', y_column='_2', scheduler=s) sp.create_dependent_modules(csv,'df') cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = csv.output.df prt = Print(scheduler=s) prt.input.df = sp.histogram2d.output.df csv.scheduler().start(None,idle_proc) self.assertEquals(len(csv.df()), 1000000)
""" Clustering datasets may be found at https://cs.joensuu.fi/sipu/datasets/ """ from progressivis import Scheduler, Every#, log_level from progressivis.cluster import MBKMeans from progressivis.io import CSVLoader from progressivis.vis import ScatterPlot from progressivis.datasets import get_dataset try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = data.output.table prn = Every(scheduler=s) prn.input.df = mbkmeans.output.table sp = ScatterPlot('_0','_1', scheduler=s) sp.move_point = mbkmeans # for input management sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans) if __name__ == '__main__': data.start() s.join()
l = df['pickup_longitude'] return df[(l < -70) & (l > -80)] def print_len(x): if x is not None: print(len(x)) #log_level() try: s = scheduler except: s = Scheduler() csv = CSVLoader(get_dataset('bigfile'), header=None, index_col=False, force_valid_ids=True, scheduler=s) pr = Every(scheduler=s) pr.input.df = csv.output.table scatterplot = ScatterPlot(x_column='_1', y_column='_2', scheduler=s) scatterplot.create_dependent_modules(csv, 'table') if __name__ == '__main__': csv.start() s.join() print(len(csv.df()))
from progressivis import * from progressivis.vis import ScatterPlot from progressivis.io import CSVLoader from progressivis.datasets import get_dataset def filter(df): l = df['pickup_longitude'] return df[(l < -70) & (l > -80) ] def print_len(x): if x is not None: print len(x) #log_level() try: s = scheduler except: s = Scheduler() csv = CSVLoader(get_dataset('bigfile'),header=None,index_col=False,force_valid_ids=True,scheduler=s) pr = Every(scheduler=s) pr.input.df = csv.output.df scatterplot = ScatterPlot('_1', '_2', scheduler=s) scatterplot.create_dependent_modules(csv,'df') if __name__=='__main__': csv.start() s.join() print len(csv.df())
from progressivis import Scheduler, Print from progressivis.cluster import MBKMeans from progressivis.stats import RandomTable from progressivis.vis import ScatterPlot try: s = scheduler except: s = Scheduler() table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s) mbkmeans = MBKMeans(columns=['a', 'b'], n_clusters=8, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = table.output.table prn = Print(scheduler=s) prn.input.df = mbkmeans.output.table sp = ScatterPlot('a', 'b', scheduler=s) sp.create_dependent_modules(mbkmeans,'table') if __name__ == '__main__': table.start()