def test_histogram1d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = csv.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[2] # type: ignore ) v = df.to_numpy().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertListEqual(h1.tolist(), h2.tolist())
def test_histogram2d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram2d = Histogram2D( 1, 2, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = csv.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(csv.scheduler().start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[1, 2] # type: ignore ) v = df.to_numpy() # .reshape(-1, 2) bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertTrue(np.allclose(h1, h2))
def test_09_read_multi_csv_file_with_crash(self) -> None: s = self.scheduler() tag = "t9" file_list = [get_dataset("bigfile"), get_dataset("bigfile")] module = CSVLoader(file_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 3) aio.run_gather(s.start(), sts) _close(module) s = self.scheduler(clean=True) module = CSVLoader( file_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def run_throttled_server(port: int = 8000, threshold: int = 10**6) -> None: _ = get_dataset("smallfile") _ = get_dataset("bigfile") _ = get_dataset_bz2("smallfile") _ = get_dataset_bz2("bigfile") os.chdir(DATA_DIR) ThrottledReqHandler.threshold = threshold http_srv.test(HandlerClass=ThrottledReqHandler, port=port) # type: ignore
def test_read_multiple_csv(self): s=Scheduler() filenames = pd.DataFrame({'filename': [get_dataset('smallfile'), get_dataset('smallfile')]}) cst = Constant(df=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.df csv.start() self.assertEqual(len(csv.df()), 60000)
def run_simple_server() -> None: _ = get_dataset("smallfile") _ = get_dataset("bigfile") _ = get_dataset_bz2("smallfile") _ = get_dataset_bz2("bigfile") os.chdir(DATA_DIR) import RangeHTTPServer.__main__ # type: ignore RangeHTTPServer.__main__
def test_read_multiple_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def test_07_read_multi_csv_file_no_crash(self): s = self.scheduler() module = CSVLoader( [get_dataset('smallfile'), get_dataset('smallfile')], index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) #decorate(s, Patch1("csv_loader_1")) s.start() s.join() self.assertEqual(len(module.table()), 60000)
def run_throttled_server(port=8000, threshold=10**6): _ = get_dataset('smallfile') _ = get_dataset('bigfile') _ = get_dataset_bz2('smallfile') _ = get_dataset_bz2('bigfile') os.chdir(DATA_DIR) ThrottledReqHandler.threshold = threshold if six.PY2: import sys sys.argv[1] = 8000 http_srv.test(HandlerClass=ThrottledReqHandler) else: http_srv.test(HandlerClass=ThrottledReqHandler, port=port)
def run_simple_server() -> None: _ = get_dataset("smallfile") _ = get_dataset("bigfile") _ = get_dataset_bz2("smallfile") _ = get_dataset_bz2("bigfile") _ = get_dataset_gz("smallfile") _ = get_dataset_gz("bigfile") # if six.PY3: # _ = get_dataset_lzma('smallfile') # _ = get_dataset_lzma('bigfile') os.chdir(DATA_DIR) import RangeHTTPServer.__main__ # type: ignore assert RangeHTTPServer.__main__
def test_07_read_multi_csv_file_no_crash(self) -> None: s = self.scheduler() module = CSVLoader( [get_dataset("smallfile"), get_dataset("smallfile")], index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 60000)
def run_simple_server(): _ = get_dataset('smallfile') _ = get_dataset('bigfile') _ = get_dataset_bz2('smallfile') _ = get_dataset_bz2('bigfile') os.chdir(DATA_DIR) if six.PY2: import SimpleHTTPServer import RangeHTTPServer from RangeHTTPServer import RangeRequestHandler import sys sys.argv[1] = 8000 SimpleHTTPServer.test(HandlerClass=RangeRequestHandler) else: import RangeHTTPServer.__main__
def test_histogram2d(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) min_ = Min(scheduler=s) min_.input.table = csv.output.table max_ = Max(scheduler=s) max_.input.table = csv.output.table histogram2d = Histogram2D(1, 2, xbins=100, ybins=100, scheduler=s) # columns are called 1..30 histogram2d.input.table = csv.output.table histogram2d.input.min = min_.output.table histogram2d.input.max = max_.output.table heatmap = Heatmap(filename='histo_%03d.png', scheduler=s) heatmap.input.array = histogram2d.output.table #pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) #pr.input.df = heatmap.output.heatmap #pr.input.df = histogram2d.output.df pr.input.df = csv.output.table csv.scheduler().start() s.join() #self.scheduler.thread.join() s = histogram2d.trace_stats()
def test_join(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) stat1 = Stats(1, reset_index=True, scheduler=s) stat1.input.table = csv.output.table stat2 = Stats(2, reset_index=True, scheduler=s) stat2.input.table = csv.output.table stat3 = Stats(3, reset_index=True, scheduler=s) stat3.input.table = csv.output.table #join=Join(scheduler=s) #import pdb;pdb.set_trace() reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s) reduce_.input.table = stat1.output.stats reduce_.input.table = stat2.output.stats join = reduce_.expand() pr = Print(proc=self.terse, scheduler=s) pr.input.df = join.output.table prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input.df = csv.output.table s.start() res = join.trace_stats(max_runs=1) print(res)
def test_read_vec(self): module=VECLoader(get_dataset('warlogs'), id='test_read_vec') self.assertTrue(module.df() is None) module.run(0) s = module.trace_stats(max_runs=1) df = module.df() self.assertFalse(df is None) l = len(df) self.assertEqual(l, len(df[df[module.UPDATE_COLUMN]==module.last_update()])) cnt = 1 while not module.is_zombie(): module.run(cnt) cnt += 1 s = module.trace_stats(max_runs=1) df = module.df() ln = len(df) print "Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), ln) self.assertEqual(ln-l, len(df[df[module.UPDATE_COLUMN]==module.last_update()])) l = ln s = module.trace_stats(max_runs=1) print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df())) df2 = module.df().groupby([Module.UPDATE_COLUMN]) self.assertEqual(cnt, len(df2))
def test_join(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,scheduler=s) stat1=Stats(1, scheduler=s) stat1.input.df = csv.output.df stat2=Stats(2, scheduler=s) stat2.input.df = csv.output.df lr1 = LastRow(scheduler=s) lr1.input.df = stat1.output.stats lr2 = LastRow(scheduler=s) lr2.input.df = stat2.output.stats join=Join(scheduler=s) join.input.df = lr1.output.df join.input.df = lr2.output.df pr=Print(scheduler=s) pr.input.df = join.output.df prlen = Every(proc=print_len, constant_time=True, scheduler=s) prlen.input.df = csv.output.df s.start() res = join.trace_stats(max_runs=1) pd.set_option('display.expand_frame_repr', False) last = join.df() df = csv.df() self.assertTrue(last.at[0,'1.min']==df[1].min() and last.at[0,'1.max']==df[1].max() and \ last.at[0,'2.min']==df[2].min() and last.at[0,'2.max']==df[2].max()) print res
def test_mb_k_means(self) -> None: s = self.scheduler() n_clusters = 3 try: dataset = (get_dataset("cluster:s3"), ) except TimeoutError: print("Cannot download cluster:s3") return with s: csv = CSVLoader( dataset, sep=" ", skipinitialspace=True, header=None, index_col=False, scheduler=s, ) km = MBKMeans( n_clusters=n_clusters, random_state=42, is_input=False, is_greedy=False, scheduler=s, ) # km.input.table = csv.output.result km.create_dependent_modules(csv) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = km.output.result e = Every(proc=self.terse, scheduler=s) e.input[0] = km.output.labels aio.run(s.start()) labels = km.labels() assert labels is not None self.assertEqual(len(csv.table), len(labels))
def test_load_csv(self): """ Connecting modules via function calls """ csv = pv.load_csv(get_dataset('bigfile'), index_col=False, header=None) m = pv.min(csv) pv.echo(m, proc=prtm) M = pv.max(csv) pv.echo(M, proc=prtM) trace = M["_trace"] pv.echo(trace, proc=prtT) self.assertEqual(csv.scheduler(), csv.module.scheduler()) csv.scheduler().start() csv.scheduler().join() table = csv.table lastm = m.table.last() lastM = M.table.last() self.assertEqual(len(table), 1000000) for col in table.columns: #print('testing column %s'%col) c = table[col] v = c.min() self.assertEqual(v, lastm[col]) v = c.max() self.assertEqual(v, lastM[col])
def test_piped_load_csv2(self): """ Connecting modules via the pipe operator (only one pipe) """ ret = (PipedInput(get_dataset('bigfile')) | pv.load_csv(index_col=False, header=None) | pv.min() | pv.echo(proc=prtm).repipe('csv_loader_1') | pv.max() | pv.echo(proc=prtM).repipe('max_1', out='_trace') | pv.echo(proc=prtT)) m = ret.fetch('min_1') M = ret.fetch('max_1') csv = ret.fetch('csv_loader_1') self.assertEqual(csv.scheduler(), csv.module.scheduler()) csv.scheduler().start() csv.scheduler().join() table = csv.table lastm = m.table.last() lastM = M.table.last() self.assertEqual(len(table), 1000000) for col in table.columns: #print('testing column %s'%col) c = table[col] v = c.min() self.assertEqual(v, lastm[col]) v = c.max() self.assertEqual(v, lastM[col])
def test_read_csv(self): s=self.scheduler() module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() self.assertEqual(len(module.table()), 1000000)
def test_scheduler(self): s = MTScheduler() csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,scheduler=s) smp = Sample(n=10,scheduler=s) smp.input.df = csv.output.df csv.scheduler().start() sleep(1) self.assertTrue(csv.scheduler().is_running()) smp2 = Sample(n=15, scheduler=s) smp2.input.df = csv.output.df def add_min(): m = Min(scheduler=s) # Of course, sleeping here is a bad idea. this is to illustrate # that add_min will be executed atomically by the scheduler. # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent # state. #sleep(1) m.input.df = smp2.output.df prt = Print(scheduler=s) prt.input.df = m.output.df s.add_oneshot_tick_proc(add_min) sleep(1) self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id)) self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id)) #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id)) s.stop() s.join()
def t_histogram1d_impl(self, **kw: Any) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = csv.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = stirrer.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result # pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) tab = stirrer.table.loc[:, ["_2"]] assert tab is not None v = tab.to_array().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.tolist(), h2.tolist())
def test_sample(self): s = Scheduler() csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,scheduler=s) smp = Sample(n=10,scheduler=s) smp.input.df = csv.output.df prt = Print(scheduler=s) prt.input.df = smp.output.df csv.scheduler().start()
def test_query_simple(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,force_valid_ids=True,scheduler=s) q=Select(scheduler=s) q.input.df = csv.output.df prlen = Every(constant_time=True, scheduler=s) prlen.input.df = q.output.df s.start() self.assertEqual(len(q.df()), 1000000)
def load_csv(self): module=CSVLoader(filepath_or_buffer=get_dataset('smallfile'), force_valid_ids=True, index_col=False, header=None, scheduler=self.scheduler) self.assertTrue(module.table() is None) self.scheduler.start() self.scheduler.join() t = module.table() self.assertFalse(t is None) self.assertEqual(len(t), 30000) df = pd.read_csv(filepath_or_buffer=get_dataset('smallfile'), index_col=False, header=None) for col in range(t.ncol): coldf = df[col] colt = t[col] self.assertTrue(np.all(coldf==colt.values))
def test_MDS_csv(self): s=Scheduler() vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s) dis=PairwiseDistances(metric='euclidean',scheduler=s) dis.input.df = vec.output.df cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = dis.output.dist global times times = 0 s.start(ten_times)
def test_scatterplot(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,force_valid_ids=True,scheduler=s) sp = ScatterPlot(x_column='_1', y_column='_2', scheduler=s) sp.create_dependent_modules(csv,'df') cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = csv.output.df prt = Print(scheduler=s) prt.input.df = sp.histogram2d.output.df csv.scheduler().start(None,idle_proc) self.assertEquals(len(csv.df()), 1000000)
def test_query(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,force_valid_ids=True,scheduler=s) cst=Constant(pd.DataFrame({'query': ['_1 < 0.5']}),scheduler=s) q=Select(scheduler=s) q.input.df = csv.output.df q.input.query = cst.output.df prlen = Every(constant_time=True, scheduler=s) prlen.input.df = q.output.df s.start() self.assertTrue(len(q.df()) < 1000000)
def test_percentile(self): s = self.scheduler() csv_module = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, scheduler=s) module=Percentiles('_1', name='test_percentile', percentiles=[0.1, 0.25, 0.5, 0.75, 0.9], scheduler=s) module.input.table = csv_module.output.table prt = Every(proc=self.terse, name='print', scheduler=s) prt.input.df = module.output.percentiles s.start() s.join()
def test_csv_distances(self): s = self.scheduler() vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s) # dis=PairwiseDistances(metric='euclidean',scheduler=s) # dis.input.df = vec.output.df cnt = Every(proc=self.terse,constant_time=True,scheduler=s) # cnt.input.df = dis.output.dist cnt.input.df = vec.output.table global times times = 0 s.start(ten_times) s.join() table = vec.table()
def test_mb_k_means(self): #log_level() s=Scheduler() n_clusters = 3 csv = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s) km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s) km.input.df = csv.output.df pr = Print(scheduler=s) pr.input.df = km.output.df e = Every(scheduler=s) e.input.df = km.output.labels s.start() self.assertEquals(len(csv.df()), len(km.labels()))
def NOtest_vec_distances(self): s=Scheduler() vec=VECLoader(get_dataset('warlogs'),scheduler=s) dis=PairwiseDistances(metric='cosine',scheduler=s) dis.input.df = vec.output.df dis.input.array = vec.output.array cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = dis.output.dist global times times = 0 s.start() df = vec.df() computed = dis.dist() self.assertEquals(computed.shape[0], len(df)) truth = pairwise_distances(vec.toarray(), metric=dis._metric) self.assertTrue(np.allclose(truth, computed))
def test_percentile(self): s=Scheduler() csv_module = CSVLoader(get_dataset('smallfile'), index_col=False,header=None, scheduler=s) module=Percentiles(1,id='test_percentile', percentiles=[0.1, 0.25, 0.5, 0.75, 0.9], scheduler=s) module.describe() csv_module.describe() connect(csv_module, 'df', module, 'df') connect(module, 'percentiles', Print(id='print', scheduler=s), 'df') s.start() ret = module.trace_stats(max_runs=1) #print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df())) pd.set_option('display.expand_frame_repr', False) print ret
def test_stats(self): s=Scheduler() csv_module = CSVLoader(get_dataset('smallfile'), index_col=False,header=None, scheduler=s) stats=Stats(1,id='test_stats', scheduler=s) wait=Wait(id='wait', delay=3, scheduler=s) wait.input.df = csv_module.output.df #connect(csv_module, 'df', wait, 'df') stats.input._params = wait.output.df #connect(wait, 'df', stats, '_params') #connect(csv_module, 'df', stats, 'df') stats.input.df = csv_module.output.df pr = Print(id='print', scheduler=s) #connect(stats, 'stats', pr, 'inp') pr.input.df = stats.output.stats s.start() s = stats.trace_stats(max_runs=1) pd.set_option('display.expand_frame_repr', False) print s
def test_histogram1d(self): s = Scheduler() csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, scheduler=s) min = Min(scheduler=s) min.input.df = csv.output.df max = Max(scheduler=s) max.input.df = csv.output.df histogram1d = Histogram1D(2, scheduler=s) # columns are called 1..30 histogram1d.input.df = csv.output.df histogram1d.input.min = min.output.df histogram1d.input.max = max.output.df # pr = Print(scheduler=s) pr = Every(scheduler=s) pr.input.df = csv.output.df s.start(tick_proc=lambda s, r: csv.is_terminated() and s.stop()) s = histogram1d.trace_stats() # print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df())) pd.set_option("display.expand_frame_repr", False) print s
def test_csv_distances(self): s=Scheduler() vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s) dis=PairwiseDistances(metric='euclidean',scheduler=s) dis.input.df = vec.output.df cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = dis.output.dist global times times = 0 s.start(ten_times) df = vec.df() computed = dis.dist() #self.assertEquals(computed.shape[0], len(df)) del df[CSVLoader.UPDATE_COLUMN] offset=0 size=offset+5000 truth = pairwise_distances(df.iloc[offset:size], metric=dis._metric) dist = computed[offset:size,offset:size] self.assertTrue(np.allclose(truth, dist,atol=1e-7)) # reduce tolerance
def test_histogram2d(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,scheduler=s) min = Min(scheduler=s) min.input.df = csv.output.df max = Max(scheduler=s) max.input.df = csv.output.df histogram2d=Histogram2D(1, 2, xbins=100, ybins=100,scheduler=s) # columns are called 1..30 histogram2d.input.df = csv.output.df histogram2d.input.min = min.output.df histogram2d.input.max = max.output.df heatmap=Heatmap(filename='histo_%03d.png',scheduler=s) heatmap.input.array = histogram2d.output.df #pr = Print(scheduler=s) pr = Every(scheduler=s) #pr.input.df = heatmap.output.heatmap #pr.input.df = histogram2d.output.df pr.input.df = csv.output.df csv.scheduler().start() #self.scheduler.thread.join() s = histogram2d.trace_stats() #print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df())) pd.set_option('display.expand_frame_repr', False) print s
from progressivis import * from progressivis.vis import ScatterPlot from progressivis.io import CSVLoader from progressivis.datasets import get_dataset def filter(df): l = df['pickup_longitude'] return df[(l < -70) & (l > -80) ] def print_len(x): if x is not None: print len(x) #log_level() try: s = scheduler except: s = Scheduler() csv = CSVLoader(get_dataset('bigfile'),header=None,index_col=False,force_valid_ids=True,scheduler=s) pr = Every(scheduler=s) pr.input.df = csv.output.df scatterplot = ScatterPlot('_1', '_2', scheduler=s) scatterplot.create_dependent_modules(csv,'df') if __name__=='__main__': csv.start() s.join() print len(csv.df())
from progressivis import Scheduler, Every, Print from progressivis.io import CSVLoader from progressivis.stats import Histogram2D, Min, Max from progressivis.datasets import get_dataset print "Loading test_histogram2d" print "Type of default_scheduler is %s" % type(Scheduler.default) csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, engine="c") pr = Every() pr.input.df = csv.output.df min = Min() min.input.df = csv.output.df max = Max() max.input.df = csv.output.df histogram2d = Histogram2D(1, 2, xbins=128, ybins=128) histogram2d.input.df = csv.output.df histogram2d.input.min = min.output.df histogram2d.input.max = max.output.df pr = Print(id="print") pr.input.df = histogram2d.output.df if __name__ == "__main__": csv.start()
https://cs.joensuu.fi/sipu/datasets/ """ from progressivis import Scheduler, Every#, log_level from progressivis.cluster import MBKMeans from progressivis.io import CSVLoader from progressivis.stats import Min, Max, Histogram2D from progressivis.vis import Heatmap, ScatterPlot from progressivis.datasets import get_dataset try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s) mbkmeans.input.df = data.output.df prn = Every(scheduler=s) prn.input.df = mbkmeans.output.df sp = ScatterPlot(0,1, scheduler=s) sp.move_point = mbkmeans # for input management #sp.create_dependent_modules(mbkmeans,'centroids') # Create modules by hand rather than with the utility. # We show the cluster centroids on the scatterplot and the # data as a heatmap # histogram2d histogram2d = Histogram2D(0, 1, scheduler=s) histogram2d.input.df = data.output.df min_mod = Min([0,1], scheduler=s)
def test_read_csv(self): s=Scheduler() module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) self.assertTrue(module.df() is None) s.start() self.assertEqual(len(module.df()), 1000000)