def test_read_multiple_csv(self): s=Scheduler() filenames = pd.DataFrame({'filename': [get_dataset('smallfile'), get_dataset('smallfile')]}) cst = Constant(df=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.df csv.start() self.assertEqual(len(csv.df()), 60000)
def p10s_read_csv(self): s = Scheduler() module = CSVLoader(RandomBytesIO(cols=30, size=self.current_step * GIGA), index_col=False, header=None, scheduler=s) module.start()
def test_read_multiple_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def test_read_multiple_fake_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [ 'buffer://fake1?cols=10&rows=30000', 'buffer://fake2?cols=10&rows=30000']}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None: p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s = self.scheduler() filenames = Table( name="file_names", dshape="{filename: string}", data={ "filename": [ make_url("smallfile", ext=BZ2), make_url("smallfile", ext=BZ2), ] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) _close(csv) self.assertEqual(len(csv.table), 60000)
def test_03_read_multiple_csv_crash_recovery(self): #if TRAVIS: return p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [make_url('smallfile'), make_url('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.table csv.start() s.join() _close(csv) self.assertEqual(len(csv.table()), 60000)
def test_read_multiple_csv(self) -> None: s = self.scheduler() filenames = Table( name="file_names", dshape="{filename: string}", data={ "filename": [get_dataset("smallfile"), get_dataset("smallfile")] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) self.assertEqual(len(csv.table), 60000)
def test_read_multiple_fake_csv(self) -> None: s = self.scheduler() filenames = Table( name="file_names2", dshape="{filename: string}", data={ "filename": [ "buffer://fake1?cols=10&rows=30000", "buffer://fake2?cols=10&rows=30000", ] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) self.assertEqual(len(csv.table), 60000)
""" Clustering datasets may be found at https://cs.joensuu.fi/sipu/datasets/ """ from progressivis import Scheduler, Every#, log_level from progressivis.cluster import MBKMeans from progressivis.io import CSVLoader from progressivis.vis import ScatterPlot from progressivis.datasets import get_dataset try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = data.output.table prn = Every(scheduler=s) prn.input.df = mbkmeans.output.table sp = ScatterPlot('_0','_1', scheduler=s) sp.move_point = mbkmeans # for input management sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans) if __name__ == '__main__': data.start() s.join()
l = df['pickup_longitude'] return df[(l < -70) & (l > -80)] def print_len(x): if x is not None: print(len(x)) #log_level() try: s = scheduler except: s = Scheduler() csv = CSVLoader(get_dataset('bigfile'), header=None, index_col=False, force_valid_ids=True, scheduler=s) pr = Every(scheduler=s) pr.input.df = csv.output.table scatterplot = ScatterPlot(x_column='_1', y_column='_2', scheduler=s) scatterplot.create_dependent_modules(csv, 'table') if __name__ == '__main__': csv.start() s.join() print(len(csv.df()))
from progressivis import Scheduler, Every, Print from progressivis.io import CSVLoader from progressivis.stats import Histogram2D, Min, Max from progressivis.datasets import get_dataset print "Loading test_histogram2d" print "Type of default_scheduler is %s" % type(Scheduler.default) csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, engine="c") pr = Every() pr.input.df = csv.output.df min = Min() min.input.df = csv.output.df max = Max() max.input.df = csv.output.df histogram2d = Histogram2D(1, 2, xbins=128, ybins=128) histogram2d.input.df = csv.output.df histogram2d.input.min = min.output.df histogram2d.input.max = max.output.df pr = Print(id="print") pr.input.df = histogram2d.output.df if __name__ == "__main__": csv.start()
mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s) mbkmeans.input.df = data.output.df prn = Every(scheduler=s) prn.input.df = mbkmeans.output.df sp = ScatterPlot(0,1, scheduler=s) sp.move_point = mbkmeans # for input management #sp.create_dependent_modules(mbkmeans,'centroids') # Create modules by hand rather than with the utility. # We show the cluster centroids on the scatterplot and the # data as a heatmap # histogram2d histogram2d = Histogram2D(0, 1, scheduler=s) histogram2d.input.df = data.output.df min_mod = Min([0,1], scheduler=s) max_mod = Max([0,1], scheduler=s) min_mod.input.df = data.output.df max_mod.input.df = data.output.df histogram2d.input.min = min_mod.output.df histogram2d.input.max = max_mod.output.df # heatmap heatmap = Heatmap(filename='heatmap%d.png', history=100, scheduler=s) heatmap.input.array = histogram2d.output.df # scatterplot sp.input.heatmap = heatmap.output.heatmap sp.input.df = mbkmeans.output.df if __name__ == '__main__': data.start() s.join()
def p10s_read_csv(f): s = Scheduler() module = CSVLoader(f, index_col=False, header=None, scheduler=s) module.start()