Пример #1
0
 def test_read_multiple_csv(self):
     s=Scheduler()
     filenames = pd.DataFrame({'filename': [get_dataset('smallfile'), get_dataset('smallfile')]})
     cst = Constant(df=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.df
     csv.start()
     self.assertEqual(len(csv.df()), 60000)
Пример #2
0
 def p10s_read_csv(self):
     s = Scheduler()
     module = CSVLoader(RandomBytesIO(cols=30,
                                      size=self.current_step * GIGA),
                        index_col=False,
                        header=None,
                        scheduler=s)
     module.start()
Пример #3
0
 def test_read_multiple_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     self.assertEqual(len(csv.table()), 60000)
Пример #4
0
 def test_read_multiple_fake_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [
                           'buffer://fake1?cols=10&rows=30000',
                           'buffer://fake2?cols=10&rows=30000']})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()        
     self.assertEqual(len(csv.table()), 60000)
Пример #5
0
 def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None:
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s = self.scheduler()
     filenames = Table(
         name="file_names",
         dshape="{filename: string}",
         data={
             "filename": [
                 make_url("smallfile", ext=BZ2),
                 make_url("smallfile", ext=BZ2),
             ]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False,
                     header=None,
                     scheduler=s,
                     timeout=0.01)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     _close(csv)
     self.assertEqual(len(csv.table), 60000)
 def test_03_read_multiple_csv_crash_recovery(self):
     #if TRAVIS: return        
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP) 
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [make_url('smallfile'), make_url('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     _close(csv)        
     self.assertEqual(len(csv.table()), 60000)
Пример #7
0
 def test_read_multiple_csv(self) -> None:
     s = self.scheduler()
     filenames = Table(
         name="file_names",
         dshape="{filename: string}",
         data={
             "filename":
             [get_dataset("smallfile"),
              get_dataset("smallfile")]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     self.assertEqual(len(csv.table), 60000)
Пример #8
0
 def test_read_multiple_fake_csv(self) -> None:
     s = self.scheduler()
     filenames = Table(
         name="file_names2",
         dshape="{filename: string}",
         data={
             "filename": [
                 "buffer://fake1?cols=10&rows=30000",
                 "buffer://fake2?cols=10&rows=30000",
             ]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     self.assertEqual(len(csv.table), 60000)
Пример #9
0
"""
Clustering datasets may be found at
https://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.vis import ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s)
mbkmeans.input.table = data.output.table
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = ScatterPlot('_0','_1', scheduler=s)

sp.move_point = mbkmeans # for input management
sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans)

if __name__ == '__main__':
    data.start()
    s.join()
Пример #10
0
    l = df['pickup_longitude']
    return df[(l < -70) & (l > -80)]


def print_len(x):
    if x is not None:
        print(len(x))


#log_level()

try:
    s = scheduler
except:
    s = Scheduler()

csv = CSVLoader(get_dataset('bigfile'),
                header=None,
                index_col=False,
                force_valid_ids=True,
                scheduler=s)
pr = Every(scheduler=s)
pr.input.df = csv.output.table
scatterplot = ScatterPlot(x_column='_1', y_column='_2', scheduler=s)
scatterplot.create_dependent_modules(csv, 'table')

if __name__ == '__main__':
    csv.start()
    s.join()
    print(len(csv.df()))
Пример #11
0
from progressivis import Scheduler, Every, Print
from progressivis.io import CSVLoader
from progressivis.stats import Histogram2D, Min, Max
from progressivis.datasets import get_dataset

print "Loading test_histogram2d"
print "Type of default_scheduler is %s" % type(Scheduler.default)

csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, engine="c")
pr = Every()
pr.input.df = csv.output.df
min = Min()
min.input.df = csv.output.df
max = Max()
max.input.df = csv.output.df
histogram2d = Histogram2D(1, 2, xbins=128, ybins=128)
histogram2d.input.df = csv.output.df
histogram2d.input.min = min.output.df
histogram2d.input.max = max.output.df
pr = Print(id="print")
pr.input.df = histogram2d.output.df

if __name__ == "__main__":
    csv.start()
Пример #12
0
mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s)
mbkmeans.input.df = data.output.df
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.df
sp = ScatterPlot(0,1, scheduler=s)
sp.move_point = mbkmeans # for input management
#sp.create_dependent_modules(mbkmeans,'centroids')
# Create modules by hand rather than with the utility.
# We show the cluster centroids on the scatterplot and the
# data as a heatmap

# histogram2d
histogram2d = Histogram2D(0, 1, scheduler=s)
histogram2d.input.df = data.output.df
min_mod = Min([0,1], scheduler=s)
max_mod = Max([0,1], scheduler=s)
min_mod.input.df = data.output.df
max_mod.input.df = data.output.df
histogram2d.input.min = min_mod.output.df
histogram2d.input.max = max_mod.output.df
# heatmap
heatmap = Heatmap(filename='heatmap%d.png', history=100, scheduler=s)
heatmap.input.array = histogram2d.output.df
# scatterplot
sp.input.heatmap = heatmap.output.heatmap
sp.input.df = mbkmeans.output.df

if __name__ == '__main__':
    data.start()
    s.join()
Пример #13
0
 def p10s_read_csv(f):
     s = Scheduler()
     module = CSVLoader(f, index_col=False, header=None, scheduler=s)
     module.start()