def test_filter(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     filter_ = FilterMod(expr='_1 > 0.5', scheduler=s)
     filter_.input.table = random.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = filter_.output.table
     s.start()
     s.join()
     idx = filter_.get_input_slot('table').data().eval(
         '_1>0.5', result_object='index')
     self.assertEqual(filter_._table.selection, bitmap(idx))
示例#2
0
 def test_dummy(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     dummy_ = DummyMod(update_column='_1',
                       delete_rows=5,
                       update_rows=5,
                       fixed_step_size=100,
                       scheduler=s)
     dummy_.input.table = random.output.table
     max_ = Max(name='max_' + str(hash(random)), scheduler=s)
     max_.input.table = dummy_.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = max_.output.table
     s.start()
     s.join()
    def test_dataflow(self):
        s = Scheduler()
        with Dataflow(s):
            csv = CSVLoader(get_dataset('bigfile'),
                            name="csv",
                            index_col=False,
                            header=None)
            m = Min()
            m.input.table = csv.output.table
            prt = Print(proc=self.terse)
            prt.input.df = m.output.table

        self.assertIs(s["csv"], csv)
        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        s.stop()
        s.join()
    def test_scheduler(self):
        s = Scheduler()
        csv = CSVLoader(get_dataset('bigfile'),
                        name="csv",
                        index_col=False,
                        header=None,
                        scheduler=s)

        #smp = Sample(n=10,scheduler=s)
        #smp.input.df = csv.output.table

        self.assertIs(s["csv"], csv)
        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        #smp2 = Sample(n=15, scheduler=s)
        #smp2.input.df = csv.output.df

        def add_min():
            m = Min(scheduler=s)
            # Of course, sleeping here is a bad idea. this is to illustrate
            # that add_min will be executed atomically by the scheduler.
            # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent
            # state.
            #sleep(1)
            m.input.table = csv.output.table
            prt = Print(proc=self.terse, scheduler=s)
            prt.input.df = m.output.table

        s.on_tick_once(add_min)

        sleep(1)
        #self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id))
        #self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id))
        #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id))
        s.stop()
        s.join()
"""
Clustering datasets may be found at
https://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.vis import ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s)
mbkmeans.input.table = data.output.table
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = ScatterPlot('_0','_1', scheduler=s)

sp.move_point = mbkmeans # for input management
sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans)

if __name__ == '__main__':
    data.start()
    s.join()
示例#6
0
mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s)
mbkmeans.input.df = data.output.df
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.df
sp = ScatterPlot(0,1, scheduler=s)
sp.move_point = mbkmeans # for input management
#sp.create_dependent_modules(mbkmeans,'centroids')
# Create modules by hand rather than with the utility.
# We show the cluster centroids on the scatterplot and the
# data as a heatmap

# histogram2d
histogram2d = Histogram2D(0, 1, scheduler=s)
histogram2d.input.df = data.output.df
min_mod = Min([0,1], scheduler=s)
max_mod = Max([0,1], scheduler=s)
min_mod.input.df = data.output.df
max_mod.input.df = data.output.df
histogram2d.input.min = min_mod.output.df
histogram2d.input.max = max_mod.output.df
# heatmap
heatmap = Heatmap(filename='heatmap%d.png', history=100, scheduler=s)
heatmap.input.array = histogram2d.output.df
# scatterplot
sp.input.heatmap = heatmap.output.heatmap
sp.input.df = mbkmeans.output.df

if __name__ == '__main__':
    data.start()
    s.join()