def test_mb_k_means(self) -> None: s = self.scheduler() n_clusters = 3 try: dataset = (get_dataset("cluster:s3"), ) except TimeoutError: print("Cannot download cluster:s3") return with s: csv = CSVLoader( dataset, sep=" ", skipinitialspace=True, header=None, index_col=False, scheduler=s, ) km = MBKMeans( n_clusters=n_clusters, random_state=42, is_input=False, is_greedy=False, scheduler=s, ) # km.input.table = csv.output.result km.create_dependent_modules(csv) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = km.output.result e = Every(proc=self.terse, scheduler=s) e.input[0] = km.output.labels aio.run(s.start()) labels = km.labels() assert labels is not None self.assertEqual(len(csv.table), len(labels))
def test_mb_k_means(self): #log_level() s=Scheduler() n_clusters = 3 csv = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s) km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s) km.input.df = csv.output.df pr = Print(scheduler=s) pr.input.df = km.output.df e = Every(scheduler=s) e.input.df = km.output.labels s.start() self.assertEquals(len(csv.df()), len(km.labels()))
def test_mb_k_means(self): #log_level() s = self.scheduler() n_clusters = 3 csv = CSVLoader(get_dataset('cluster:s3'), sep=' ', skipinitialspace=True, header=None, index_col=False, scheduler=s) km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s) km.input.table = csv.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = km.output.table e = Every(proc=self.terse, scheduler=s) e.input.df = km.output.labels s.start() s.join() self.assertEqual(len(csv.table()), len(km.labels()))
#log_level(package="progressivis.cluster") #dir_name = tempfile.mkdtemp(prefix='progressivis_tmp_') dir_name = os.path.join(tempfile.gettempdir(), 'progressivis_tmp_') os.makedirs(dir_name, exist_ok=True) file_name = os.path.join(dir_name, "foobar.csv") gen_csv(file_name, rows=99999, reset=True) #, header='_0,_1', reset=False) data = CSVLoader(file_name, skipinitialspace=True, header=None, index_col=False, scheduler=s) n_clusters = 3 mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=n_clusters, batch_size=100, tol=0.01, is_input=False, scheduler=s) classes = [] for i in range(n_clusters): cname = f"k{i}" filt = MBKMeansFilter(i) filt.create_dependent_modules(mbkmeans, data, 'table') classes.append({ 'name': cname, 'x_column': '_0', 'y_column': '_1', 'sample': mbkmeans if i == 0 else None, 'input_module': filt, 'input_slot': 'table' })
from progressivis import Scheduler, Print from progressivis.cluster import MBKMeans from progressivis.stats import RandomTable from progressivis.vis import MCScatterPlot import asyncio as aio try: s = scheduler except: s = Scheduler() table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s) mbkmeans = MBKMeans(columns=['a', 'b'], n_clusters=8, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = table.output.table prn = Print(scheduler=s) prn.input.df = mbkmeans.output.table sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', 'a', 'b')], approximate=True) sp.create_dependent_modules(mbkmeans, 'table') sp['Scatterplot'].range_query_2d.hist_index_x.params.init_threshold = 1 sp['Scatterplot'].range_query_2d.hist_index_y.params.init_threshold = 1 if __name__ == '__main__': #table.start() aio.run(s.start(coros=[aio.sleep(3600)]))
""" Clustering datasets may be found at https://cs.joensuu.fi/sipu/datasets/ """ from progressivis import Scheduler, Every#, log_level from progressivis.cluster import MBKMeans from progressivis.io import CSVLoader from progressivis.vis import ScatterPlot from progressivis.datasets import get_dataset try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = data.output.table prn = Every(scheduler=s) prn.input.df = mbkmeans.output.table sp = ScatterPlot('_0','_1', scheduler=s) sp.move_point = mbkmeans # for input management sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans) if __name__ == '__main__': data.start() s.join()