def get_edges(self): if not self.has_downloaded(): self.download() graph_file = config.get_download_file_path(self.name, self.name + '.graph') self.logger.info("reading {}".format(graph_file)) cmd = "{} -server -cp {} it.unimi.dsi.webgraph.ArcListASCIIGraph {} {}".format( utils.get_java_command(), config.WEBGRAPH_JAR_PATH, self.name + '-hc-t', self.name) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, self.download_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) csvfile = os.path.join(self.download_dir, self.name) edges = pd.read_csv(csvfile, header=None, sep='\t') edges.columns = ['src', 'dest'] utils.remove_if_file_exit(csvfile) return edges
def run(self, data, seed=None): if seed is None: seed = np.random.randint(999999) params = {'seed': seed} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_pajek): data.to_pajek() with utils.TempDir() as tmp_dir: def link_file(path, destname=None): if destname is None: destname = path.split("/")[-1] destpath = os.path.join(tmp_dir, destname) utils.remove_if_file_exit(destpath) os.symlink(path, destpath) return destpath pajek = link_file(data.file_pajek, 'pajek.txt') cmd = "{} {} {} {}".format( config.get_OSLOM_prog('infomap', data.is_directed()), seed, pajek, 1) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = glob.glob(tmp_dir + "/pajek.tree")[0] import pandas as pd output = pd.read_csv(outputfile, sep=" ", skiprows=1, header=None) output['cluster'] = output.loc[:, 0].map( lambda u: int(u.split(':')[0])) output['node'] = output.loc[:, 2].astype(np.int) clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, f=False, m=None,seed=None): if False and (data.is_directed()): raise Exception("only undirected is supported") if seed is not None:self.logger.info("seed ignored") params = {} params['f'] = f params['m'] = m if not utils.file_exists(data.file_hig): data.to_higformat() cmd = ["./hirecs"] cmd.append("-oje") if f: cmd.append('-f') if m is not None: cmd.append("-m{}".format(m)) cmd.append(data.file_hig) cmd.append("> output") cmd = " ".join(cmd) with utils.TempDir() as tmp_dir: with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd) self.logger.info("Running " + cmd) cmd = "bash tmpcmd" utils.link_file(os.path.join(config.HIRECS_PATH, 'hirecs'), tmp_dir) utils.link_file(os.path.join(config.HIRECS_PATH, 'libhirecs.so'), tmp_dir) utils.link_file(data.file_hig, tmp_dir) timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) with open (os.path.join(tmp_dir, "output"), "r") as f: output = json.load(f) mod = output['mod'] communities = output['communities'] clusters = {} for c in communities: clusters[int(c)] = [int(u) for u in communities[c].keys()] self.logger.info("Made %d clusters in %f seconds with modularity %f" % (len(clusters), timecost, mod)) result = {} result['runname'] = self.name result['params'] = params result['overlap'] = True result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None, lamb=1, trials=5, temp_step=0.999, initial_temp=1e-6): params = locals() del params['data'] del params['self'] if seed is None: seed = np.random.randint(999999) params['seed'] = seed if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = os.path.join(tmp_dir, 'edges.txt') utils.remove_if_file_exit(pajek) os.symlink(data.file_edges, pajek) cmd = "{} {} {} {} {} {} {}".format( config.get_OSLOM_prog('modopt', data.is_directed()), pajek, seed, lamb, trials, temp_step, initial_temp) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = pajek + "part" with open(outputfile) as f: lines = [u.strip() for u in f] lines = [[int(v) for v in u.split("\t")] for u in lines] clusters = dict(enumerate(lines)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def to_higformat(self, filepath=None): if (filepath == None): filepath = self.file_hig if utils.file_exists(filepath): return filepath if not utils.file_exists(self.file_pajek): self.to_pajek() cmd = "python {} {}".format(config.HIG_CONVERT_PROG, self.file_pajek) self.logger.info("running " + cmd) status = utils.shell_run_and_wait(cmd) if (status != 0): raise Exception("run command failed: " + str(status)) return filepath
def run(self, data, seed=None): self.logger.warning( "dct assumes node starts with zero and is continuous") if seed is None: seed = np.random.randint(999999) params = {'seed': seed, 'prog': self.progname} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: #tmp_dir = "/tmp/abc" pajek = utils.link_file(data.file_edges, dest_dir=tmp_dir, destname='edges.txt') cmd = "{} {}".format( config.get_dct_prog(self.progname, data.is_directed()), pajek) self.logger.info("Running " + cmd) timecost, status = utils.timeit(lambda: utils.shell_run_and_wait( cmd, tmp_dir, env={'SEED': str(seed)})) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfiles = glob.glob(tmp_dir + "/output*.cluster") import pandas as pd df_from_each_file = (pd.read_csv(f, sep=" ", header=None) for f in outputfiles) output = pd.concat(df_from_each_file, ignore_index=True) output.columns = ['node', 'cluster'] clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None): self.logger.warning( "dct::seq_louvain assumes node starts with zero and is continuous") if seed is None: seed = np.random.randint(999999) params = {'seed': seed} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = utils.link_file(data.file_edges, dest_dir=tmp_dir, destname='edges.txt') cmd = "{} -f -s {} -o output {}".format( config.get_dct_prog('infomap', data.is_directed()), seed, pajek) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = tmp_dir + "/output" import pandas as pd output = pd.read_csv(outputfile, sep=" ", header=None) output.columns = ['cluster'] output['node'] = range(len(output)) clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, algorithm=4, minpts=4, epsilon=0.5, alpha=32768 , beta=32768 , thread=-1, seed=None): params = locals() del(params['self']);del(params['data']) if seed is not None:self.logger.info("seed ignored") thread = utils.get_num_thread(thread) params['thread'] = thread if False and (data.is_directed() or data.is_weighted()): raise Exception("only undirected and unweighted graph is supported") if not utils.file_exists(data.file_anyscan): data.to_anyscan() cmd = "{} -c {} -i {} -m {} -e {} -o {} -a {} -b {} -t {}".format( config.ANYSCAN_PROG, algorithm, data.file_anyscan, minpts, epsilon, 'output', alpha, beta, thread) self.logger.info("Running " + cmd) with utils.TempDir() as tmp_dir: timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 1: # anyscan always return 1 raise Exception("Run command with error status code {}".format(status)) with open (os.path.join(tmp_dir, "output"), "r") as output: lines = [u.strip() for u in output.readlines()] n_nodes = int(lines[0]) clusters_list = lines[1].split(" ") clusters_list = [int(u) for u in clusters_list ] if (n_nodes != len(clusters_list)): raise Exception("#node is not equals #cluster") from collections import defaultdict clusters = defaultdict(list) for n, c in enumerate(clusters_list): clusters[c].append(n) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, mu=3, epsilon=0.5, prog='pScan', seed=None): assert prog in ['pScan', 'ppScan', 'ppScanSSE'] if seed is not None:self.logger.info("seed ignored") params = {'mu':mu, 'epsilon':epsilon, 'prog':prog} if False and (data.is_directed() or data.is_weighted()): raise Exception("only undirected and unweighted graph is supported") if not utils.file_exists(data.file_scanbin): data.to_scanbin() with utils.TempDir() as tmp_dir: scanbin = data.file_scanbin os.symlink(os.path.join(scanbin, 'b_degree.bin'), os.path.join(tmp_dir, 'b_degree.bin')) os.symlink(os.path.join(scanbin, 'b_adj.bin'), os.path.join(tmp_dir, 'b_adj.bin')) if prog == 'pScan': EXE = config.PSCAN_PROG elif prog == 'ppScan': EXE = config.PPSCAN_PROG elif prog == 'ppScanSSE': EXE = config.PPSCANSSE_PROG cmd = "{} {} {} {} {}".format(EXE, tmp_dir, epsilon, mu, 'output') self.logger.info("Running " + cmd) timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(tmp_dir + "/result-*-*.txt")[0] import pandas as pd output = pd.read_csv(outputfile, sep=" ") clusters = output[output['c/n'] == 'c'][['vertex_id', 'cluster_id']] others = output[output['c/n'] != 'c'][['vertex_id', 'cluster_id']] clusters = clusters.groupby('cluster_id').apply(lambda u: list(u['vertex_id'])).to_dict() others = others.values.tolist() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters result['noise_or_hub'] = others save_result(result) self.result = result return self
def to_scanbin(self, filepath=None): if (filepath == None): filepath = self.file_scanbin if utils.file_exists(os.path.join( filepath, "b_degree.bin")) and utils.file_exists( os.path.join(filepath, "b_adj.bin")): return filepath utils.create_dir_if_not_exists(filepath) if not utils.file_exists(self.file_edges): self.to_edgelist() cmd = "{} {} {} {}".format(config.SCAN_CONVERT_PROG, self.file_edges, "b_degree.bin", "b_adj.bin") self.logger.info("running " + cmd) status = utils.shell_run_and_wait(cmd, filepath) if (status != 0): raise Exception("run command failed: " + str(status)) return filepath
def to_mcl_mci(self, filepath=None): if (filepath == None): filepath = self.file_mcl_mci if utils.file_exists(filepath): return filepath if not utils.file_exists(self.file_edges): self.to_edgelist() if not self.is_directed(): cmd = "{} --stream-mirror -123 {} -o {} --write-binary".format( config.MCL_CONVERT_PROG, self.file_edges, filepath) else: cmd = "{} -123 {} -o {} --write-binary".format( config.MCL_CONVERT_PROG, self.file_edges, filepath) self.logger.info("running " + cmd) status = utils.shell_run_and_wait(cmd) if (status != 0): raise Exception("run command failed: " + str(status)) return filepath
def run(self, data, **kwargs): if False and (data.is_directed()): raise Exception("only undirected is supported") params = dict(kwargs) params = {k:v for k, v in params.items() if v is not None } if not data.is_directed(): params['Sym'] = 1 params['d'] = "output" if "r" not in params: params['r'] = 0.1 if not utils.file_exists(data.file_edges): data.to_edgelist() txt_params = " ".join(["-{} {}".format(k, v) for k, v in params.items()]) cmd = "{} -jar {} -i {} {} ".format(utils.get_java_command(), config.GANXISW_PROG, "edges.txt", txt_params) with utils.TempDir() as tmp_dir: utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True) utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output")) self.logger.info("Running " + cmd) utils.link_file(data.file_edges, tmp_dir, "edges.txt") timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(os.path.join(tmp_dir, "output/SLPAw*.icpm"))[0] clusters = [] with open (os.path.join(tmp_dir, outputfile), "r") as f: for line in f: clusters.append([int(u) for u in line.strip().split(" ")]) clusters = dict(enumerate(clusters)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def to_topgc_fromat(self, filepath=None): if (filepath == None): filepath = self.file_topgc if utils.file_exists(filepath): return filepath if not utils.file_exists(self.file_edges): self.to_edgelist() with TempDir() as tmp_dir: cmd = "cat {}|sort -k1,1 -k2,2 -n | {} > {} || rm {}".format( self.file_edges, config.get_cdc_prog('mkidx'), filepath, filepath) self.logger.info("Running " + cmd) with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait("bash -x tmpcmd", tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) assert utils.file_exists(filepath) return filepath
def run(self, data, cutoff_r=0.01, inflation_in=2, NBDisimilarity_q=0.3, seed=None): if False and (data.is_directed()): raise Exception("only undirected is supported") if seed is not None:self.logger.info("seed ignored") params = locals();del params['self'];del params['data'] if not utils.file_exists(data.file_edges): data.to_edgelist() cmd = "{} {} {} {} {}".format(config.LABLE_RANK_PROG, "edges.txt", cutoff_r, inflation_in, NBDisimilarity_q) with utils.TempDir() as tmp_dir: utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True) utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output")) self.logger.info("Running " + cmd) utils.link_file(data.file_edges, tmp_dir, "edges.txt") timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(os.path.join(tmp_dir, "output/LabelRank*.icpm"))[0] clusters = [] with open (os.path.join(tmp_dir, outputfile), "r") as f: for line in f: clusters.append([int(u) for u in line.strip().split(" ")]) clusters = dict(enumerate(clusters)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def md5check(self): cmd = 'md5sum --ignore-missing -c {}.md5sums'.format(self.name) status = utils.shell_run_and_wait( cmd, config.get_download_file_path(self.name)) return status == 0
def run(self, data, **kwargs): if False and (data.is_directed() or data.is_weighted()): raise Exception( "only undirected and unweighted graph is supported") params = dict(kwargs) params = {u: v for u, v in params.items() if v is not None} if "loss" not in params: params['loss'] = 'modularity' argparams = dict(params) if argparams['loss'] in ['pmod', 'mom']: if argparams['loss'] not in argparams: raise Exception( "You have to specify pmod=<val> or mom=<val> for the loss function" ) loss_args = "--loss {{{},{}}}".format(argparams['loss'], argparams[argparams['loss']]) del argparams[argparams['loss']] del argparams['loss'] else: loss_args = "--loss {}".format(argparams["loss"]) del argparams['loss'] if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = utils.link_file(data.file_edges, dest_dir=tmp_dir, destname='edges.txt') cmdargs = ["--{} {}".format(u, v) for u, v in argparams.items()] cmdargs.append(loss_args) cmdargs.append("-o lsooutput") cmdargs = " ".join(cmdargs) cmd = "{} {} {}".format(config.LSO_CLUSTER_PROG, pajek, cmdargs) with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait("bash -x ./tmpcmd", tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfiles = tmp_dir + "/lsooutput" import pandas as pd output = pd.read_csv(outputfiles, sep="\t", header=None) output.columns = ['node', 'cluster'] clusters = output clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, vmax_start=None, vmax_end=None, c=None, niter=None, seed=None): if False and (data.is_directed() or data.is_weighted()): raise Exception( "only undirected and unweighted graph is supported") params = locals() del (params['self']) del (params['data']) params['f'] = data.file_edges params['o'] = 'output' params = { u.replace("_", "-"): v for u, v in params.items() if v is not None } if not utils.file_exists(data.file_edges): data.to_edgelist() cmd = "{} {} {}".format( config.STREAMCOM_PROG, " ".join([ '{}{} {}'.format('-' if len(u) == 1 else '--', u, v) for u, v in params.items() ]), data.file_edges) self.logger.info("Running " + cmd) with utils.TempDir() as tmp_dir: timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) output = glob.glob(os.path.join(tmp_dir, "output*"))[0] with open(os.path.join(tmp_dir, output), "r") as output: lines = [u.strip() for u in output.readlines()] from collections import defaultdict clusters = defaultdict(list) for c, line in enumerate(lines): if line.startswith('#'): continue for n in line.split(" "): clusters[c].append(int(n)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, **kwargs): if False and (data.is_directed()): raise Exception("only undirected is supported") params = dict(kwargs) if "seed" in params: if params['seed'] is not None: self.logger.info("seed ignored") del params['seed'] #params['abc'] = '' params['o'] = 'output' params = {u: v for u, v in params.items() if v is not None} if not utils.file_exists(data.file_mcl_mci): data.to_mcl_mci() if not utils.file_exists(data.file_mcl_mci): raise Exception("failed to crate mcl mci format file") with utils.TempDir() as tmp_dir: cmd1 = "{} {} {}".format( config.MCL_PROG, data.file_mcl_mci, " ".join([ '{}{} {}'.format('-' if len(u) == 1 else '--', u, v).strip() for u, v in params.items() ])) cmd2 = "{} -imx {} -o cluster.output".format( config.MCLDUMP_PROG, 'output') cmdfile = os.path.join(tmp_dir, "tmpcmd.sh") with open(cmdfile, 'wt') as f: f.write(cmd1 + "\n") f.write(cmd2 + "\n") self.logger.info("Running " + cmd1) self.logger.info("Running " + cmd2) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait("bash " + cmdfile, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) with open(os.path.join(tmp_dir, "cluster.output"), "r") as output: lines = [u.strip() for u in output.readlines()] from collections import defaultdict clusters = defaultdict(list) for line in lines: cluster, node = line.split("\t")[:2] clusters[int(cluster)].append(int(node)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, execution='async', ncpus=None, scheduler=None, engine_opts=None, graph_opts=None, scheduler_opts=None, seed=None): if seed is not None: self.logger.info("seed ignored") params = locals() del params['self'] del params['data'] del params['seed'] params = {u: v for u, v in params.items() if v is not None} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") ncpus = utils.get_num_thread(ncpus) params['ncpus'] = ncpus params['weighted'] = data.is_weighted() * 1 params['directed'] = data.is_directed() * 1 if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = os.path.join(tmp_dir, 'edges.txt') utils.remove_if_file_exit(pajek) os.symlink(data.file_edges, pajek) args = " ".join( ["--{} {}".format(u, v) for u, v in params.items()]) cmd = "{} --graph {} --saveprefix=output.cluster {}".format( config.get_powergraph_prog('label_propagation', data.is_directed()), pajek, args) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfiles = glob.glob(tmp_dir + "/output.cluster.*") import pandas as pd df_from_each_file = (pd.read_csv(f, sep="\t", header=None) for f in outputfiles) output = pd.concat(df_from_each_file, ignore_index=True) output.columns = ['node', 'cluster'] clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def generate(self, seed=None): params = self.params if params['name'] == 'Erdos-Renyi': gtype = snap.PNGraph if params['directed'] else snap.PUNGraph g = snap.GenRndGnm(gtype, params['n_node'], params['n_edge']) lst = [] for EI in g.Edges(): lst.append([EI.GetSrcNId(), EI.GetDstNId()]) return pd.DataFrame(lst, columns=['src', 'dest']), None elif params['name'] == 'ovp_LFR': if seed is None: seed = np.random.randint(999999) with utils.TempDir() as tmpdir: with open(os.path.join(tmpdir, "seed.txt"), 'wt') as seedf: seedf.write(str(seed)) program = config.LFR_PROG newparams = dict(self.params) if 'directed' in newparams: del newparams['directed'] if 'weighted' in newparams: del newparams['weighted'] newparams['name'] = 'LFR' cmd = program + " " + " ".join([ "-" + str(u[0]) + " " + str(u[1]) for u in newparams.items() if u[1] is not None ]) self.logger.info("Runing '{}' with seed {}".format(cmd, seed)) self.logger.info("working dir: " + tmpdir) status = utils.shell_run_and_wait(cmd, working_dir=tmpdir) if status != 0: raise Exception( "run command failed. status={}".format(status)) edgefile = "LFR.nsa" if self.params['directed'] else "LFR.nse" edgefile = os.path.join(tmpdir, edgefile) if self.params['weighted']: edges = pd.read_csv(edgefile, sep='\t', header=None, skiprows=1, dtype={2: np.float}) edges.columns = ['src', 'dest', 'weight'] else: edges = pd.read_csv(edgefile, sep='\t', header=None, skiprows=1, usecols=[0, 1]) edges.columns = ['src', 'dest'] gtfile = "LFR.nmc" gtfile = os.path.join(tmpdir, gtfile) gt = pd.read_csv(gtfile, sep='\t', header=None) gt.columns = ['node', 'cluster'] return self.make_offset_0(edges, gt) elif params['name'] == 'LFR': if seed is None: seed = np.random.randint(999999) with utils.TempDir() as tmpdir: with open(os.path.join(tmpdir, "time_seed.dat"), 'wt') as seedf: seedf.write(str(seed)) program = config.get_LFR_prog(weighted=params['weighted'], directed=params['directed'], hier='hier' in params and params['hier']) newparams = dict(self.params) if 'directed' in newparams: del newparams['directed'] if 'weighted' in newparams: del newparams['weighted'] if 'name' in newparams: del newparams['name'] if 'hier' in newparams: del newparams['hier'] cmd = program + " " + " ".join([ "-" + str(u[0]) + " " + str(u[1]) for u in newparams.items() if u[1] is not None ]) self.logger.info("Runing '{}' with seed {}".format(cmd, seed)) self.logger.info("working dir: " + tmpdir) status = utils.shell_run_and_wait(cmd, working_dir=tmpdir) if status != 0: raise Exception( "run command failed. status={} with seed {}".format( status, seed)) edgefile = "network.dat" edgefile = os.path.join(tmpdir, edgefile) if self.params['weighted']: edges = pd.read_csv(edgefile, sep='\t', header=None, skiprows=1, dtype={2: np.float}) edges.columns = ['src', 'dest', 'weight'] else: edges = pd.read_csv(edgefile, sep='\t', header=None, skiprows=1) edges.columns = ['src', 'dest'] def read_comm(fname): gtfile = fname gtfile = os.path.join(tmpdir, gtfile) gt = pd.read_csv(gtfile, sep='\t', header=None) gt.columns = ['node', 'cluster'] return gt if 'hier' in params and params['hier']: gt = read_comm("community_first_level.dat"), read_comm( "community_second_level.dat") else: gt = read_comm("community.dat") return self.make_offset_0(edges, gt) else: raise Exception("unknown " + params['name'])
def run(self, data, v=5, v1=None, v2=None, prop=None, repeat=None, mo=None, nosplit=False, extrasimplify=False, q=False, seed=None): assert (v1 is None and v2 is None) or (v1 is not None and v2 is not None) params = locals() del params['self'] del params['data'] if seed is not None: self.logger.info("seed ignored") if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = os.path.join(tmp_dir, 'edges.txt') utils.remove_if_file_exit(pajek) os.symlink(data.file_edges, pajek) cmd = "{} -cp {} COPRA {} -w -v {}".format( utils.get_java_command(), config.get_OSLOM_prog('copra', data.is_directed()), pajek, v) if (v1 is not None and v2 is not None): cmd += "-vs {} {}".format(v1, v2) if prop is not None: cmd += ' -prop {}'.format(prop) if repeat is not None: cmd += ' -repeat {}'.format(repeat) if mo is not None: cmd += ' -mo' if nosplit is not None: cmd += ' -nosplit' if extrasimplify is not None: cmd += ' -extrasimplify' if q is not None: cmd += ' -q' self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = os.path.join(tmp_dir, 'clusters-edges.txt') with open(outputfile) as f: lines = [u.strip() for u in f] lines = [[int(v) for v in u.split(" ")] for u in lines] clusters = dict(enumerate(lines)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, startk=None, finalk=None, runs=None, ensemblesize=None, algorithm=None, seed=None): if False and (data.is_directed() or data.is_weighted()): raise Exception( "only undirected and unweighted graph is supported") params = locals() del (params['self']) del (params['data']) if seed is None: seed = np.random.randint(999999) params['seed'] = seed params['inpfmt'] = 'e' params['outfile'] = 'output' params['outfmt'] = 'l' params = {u: v for u, v in params.items() if v is not None} if not utils.file_exists(data.file_edges): data.to_edgelist() cmd = "{} {} {}".format( config.CGGC_PROG, " ".join(['--{}={}'.format(u, v) for u, v in params.items()]), data.file_edges) self.logger.info("Running " + cmd) with utils.TempDir() as tmp_dir: timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) with open(os.path.join(tmp_dir, "output"), "r") as output: lines = [u.strip() for u in output.readlines()] from collections import defaultdict clusters = defaultdict(list) for c, line in enumerate(lines): if line.startswith('#'): continue for n in line.split(" "): clusters[c].append(int(n)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None, thread=None, threshold=1e-3, vThresh=0.0, maxIter=10, prior=True): if seed is None: seed = np.random.randint(999999) params = locals() del params['self'] del params['data'] params = {u: v for u, v in params.items() if v is not None} thread = utils.get_num_thread(thread) params['thread'] = thread if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = utils.link_file(data.file_edges, tmp_dir, 'edges.txt') cmd = "{prog} {seed} {graph} {thread} 1 {threshold} {vThresh} {maxIter} {output_dir} {prior}"\ .format(prog=config.get_powergraph_prog('RelaxMap', data.is_directed()), seed=seed, graph=pajek, thread=thread, threshold=threshold, vThresh=vThresh, output_dir=tmp_dir, maxIter=maxIter, prior='prior' if prior else 'normal').strip() self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = os.path.join(tmp_dir, "edges.tree") import pandas as pd output = pd.read_csv(outputfile, sep=" ", header=None, skiprows=1) output['node'] = output.loc[:, 2].astype(np.int) output['cluster'] = output.loc[:, 0].map( lambda u: u.split(':')[0]).astype(np.int) clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None, r=10, hr=50, t=0.1, cp=0.5, fast=False, singlet=False, infomap=False, copra=False, louvain=False, runs=1): params = locals() del params['data'] del params['self'] if seed is None: seed = np.random.randint(999999) params['seed'] = seed if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if int(infomap) + int(copra) + int(louvain) > 1: raise Exception("only of infomap, corpra, louvain can be true") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: # tmp_dir = "/tmp/abc" def link_file(path, destname=None): if destname is None: destname = path.split("/")[-1] destpath = os.path.join(tmp_dir, destname) utils.remove_if_file_exit(destpath) os.symlink(path, destpath) return destpath pajek = link_file(data.file_edges) if copra: _ = link_file( config.get_OSLOM_prog('copra', data.is_directed())) if infomap: _ = link_file(config.get_OSLOM_prog('infomap_script', True)) _ = link_file(config.get_OSLOM_prog('infomap', True)) _ = link_file(config.get_OSLOM_prog('infomap_script', False)) _ = link_file(config.get_OSLOM_prog('infomap', False)) if louvain: _ = link_file( config.get_OSLOM_prog('louvain_script', data.is_directed())) _ = link_file( config.get_OSLOM_prog('convert', data.is_directed())) _ = link_file( config.get_OSLOM_prog('community', data.is_directed())) _ = link_file( config.get_OSLOM_prog('hierarchy', data.is_directed())) cmd = "{} -f {} -{} -r {} -hr {} -seed {} -t {} -cp {}".format( config.get_OSLOM_prog('oslom', data.is_directed()), pajek, 'w' if data.is_weighted() else 'uw', r, hr, seed, t, cp) if fast: cmd += " -fast" if singlet: cmd += " -singlet" if infomap: cmd += " -infomap {}".format(runs) if copra: cmd += " -copra {}".format(runs) if louvain: cmd += " -louvain {}".format(runs) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfiles = glob.glob( os.path.join(tmp_dir, "edges.txt_oslo_files", "tp*")) clusters = {} for tp in outputfiles: fname = tp.split("/")[-1] if fname == 'tp': level = 0 else: level = int(fname[2:]) with open(tp) as f: lines = [u.strip() for u in f if not u.startswith('#')] lines = [[int(v) for v in u.split(" ")] for u in lines] clusters[level] = dict(enumerate(lines)) max_level = max(list(clusters.keys())) self.logger.info( "Made %d levels of clusters with #clusters %s in %f seconds" % (len(clusters), str([len(u) for u in clusters.values()]), timecost)) result = {} result['multilevel'] = True result['num_level'] = len(clusters) result['max_level'] = max_level result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self