def make_cnl_file(self, filepath=None): local_cnl = self.persistent_cnl() if local_cnl: return utils.link_file(local_cnl, destname=filepath) else: assert filepath is not None return self.save_to_cnl_file(filepath)
def run(self, data, **kwargs): if False and (data.is_directed()): raise Exception("only undirected is supported") params = dict(kwargs) params = {k:v for k, v in params.items() if v is not None } if not data.is_directed(): params['Sym'] = 1 params['d'] = "output" if "r" not in params: params['r'] = 0.1 if not utils.file_exists(data.file_edges): data.to_edgelist() txt_params = " ".join(["-{} {}".format(k, v) for k, v in params.items()]) cmd = "{} -jar {} -i {} {} ".format(utils.get_java_command(), config.GANXISW_PROG, "edges.txt", txt_params) with utils.TempDir() as tmp_dir: utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True) utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output")) self.logger.info("Running " + cmd) utils.link_file(data.file_edges, tmp_dir, "edges.txt") timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(os.path.join(tmp_dir, "output/SLPAw*.icpm"))[0] clusters = [] with open (os.path.join(tmp_dir, outputfile), "r") as f: for line in f: clusters.append([int(u) for u in line.strip().split(" ")]) clusters = dict(enumerate(clusters)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, cutoff_r=0.01, inflation_in=2, NBDisimilarity_q=0.3, seed=None): if False and (data.is_directed()): raise Exception("only undirected is supported") if seed is not None:self.logger.info("seed ignored") params = locals();del params['self'];del params['data'] if not utils.file_exists(data.file_edges): data.to_edgelist() cmd = "{} {} {} {} {}".format(config.LABLE_RANK_PROG, "edges.txt", cutoff_r, inflation_in, NBDisimilarity_q) with utils.TempDir() as tmp_dir: utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True) utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output")) self.logger.info("Running " + cmd) utils.link_file(data.file_edges, tmp_dir, "edges.txt") timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(os.path.join(tmp_dir, "output/LabelRank*.icpm"))[0] clusters = [] with open (os.path.join(tmp_dir, outputfile), "r") as f: for line in f: clusters.append([int(u) for u in line.strip().split(" ")]) clusters = dict(enumerate(clusters)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None): self.logger.warning( "dct assumes node starts with zero and is continuous") if seed is None: seed = np.random.randint(999999) params = {'seed': seed, 'prog': self.progname} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: #tmp_dir = "/tmp/abc" pajek = utils.link_file(data.file_edges, dest_dir=tmp_dir, destname='edges.txt') cmd = "{} {}".format( config.get_dct_prog(self.progname, data.is_directed()), pajek) self.logger.info("Running " + cmd) timecost, status = utils.timeit(lambda: utils.shell_run_and_wait( cmd, tmp_dir, env={'SEED': str(seed)})) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfiles = glob.glob(tmp_dir + "/output*.cluster") import pandas as pd df_from_each_file = (pd.read_csv(f, sep=" ", header=None) for f in outputfiles) output = pd.concat(df_from_each_file, ignore_index=True) output.columns = ['node', 'cluster'] clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None): self.logger.warning( "dct::seq_louvain assumes node starts with zero and is continuous") if seed is None: seed = np.random.randint(999999) params = {'seed': seed} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = utils.link_file(data.file_edges, dest_dir=tmp_dir, destname='edges.txt') cmd = "{} -f -s {} -o output {}".format( config.get_dct_prog('infomap', data.is_directed()), seed, pajek) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = tmp_dir + "/output" import pandas as pd output = pd.read_csv(outputfile, sep=" ", header=None) output.columns = ['cluster'] output['node'] = range(len(output)) clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, f=False, m=None,seed=None): if False and (data.is_directed()): raise Exception("only undirected is supported") if seed is not None:self.logger.info("seed ignored") params = {} params['f'] = f params['m'] = m if not utils.file_exists(data.file_hig): data.to_higformat() cmd = ["./hirecs"] cmd.append("-oje") if f: cmd.append('-f') if m is not None: cmd.append("-m{}".format(m)) cmd.append(data.file_hig) cmd.append("> output") cmd = " ".join(cmd) with utils.TempDir() as tmp_dir: with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd) self.logger.info("Running " + cmd) cmd = "bash tmpcmd" utils.link_file(os.path.join(config.HIRECS_PATH, 'hirecs'), tmp_dir) utils.link_file(os.path.join(config.HIRECS_PATH, 'libhirecs.so'), tmp_dir) utils.link_file(data.file_hig, tmp_dir) timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) with open (os.path.join(tmp_dir, "output"), "r") as f: output = json.load(f) mod = output['mod'] communities = output['communities'] clusters = {} for c in communities: clusters[int(c)] = [int(u) for u in communities[c].keys()] self.logger.info("Made %d clusters in %f seconds with modularity %f" % (len(clusters), timecost, mod)) result = {} result['runname'] = self.name result['params'] = params result['overlap'] = True result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, **kwargs): if False and (data.is_directed() or data.is_weighted()): raise Exception( "only undirected and unweighted graph is supported") params = dict(kwargs) params = {u: v for u, v in params.items() if v is not None} if "loss" not in params: params['loss'] = 'modularity' argparams = dict(params) if argparams['loss'] in ['pmod', 'mom']: if argparams['loss'] not in argparams: raise Exception( "You have to specify pmod=<val> or mom=<val> for the loss function" ) loss_args = "--loss {{{},{}}}".format(argparams['loss'], argparams[argparams['loss']]) del argparams[argparams['loss']] del argparams['loss'] else: loss_args = "--loss {}".format(argparams["loss"]) del argparams['loss'] if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = utils.link_file(data.file_edges, dest_dir=tmp_dir, destname='edges.txt') cmdargs = ["--{} {}".format(u, v) for u, v in argparams.items()] cmdargs.append(loss_args) cmdargs.append("-o lsooutput") cmdargs = " ".join(cmdargs) cmd = "{} {} {}".format(config.LSO_CLUSTER_PROG, pajek, cmdargs) with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait("bash -x ./tmpcmd", tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfiles = tmp_dir + "/lsooutput" import pandas as pd output = pd.read_csv(outputfiles, sep="\t", header=None) output.columns = ['node', 'cluster'] clusters = output clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None, thread=None, threshold=1e-3, vThresh=0.0, maxIter=10, prior=True): if seed is None: seed = np.random.randint(999999) params = locals() del params['self'] del params['data'] params = {u: v for u, v in params.items() if v is not None} thread = utils.get_num_thread(thread) params['thread'] = thread if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = utils.link_file(data.file_edges, tmp_dir, 'edges.txt') cmd = "{prog} {seed} {graph} {thread} 1 {threshold} {vThresh} {maxIter} {output_dir} {prior}"\ .format(prog=config.get_powergraph_prog('RelaxMap', data.is_directed()), seed=seed, graph=pajek, thread=thread, threshold=threshold, vThresh=vThresh, output_dir=tmp_dir, maxIter=maxIter, prior='prior' if prior else 'normal').strip() self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = os.path.join(tmp_dir, "edges.tree") import pandas as pd output = pd.read_csv(outputfile, sep=" ", header=None, skiprows=1) output['node'] = output.loc[:, 2].astype(np.int) output['cluster'] = output.loc[:, 0].map( lambda u: u.split(':')[0]).astype(np.int) clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self