예제 #1
0
def to_snap(data):
    """
    convert the dataset to a SNAP graph.
    
    :param data: :py:class:`gct.Dataset`
    :rtype: SNAP graph
    """
    import snap
    if 1 and utils.file_exists(data.file_snap):
        FIn = snap.TFIn(data.file_snap)
        if data.is_directed():
            graph = snap.TNGraph.Load(FIn)
        else:
            graph = snap.TUNGraph.Load(FIn)
        return graph

    if False and data.is_weighted():
        raise Exception("weighted graph is not supported well on snap")
    fname = data.file_edges
    if not utils.file_exists(fname):
        data.to_edgelist()

    if data.is_directed():
        return snap.LoadEdgeList(snap.PNGraph, fname, 0, 1)
    else:
        return snap.LoadEdgeList(snap.PUNGraph, fname, 0, 1)
 def has_downloaded(self, fname=None):
     if fname is None:
         for fname in self.get_remote_names():
             if not utils.file_exists(
                     config.get_download_file_path(self.name, fname)):
                 return False
         return True
     else:
         return utils.file_exists(
             config.get_download_file_path(self.name, fname))
 def to_pajek(self, filepath=None):
     if (filepath == None):
         filepath = self.file_pajek
         if utils.file_exists(filepath):
             return filepath
     if not utils.file_exists(self.file_edges): self.to_edgelist()
     from gct.dataset import edgelist2pajek
     edgelist2pajek.edgelist_to_pajek(self.file_edges, self.file_pajek,
                                      self.is_directed(),
                                      self.is_weighted())
     return filepath
    def to_higformat(self, filepath=None):
        if (filepath == None):
            filepath = self.file_hig
            if utils.file_exists(filepath):
                return filepath
        if not utils.file_exists(self.file_pajek): self.to_pajek()

        cmd = "python {} {}".format(config.HIG_CONVERT_PROG, self.file_pajek)
        self.logger.info("running " + cmd)
        status = utils.shell_run_and_wait(cmd)
        if (status != 0):
            raise Exception("run command failed: " + str(status))
        return filepath
    def to_anyscan(self, filepath=None):
        if False and self.is_directed():
            raise Exception("directed graph not supported")
        if (filepath == None):
            filepath = self.file_anyscan
            if utils.file_exists(filepath):
                return filepath

        edges1 = self.get_edges()[['src', 'dest']]
        min_node = min(edges1['src'].min(), edges1['dest'].min())
        max_node = max(edges1['src'].max(), edges1['dest'].max())
        self.logger.info("min node: {}, max node: {}".format(
            min_node, max_node))
        if min_node != 0:
            self.logger.warn(
                "node id is greater than 0, fake node will be added")

        self_edges = pd.DataFrame([[u, u] for u in range(max_node + 1)],
                                  columns=['src', 'dest'])
        edges2 = edges1[['dest', 'src']].copy()
        edges2.columns = ['src', 'dest']
        edges = pd.concat([edges1, edges2, self_edges], axis=0)
        del edges1, edges2, self_edges

        def fun(df):
            values = sorted(list(set(df['dest'])))
            return str(len(values)) + " " + " ".join([str(u) for u in values])

        grouped = edges.groupby('src').apply(fun).reset_index()
        with open(filepath, 'wt') as f:
            f.write(str(max_node + 1) + "\n")
            for i in grouped.index:
                f.write(grouped.loc[i, 0] + "\n")
        return filepath
def load_local(name):
    '''
    load a local dataset
    
    :param name: name of a dataset 
    '''
    path = config.get_data_file_path(name, create=False)
    if not utils.file_exists(path):
        raise Exception("path not exists: " + path)
    with open(os.path.join(path, 'meta.info')) as f:
        meta = json.load(f)
    edges = meta['parq_edges']
    gt = None
    if meta["has_ground_truth"]:
        gt = {k: Clustering(v) for k, v in meta['parq_ground_truth'].items()}
    additional_meta = None if not "additional" in meta else meta['additional']
    is_edge_mirrored = meta['is_edge_mirrored']
    return Dataset(name=meta['name'],
                   description=meta['description'],
                   groundtruthObj=gt,
                   edgesObj=edges,
                   directed=meta['directed'],
                   weighted=meta['weighted'],
                   overide=False,
                   additional_meta=additional_meta,
                   is_edge_mirrored=is_edge_mirrored)
 def to_scanbin(self, filepath=None):
     if (filepath == None):
         filepath = self.file_scanbin
         if utils.file_exists(os.path.join(
                 filepath, "b_degree.bin")) and utils.file_exists(
                     os.path.join(filepath, "b_adj.bin")):
             return filepath
         utils.create_dir_if_not_exists(filepath)
     if not utils.file_exists(self.file_edges): self.to_edgelist()
     cmd = "{} {} {} {}".format(config.SCAN_CONVERT_PROG, self.file_edges,
                                "b_degree.bin", "b_adj.bin")
     self.logger.info("running " + cmd)
     status = utils.shell_run_and_wait(cmd, filepath)
     if (status != 0):
         raise Exception("run command failed: " + str(status))
     return filepath
 def to_mcl_mci(self, filepath=None):
     if (filepath == None):
         filepath = self.file_mcl_mci
         if utils.file_exists(filepath):
             return filepath
     if not utils.file_exists(self.file_edges): self.to_edgelist()
     if not self.is_directed():
         cmd = "{} --stream-mirror -123 {} -o {} --write-binary".format(
             config.MCL_CONVERT_PROG, self.file_edges, filepath)
     else:
         cmd = "{} -123 {} -o {} --write-binary".format(
             config.MCL_CONVERT_PROG, self.file_edges, filepath)
     self.logger.info("running " + cmd)
     status = utils.shell_run_and_wait(cmd)
     if (status != 0):
         raise Exception("run command failed: " + str(status))
     return filepath
 def to_unweighted_fromat(self, filepath=None):
     if (filepath == None):
         filepath = self.file_unweighted_edges
         if utils.file_exists(filepath):
             return filepath
     edges = self.get_edges()[['src', 'dest']]
     edges.to_csv(filepath, header=None, index=None, sep=" ")
     return filepath
 def persistent_cnl(self):
     if self.path is not None:
         local_cnl = self.path + ".cnl"
         if not utils.file_exists(local_cnl):
             self.logger.info("persistent cluster to cnl file: " +
                              local_cnl)
             self.save_to_cnl_file(local_cnl)
         return local_cnl
     return None
def local_exists(name):
    '''
    check if dataset 'name' exists locally.
    
    :param name: name of a dataset
    :rtype: bool
    '''
    path = config.get_data_file_path(name)
    return utils.file_exists(path)
예제 #12
0
    def run(self, data, seed=None):

        if seed is None:
            seed = np.random.randint(999999)
        params = {'seed': seed}
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_pajek):
            data.to_pajek()

        with utils.TempDir() as tmp_dir:

            def link_file(path, destname=None):
                if destname is None:
                    destname = path.split("/")[-1]
                destpath = os.path.join(tmp_dir, destname)
                utils.remove_if_file_exit(destpath)
                os.symlink(path, destpath)
                return destpath

            pajek = link_file(data.file_pajek, 'pajek.txt')
            cmd = "{} {} {} {}".format(
                config.get_OSLOM_prog('infomap', data.is_directed()), seed,
                pajek, 1)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = glob.glob(tmp_dir + "/pajek.tree")[0]
            import pandas as pd
            output = pd.read_csv(outputfile, sep=" ", skiprows=1, header=None)
            output['cluster'] = output.loc[:, 0].map(
                lambda u: int(u.split(':')[0]))
            output['node'] = output.loc[:, 2].astype(np.int)
        clusters = output[['cluster', 'node']]
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
예제 #13
0
    def run(self, data, f=False, m=None,seed=None):
        
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None:self.logger.info("seed ignored")
        
        params = {}
        params['f'] = f
        params['m'] = m 
        
        if not utils.file_exists(data.file_hig):
            data.to_higformat()
        cmd = ["./hirecs"]
        cmd.append("-oje")
        if f: cmd.append('-f')
        if m is not None: cmd.append("-m{}".format(m))
        cmd.append(data.file_hig)
        cmd.append("> output")
        cmd = " ".join(cmd)
        with utils.TempDir() as tmp_dir:
            with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd)
            self.logger.info("Running " + cmd)
            cmd = "bash tmpcmd" 
            
            utils.link_file(os.path.join(config.HIRECS_PATH, 'hirecs'), tmp_dir)
            utils.link_file(os.path.join(config.HIRECS_PATH, 'libhirecs.so'), tmp_dir)
            utils.link_file(data.file_hig, tmp_dir)
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0: 
                raise Exception("Run command with error status code {}".format(status))
            
            with open (os.path.join(tmp_dir, "output"), "r") as f:
                output = json.load(f)
                
            mod = output['mod']
            communities = output['communities']
            
            clusters = {}
            for c in communities:
                clusters[int(c)] = [int(u) for u in communities[c].keys()]
        
        self.logger.info("Made %d clusters in %f seconds with modularity %f" % (len(clusters), timecost, mod))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['overlap'] = True
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
예제 #14
0
    def run(self,
            data,
            seed=None,
            lamb=1,
            trials=5,
            temp_step=0.999,
            initial_temp=1e-6):
        params = locals()
        del params['data']
        del params['self']
        if seed is None:
            seed = np.random.randint(999999)
        params['seed'] = seed
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = os.path.join(tmp_dir, 'edges.txt')
            utils.remove_if_file_exit(pajek)
            os.symlink(data.file_edges, pajek)
            cmd = "{} {} {} {} {} {} {}".format(
                config.get_OSLOM_prog('modopt', data.is_directed()), pajek,
                seed, lamb, trials, temp_step, initial_temp)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = pajek + "part"
            with open(outputfile) as f:
                lines = [u.strip() for u in f]
            lines = [[int(v) for v in u.split("\t")] for u in lines]

        clusters = dict(enumerate(lines))
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
    def to_topgc_fromat(self, filepath=None):
        if (filepath == None):
            filepath = self.file_topgc
            if utils.file_exists(filepath):
                return filepath
        if not utils.file_exists(self.file_edges): self.to_edgelist()
        with TempDir() as tmp_dir:
            cmd = "cat {}|sort -k1,1 -k2,2 -n | {} >  {} || rm {}".format(
                self.file_edges, config.get_cdc_prog('mkidx'), filepath,
                filepath)
            self.logger.info("Running " + cmd)
            with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f:
                f.write(cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait("bash -x tmpcmd", tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))
        assert utils.file_exists(filepath)
        return filepath
    def run(self, data, seed=None):
        self.logger.warning(
            "dct assumes node starts with zero and is continuous")
        if seed is None:
            seed = np.random.randint(999999)
        params = {'seed': seed, 'prog': self.progname}
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            #tmp_dir = "/tmp/abc"
            pajek = utils.link_file(data.file_edges,
                                    dest_dir=tmp_dir,
                                    destname='edges.txt')
            cmd = "{} {}".format(
                config.get_dct_prog(self.progname, data.is_directed()), pajek)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(
                cmd, tmp_dir, env={'SEED': str(seed)}))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfiles = glob.glob(tmp_dir + "/output*.cluster")
            import pandas as pd
            df_from_each_file = (pd.read_csv(f, sep=" ", header=None)
                                 for f in outputfiles)
            output = pd.concat(df_from_each_file, ignore_index=True)
            output.columns = ['node', 'cluster']
        clusters = output[['cluster', 'node']]
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
 def to_mirror_edges_format(self, filepath=None):
     if (filepath == None):
         filepath = self.file_mirror_edges
         if utils.file_exists(filepath):
             return filepath
     edges1 = self.get_edges()
     edges2 = edges1.copy()
     edges2['src'] = edges1['dest']
     edges2['dest'] = edges1['src']
     edges2 = edges2[edges1.columns]
     edges = pd.concat([edges1, edges2
                        ]).drop_duplicates().sort_values(['src', 'dest'])
     edges.to_csv(filepath, header=None, index=None, sep=" ")
     return filepath
예제 #18
0
def to_networkit(data):
    """
    convert the dataset to a `networkit <https://networkit.github.io/>`_ graph.
    
    :param data: :py:class:`gct.Dataset`
    :rtype: networkit graph
    """
    import networkit
    fname = data.file_edges
    if not utils.file_exists(fname):
        data.to_edgelist()
    return networkit.readGraph(fname,
                               fileformat=networkit.Format.EdgeListSpaceZero,
                               directed=data.is_directed())
    def run(self, data, seed=None):
        self.logger.warning(
            "dct::seq_louvain assumes node starts with zero and is continuous")
        if seed is None:
            seed = np.random.randint(999999)
        params = {'seed': seed}
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = utils.link_file(data.file_edges,
                                    dest_dir=tmp_dir,
                                    destname='edges.txt')
            cmd = "{} -f -s {} -o output {}".format(
                config.get_dct_prog('infomap', data.is_directed()), seed,
                pajek)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = tmp_dir + "/output"
            import pandas as pd
            output = pd.read_csv(outputfile, sep=" ", header=None)
            output.columns = ['cluster']
            output['node'] = range(len(output))
        clusters = output[['cluster', 'node']]
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
    def run(self, data, algorithm=4, minpts=4, epsilon=0.5, alpha=32768 , beta=32768 , thread=-1, seed=None):
        params = locals()
        del(params['self']);del(params['data'])
        if seed is not None:self.logger.info("seed ignored")        
        thread = utils.get_num_thread(thread)
        params['thread'] = thread 

        if False and (data.is_directed() or data.is_weighted()):
            raise Exception("only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_anyscan):
            data.to_anyscan()
        
        cmd = "{} -c {} -i {} -m {} -e {} -o {} -a {} -b {} -t {}".format(
            config.ANYSCAN_PROG, algorithm, data.file_anyscan, minpts, epsilon, 'output', alpha, beta, thread)
        self.logger.info("Running " + cmd)

        with utils.TempDir() as tmp_dir:
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 1:  # anyscan always return 1
                raise Exception("Run command with error status code {}".format(status))
            
            with open (os.path.join(tmp_dir, "output"), "r") as output:
                lines = [u.strip() for u in output.readlines()]
                
            n_nodes = int(lines[0])
            clusters_list = lines[1].split(" ")
            clusters_list = [int(u) for u in clusters_list ]
            
        if (n_nodes != len(clusters_list)):
            raise Exception("#node is not equals #cluster")
            
        from collections import defaultdict
        clusters = defaultdict(list)
        for n, c in enumerate(clusters_list):
            clusters[c].append(n)
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))
        
        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
    def to_snapformat(self, filepath=None):
        if (filepath == None):
            filepath = self.file_snap
            if utils.file_exists(filepath):
                return filepath

        import snap
        from gct.dataset import convert
        g = convert.to_snap(self)
        self.logger.info("Writing {} to {}".format(type(g), filepath))
        FOut = snap.TFOut(filepath)
        g.Save(FOut)
        FOut.Flush()

        return filepath
    def run(self, data, mu=3, epsilon=0.5, prog='pScan', seed=None):
        assert prog in ['pScan', 'ppScan', 'ppScanSSE']
        if seed is not None:self.logger.info("seed ignored")
                
        params = {'mu':mu, 'epsilon':epsilon, 'prog':prog}
        if False and (data.is_directed() or data.is_weighted()):
            raise Exception("only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_scanbin):
            data.to_scanbin()
        
        with utils.TempDir() as tmp_dir:
            scanbin = data.file_scanbin 
            os.symlink(os.path.join(scanbin, 'b_degree.bin'), os.path.join(tmp_dir, 'b_degree.bin'))
            os.symlink(os.path.join(scanbin, 'b_adj.bin'), os.path.join(tmp_dir, 'b_adj.bin'))
            if prog == 'pScan':
                EXE = config.PSCAN_PROG
            elif prog == 'ppScan':
                EXE = config.PPSCAN_PROG                
            elif prog == 'ppScanSSE':
                EXE = config.PPSCANSSE_PROG                
            cmd = "{} {} {} {} {}".format(EXE, tmp_dir, epsilon, mu, 'output')
            self.logger.info("Running " + cmd)
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception("Run command with error status code {}".format(status))
    
            outputfile = glob.glob(tmp_dir + "/result-*-*.txt")[0]
            import pandas as pd 
            output = pd.read_csv(outputfile, sep=" ")
        
        clusters = output[output['c/n'] == 'c'][['vertex_id', 'cluster_id']]
        others = output[output['c/n'] != 'c'][['vertex_id', 'cluster_id']]
        clusters = clusters.groupby('cluster_id').apply(lambda u: list(u['vertex_id'])).to_dict()
        others = others.values.tolist()

        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))
        
        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 
        result['noise_or_hub'] = others 
        save_result(result)
        self.result = result 
        return self 
    def download(self):
        for fname in self.get_remote_names():
            if not self.has_downloaded(fname):
                rfile = 'http://data.law.di.unimi.it/webdata/' + self.name + "/" + fname
                lfile = config.get_download_file_path(self.name,
                                                      fname,
                                                      create=True)
                if not utils.file_exists(lfile):
                    self.logger.info("Dowloading {} to {} ".format(
                        rfile, lfile))
                    utils.urlretrieve(rfile, lfile)

        assert self.has_downloaded()
        if not self.md5check():
            self.logger.error("md5 check failed")
            shutil.rmtree(config.get_download_file_path(self.name))
            raise Exception("md5 check failed")
예제 #24
0
    def run(self, data, **kwargs):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        params = dict(kwargs)
        params = {k:v for k, v in params.items() if v is not None }
        if not data.is_directed():
            params['Sym'] = 1
        params['d'] = "output"
        if "r" not in params: params['r'] = 0.1

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()
        
        txt_params = " ".join(["-{} {}".format(k, v) for k, v in params.items()]) 
        cmd = "{} -jar {} -i {} {} ".format(utils.get_java_command(), config.GANXISW_PROG, "edges.txt", txt_params)
        with utils.TempDir() as tmp_dir:
            utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True)
            utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output"))
            self.logger.info("Running " + cmd)
            utils.link_file(data.file_edges, tmp_dir, "edges.txt")
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0: 
                raise Exception("Run command with error status code {}".format(status))
            outputfile = glob.glob(os.path.join(tmp_dir, "output/SLPAw*.icpm"))[0]
            clusters = []
            with open (os.path.join(tmp_dir, outputfile), "r") as f:
                for line in f: 
                    clusters.append([int(u) for u in line.strip().split(" ")])
            clusters = dict(enumerate(clusters))
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
예제 #25
0
def to_networkx(data):
    """
    convert the dataset to a networkx graph.
    
    :param data: :py:class:`gct.Dataset`
    :rtype: networkx graph
    """
    import networkx as nx
    fname = data.file_edges
    if not utils.file_exists(fname):
        data.to_edgelist()

    container = nx.DiGraph() if data.is_directed() else nx.Graph()
    if data.is_weighted():
        return nx.read_weighted_edgelist(fname,
                                         create_using=container,
                                         nodetype=int)
    else:
        return nx.read_edgelist(fname, create_using=container, nodetype=int)
 def to_edgelist(self, filepath=None, sep=" ", sort=False):
     if (filepath == None):
         filepath = self.file_edges
         if utils.file_exists(filepath):
             return filepath
     self.logger.info("writing edges to " + filepath)
     with open(filepath, 'wt') as f:
         columns = ['src', 'dest', 'weight'
                    ] if self.is_weighted() else ['src', 'dest']
         df = self.get_edges()[columns]
         if sort:
             df = df.sort_values(by=['src', 'dest'])
         for i in range(df.shape[0]):
             r = df.iloc[i]
             r = ['%g' % (u) for u in r]
             row = sep.join(r)
             row += "\n"
             f.write(row)
     self.logger.info("finish writing " + filepath)
     return filepath
예제 #27
0
def to_graph_tool(data):
    """
    convert the dataset to a graph-tool graph. 
    
    (TBD) graph_tool support weights?
    
    :param data: :py:class:`gct.Dataset`
    :rtype: graph-tool graph
    """
    import graph_tool
    fname = data.file_edges
    if not utils.file_exists(fname):
        data.to_edgelist()

    g = graph_tool.load_graph_from_csv(fname,
                                       directed=data.is_directed(),
                                       string_vals=False,
                                       skip_first=False,
                                       csv_options={"delimiter": " "})

    return g
예제 #28
0
    def run(self, data, cutoff_r=0.01, inflation_in=2, NBDisimilarity_q=0.3, seed=None):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None:self.logger.info("seed ignored")        
        params = locals();del params['self'];del params['data']

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()
        
        cmd = "{} {} {} {} {}".format(config.LABLE_RANK_PROG, "edges.txt", cutoff_r, inflation_in, NBDisimilarity_q)
        with utils.TempDir() as tmp_dir:
            utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True)
            utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output"))
            self.logger.info("Running " + cmd)
            utils.link_file(data.file_edges, tmp_dir, "edges.txt")
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0: 
                raise Exception("Run command with error status code {}".format(status))
            outputfile = glob.glob(os.path.join(tmp_dir, "output/LabelRank*.icpm"))[0]
            clusters = []
            with open (os.path.join(tmp_dir, outputfile), "r") as f:
                for line in f: 
                    clusters.append([int(u) for u in line.strip().split(" ")])
            clusters = dict(enumerate(clusters))
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
    def run(self, data, mu=1, epsilon=0, seed=None):
        params = {'mu':mu, 'epsilon':epsilon}
        if seed is not None:self.logger.info("seed ignored")        
        if data.is_weighted():
            raise Exception("only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()
        
        cmd = "{} -e {} -m {} -r {}".format(config.SCANPP_PROG, epsilon, mu, data.file_edges)
        self.logger.info("Running " + cmd)
        timecost, output = utils.timeit(lambda: utils.check_output(cmd.split(" ")))
        if not output.startswith('node'):
            raise Exception("Something wrong with scapp. output:\n" + output)

        output = [u.strip() for u in output.split("\n")][1:]
        output = [u.split("\t") for u in output if u]
        output = [[int(v) for v in u] for u in output]
        
        from collections import defaultdict
        clusters = defaultdict(list)
        for n, c in output:
            clusters[c].append(n)
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))
        
        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
 def has_downloaded(self):
     for fname in self.local_files:
         if not utils.file_exists(fname):
             return False
     else:
         return True