Пример #1
0
 def link_file(path, destname=None):
     if destname is None:
         destname = path.split("/")[-1]
     destpath = os.path.join(tmp_dir, destname)
     utils.remove_if_file_exit(destpath)
     os.symlink(path, destpath)
     return destpath
    def get_edges(self):
        if not self.has_downloaded():
            self.download()

        graph_file = config.get_download_file_path(self.name,
                                                   self.name + '.graph')
        self.logger.info("reading {}".format(graph_file))

        cmd = "{} -server -cp {} it.unimi.dsi.webgraph.ArcListASCIIGraph {} {}".format(
            utils.get_java_command(), config.WEBGRAPH_JAR_PATH,
            self.name + '-hc-t', self.name)

        self.logger.info("Running " + cmd)

        timecost, status = utils.timeit(
            lambda: utils.shell_run_and_wait(cmd, self.download_dir))
        if status != 0:
            raise Exception(
                "Run command with error status code {}".format(status))

        csvfile = os.path.join(self.download_dir, self.name)
        edges = pd.read_csv(csvfile, header=None, sep='\t')
        edges.columns = ['src', 'dest']
        utils.remove_if_file_exit(csvfile)
        return edges
    def testRandomDataset8(self):
        ds = self.graph_unweighted_undirect
        utils.remove_if_file_exit(ds.file_hig)
        print(ds.to_higformat())

        ds = self.graph_weighted_undirect
        utils.remove_if_file_exit(ds.file_hig)
        print(ds.to_higformat())
Пример #4
0
    def run(self,
            data,
            seed=None,
            lamb=1,
            trials=5,
            temp_step=0.999,
            initial_temp=1e-6):
        params = locals()
        del params['data']
        del params['self']
        if seed is None:
            seed = np.random.randint(999999)
        params['seed'] = seed
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = os.path.join(tmp_dir, 'edges.txt')
            utils.remove_if_file_exit(pajek)
            os.symlink(data.file_edges, pajek)
            cmd = "{} {} {} {} {} {} {}".format(
                config.get_OSLOM_prog('modopt', data.is_directed()), pajek,
                seed, lamb, trials, temp_step, initial_temp)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = pajek + "part"
            with open(outputfile) as f:
                lines = [u.strip() for u in f]
            lines = [[int(v) for v in u.split("\t")] for u in lines]

        clusters = dict(enumerate(lines))
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
Пример #5
0
    def run(self, data, seed=None):
        if seed is None:
            seed = np.random.randint(999999)
        params = {'seed': seed}
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_pajek):
            data.to_pajek()

        with utils.TempDir() as tmp_dir:
            #tmp_dir="/tmp/abc"
            pajek = os.path.join(tmp_dir, 'pajek.txt')
            utils.remove_if_file_exit(pajek)
            os.symlink(data.file_pajek, pajek)
            cmd = "{} {} {} {}".format(
                config.get_OSLOM_prog('Infohiermap', data.is_directed()), seed,
                pajek, 1)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = glob.glob(tmp_dir + "/pajek.tree")[0]
            import pandas as pd
            output = pd.read_csv(outputfile,
                                 sep=" ",
                                 skiprows=1,
                                 header=None,
                                 dtype={0: np.str})
            output['cluster'] = output.loc[:, 0].map(
                lambda u: int(u.split(':')[0]))
            output['node'] = output.loc[:, 2].astype(np.int)
        clusters = output[['cluster', 'node']]
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
    def testFromSnap(self):
        name = "testFromSnap"

        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = snap.GenRndGnm(snap.PNGraph, 100, 1000) 
        d = convert.from_snap(name, G, overide=True)
        print ("testFromSnap", d)

        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = snap.GenRndGnm(snap.PUNGraph, 100, 1000) 
        d = convert.from_snap(name, G, overide=True)
        print ("testFromSnap", d) 
    def testRandomDataset7(self):
        ds = self.graph_unweighted_undirect
        utils.remove_if_file_exit(ds.file_snap)
        print(ds.to_snapformat())
        FIn = snap.TFIn(ds.file_snap)
        Graph = snap.TUNGraph.Load(FIn)

        ds = self.graph_unweighted_direct
        self.assertTrue(ds.is_directed())
        utils.remove_if_file_exit(ds.file_snap)
        print(ds.to_snapformat())
        FIn = snap.TFIn(ds.file_snap)
        Graph = snap.TNGraph.Load(FIn)
def remove_local(name, rm_graph_data=True, rm_clustering_result=True):
    '''
    remove a local dataset 

    :param name: name of a dataset
    :param rm_clustering_result: remove local graph data
    :param rm_clustering_result: remove clustering results associated with the graph.
    '''
    if rm_graph_data:
        path = config.get_data_file_path(name)
        utils.remove_if_file_exit(path, is_dir=True)
    if rm_clustering_result:
        path = config.get_result_file_path(name)
        utils.remove_if_file_exit(path, is_dir=True)
def save_result(result):
    if isinstance(result, Result):
        result.save()
    else:
        filepath = config.get_result_file_path(result['dataname'],
                                               result['runname'],
                                               create=True)
        try:
            fpath = os.path.join(filepath, 'result.txt')
            with open(fpath, 'wt') as f:
                json.dump(result, f)
        except:
            utils.remove_if_file_exit(fpath, is_dir=False)
            raise
Пример #10
0
    def run(self, data, **kwargs):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        params = dict(kwargs)
        params = {k:v for k, v in params.items() if v is not None }
        if not data.is_directed():
            params['Sym'] = 1
        params['d'] = "output"
        if "r" not in params: params['r'] = 0.1

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()
        
        txt_params = " ".join(["-{} {}".format(k, v) for k, v in params.items()]) 
        cmd = "{} -jar {} -i {} {} ".format(utils.get_java_command(), config.GANXISW_PROG, "edges.txt", txt_params)
        with utils.TempDir() as tmp_dir:
            utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True)
            utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output"))
            self.logger.info("Running " + cmd)
            utils.link_file(data.file_edges, tmp_dir, "edges.txt")
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0: 
                raise Exception("Run command with error status code {}".format(status))
            outputfile = glob.glob(os.path.join(tmp_dir, "output/SLPAw*.icpm"))[0]
            clusters = []
            with open (os.path.join(tmp_dir, outputfile), "r") as f:
                for line in f: 
                    clusters.append([int(u) for u in line.strip().split(" ")])
            clusters = dict(enumerate(clusters))
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
Пример #11
0
    def run(self, data, cutoff_r=0.01, inflation_in=2, NBDisimilarity_q=0.3, seed=None):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None:self.logger.info("seed ignored")        
        params = locals();del params['self'];del params['data']

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()
        
        cmd = "{} {} {} {} {}".format(config.LABLE_RANK_PROG, "edges.txt", cutoff_r, inflation_in, NBDisimilarity_q)
        with utils.TempDir() as tmp_dir:
            utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True)
            utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output"))
            self.logger.info("Running " + cmd)
            utils.link_file(data.file_edges, tmp_dir, "edges.txt")
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0: 
                raise Exception("Run command with error status code {}".format(status))
            outputfile = glob.glob(os.path.join(tmp_dir, "output/LabelRank*.icpm"))[0]
            clusters = []
            with open (os.path.join(tmp_dir, outputfile), "r") as f:
                for line in f: 
                    clusters.append([int(u) for u in line.strip().split(" ")])
            clusters = dict(enumerate(clusters))
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
    def testFromNextworkx(self):
        name = "testFromNextworkx"

        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = nx.complete_graph(5)
        d = convert.from_networkx(name, G, weighted=False, overide=True)
        print (d) 

        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = nx.complete_graph(5).to_directed()
        d = convert.from_networkx(name, G, weighted=False, overide=True)
        print (d)
       
        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = nx.complete_graph(5) 
        G.add_weighted_edges_from((u, v, 1) for u, v in nx.complete_graph(5).edges())
        d = convert.from_networkx(name, G, weighted=True, overide=True)
        print (d)
                
        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = nx.complete_graph(5).to_directed()
        G.add_weighted_edges_from((u, v, 1) for u, v in nx.complete_graph(5).edges())
        d = convert.from_networkx(name, G, weighted=True, overide=True)
        print (d)
    def testFromIGraph(self):
        name = "testFromIGraph"

        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = igraph.Graph.Full(5, directed=False)
        d = convert.from_igraph(name, G)
        print (d)

        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = igraph.Graph.Full(5, directed=True)
        d = convert.from_igraph(name, G)
        print (d)
        
        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = igraph.Graph.Full(5, directed=False)
        G.es['weight'] = [1] * G.ecount()
        d = convert.from_igraph(name, G)
        print (d) 

        utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True)
        G = igraph.Graph.Full(5, directed=True)
        G.es['weight'] = [1] * G.ecount()        
        d = convert.from_igraph(name, G)
        print (d) 
Пример #14
0
    def run(self,
            data,
            execution='async',
            ncpus=None,
            scheduler=None,
            engine_opts=None,
            graph_opts=None,
            scheduler_opts=None,
            seed=None):
        if seed is not None: self.logger.info("seed ignored")
        params = locals()
        del params['self']
        del params['data']
        del params['seed']
        params = {u: v for u, v in params.items() if v is not None}
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        ncpus = utils.get_num_thread(ncpus)
        params['ncpus'] = ncpus
        params['weighted'] = data.is_weighted() * 1
        params['directed'] = data.is_directed() * 1
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = os.path.join(tmp_dir, 'edges.txt')
            utils.remove_if_file_exit(pajek)
            os.symlink(data.file_edges, pajek)
            args = " ".join(
                ["--{} {}".format(u, v) for u, v in params.items()])
            cmd = "{} --graph {} --saveprefix=output.cluster {}".format(
                config.get_powergraph_prog('label_propagation',
                                           data.is_directed()), pajek, args)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfiles = glob.glob(tmp_dir + "/output.cluster.*")
            import pandas as pd
            df_from_each_file = (pd.read_csv(f, sep="\t", header=None)
                                 for f in outputfiles)
            output = pd.concat(df_from_each_file, ignore_index=True)
            output.columns = ['node', 'cluster']

        clusters = output[['cluster', 'node']]
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
Пример #15
0
    def run(self,
            data,
            v=5,
            v1=None,
            v2=None,
            prop=None,
            repeat=None,
            mo=None,
            nosplit=False,
            extrasimplify=False,
            q=False,
            seed=None):
        assert (v1 is None and v2 is None) or (v1 is not None
                                               and v2 is not None)
        params = locals()
        del params['self']
        del params['data']
        if seed is not None: self.logger.info("seed ignored")
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = os.path.join(tmp_dir, 'edges.txt')
            utils.remove_if_file_exit(pajek)
            os.symlink(data.file_edges, pajek)
            cmd = "{} -cp {} COPRA {} -w -v {}".format(
                utils.get_java_command(),
                config.get_OSLOM_prog('copra', data.is_directed()), pajek, v)
            if (v1 is not None and v2 is not None):
                cmd += "-vs {} {}".format(v1, v2)
            if prop is not None: cmd += ' -prop {}'.format(prop)
            if repeat is not None: cmd += ' -repeat {}'.format(repeat)
            if mo is not None: cmd += ' -mo'
            if nosplit is not None: cmd += ' -nosplit'
            if extrasimplify is not None: cmd += ' -extrasimplify'
            if q is not None: cmd += ' -q'

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = os.path.join(tmp_dir, 'clusters-edges.txt')
            with open(outputfile) as f:
                lines = [u.strip() for u in f]
            lines = [[int(v) for v in u.split(" ")] for u in lines]

        clusters = dict(enumerate(lines))
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
 def testRandomDataset6(self):
     ds = self.graph_unweighted_undirect
     utils.remove_if_file_exit(ds.file_anyscan)
     print(ds.to_anyscan())
    def __init__(self,
                 name=None,
                 description="",
                 groundtruthObj=None,
                 edgesObj=None,
                 directed=False,
                 weighted=False,
                 overide=False,
                 additional_meta=None,
                 is_edge_mirrored=False):
        assert edgesObj is not None
        self.name = name
        self.description = description
        self.additional_meta = additional_meta
        self.logger = utils.get_logger("{}:{}".format(
            type(self).__name__, self.name))
        self.directed = directed
        self.weighted = weighted
        self.is_edge_mirrored = is_edge_mirrored

        self.parq_edges = None

        if name:
            assert name
            self.file_edges = config.get_data_file_path(self.name, 'edges.txt')
            self.file_pajek = config.get_data_file_path(self.name, 'pajek.txt')
            self.file_hig = config.get_data_file_path(self.name, 'pajek.hig')
            self.file_scanbin = config.get_data_file_path(self.name, 'scanbin')
            self.file_anyscan = config.get_data_file_path(
                self.name, 'anyscan.txt')
            self.file_snap = config.get_data_file_path(self.name, 'snap.bin')
            self.file_mcl_mci = config.get_data_file_path(self.name, 'mcl.mci')
            self.file_mcl_tab = config.get_data_file_path(self.name, 'mcl.tab')
            self.file_topgc = config.get_data_file_path(self.name, 'topgc.txt')
            self.file_mirror_edges = config.get_data_file_path(
                self.name, 'edges_mirror.txt')

            if self.is_weighted():
                self.file_unweighted_edges = self.file_edges
            else:
                self.file_unweighted_edges = config.get_data_file_path(
                    self.name, 'unweighted_edges.txt')

        self.set_ground_truth(groundtruthObj)
        self.set_edges(edgesObj)

        if name:
            is_persistent = self.is_edges_persistent(
            ) and self.is_ground_truth_persistent()
            self.home = config.get_data_file_path(name, create=False)
            if utils.file_exists(self.home):
                if overide:
                    utils.remove_if_file_exit(self.home, is_dir=True)
                elif is_persistent:
                    pass
                else:
                    raise Exception(
                        "Dataset {} exists at {}. Use overide=True or load it locally."
                        .format(name, self.home))

            if not is_persistent:
                utils.remove_if_file_exit(config.get_result_file_path(
                    self.name),
                                          is_dir=True)
                utils.create_dir_if_not_exists(self.home)
                self.persistent()
                self.update_meta()