示例#1
0
def main():
    machine_name, outdir = get_machine_name_and_outdir()
    names = [
        'karate', 'football', 'polbooks', 'wisconsin', 'texas', 'film',
        'cornell', 'cora', 'citeseer', 'airports', 'polblogs', 'chameleon',
        'pubmed', 'squirrel'
    ]

    clusterings = [
        'cond', 'spectral', 'leiden', 'louvain', 'infomap', 'labelprop',
        'random', 'leadingeig', 'consensus'
    ][:-1]

    for name in names:
        g, _ = get_graph(name, basedir=outdir)
        make_dirs(outdir=outdir, name=name)
        for clustering in clusterings:
            tree = load_pickle(
                join(outdir, 'output', 'trees', name,
                     f'{clustering}_list.pkl'))
            if tree is None: continue
            root = create_tree(tree)
            faulty_tnodes = tree_okay(root=root, g=g)
            if faulty_tnodes > 0:
                print(f'{name}\t{clustering}\t{faulty_tnodes:,d} errors')

    return
示例#2
0
def autoencoders(outdir, name, model):
    model_path = join(outdir, 'output', 'other_models', 'autoencoders')
    # if not Path(model_path).exists():
    #     os.makedirs(model_path)
    model_path = join(model_path, f'{name}_{model}_mat.pkl')
    graphs_path = join(outdir, 'output', 'graphs', name, f'{model}_10.pkl')

    # if Path(graphs_path).exists():
    #     return
    #
    input_g, _ = get_graph(name, basedir=outdir)
    if Path(model_path).exists():
        thresh_mat = load_pickle(model_path)
        graphs = []
        ns, ms = [], []
        for _ in range(10):
            g = get_graph_from_prob_matrix(thresh_mat, thresh=0.5)
            nx.set_node_attributes(g,
                                   name='value',
                                   values=nx.get_node_attributes(
                                       input_g, 'value'))
            ns.append(g.order())
            ms.append(g.size())
            graphs.append(g)
        print('Avg n, m', np.round(np.mean(ns), 3), np.round(np.mean(ms), 3))
        dump_pickle(graphs, graphs_path)
        return

    from other_models.autoencoders.fit import fit_model

    _, thresh_mat = fit_model(g=input_g, model_name=model)

    dump_pickle(thresh_mat, model_path)
    return
示例#3
0
def make_combined_graph_dfs(basedir, names, clusterings, bipartite=False):
    dfs = []
    mus = range(3, 11)
    # mus = range(5, 8)

    possbile_models = 'SBM', 'DC-SBM', 'CL', 'AGM', 'NetGAN', 'CELL'
    for name in names:  # add tqdm
        temp_fname = f'{basedir}/stats/temp/_graph_df_{name}.csv'
        if Path(temp_fname).exists():
            print(f'Skipping {name!r}')
            continue

        sub_df = []
        orig_graph, attr_name = get_graph(name, basedir=basedir)

        for fname in tqdm(glob.glob(f'{basedir}/output/graphs/{name}/*'),
                          desc=f'{name}',
                          ncols=100):
            path = Path(fname)

            if path.stem.startswith(possbile_models) or 'ae' in path.stem:
                pattern = r'(.*)\_(\d+)'
                m = re.match(pattern, path.stem)
                if m is None: continue
                grammar_type, _ = m.groups()
                mu = np.nan
                clustering = np.nan

            elif path.stem.startswith('AVRG'):
                pattern = r'(.*)\_(\w+)\_(\d+)\_(\d+)'
                m = re.match(pattern, path.stem)
                if m is None:
                    continue
                grammar_type, clustering, mu, _ = m.groups()
                mu = int(mu)
                if mu not in mus or clustering not in clusterings:
                    continue
                tqdm.write(path.stem)
            else:
                assert NotImplementedError(f'Invalid model: {path.stem!r}')
                continue

            try:
                df = make_graph_df(name, fname, orig_graph, mu, clustering,
                                   attr_name, grammar_type, bipartite, basedir)
            except Exception as e:
                print(f'ERROR in graph df! {e}')
                continue
            dfs.append(df)
            sub_df.append(df)
            # df.to_csv(temp_fname, index=False)
        if len(sub_df) > 0:
            temp_df = pd.concat(sub_df, ignore_index=True)
            temp_df.to_csv(temp_fname)
            print(f'Writing {name!r} to {temp_fname!r}')

    # os.remove(temp_df_filename)
    graph_df = pd.concat(dfs, ignore_index=True)
    return graph_df
示例#4
0
def old_main():
    basedir = '/data/ssikdar/Attributed-VRG'
    names = [
        'polbooks', 'football', 'wisconsin', 'texas', 'cornell', 'cora',
        'citeseer', 'airports', 'polblogs', 'film', 'chameleon', 'squirrel'
    ][:-3]
    models = ['gcn_ae', 'gcn_vae']

    args = []

    for name in names:
        input_g, _ = get_graph(gname=name, basedir=basedir)
        for model in models:
            try:
                if model in ('netgan', 'cell'):
                    netgan_cell_runner(outdir=basedir,
                                       model=model,
                                       name=name,
                                       input_g=input_g)
                elif 'ae' in model:
                    autoencoders(outdir=basedir, name=name, model=model)
                elif model in ('SBM', 'DC-SBM', 'CL', 'AGM'):
                    graphs = get_graphs_from_models(input_graph=input_g,
                                                    num_graphs=10,
                                                    name=name,
                                                    model=model,
                                                    outdir=basedir)
                    print(graphs)
            except Exception as e:
                print(name, model, e)

    exit(0)
    # for name in names:
    #     input_g, _ = get_graph(gname=name, basedir=basedir)
    #     for model in models:
    #         try:
    #             if model in ('netgan', 'cell'):
    #                 netgan_cell_runner(outdir=basedir, model=model, name=name, input_g=input_g)
    #             elif 'ae' in model:
    #                 autoencoders(outdir=basedir, name=name, model=model)
    #             elif model in ('SBM', 'DC-SBM', 'CL', 'AGM'):
    #                 get_graphs_from_models(input_graph=input_g, num_graphs=10, name=name, model=model, outdir=basedir)
    #         except Exception as e:
    #             print(name, model, e)
    exit(0)

    # for name in names[: ]:
    #     input_graph, attr_name = get_graph(name, basedir=outdir)
    #     for model in 'SBM', 'DC-SBM', 'CL', 'AGM':
    #         try:
    #             get_graphs_from_models(input_graph=input_graph, num_graphs=10, name=name, model=model, outdir=outdir)
    #         except Exception as e:
    #             print(e)

    exit(0)
示例#5
0
def batch_grammar_runner(names,
                         clusterings,
                         outdir,
                         mus=None,
                         extract_types=None,
                         num_workers=8,
                         shuffle=False):
    # grammar_types_1 = ['VRG', 'AVRG']
    grammar_types = ['AVRG']
    if extract_types is None:
        extract_types = ['mu_random', 'mu_level', 'all_tnodes']
    if mus is None:
        mus = range(3, 11)
    # mus = [5, 6]
    use_cluster_pickle = True
    use_grammar_pickle = True
    count = 1
    args = []
    write_pickle = True

    for name in names:
        input_graph, attr_name = get_graph(name, basedir=outdir)

        for clustering in clusterings:
            for grammar_type in grammar_types:
                for extract_type in extract_types:
                    for mu in mus:
                        extract = extract_type.replace('_', '-')
                        if extract_type == 'all_tnodes':
                            mu = -1
                        grammar_filename = join(
                            outdir, 'output', 'grammars', name,
                            f'{grammar_type}_{extract}_{clustering}_{mu}.pkl')

                        arg = (name, grammar_type, extract_type, clustering,
                               mu, input_graph, use_grammar_pickle,
                               use_cluster_pickle, attr_name, outdir, count,
                               grammar_filename, write_pickle)
                        args.append(arg)
                        if extract_type == 'all_tnodes':  # here mu is not important for all_tnodes
                            break
    print(args[:3])
    if shuffle:
        random.shuffle(args)
    try:
        parallel_async(func=get_grammars, args=args, num_workers=num_workers)
    except Exception as e:
        print(e)

    ## get_grammars(name: str,  grammar_type: str, extract_type: str, clustering: str, mu: int, input_graph: nx.Graph,
    # use_grammar_pickle: bool, use_cluster_pickle: bool, attr_name: str, outdir: str, count: int = 1,
    # grammar_filename: str = '', write_pickle: bool = True, list_of_list_clusters=None)
    ##

    return
示例#6
0
def avrg_link_pred():
    basedir = '/data/ssikdar/Attributed-VRG/'
    names = [
        'polbooks',
        'football',
        'wisconsin',
        'texas',
        'cornell',
        'cora',
        'citeseer',
    ]
    for name in names:
        input_graph, _ = get_graph(name, basedir=basedir)
        extract_type = 'mu_random'
        mu = 5
        clustering = 'leiden'
        grammar_filename = join(basedir, 'output/grammars', name,
                                f'NCE_mu-random_{clustering}_{mu}.pkl')
        nce = get_grammars(name=name,
                           grammar_type='NCE',
                           extract_type=extract_type,
                           clustering=clustering,
                           attr_name='value',
                           input_graph=input_graph,
                           mu=mu,
                           outdir=basedir,
                           use_grammar_pickle=True,
                           use_cluster_pickle=False,
                           grammar_filename=grammar_filename)[0]

        print(nce)
        # AVRG-regular_mu-random_louvain_8_10.pkl
        graphs_filename = join(basedir, 'output/graphs', name,
                               f'NCE_mu-random_{clustering}_{mu}_10.pkl')
        nce_graphs = generate_graphs(basedir=basedir,
                                     extract_type=extract_type,
                                     gen_type='NCE',
                                     grammar=nce,
                                     graphs_filename=graphs_filename,
                                     name=name,
                                     num_graphs=10,
                                     use_pickle=True)

        for out_g in nce_graphs:
            print(f'n={out_g.order():,d}, m={out_g.size():,d}, {type(out_g)}')
        print()
    return

    return
示例#7
0
def batch_cluster_runner(names, outdir, clusterings=None):
    if clusterings is None:
        clusterings = [
            'cond', 'spectral', 'leiden', 'louvain', 'infomap', 'labelprop',
            'random', 'leading_eig', 'consensus'
        ][:-1]
    use_pickle = True
    args = []

    for name in names:
        g, _ = get_graph(name, basedir=outdir)
        g.name = name
        for clustering in clusterings:
            args.append((g, outdir, clustering, use_pickle, ''))
    random.shuffle(args)
    parallel_async(func=get_clustering, args=args)
    return
示例#8
0
if __name__ == '__main__':
    basedir = '/data/ssikdar/Attributed-VRG'
    names = [
        'polbooks', 'football', 'wisconsin', 'texas', 'cornell', 'cora',
        'citeseer', 'airports', 'polblogs', 'film', 'chameleon', 'squirrel'
    ][:5]
    models = [
        'AVRG', 'gcn_ae', 'gcn_vae', 'linear_ae', 'linear_vae', 'jaccard',
        'adamic-adar'
    ]  # 'netgan', 'cell', ]
    # names = ['citeseer']
    models = ['cell']

    for name in names:
        name_fname = join(basedir, 'stats/link_pred', f'{name}.csv')
        orig_g, att_name = get_graph(name, basedir=basedir)
        model_dfs = []
        trials = 10
        test_frac, val_frac = 0.1, 0.05
        for model in models:
            model_rows = []
            model_fname = join(basedir, 'stats/link_pred',
                               f'{name}_{model}.csv')
            if Path(model_fname).exists():
                model_df = load_pickle(model_fname)
                continue
            for trial in range(1, trials + 1):
                splits_filename = join(
                    basedir, 'output', 'splits',
                    f'{name}_{int(test_frac*100)}_{int(val_frac*100)}_{trial}.pkl'
                )
示例#9
0
def batch_generator_runner(names,
                           basedir,
                           clusterings,
                           mus=None,
                           extract_types=None,
                           save_snapshots=False,
                           num_workers=10,
                           shuffle=False):
    num_graphs = 10  # we need 1 graph to chart the progress  # TODO: change this in the future?
    use_pickle = True
    save_snapshots = save_snapshots
    if mus is None:
        mus = list(range(3, 11)) + [-1]
    alpha = None

    args = []
    for name in names:
        input_graph, attr_name = get_graph(name, basedir=basedir)
        if input_graph.size() > 3_000:
            save_snapshots = False

        mix_dict = get_mixing_dict(input_graph, attr_name=attr_name)
        inp_deg_ast = nx.degree_assortativity_coefficient(input_graph)
        inp_attr_ast = nx.attribute_assortativity_coefficient(
            input_graph, attr_name)

        for grammar_filename in glob(f'{basedir}/output/grammars/{name}/*'):
            grammar = load_pickle(grammar_filename)
            extract_type = grammar.extract_type.replace('_', '-')
            if grammar.mu not in mus or grammar.clustering not in clusterings or extract_type not in extract_types:
                continue
            print(Path(grammar_filename).stem)

            if isinstance(grammar, AttributedVRG):
                for gen_type, fancy in zip(('AVRG-regular', 'AVRG-fancy'),
                                           (False, True)):
                    graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl'
                    args.append((name, grammar, num_graphs, extract_type,
                                 gen_type, basedir, graphs_filename, mix_dict,
                                 attr_name, fancy, inp_deg_ast, inp_attr_ast,
                                 use_pickle, save_snapshots, alpha))

                for alpha, gen_type in zip(
                    (0, 0.5, 1),
                    ('AVRG-greedy-attr', 'AVRG-greedy-50', 'AVRG-greedy-deg')):
                    fancy = None
                    graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl'
                    args.append((name, grammar, num_graphs, extract_type,
                                 gen_type, basedir, graphs_filename, mix_dict,
                                 attr_name, fancy, inp_deg_ast, inp_attr_ast,
                                 use_pickle, save_snapshots, alpha))

            else:
                continue  # skip VRGs
                # assert isinstance(grammar, VRG)
                # grammar_type = 'VRG'
                # fancy = None
                # graphs_filename = f'{basedir}/output/graphs/{name}/{grammar_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl'
                # args.append((name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy,
                #              inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha, graphs_filename))
    if shuffle:
        random.shuffle(args)
    try:
        parallel_async(func=generate_graphs,
                       args=args,
                       num_workers=num_workers)
    except Exception as e:
        print(e)
    return
示例#10
0
def make_gen_df(base_path, names=None, clusterings=None, num_samples=None):
    """
    num_samples is for number of samples of generated graph
    """
    rows = []
    cols = [
        'snap_id', 'name', 'model', 'clustering', 'attr_name', 'orig_n',
        'orig_m', 'orig_deg_ast', 'orig_attr_ast', 'mu', 'n', 'm', 't',
        'term_graph', 'term_n', 'term_m', 'term_degree_js', 'term_pagerank_js',
        'term_lambda_dist', 'term_deg_ast', 'term_attr_ast'
    ]

    if names is None:
        names = [
            'karate', 'football', 'polbooks', 'us-flights', 'cora', 'citeseer',
            'polblogs', 'pubmed'
        ][:-1]

    # mus = [5, 6]
    mus = range(3, 11)
    snap_id = 0  # snap id is the track of generated graphs - 10

    if clusterings is None:
        clusterings = ['cond', 'leiden', 'spectral', 'consensus']

    for name in names:
        orig_graph, attr_name = get_graph(name, basedir=base_path)
        orig_deg_ast = nx.degree_assortativity_coefficient(orig_graph)
        orig_att_ast = nx.attribute_assortativity_coefficient(
            orig_graph, attr_name)

        orig_gstats = GraphStats(orig_graph)

        for gen_filename in glob.glob(
                f'{base_path}/output/generators/{name}/*'):
            path = Path(gen_filename)
            gen: RandomGenerator = load_pickle(
                path
            )  # all gen snapshots has 10 different generations - we need maybe just 1
            if gen is None: continue

            print(path.stem, end='\t', flush=True)
            pattern = r'(.*)\_(\w+)\_(\d+)\_(\d+)'
            m = re.match(pattern, path.stem)
            grammar_type, clustering, mu, _ = m.groups()
            mu = int(mu)
            if mu not in mus or clustering not in clusterings: continue

            generated_graph_snapshots = gen.all_gen_snapshots[0]
            del gen  # delete the object to save memory

            if num_samples is None:
                num_samples = len(generated_graph_snapshots)

            indices = sorted(
                set(
                    np.linspace(0,
                                len(generated_graph_snapshots) - 1,
                                num_samples,
                                dtype=int,
                                endpoint=True)))

            for t in indices:
                graph = generated_graph_snapshots[t]
                terminal_graph = filter_terminal_graph(graph)
                terminal_graph = un_nest_attr_dict(terminal_graph)
                row = {col: np.nan for col in cols}

                row.update(
                    dict(snap_id=snap_id,
                         name=name,
                         model=grammar_type,
                         clustering=clustering,
                         attr_name=attr_name,
                         orig_n=orig_graph.order(),
                         orig_m=orig_graph.size(),
                         orig_deg_ast=orig_deg_ast,
                         orig_attr_ast=orig_att_ast,
                         mu=mu,
                         t=t,
                         n=graph.order(),
                         m=graph.size(),
                         term_graph=terminal_graph,
                         term_n=terminal_graph.order(),
                         term_m=terminal_graph.size()))

                if terminal_graph.size() > 0:
                    gen_gstats = GraphStats(terminal_graph)

                    gpc = GraphPairCompare(orig_gstats, gen_gstats)
                    row.update(
                        term_degree_js=gpc.degree_js(),
                        term_pagerank_js=gpc.pagerank_js(),
                        term_lambda_dist=gpc.lambda_dist(),
                        term_deg_ast=nx.degree_assortativity_coefficient(
                            terminal_graph),
                        term_attr_ast=nx.attribute_assortativity_coefficient(
                            terminal_graph, attr_name),
                        deg_mix_dist_dict=gpc.deg_mixing_dist_dict(),
                        attr_mix_dist_dict=gpc.attr_mixing_dist_dict())
                rows.append(row)
            temp_df = pd.DataFrame(rows)
            temp_df.to_csv(f'{base_path}/stats/temp_gen_df.csv', index=False)
    return pd.DataFrame(rows)
示例#11
0
def make_all_graph_dfs_new(basedir,
                           name,
                           clusterings,
                           models=None,
                           final=False,
                           slow_stats=False,
                           mus=None,
                           extract_types=None):
    dfs = []

    if mus is None:
        mus = list(range(3, 11)) + [-1]

    if extract_types is None:
        extract_types = ['mu-random', 'mu-level', 'all-tnodes']

    if models is None:
        models = ['AVRG']

    write_every = 10

    for model in models:
        if final:
            temp_fname = f'{basedir}/stats/final/graphs/{name}_graph_df_{model}.csv'
        else:
            temp_fname = f'{basedir}/stats/temp/graphs/_graph_df_{name}.csv'

        existing_df = pd.read_csv(temp_fname) if Path(
            temp_fname).exists() else None

        orig_graph, attr_name = get_graph(name, basedir=basedir)
        orig_gstats = GraphStats(orig_graph)
        i = 0

        for fname in tqdm(
                glob.glob(f'{basedir}/output/graphs/{name}/{model}*'),
                desc=f'{name}',
                ncols=100):
            path = Path(fname)
            if not path.stem.startswith(tuple(models)):
                continue

            if model == 'AVRG':
                pattern = r'(.+)_(.+)_(.+)_(.+)_(\d+).*'
                m = re.match(pattern, path.stem)
                if m is None:
                    return
                gen_type, extract_type, clustering, mu, _ = m.groups()
                mu = int(mu)

                if clustering not in clusterings or mu not in mus or extract_type not in extract_types:
                    continue

                ## check if the row exists already
                if existing_df is not None:
                    if not existing_df[(existing_df.name == name)
                                       & (existing_df.model == model) &
                                       (existing_df.gen_type == gen_type) &
                                       (existing_df.extract_type
                                        == extract_type) &
                                       (existing_df.clustering == clustering) &
                                       (existing_df.mu == mu)].empty:
                        continue

            tqdm.write(path.stem)

            df = make_graph_df_new(basedir=basedir,
                                   fname=fname,
                                   name=name,
                                   orig_gstats=orig_gstats,
                                   model=model,
                                   slow_stats=slow_stats)
            # add df to the existing df
            if existing_df is None:
                existing_df = df
            else:
                existing_df = existing_df.append(df, ignore_index=True)

            if (i > 0) and (i % write_every == 0):
                tqdm.write(f'Writing partial results {name!r} {model!r}!')
                existing_df.to_csv(temp_fname, index=False)
            i += 1

        if existing_df is not None:
            # write existing df again
            existing_df.to_csv(temp_fname, index=False)
            print(f'Writing {name!r} {model!r} to {temp_fname!r}')
    return
示例#12
0
def make_grammar_df(base_path, names, clusterings, overwrite):
    cost_dict_pickle_fname = join(basedir, 'input', 'cost_dict.pkl')
    root_dict_pickle_fname = join(basedir, 'input', 'root_dict.pkl')
    cost_dict = load_pickle(cost_dict_pickle_fname)
    root_dict = load_pickle(root_dict_pickle_fname)
    if root_dict is None:
        root_dict = {}
        recompute = True
    else:
        recompute = False
        for name in names:
            if name not in root_dict:
                recompute = True
            for clustering in clusterings:
                if clustering not in root_dict[name]:
                    recompute = True

    if recompute:
        for name in tqdm(names, desc='Name'):
            orig_graph, attr_name = get_graph(name, basedir=base_path)
            if name not in root_dict:
                root_dict[name] = {}
            for clustering in tqdm(clusterings, desc='Clustering',
                                   leave=False):
                if clustering in root_dict[name]:
                    continue
                root = load_pickle(
                    f'{base_path}/output/trees/{name}/{clustering}_list.pkl')
                if root is None:
                    continue
                if isinstance(root, list):
                    root = create_tree(root)
                ht, avg_branch_factor, _ = get_tree_stats(g=orig_graph,
                                                          root=root,
                                                          cost=False)
                dc = cost_dict[name][clustering]
                root_dict[name][clustering] = ht, avg_branch_factor, dc
            dump_pickle(root_dict, root_dict_pickle_fname)

    print(root_dict)
    dl_dict = {}

    for name in names:
        temp_fname = f'{base_path}/stats/temp/_grammar_df_{name}.csv'
        if Path(temp_fname).exists() and not overwrite:
            print(f'Skipping {name!r}')
            continue

        orig_graph, attr_name = get_graph(name, basedir=base_path)
        dl_dict[name] = graph_mdl(orig_graph, attributed=True)
        rows = []

        print('\n\n', name)
        files = glob.glob(f'{base_path}/output/grammars/{name}/*.pkl')
        for fname in tqdm(files, total=len(files), desc=f'{name}'):
            path = Path(fname)
            pattern = r'(\w+)_(.+)\_(\w+)_(.+).*'
            m = re.match(pattern, path.stem)
            if m is None:
                continue
            grammar_type, extract_type, clustering, mu = m.groups()
            if clustering not in clusterings:  # skip over clusterings we dont care about
                continue

            if grammar_type.startswith('VRG'):  # skip over regular VRGs
                continue

            tqdm.write(f'{grammar_type}, {extract_type}, {clustering}, {mu}')
            ht, avg_branch_factor, dc = root_dict[name][clustering]

            vrg = load_pickle(fname)
            if vrg is None:
                continue
            graph_dl = dl_dict[name]

            row = dict(name=name,
                       orig_n=orig_graph.order(),
                       orig_m=orig_graph.size(),
                       grammar_type=grammar_type,
                       extract_type=vrg.extract_type,
                       mu=int(mu),
                       clustering=clustering,
                       cost=dc,
                       branch_factor=avg_branch_factor,
                       height=ht,
                       graph_dl=graph_dl,
                       num_rules=vrg.num_rules,
                       unique_rules=len(vrg.unique_rule_list),
                       grammar_dl=vrg.cost)
            rows.append(row)
        temp_df = pd.DataFrame(rows)
        temp_df.to_csv(temp_fname, index=False)

    # for name in names:
    #     temp_fname = f'{base_path}/stats/_grammar_df_{name}.csv'
    #     if Path(temp_fname).exists():
    #         os.remove(temp_fname)
    return pd.DataFrame(rows)