Exemplo n.º 1
0
def generate_karate_embedding():
    import visualisation
    y_path = '../../local_resources/karate/y.p'
    targets = utils.read_pickle(y_path)
    y = np.array(targets['cat'])
    log_path = '../../local_resources/tf_logs/run4/'
    walk_path = '../../local_resources/karate/walks_n1_l10.csv'
    size = 2  # dimensionality of the embedding
    params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500,
                    statistics_interval=0.1,
                    initial_learning_rate=1.0, save_path=log_path, epochs=10, concurrent_steps=1)

    path = '../../local_resources/karate/embeddings/tf_Win_polar' + '_' + utils.get_timestamp() + '.csv'

    embedding_in, embedding_out = HE.main(params)

    visualisation.plot_poincare_embedding(embedding_in, y,
                                          '../../results/karate/figs/poincare_polar_Win' + '_' + utils.get_timestamp() + '.pdf')
    visualisation.plot_poincare_embedding(embedding_out, y,
                                          '../../results/karate/figs/poincare_polar_Wout' + '_' + utils.get_timestamp() + '.pdf')
    df_in = pd.DataFrame(data=embedding_in, index=range(embedding_in.shape[0]))
    df_in.to_csv(path, sep=',')
    df_out = pd.DataFrame(data=embedding_out, index=range(embedding_out.shape[0]))
    df_out.to_csv(
        '../../local_resources/karate/embeddings/tf_Wout_polar' + '_' + utils.get_timestamp() + '.csv',
        sep=',')
    return path
Exemplo n.º 2
0
def generate_political_blogs_embedding():
    import visualisation
    s = datetime.datetime.now()
    y_path = '../../local_resources/political_blogs/y.p'
    y = utils.read_pickle(y_path)
    log_path = '../../local_resources/tf_logs/polblogs/'
    walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv'
    size = 2  # dimensionality of the embedding
    params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500,
                    statistics_interval=10.0,
                    initial_learning_rate=1.0, save_path=log_path, epochs=5, concurrent_steps=4)

    path = '../../local_resources/political_blogs/embeddings/Win' + '_' + utils.get_timestamp() + '.csv'

    embedding_in, embedding_out = HCE.main(params)

    visualisation.plot_poincare_embedding(embedding_in, y,
                                          '../../results/political_blogs/figs/poincare_polar_Win' + '_' + utils.get_timestamp() + '.pdf')
    visualisation.plot_poincare_embedding(embedding_out, y,
                                          '../../results/political_blogs/figs/poincare_polar_Wout' + '_' + utils.get_timestamp() + '.pdf')
    df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0]))
    df_in.to_csv(path, sep=',')
    df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0]))
    df_out.to_csv(
        '../../local_resources/political_blogs/embeddings/Wout' + '_' + utils.get_timestamp() + '.csv',
        sep=',')
    print('political blogs sample generated in: ', datetime.datetime.now() - s)

    political_blogs_scenario(path)
    return path
Exemplo n.º 3
0
def generate_blogcatalog_cartesian_embedding():
    import visualisation
    s = datetime.datetime.now()
    y_path = '../../local_resources/blogcatalog/y.p'
    y = utils.read_pickle(y_path)
    log_path = '../../local_resources/tf_logs/blogcatalog_cartesian/final_throw1'
    walk_path = '../../local_resources/blogcatalog/p025_q025_d128_walks.csv'
    size = 128  # dimensionality of the embedding
    params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500,
                    statistics_interval=10,
                    initial_learning_rate=0.2, save_path=log_path, epochs=5, concurrent_steps=12)

    path = '../../local_resources/blogcatalog/embeddings/Win_cartesian' + '_' + utils.get_timestamp() + '.csv'

    embedding_in, embedding_out = HCE.main(params)

    visualisation.plot_poincare_embedding(embedding_in, y,
                                          '../../results/blogcatalog/figs/poincare_Win_cartesian' + '_' + utils.get_timestamp() + '.pdf')
    visualisation.plot_poincare_embedding(embedding_out, y,
                                          '../../results/blogcatalog/figs/poincare_Wout_cartesian' + '_' + utils.get_timestamp() + '.pdf')
    df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0]))
    df_in.to_csv(path, sep=',')
    df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0]))
    df_out.to_csv(
        '../../local_resources/blogcatalog/embeddings/Wout_cartesian' + '_' + utils.get_timestamp() + '.csv',
        sep=',')
    print('blogcatalog cartesian embedding generated in: ', datetime.datetime.now() - s)
    return path
Exemplo n.º 4
0
def batch_size_scenario():
    """
    Generate embeddings using different batch sizes for the ~1000 vertex polblogs network
    :return:
    """
    import visualisation
    s = datetime.datetime.now()
    y_path = '../../local_resources/political_blogs/y.p'
    x_path = '../../local_resources/political_blogs/X.p'
    y = utils.read_pickle(y_path)
    log_path = '../../local_resources/tf_logs/polblogs/'
    walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv'
    size = 2  # dimensionality of the embedding
    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
    embeddings = []
    for batch_size in batch_sizes:
        params = Params(walk_path, batch_size=batch_size, embedding_size=size, neg_samples=5, skip_window=5,
                        num_pairs=1500,
                        statistics_interval=10.0,
                        initial_learning_rate=0.1, save_path=log_path, epochs=5, concurrent_steps=4)

        path = '../../local_resources/political_blogs/embeddings/Win_batch_{}_{}.csv'.format(
            batch_size, utils.get_timestamp())

        embedding_in, embedding_out = HCE.main(params)

        visualisation.plot_poincare_embedding(embedding_in, y,
                                              '../../results/political_blogs/figs/poincare_polar_Win_batch_{}_{}.pdf'.format(
                                                  batch_size, utils.get_timestamp()))
        visualisation.plot_poincare_embedding(embedding_out, y,
                                              '../../results/political_blogs/figs/poincare_polar_Wout_batch_{}_{}.pdf'.format(
                                                  batch_size, utils.get_timestamp()))
        df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0]))
        df_in.to_csv(path, sep=',')
        df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0]))
        df_out.to_csv(
            '../../local_resources/political_blogs/embeddings/Wout_batch_{}_{}.csv'.format(
                batch_size, utils.get_timestamp()),
            sep=',')
        print('political blogs embedding generated in: ', datetime.datetime.now() - s)
        embeddings.append(embedding_in)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [[str(batch_size)] for batch_size in batch_sizes]
    n_folds = 10
    results = run_detectors.run_all_datasets(embeddings, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv')
    tests[1].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/political_blogs/batch_size_macro' + utils.get_timestamp() + '.csv'
    micro_path = '../../results/political_blogs/batch_size_micro' + utils.get_timestamp() + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)

    return path
Exemplo n.º 5
0
def simulated_tree_scenario(branching_factor, levels):
    import visualisation
    folder = '../../local_resources/simulated_trees'
    deepwalk_path = '../../local_resources/simulated_trees/deepwalk_z{}_l{}.emd'.format(
        branching_factor, levels)
    walk_path = '../../local_resources/simulated_trees/walks_long_z{}_l{}.emd'.format(
        branching_factor, levels)
    emb_path = create_adj_mat(folder, branching_factor, levels)
    generate_simulated_tree(emb_path, walk_path, deepwalk_path)

    deepwalk_emd = pd.read_csv(deepwalk_path,
                               header=None,
                               index_col=0,
                               skiprows=1,
                               sep=" ")

    s = datetime.datetime.now()
    # y_path = '../../local_resources/blogcatalog_121_sample/y.p'
    # y = utils.read_pickle(y_path)
    y = generate_y(branching_factor, levels)

    log_path = '../../local_resources/tf_logs/sim_tree/'
    # walk_path = '../../local_resources/simulated_trees/walks.csv'
    size = 2  # dimensionality of the embedding
    params = Params(walk_path,
                    batch_size=4,
                    embedding_size=size,
                    neg_samples=5,
                    skip_window=5,
                    num_pairs=1500,
                    statistics_interval=0.1,
                    initial_learning_rate=1.0,
                    save_path=log_path,
                    epochs=20,
                    concurrent_steps=4)

    path = '../../local_resources/simulated_trees/embeddings/Win' + '_' + utils.get_timestamp(
    ) + '.csv'

    embedding_in, embedding_out = HE.main(params)

    visualisation.plot_deepwalk_embedding(
        deepwalk_emd.values, y,
        '../../results/simulated_trees/figs/deepwalk_z{}_l{}_{}.pdf'.format(
            branching_factor, levels, utils.get_timestamp()))

    visualisation.plot_poincare_embedding(
        embedding_in, y,
        '../../results/simulated_trees/figs/hyp_z{}_l{}_{}.pdf'.format(
            branching_factor, levels, utils.get_timestamp()))

    df_in = pd.DataFrame(data=embedding_in,
                         index=np.arange(embedding_in.shape[0]))
    df_in.to_csv(path, sep=',')

    return path
Exemplo n.º 6
0
def run_embedding(folder, learning_rate, run_scenario=True, module=HE):
    """
    Generate an embeddings for a given graph
    :param folder: the name of the folder and also the graph
    :param run_scenario: True if cv results are required
    :param module: An alias for the module containing the specific embedding
    :return: the path to the embedding
    """
    import visualisation
    s = datetime.datetime.now()
    y_path = '../../local_resources/{}/y.p'.format(folder)
    targets = utils.read_pickle(y_path)
    y = np.array(targets['cat'])
    log_path = '../../local_resources/tf_logs/run1/'
    walk_path = '../../local_resources/{}/walks_n1_l10.csv'.format(folder)
    size = 4  # dimensionality of the embedding
    params = Params(walk_path,
                    batch_size=4,
                    embedding_size=size,
                    neg_samples=5,
                    skip_window=5,
                    num_pairs=1500,
                    statistics_interval=10.0,
                    initial_learning_rate=learning_rate,
                    save_path=log_path,
                    epochs=5,
                    concurrent_steps=4)

    path = '../../local_resources/{0}/embeddings/Win_{1}.csv'.format(
        folder, utils.get_timestamp())

    embedding_in, embedding_out = module.main(params)

    visualisation.plot_poincare_embedding(
        embedding_in, y,
        '../../results/all/embedding_figs/{}_Win_{}.pdf'.format(
            folder, utils.get_timestamp()))
    visualisation.plot_poincare_embedding(
        embedding_out, y,
        '../../results/all/embedding_figs/{}_Wout_{}.pdf'.format(
            folder, utils.get_timestamp()))
    df_in = pd.DataFrame(data=embedding_in,
                         index=np.arange(embedding_in.shape[0]))
    df_in.to_csv(path, sep=',')
    df_out = pd.DataFrame(data=embedding_out,
                          index=np.arange(embedding_out.shape[0]))
    df_out.to_csv('../../local_resources/{0}/embeddings/Wout_{1}.csv'.format(
        folder, utils.get_timestamp()),
                  sep=',')
    print('{} embedding generated in: '.format(folder),
          datetime.datetime.now() - s)
    if run_scenario:
        MLD.run_scenario(folder, path)
    return path