Пример #1
0
def generate_blogcatalog_cartesian_embedding():
    import visualisation
    s = datetime.datetime.now()
    y_path = '../../local_resources/blogcatalog/y.p'
    y = utils.read_pickle(y_path)
    log_path = '../../local_resources/tf_logs/blogcatalog_cartesian/final_throw1'
    walk_path = '../../local_resources/blogcatalog/p025_q025_d128_walks.csv'
    size = 128  # dimensionality of the embedding
    params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500,
                    statistics_interval=10,
                    initial_learning_rate=0.2, save_path=log_path, epochs=5, concurrent_steps=12)

    path = '../../local_resources/blogcatalog/embeddings/Win_cartesian' + '_' + utils.get_timestamp() + '.csv'

    embedding_in, embedding_out = HCE.main(params)

    visualisation.plot_poincare_embedding(embedding_in, y,
                                          '../../results/blogcatalog/figs/poincare_Win_cartesian' + '_' + utils.get_timestamp() + '.pdf')
    visualisation.plot_poincare_embedding(embedding_out, y,
                                          '../../results/blogcatalog/figs/poincare_Wout_cartesian' + '_' + utils.get_timestamp() + '.pdf')
    df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0]))
    df_in.to_csv(path, sep=',')
    df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0]))
    df_out.to_csv(
        '../../local_resources/blogcatalog/embeddings/Wout_cartesian' + '_' + utils.get_timestamp() + '.csv',
        sep=',')
    print('blogcatalog cartesian embedding generated in: ', datetime.datetime.now() - s)
    return path
Пример #2
0
def generate_political_blogs_embedding():
    import visualisation
    s = datetime.datetime.now()
    y_path = '../../local_resources/political_blogs/y.p'
    y = utils.read_pickle(y_path)
    log_path = '../../local_resources/tf_logs/polblogs/'
    walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv'
    size = 2  # dimensionality of the embedding
    params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500,
                    statistics_interval=10.0,
                    initial_learning_rate=1.0, save_path=log_path, epochs=5, concurrent_steps=4)

    path = '../../local_resources/political_blogs/embeddings/Win' + '_' + utils.get_timestamp() + '.csv'

    embedding_in, embedding_out = HCE.main(params)

    visualisation.plot_poincare_embedding(embedding_in, y,
                                          '../../results/political_blogs/figs/poincare_polar_Win' + '_' + utils.get_timestamp() + '.pdf')
    visualisation.plot_poincare_embedding(embedding_out, y,
                                          '../../results/political_blogs/figs/poincare_polar_Wout' + '_' + utils.get_timestamp() + '.pdf')
    df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0]))
    df_in.to_csv(path, sep=',')
    df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0]))
    df_out.to_csv(
        '../../local_resources/political_blogs/embeddings/Wout' + '_' + utils.get_timestamp() + '.csv',
        sep=',')
    print('political blogs sample generated in: ', datetime.datetime.now() - s)

    political_blogs_scenario(path)
    return path
def karate_speed_test():
    """
    compare the runtime of cartesian tf embeddings with gensim
    :return:
    """

    walk_path = '../../local_resources/karate/walks_n1_l10.csv'
    outpath = '../../local_results/speedtest/karate_gensim.emd'
    log_path = '../../local_resources/tf_logs/hyperbolic_cartesian/speedtest'
    walks = generate_gensim_sentences(walk_path)
    size = 4  # dimensionality of the embedding
    epochs = 5
    params = HCE.Params(walk_path,
                        batch_size=20,
                        embedding_size=size,
                        neg_samples=5,
                        skip_window=5,
                        num_pairs=1500,
                        statistics_interval=1,
                        initial_learning_rate=0.2,
                        save_path=log_path,
                        epochs=epochs,
                        concurrent_steps=4)

    s = datetime.datetime.now()
    embedding_in, embedding_out = HCE.main(params)
    print 'tf ran in {0} s'.format(datetime.datetime.now() - s)
    s = datetime.datetime.now()
    generate_gensim_embeddings(walks, outpath, params)
    print 'gensim ran in {0} s'.format(datetime.datetime.now() - s)
Пример #4
0
def batch_size_scenario():
    """
    Generate embeddings using different batch sizes for the ~1000 vertex polblogs network
    :return:
    """
    import visualisation
    s = datetime.datetime.now()
    y_path = '../../local_resources/political_blogs/y.p'
    x_path = '../../local_resources/political_blogs/X.p'
    y = utils.read_pickle(y_path)
    log_path = '../../local_resources/tf_logs/polblogs/'
    walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv'
    size = 2  # dimensionality of the embedding
    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
    embeddings = []
    for batch_size in batch_sizes:
        params = Params(walk_path, batch_size=batch_size, embedding_size=size, neg_samples=5, skip_window=5,
                        num_pairs=1500,
                        statistics_interval=10.0,
                        initial_learning_rate=0.1, save_path=log_path, epochs=5, concurrent_steps=4)

        path = '../../local_resources/political_blogs/embeddings/Win_batch_{}_{}.csv'.format(
            batch_size, utils.get_timestamp())

        embedding_in, embedding_out = HCE.main(params)

        visualisation.plot_poincare_embedding(embedding_in, y,
                                              '../../results/political_blogs/figs/poincare_polar_Win_batch_{}_{}.pdf'.format(
                                                  batch_size, utils.get_timestamp()))
        visualisation.plot_poincare_embedding(embedding_out, y,
                                              '../../results/political_blogs/figs/poincare_polar_Wout_batch_{}_{}.pdf'.format(
                                                  batch_size, utils.get_timestamp()))
        df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0]))
        df_in.to_csv(path, sep=',')
        df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0]))
        df_out.to_csv(
            '../../local_resources/political_blogs/embeddings/Wout_batch_{}_{}.csv'.format(
                batch_size, utils.get_timestamp()),
            sep=',')
        print('political blogs embedding generated in: ', datetime.datetime.now() - s)
        embeddings.append(embedding_in)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [[str(batch_size)] for batch_size in batch_sizes]
    n_folds = 10
    results = run_detectors.run_all_datasets(embeddings, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv')
    tests[1].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/political_blogs/batch_size_macro' + utils.get_timestamp() + '.csv'
    micro_path = '../../results/political_blogs/batch_size_micro' + utils.get_timestamp() + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)

    return path