def generate_blogcatalog_cartesian_embedding(): import visualisation s = datetime.datetime.now() y_path = '../../local_resources/blogcatalog/y.p' y = utils.read_pickle(y_path) log_path = '../../local_resources/tf_logs/blogcatalog_cartesian/final_throw1' walk_path = '../../local_resources/blogcatalog/p025_q025_d128_walks.csv' size = 128 # dimensionality of the embedding params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=10, initial_learning_rate=0.2, save_path=log_path, epochs=5, concurrent_steps=12) path = '../../local_resources/blogcatalog/embeddings/Win_cartesian' + '_' + utils.get_timestamp() + '.csv' embedding_in, embedding_out = HCE.main(params) visualisation.plot_poincare_embedding(embedding_in, y, '../../results/blogcatalog/figs/poincare_Win_cartesian' + '_' + utils.get_timestamp() + '.pdf') visualisation.plot_poincare_embedding(embedding_out, y, '../../results/blogcatalog/figs/poincare_Wout_cartesian' + '_' + utils.get_timestamp() + '.pdf') df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0])) df_out.to_csv( '../../local_resources/blogcatalog/embeddings/Wout_cartesian' + '_' + utils.get_timestamp() + '.csv', sep=',') print('blogcatalog cartesian embedding generated in: ', datetime.datetime.now() - s) return path
def generate_political_blogs_embedding(): import visualisation s = datetime.datetime.now() y_path = '../../local_resources/political_blogs/y.p' y = utils.read_pickle(y_path) log_path = '../../local_resources/tf_logs/polblogs/' walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv' size = 2 # dimensionality of the embedding params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=10.0, initial_learning_rate=1.0, save_path=log_path, epochs=5, concurrent_steps=4) path = '../../local_resources/political_blogs/embeddings/Win' + '_' + utils.get_timestamp() + '.csv' embedding_in, embedding_out = HCE.main(params) visualisation.plot_poincare_embedding(embedding_in, y, '../../results/political_blogs/figs/poincare_polar_Win' + '_' + utils.get_timestamp() + '.pdf') visualisation.plot_poincare_embedding(embedding_out, y, '../../results/political_blogs/figs/poincare_polar_Wout' + '_' + utils.get_timestamp() + '.pdf') df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0])) df_out.to_csv( '../../local_resources/political_blogs/embeddings/Wout' + '_' + utils.get_timestamp() + '.csv', sep=',') print('political blogs sample generated in: ', datetime.datetime.now() - s) political_blogs_scenario(path) return path
def karate_speed_test(): """ compare the runtime of cartesian tf embeddings with gensim :return: """ walk_path = '../../local_resources/karate/walks_n1_l10.csv' outpath = '../../local_results/speedtest/karate_gensim.emd' log_path = '../../local_resources/tf_logs/hyperbolic_cartesian/speedtest' walks = generate_gensim_sentences(walk_path) size = 4 # dimensionality of the embedding epochs = 5 params = HCE.Params(walk_path, batch_size=20, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=1, initial_learning_rate=0.2, save_path=log_path, epochs=epochs, concurrent_steps=4) s = datetime.datetime.now() embedding_in, embedding_out = HCE.main(params) print 'tf ran in {0} s'.format(datetime.datetime.now() - s) s = datetime.datetime.now() generate_gensim_embeddings(walks, outpath, params) print 'gensim ran in {0} s'.format(datetime.datetime.now() - s)
def batch_size_scenario(): """ Generate embeddings using different batch sizes for the ~1000 vertex polblogs network :return: """ import visualisation s = datetime.datetime.now() y_path = '../../local_resources/political_blogs/y.p' x_path = '../../local_resources/political_blogs/X.p' y = utils.read_pickle(y_path) log_path = '../../local_resources/tf_logs/polblogs/' walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv' size = 2 # dimensionality of the embedding batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128] embeddings = [] for batch_size in batch_sizes: params = Params(walk_path, batch_size=batch_size, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=10.0, initial_learning_rate=0.1, save_path=log_path, epochs=5, concurrent_steps=4) path = '../../local_resources/political_blogs/embeddings/Win_batch_{}_{}.csv'.format( batch_size, utils.get_timestamp()) embedding_in, embedding_out = HCE.main(params) visualisation.plot_poincare_embedding(embedding_in, y, '../../results/political_blogs/figs/poincare_polar_Win_batch_{}_{}.pdf'.format( batch_size, utils.get_timestamp())) visualisation.plot_poincare_embedding(embedding_out, y, '../../results/political_blogs/figs/poincare_polar_Wout_batch_{}_{}.pdf'.format( batch_size, utils.get_timestamp())) df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0])) df_out.to_csv( '../../local_resources/political_blogs/embeddings/Wout_batch_{}_{}.csv'.format( batch_size, utils.get_timestamp()), sep=',') print('political blogs embedding generated in: ', datetime.datetime.now() - s) embeddings.append(embedding_in) x, y = utils.read_data(x_path, y_path, threshold=0) names = [[str(batch_size)] for batch_size in batch_sizes] n_folds = 10 results = run_detectors.run_all_datasets(embeddings, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/political_blogs/batch_size_macro' + utils.get_timestamp() + '.csv' micro_path = '../../results/political_blogs/batch_size_micro' + utils.get_timestamp() + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True) return path