def karate_test_scenario(deepwalk_path): y_path = '../../local_resources/zachary_karate/y.p' x_path = '../../local_resources/zachary_karate/X.p' target = utils.read_target(y_path) x, y = utils.read_data(x_path, y_path, threshold=0) names = [['deepwalk'], ['logistic']] x_deepwalk = pd.read_csv(deepwalk_path, index_col=0) # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1) X = [x_deepwalk.values, normalize(x, axis=0)] n_folds = 10 results = run_detectors.run_all_datasets(X, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/karate/deepwalk_macro_pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/karate/deepwalk_micro_pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/karate/deepwalk_macro' + utils.get_timestamp( ) + '.csv' micro_path = '../../results/karate/deepwalk_micro' + utils.get_timestamp( ) + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True)
def run_scenario(folder, embedding_path): y_path = '../../local_resources/{}/y.p'.format(folder) x_path = '../../local_resources/{}/X.p'.format(folder) sizes = [2, 4, 8, 16, 32, 64, 128] deepwalk_embeddings = [] deepwalk_names = [] dwpath = '../../local_resources/{0}/{1}'.format(folder, folder) for size in sizes: path = dwpath + str(size) + '.emd' de = pd.read_csv(path, header=None, index_col=0, skiprows=1, sep=" ") de.sort_index(inplace=True) deepwalk_embeddings.append(de.values) deepwalk_names.append(['deepwalk' + str(size)]) x, y = utils.read_data(x_path, y_path, threshold=0) names = [['hyperbolic'], ['logistic']] names = deepwalk_names + names embedding = pd.read_csv(embedding_path, index_col=0) X = deepwalk_embeddings + [embedding.values, normalize(x, axis=0)] n_folds = 10 results = run_detectors.run_all_datasets(X, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/{0}/pvalues{1}.csv'.format(folder, utils.get_timestamp())) tests[1].to_csv('../../results/{0}/pvalues{1}.csv'.format(folder, utils.get_timestamp())) print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/{0}/macro{1}.csv'.format(folder, utils.get_timestamp()) micro_path = '../../results/{0}/micro{1}.csv'.format(folder, utils.get_timestamp()) results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True)
def test_embeddings(): feature_path = '../local_resources/features_1in10000.tsv' rf_features = pd.read_csv(feature_path, sep='\t', index_col=0) emd = pd.read_csv('../local_resources/hyperbolic_embeddings/tf_test1.csv', header=None, index_col=0, skiprows=1, sep=" ") features, y = utils.get_classification_xy(rf_features) features = features.loc[emd.index, :] y = y.loc[emd.index].values names = np.array([['RF just emd']]) n_folds = 10 classifiers = [ RandomForestClassifier(max_depth=2, n_estimators=50, bootstrap=True, criterion='entropy', max_features=0.1, n_jobs=1) ] results = run_all_datasets([emd.values], y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) print 'macro', results[0] print 'micro', results[1] macro_path = 'tf_testing_1in10000' + utils.get_timestamp() + '.csv' micro_path = 'tf_micro_1in10000' + utils.get_timestamp() + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True) assert results[0]['mean'].values > 0.6
def batch_size_scenario(): """ Generate embeddings using different batch sizes for the ~1000 vertex polblogs network :return: """ import visualisation s = datetime.datetime.now() y_path = '../../local_resources/political_blogs/y.p' x_path = '../../local_resources/political_blogs/X.p' y = utils.read_pickle(y_path) log_path = '../../local_resources/tf_logs/polblogs/' walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv' size = 2 # dimensionality of the embedding batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128] embeddings = [] for batch_size in batch_sizes: params = Params(walk_path, batch_size=batch_size, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=10.0, initial_learning_rate=0.1, save_path=log_path, epochs=5, concurrent_steps=4) path = '../../local_resources/political_blogs/embeddings/Win_batch_{}_{}.csv'.format( batch_size, utils.get_timestamp()) embedding_in, embedding_out = HCE.main(params) visualisation.plot_poincare_embedding(embedding_in, y, '../../results/political_blogs/figs/poincare_polar_Win_batch_{}_{}.pdf'.format( batch_size, utils.get_timestamp())) visualisation.plot_poincare_embedding(embedding_out, y, '../../results/political_blogs/figs/poincare_polar_Wout_batch_{}_{}.pdf'.format( batch_size, utils.get_timestamp())) df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0])) df_out.to_csv( '../../local_resources/political_blogs/embeddings/Wout_batch_{}_{}.csv'.format( batch_size, utils.get_timestamp()), sep=',') print('political blogs embedding generated in: ', datetime.datetime.now() - s) embeddings.append(embedding_in) x, y = utils.read_data(x_path, y_path, threshold=0) names = [[str(batch_size)] for batch_size in batch_sizes] n_folds = 10 results = run_detectors.run_all_datasets(embeddings, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/political_blogs/batch_size_macro' + utils.get_timestamp() + '.csv' micro_path = '../../results/political_blogs/batch_size_micro' + utils.get_timestamp() + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True) return path
def karate_deepwalk_grid_scenario(): """ evaluates a grid of embeddings at different sizes, walk lengths and walks per vertex for the karate network. Trying to understand why the DeepWalk performance was so poor. :return: """ import os y_path = '../../local_resources/karate/y.p' x_path = '../../local_resources/karate/X.p' target = utils.read_target(y_path) x, y = utils.read_data(x_path, y_path, threshold=0) folder = '../../local_resources/karate/gridsearch/' names = [[elem] for elem in os.listdir(folder)] embeddings = [] for name in names: emb = pd.read_csv(folder + name[0], header=None, index_col=0, skiprows=1, sep=" ") emb.sort_index(inplace=True) embeddings.append(emb.values) names.append(['hyperbolic']) hyp_path = '../../local_resources/karate/embeddings/Win_20170808-185202.csv' hyp_emb = pd.read_csv(hyp_path, index_col=0) embeddings.append(hyp_emb.values) n_folds = 10 results = run_detectors.run_all_datasets(embeddings, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/karate/pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/karate/pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/karate/macro' + utils.get_timestamp() + '.csv' micro_path = '../../results/karate/micro' + utils.get_timestamp() + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True)
def political_blogs_scenario(embedding_path): # deepwalk_path = '../../local_resources/hyperbolic_embeddings/tf_test1.csv' y_path = '../../local_resources/political_blogs/y.p' x_path = '../../local_resources/political_blogs/X.p' sizes = [2, 4, 8, 16, 32, 64, 128] deepwalk_embeddings = [] deepwalk_names = [] dwpath = '../../local_resources/political_blogs/political_blogs' for size in sizes: path = dwpath + str(size) + '.emd' de = pd.read_csv(path, header=None, index_col=0, skiprows=1, sep=" ") de.sort_index(inplace=True) deepwalk_embeddings.append(de.values) deepwalk_names.append(['deepwalk' + str(size)]) x, y = utils.read_data(x_path, y_path, threshold=0) names = [['hyperbolic'], ['logistic']] names = deepwalk_names + names embedding = pd.read_csv(embedding_path, index_col=0) # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1) X = deepwalk_embeddings + [embedding.values, normalize(x, axis=0)] n_folds = 10 results = run_detectors.run_all_datasets(X, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/political_blogs/pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/political_blogs/pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/political_blogs/macro' + utils.get_timestamp( ) + '.csv' micro_path = '../../results/political_blogs/micro' + utils.get_timestamp( ) + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True)