Пример #1
0
def exp11():
    dataset = 'linux'
    l = load_as_dict(
        '/home/<>/Documents/GraphEmbedding/model/Siamese/logs/siamese_regression_linux_2018-11-04T22:07:15.428277(sepa, fix=10; check multi-scale)/test_info.klepto')
    weight = l['atts']
    node_embs_dict = l['node_embs_dict']
    draw_emb_hist_heat(dataset, node_embs_dict, True)  # TODO: fix
Пример #2
0
def exp12():
    dataset = 'ptc'
    ds_algo = 'astar'
    ds_metric = 'ged'
    sim_or_dist = 'dist'
    dir = '/media/...)'
    row_graphs = load_data(dataset, False).graphs
    col_graphs = load_data(dataset, True).graphs
    tr_l = load_as_dict(dir + '/train_val_info.klepto')
    print(tr_l.keys())
    te_l = load_as_dict(dir + '/test_info.klepto')
    print(te_l.keys())
    true_r = load_result(dataset, ds_algo, row_graphs, col_graphs, None,
                         None, False, sim_or_dist, ds_metric, None)
    pred_r = load_result(dataset, 'siamese', row_graphs, col_graphs, None,
                         te_l['sim_mat'], True, sim_or_dist, ds_metric, None)
    draw_ranking(dataset, ds_metric, true_r, pred_r, 'Our Model',
                 tr_l['flags']['node_feat_name'],
                 plot_node_ids=False, plot_gids=False, ds_norm=True,
                 existing_mappings=None)
Пример #3
0
def compute_quality_for_corpus(corpus_dir):
    '''Return the quality score for tested corpus (with truth and prediction files).'''
    from utils import read_classification_from_file as load_as_dict
    truth_file = '!truth.txt'
    pred_file = '!prediction.txt'
    truth_dict = load_as_dict(os.path.join(corpus_dir, truth_file))
    pred_dict = load_as_dict(os.path.join(corpus_dir, pred_file))
    
    from confmat import BinaryConfusionMatrix
    pos_tag = 'SPAM'
    neg_tag = 'OK'
    cm = BinaryConfusionMatrix(pos_tag, neg_tag)
    
    cm.compute_from_dicts(truth_dict, pred_dict)
    
    confusion_dict = cm.as_dict()
    tp = confusion_dict['tp']
    tn = confusion_dict['tn']
    fp = confusion_dict['fp']
    fn = confusion_dict['fn']
    
    return quality_score(tp, tn, fp, fn)
Пример #4
0
def main():
    sfn = cur_folder + '/temp'
    loaded = load_as_dict(sfn)
    if not loaded:
        movies, movies_dict, people_dict = read_data()
        print("finish reading data!")
        movies.sort(key=voteGetter, reverse=True)
        print('sorted')
        for idx, movie in enumerate(movies):
            movie.set_rank(idx)
        print('shuffled')
        save_as_dict(sfn, movies, movies_dict, people_dict)
    else:
        movies = loaded['movies']
        movies_dict = loaded['movies_dict']
        people_dict = loaded['people_dict']
        print('loaded movies, movies_dict, people_dict')

    create_dataset(movies, movies_dict, people_dict, 'Coarse')
    create_dataset(movies, movies_dict, people_dict, 'Fine')
Пример #5
0
def gen_aids_small(name, additional=False):
    datadir = get_root_path() + '/data'
    dirin = datadir + '/AIDS40k_orig'
    sfn = get_save_path() + '/aids40k_orig'
    loaded = load_as_dict(sfn)
    if not loaded:
        graphs = {}
        nodes_graphs = defaultdict(list)
        lesseq30 = set()
        lesseq10 = set()
        disconnects = set()
        # Iterate through all 40k graphs.
        for file in glob(dirin + '/*.gexf'):
            gid = int(file.split('/')[-1].split('.')[0])
            g = nx.read_gexf(file)
            if not nx.is_connected(g):
                print('{} not connected'.format(gid))
                disconnects.add(gid)
                continue
            graphs[gid] = g
            nodes_graphs[g.number_of_nodes()].append(gid)
            if g.number_of_nodes() <= 30:
                lesseq30.add(gid)
            if g.number_of_nodes() <= 10:
                lesseq10.add(gid)
        save_as_dict(sfn, graphs, nodes_graphs, lesseq30, lesseq10,
                     disconnects)
    else:
        graphs = loaded['graphs']
        nodes_graphs = loaded['nodes_graphs']
        lesseq30 = loaded['lesseq30']
        lesseq10 = loaded['lesseq10']
        disconnects = loaded['disconnects']
    print(len(disconnects), 'disconnected graphs out of', len(graphs))
    print(len(lesseq30), 'with <= 30 nodes')
    print(len(lesseq10), 'with <= 10 nodes')
    # exit(1)
    train_dir = '{}/{}/train'.format(datadir, name)
    if additional:
        train_data = load_data(name.lower(), train=True)
        test_dir_str = 'test2'
    else:
        exec_cmd('mkdir -p {}'.format(train_dir))
        test_dir_str = 'test'
    test_dir = '{}/{}/{}'.format(datadir, name, test_dir_str)
    exec_cmd('mkdir -p {}'.format(test_dir))
    if not additional:
        if name == 'AIDS10k':
            for num_node in range(5, 23):
                choose = random.Random(123).sample(nodes_graphs[num_node],
                                                   1)[0]
                print('choose {} with {} nodes'.format(choose, num_node))
                nx.write_gexf(graphs[choose],
                              test_dir + '/{}.gexf'.format(choose))
                lesseq30.remove(choose)
            for tid in random.Random(123).sample(lesseq30, 10000):
                nx.write_gexf(graphs[tid], train_dir + '/{}.gexf'.format(tid))
        elif name == 'AIDS700nef':
            lesseq10 = sample_from_lessthan10eq(train_dir, lesseq10, 560,
                                                graphs, 'train')
            sample_from_lessthan10eq(test_dir, lesseq10, 140, graphs, 'test')
    else:
        assert (name == 'AIDS10k')
        for num_node in range(5, 30):
            k = 4
            from_li = nodes_graphs[num_node]
            print('sampling {} from {} (size={})'.format(
                k, len(from_li), num_node))
            choose = random.Random(123).sample_exclude(from_li, k,
                                                       train_data.get_gids())
            print('choose {} with {} nodes'.format(choose, num_node))
            for c in choose:
                nx.write_gexf(graphs[c], test_dir + '/{}.gexf'.format(c))
    print('Done')
Пример #6
0
     'draw_edge_label_font_size': 6,
     # graph text info config
     'each_graph_text_list': [],
     'each_graph_text_font_size': 8,
     'each_graph_text_pos': [0.5, 1.05],
     # graph padding: value range: [0, 1]
     'top_space': 0.20 if concise else 0.26,  # out of whole graph
     'bottom_space': 0.05,
     'hbetween_space': 0.6 if concise else 1,  # out of the subgraph
     'wbetween_space': 0,
     # plot config
     'plot_dpi': 200,
     'plot_save_path': ''
 }
 emb_data = load_as_dict("/home/songbian/Documents/fork/"
                         "GraphEmbedding/data/"
                         "regression_linux_test_info.pickle")
 weight_data = load_as_dict("/home/songbian/Documents/"
                            "fork/GraphEmbedding/data/"
                            "classification_linux_test_info.pickle")
 # print(weight_data)
 weight = weight_data['atts']
 weight_max_array = []
 weight_min_array = []
 for i in range(len(weight)):
     weight_min_array.append(min(weight[i]))
     weight_max_array.append(max(weight[i]))
 weight_max = max(weight_max_array)
 weight_min = min(weight_min_array)
 print("max:", weight_max)
 print("min:", weight_min)
Пример #7
0
     'draw_edge_label_font_size': 6,
     # graph text info config
     'each_graph_text_list': [],
     'each_graph_text_font_size': 8,
     'each_graph_text_pos': [0.5, 1.05],
     # graph padding: value range: [0, 1]
     'top_space': 0.20 if concise else 0.26,  # out of whole graph
     'bottom_space': 0.05,
     'hbetween_space': 0.6 if concise else 1,  # out of the subgraph
     'wbetween_space': 0,
     # plot config
     'plot_dpi': 200,
     'plot_save_path': ''
 }
 weight_data = load_as_dict(
     "/home/songbian/Documents/fork/GraphEmbedding/model/Siamese/logs"
     "/siamese_classification_aids700nef_2018-07-28T10:09:33"
     "/test_info.pickle")
 weight = weight_data['atts']
 weight_max_array = []
 weight_min_array = []
 for i in range(len(weight)):
     weight_min_array.append(min(weight[i]))
     weight_max_array.append(max(weight[i]))
 weight_max = max(weight_max_array)
 weight_min = min(weight_min_array)
 print("max:", weight_max)
 print("min:", weight_min)
 weight_max = 0.85
 weight_min = 0.7
 train_data = load_data(dataset, train=True)
 test_data = load_data(dataset, train=False)
Пример #8
0
def create_siamese_result_from_test_info_pickle(fp, dataset, row_gs, col_gs):
    name = 'siamese_test'
    d = load_as_dict(fp)
    return name, load_result(dataset, name, sim_mat=d['sim_mat'],
                             row_graphs=row_gs, col_graphs=col_gs,
                             time_mat=[])
Пример #9
0
from results import load_result


def sigmoid(x):
    return 1 / (1 + math.exp(-x))


if __name__ == '__main__':
    dataset = 'aids80nef'
    train_data = load_data(dataset, train=True)
    test_data = load_data(dataset, train=False)
    row_graphs = test_data.graphs
    col_graphs = train_data.graphs
    load_res = load_result(dataset, 'astar', row_graphs=row_graphs, col_graphs=col_graphs)
    data_origin = load_as_dict("/home/songbian/Documents/fork/"
                        "GraphEmbedding/data/"
                        "regression_aids80nef_test_info.pickle")
    data = data_origin['node_embs_list']
    for i in range(len(data)):
        for j in range(len(data[i])):
            if len(data[i]) < 10:
                data[i] = np.pad(data[i], ((0, 10 - len(data[i])),
                                (0, 0)), 'constant', constant_values=(0, 0))

    ids = load_res.sort_id_mat_
    for i in range(len(row_graphs)):
        q = test_data.graphs[i]
        gids = np.concatenate([ids[i][:10], ids[i][-10:]])
        for j in gids:
            result = np.dot(data[i], data[j].T)
            sns_plot = sns.heatmap(result)
Пример #10
0
				ha='right',
				va='bottom',
				fontsize=35)

		# axes = plt.gca()
		# axes.set_xlim([-1,1])
		# axes.set_ylim([-1,1])

	plt.savefig(filename)
	plt.close()

try:
	set_plot_defaults()

	print('Reading the tokenized corpus...')
	read_obj = load_as_dict(corpus)
	dictionary = read_obj['dictionary']
	reversed_dictionary = read_obj['reversed_dictionary']

	pca_lcl_tgt = PCA(n_components=2)
	pca_lcl_nce = PCA(n_components=2)
	pca_glb_tgt = PCA(n_components=2)
	pca_glb_nce = PCA(n_components=2)
	plot_only = create_plotID_list(words, dictionary)
	
	print('Plotting...')
	for bf in listdir(folder_name):
		# Check for existing plot files
		if bf.startswith(prefix):
			continue
Пример #11
0
                    pad_inches=0)
        if eps_dir:
            plt.savefig(eps_dir + '/' + str(i) + '.png',
                        bbox_inches='tight',
                        pad_inches=0)
            plt.savefig(eps_dir + '/' + str(i) + '.eps',
                        bbox_inches='tight',
                        pad_inches=0)
        plt_cnt += 1
        plt.close()
    print('Saved {} embedding visualization plots'.format(plt_cnt))


if __name__ == '__main__':
    data = load_as_dict("/home/songbian/Documents/fork/"
                        "GraphEmbedding/data/"
                        "regression_linux_test_info.pickle")
    embs = data['embs']
    dataset = 'linux'
    thresh_pos = 0.58
    thresh_neg = 0.58
    thresh_pos_sim = 0.5
    thresh_neg_sim = 0.5
    norm = True
    row_graphs = load_data(dataset, train=False).graphs
    col_graphs = load_data(dataset, train=True).graphs
    true_result = load_result(dataset,
                              TRUE_MODEL,
                              row_graphs=row_graphs,
                              col_graphs=col_graphs)
    pred_r = load_result(dataset,
Пример #12
0
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats, integrate
import sys
sys.path.append('../')
from dist_calculator import get_gs_dist_mat, DistCalculator
from utils import load_as_dict, load_data
from results import load_result

if __name__ == '__main__':
    dataset = 'aids700nef'
    dist_metric = 'ged'
    dist_algo = 'astar'
    emb_data = load_as_dict("/home/songbian/Documents/fork/GraphEmbedding/model/Siamese/logs/" \
                  "siamese_regression_aids700nef_2018-08-01T11:52:11(cur_best)/test_info.pickle")
    train_data = load_data(dataset, train=True)
    test_data = load_data(dataset, train=False)
    row_graphs = test_data.graphs
    col_graphs = train_data.graphs
    matrix = load_result(dataset,
                         'astar',
                         row_graphs=row_graphs,
                         col_graphs=col_graphs)
    pred_r = load_result(dataset,
                         'siamese',
                         sim_mat=emb_data['sim_mat'],
                         time_mat=emb_data['time_li'])
    ids = matrix.sort_id_mat_
    print(len(matrix.dist_norm_mat_))
    print(len(matrix.dist_norm_mat_[0]))