def plot_ged_time_helper(dataset, models, metric, rs): font = {'family': 'serif', 'size': 22} matplotlib.rc('font', **font) plt.figure(0) plt.figure(figsize=(16, 10)) xs = get_test_graph_sizes(dataset) so = np.argsort(xs) xs.sort() for model in models: mat = rs[model].mat(metric.name, norm=True) print('plotting for {}'.format(model)) ys = np.mean(mat, 1)[so] plt.plot(xs, ys, **get_plotting_arg(args1, model)) plt.scatter(xs, ys, s=200, label=model, **get_plotting_arg(args2, model)) plt.xlabel('query graph size') ax = plt.gca() ax.set_xticks(xs) plt.ylabel('average {}'.format(metric.ylabel)) plt.legend(loc='best', ncol=2) plt.grid(linestyle='dashed') plt.tight_layout() # plt.show() sp = get_result_path() + '/{}/{}/ged_{}_mat_{}_{}.png'.format( \ dataset, metric, metric, dataset, '_'.join(models)) plt.savefig(sp) print('Saved to {}'.format(sp))
def post_real_dataset_run_convert_csv_to_np(): """ Use in case only csv is generated, and numpy matrices need to be saved. """ dataset = 'imdbmulti' model = 'CDKMCS' ds_metric = 'mcs' row_graphs = load_data(dataset, False).graphs col_graphs = load_data(dataset, True).graphs num_cpu = 40 computer_name = 'scai1_all' ts = '2018-10-09T13:41:13.942414' outdir = '{}/{}'.format(get_result_path(), dataset) csv_fn = '{}/csv/{}_{}_{}_{}_{}_{}cpus.csv'.format( outdir, ds_metric, dataset, model, ts, computer_name, num_cpu) data = load_from_exsiting_csv(csv_fn, ds_metric) m = len(row_graphs) n = len(col_graphs) # -3 is identifier that the csv the data came from didn't include the data point. ds_mat = np.full((m, n), -3) time_mat = np.full((m, n), -3) cnt = 0 print('m: {}, n: {}, m*n: {}'.format(m, n, m * n)) for (i, j), row_data in data.items(): if cnt % 1000 == 0: print(cnt) ds_mat[i][j] = row_data[4] time_mat[i][j] = row_data[6] if ds_metric == 'ged' else row_data[7] cnt += 1 print(cnt) assert (cnt == m * n) save_as_np(outdir, ds_metric, ds_mat, time_mat, ts, dataset, row_graphs, col_graphs, model, computer_name, num_cpu)
def run(): events_tracker = TerminalEventsTracker(log_pth="../logs.txt", report_every_responses_nb=1000) settings_path = "./settings.json" proxies_save_pth, creds_save_pth = utils.get_proxy_and_creds_paths( settings_path) checkp_data, checkp_requester = utils.get_data_requester_checkpoint_paths( settings_path) result_file = utils.get_result_path(settings_path) backups_path = utils.get_backups_path(settings_path) proxy_storage = ProxyStorage(proxies_save_pth) creds_storage = CredsStorage(creds_save_pth) runner = VkCrawlRunnerWithCheckpoints( start_user_id=142478661, data_resume_checkpoint_save_pth=checkp_data, tracker=events_tracker, proxy_storage=proxy_storage, creds_storage=creds_storage, requester_checkpoints_path=checkp_requester, requester_max_requests_per_loop=4000, long_term_save_pth=result_file, data_backup_path=str(backups_path / "parsed_backup.jsonl"), loops_per_checkpoint=3, use_async=True, nb_sessions=8, dmp_long_term_steps=2000) runner.run()
def _load_result_mat(self, dataset, metric): file_p = get_result_path() + '/{}/{}/{}_{}_mat_{}_{}_*.npy'.format( \ dataset, metric, self.dist_metric(), metric, dataset, self.model_) li = glob(file_p) if not li: raise RuntimeError('No results found {}'.format(file_p)) file = self._choose_result_file(li) return np.load(file)
def exp5(): """ Query visualization. """ dataset = 'imdbmulti' model = 'astar' concise = True norms = [True, False] dir = get_result_path() + '/{}/query_vis/{}'.format(dataset, model) create_dir_if_not_exists(dir) info_dict = { # draw node config 'draw_node_size': 150 if dataset != 'linux' else 10, 'draw_node_label_enable': True, 'node_label_name': None if dataset == 'linux' else 'type', 'draw_node_label_font_size': 6, 'draw_node_color_map': TYPE_COLOR_MAP, # draw edge config 'draw_edge_label_enable': False, 'edge_label_name': 'valence', 'draw_edge_label_font_size': 6, # graph text info config 'each_graph_text_list': [], 'each_graph_text_font_size': 8, 'each_graph_text_pos': [0.5, 1.05], # graph padding: value range: [0, 1] 'top_space': 0.20 if concise else 0.26, # out of whole graph 'bottom_space': 0.05, 'hbetween_space': 0.6 if concise else 1, # out of the subgraph 'wbetween_space': 0, # plot config 'plot_dpi': 200, 'plot_save_path_eps': '', 'plot_save_path_png': '' } train_data = load_data(dataset, train=True) test_data = load_data(dataset, train=False) row_graphs = test_data.graphs col_graphs = train_data.graphs r = load_result(dataset, model, row_graphs=row_graphs, col_graphs=col_graphs) tr = load_result(dataset, TRUE_MODEL, row_graphs=row_graphs, col_graphs=col_graphs) for norm in norms: ids = r.get_sort_id_mat(norm) m, n = r.m_n() num_vis = 10 for i in range(num_vis): q = test_data.graphs[i] gids = np.concatenate([ids[i][:3], [ids[i][int(n / 2)]], ids[i][-3:]]) gs = [train_data.graphs[j] for j in gids] info_dict['each_graph_text_list'] = \ [get_text_label(dataset, r, tr, i, i, q, model, norm, True, concise)] + \ [get_text_label(dataset, r, tr, i, j, train_data.graphs[j], model, norm, False, concise) \ for j in gids] # print(info_dict['each_graph_text_list']) info_dict['plot_save_path_png'] = '{}/query_vis_{}_{}_{}{}.{}'.format( dir, dataset, model, i, get_norm_str(norm), 'png') info_dict['plot_save_path_eps'] = '{}/query_vis_{}_{}_{}{}.{}'.format( dir, dataset, model, i, get_norm_str(norm), 'eps') vis(q, gs, info_dict)
def draw_emb_hist_heat_helper(gcn_id, nel, cmap_color, dataset, row_graphs, col_graphs, ids, true_r, ds_norm, plot_max_num, extra_dir): plt_cnt = 0 for i in range(len(row_graphs)): # gids = column ids of [worst match, best match] gids = np.concatenate([ids[i][:1], ids[i][-1:]]) for j in gids: _, d = true_r.dist_sim(i, j, ds_norm) # nel is [train + val ... test] query_nel_idx = len(col_graphs) + i match_nel_idx = j # result is dot product between the query (test) and match (train/val) result = np.dot(nel[query_nel_idx], nel[match_nel_idx].T) plt.figure() sns_plot = sns.heatmap(result, fmt='d', cmap=cmap_color) fig = sns_plot.get_figure() dir = '{}/{}/{}'.format(get_result_path(), dataset, 'heatmap') fn = '{}_{}_{}_gcn{}'.format(i, j, d, gcn_id) plt_cnt += save_fig(fig, dir, fn, print_path=False) if extra_dir: plt_cnt += save_fig(fig, extra_dir + '/heatmap', fn, print_path=False) plt.close() result_array = [] for m in range(len(result)): for n in range(len(result[m])): result_array.append(result[m][n]) plt.figure() plt.xlim(-1, 1) plt.ylim(0, 100) sns_plot = sns.distplot(result_array, bins=16, color='r', kde=False, rug=False, hist=True) fig = sns_plot.get_figure() dir = '{}/{}/{}'.format(get_result_path(), dataset, 'histogram') fn = '{}_{}_{}_gcn{}'.format(i, j, d, gcn_id) plt_cnt += save_fig(fig, dir, fn, print_path=False) if extra_dir: plt_cnt += save_fig(fig, extra_dir + '/histogram', fn, print_path=False) plt.close() if plt_cnt > plot_max_num: print('Saved {} node embeddings mne plots for gcn{}'.format(plt_cnt, gcn_id)) return print('Saved {} node embeddings mne plots for gcn{}'.format(plt_cnt, gcn_id))
def clean_up(): rp = get_result_path() for file in sorted_nicely(glob('{}/{}'.format(rp, f))): bnf = basename(file) print_info(file, bnf) t = prompt('Delete? [y/n]', ['y', 'n']) if t == 'y': exec('rm -rf {}'.format(file)) elif t == 'n': print('Skip') else: assert (False) print('Done')
def plot_preck(dataset, dsmetric, models, rs, true_result, metric, norms, plot_results=True, extra_dir=None): """ Plot prec@k. """ create_dir_if_not_exists('{}/{}/{}'.format( get_result_path(), dataset, metric)) rtn = {} for norm in norms: _, n = true_result.m_n() ks = range(1, n) d = plot_preck_helper( dataset, dsmetric, models, rs, true_result, metric, norm, ks, False, plot_results, extra_dir) rtn.update(d) return rtn
def _load_result_mat(self, metric, model, m, n): file_p = get_result_path() + '/{}/{}/{}_{}_mat_{}_{}_*.npy'.format( self.dataset, metric, self.dist_metric(), metric, self.dataset, model) li = glob(file_p) if not li: if 'astar' in model: if self.dataset != 'imdbmulti': raise RuntimeError('Not imdbmulti and no astar results!') return self._load_merged_astar_from_other_three(metric, m, n) else: raise RuntimeError('No results found {}'.format(file_p)) file = self._choose_result_file(li, m, n) return np.load(file)
def rename(): rp = get_result_path() for dirpath, dirs, files in walk('{}/{}'.format(rp, f)): for bfn in files: if target in bfn: continue dest_bfn = bfn.replace(source, target) t = prompt('Rename {} to {}? [y/n]'.format(bfn, dest_bfn), ['y', 'n']) if t == 'y': exec('mv {} {}'.format(join(dirpath, bfn), join(dirpath, dest_bfn))) elif t == 'n': print('Skip') else: assert (False) print('Done')
def _load_sim_mat(self): fn = get_result_path() + '/{}/sim/{}_graph2vec_dim_{}_sim_{}.npy'.format( \ self.dataset, self.dataset, self.dim, self.sim) if isfile(fn): with open(fn, 'rb') as handle: sim_mat = load_pkl(handle) print('Loaded sim mat from {}'.format(fn)) return sim_mat train_emb = self._load_emb(True) test_emb = self._load_emb(False) if self.sim == 'dot': sim_mat = test_emb.dot(train_emb.T) else: raise RuntimeError('Unknown sim {}'.format(self.sim)) with open(fn, 'wb') as handle: save_pkl(sim_mat, handle) print('Saved sim mat {} to {}'.format(sim_mat.shape, fn)) return sim_mat
def _load_result_mat(self, metric, model, m, n): file_p = get_result_path() + '/{}/{}/{}_{}_mat_{}_{}_*.npy'.format( self.dataset, metric, self.ds_metric, metric, self.dataset, model) li = glob(file_p) if not li: if 'astar' in model: if self.dataset not in [ 'imdbmulti', 'webeasy', 'linux_imdb', 'nci109', 'ptc', 'mutag' ]: raise RuntimeError( 'Not imdbmulti/webeasy/linux_imdb/... and no astar results in {}!' .format(file_p)) return self._load_merged_astar_from_other_three(metric, m, n) else: raise RuntimeError('No results found {}'.format(file_p)) file = self._choose_result_file(li, m, n) return np.load(file)
def plot_preck_helper(dataset, dsmetric, models, rs, true_result, metric, norm, ks, logscale, plot_results, extra_dir): print_ids = [] numbers = {} assert (metric[0:6] == 'prec@k') if len(metric) > 6: rm = float(metric.split('_')[1]) else: rm = 0 for model in models: precs = prec_at_ks(true_result, rs[model], norm, ks, rm, print_ids) numbers[model] = {'ks': ks, 'precs': precs} rtn = {'preck{}_{}'.format(get_norm_str(norm), rm): numbers} if not plot_results: return rtn plt.figure(figsize=(16, 10)) for model in models: ks = numbers[model]['ks'] inters = numbers[model]['precs'] if logscale: pltfunc = plt.semilogx else: pltfunc = plt.plot pltfunc(ks, inters, **get_plotting_arg(args1, model)) plt.scatter(ks, inters, s=200, label=shorten_name(model), **get_plotting_arg(args2, model)) plt.xlabel('k') # ax = plt.gca() # ax.set_xticks(ks) plt.ylabel(metric) plt.ylim([-0.06, 1.06]) plt.legend(loc='best', ncol=2) plt.grid(linestyle='dashed') plt.tight_layout() # plt.show() kss = 'k_{}_{}'.format(min(ks), max(ks)) bfn = '{}_{}_{}_{}_{}{}_{}'.format( dsmetric, metric, dataset, '_'.join(models), kss, get_norm_str(norm), rm) dir = '{}/{}/{}'.format(get_result_path(), dataset, metric) save_fig(plt, dir, bfn) if extra_dir: save_fig(plt, extra_dir, bfn) print(metric, 'plotted') return rtn
def _load_emb(self, train): fn = get_result_path( ) + '/{}/emb/{}_graph2vec_{}_emb_dim_{}.npy'.format( self.dataset, self.dataset, 'train' if train else 'test', self.dim) if isfile(fn): emb = np.load(fn) print('Loaded emb {} from {}'.format(emb.shape, fn)) return emb data = load_data(self.dataset, train=train) id_map = self._gid_to_matrixid(data) emb = np.zeros((len(data.graphs), self.dim)) cnt = 0 d = self._load_json_emb() for f in d: gid = get_file_base_id(f) if gid in id_map: emb[id_map[gid]] = d[f] cnt += 1 if cnt != len(id_map): raise RuntimeError('Mismatch: {} != {}').format(cnt, len(id_map)) np.save(fn, emb) print('Saved emb {} to {}'.format(emb.shape, fn)) return emb
def plot_single_number_metric(dataset, dsmetric, models, rs, true_result, metric, norms, ds_kernel=None, thresh_poss=None, thresh_negs=None, thresh_poss_sim=None, thresh_negs_sim=None, plot_results=True, extra_dir=None): """ Plot mrr or mse. """ create_dir_if_not_exists('{}/{}/{}'.format( get_result_path(), dataset, metric)) rtn = {} if norms and thresh_poss and thresh_negs: assert (len(norms) == len(thresh_poss) == len(thresh_negs)) for i, norm in enumerate(norms): thresh_pos = thresh_poss[i] if thresh_poss else None thresh_neg = thresh_negs[i] if thresh_negs else None thresh_pos_sim = thresh_poss_sim[i] if thresh_poss_sim else None thresh_neg_sim = thresh_negs_sim[i] if thresh_negs_sim else None d = plot_single_number_metric_helper( dataset, dsmetric, models, rs, true_result, metric, norm, ds_kernel, thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim, plot_results, extra_dir) rtn.update(d) return rtn
def test_model(args): models = os.listdir(args.save_path) # load dataset data_paths = get_data_path(args.mode, args.encoder) datasets = MatchSumPipe(args.candidate_num, args.encoder).process_from_file(data_paths) print('Information of dataset is:') print(datasets) test_set = datasets.datasets['test'] # need 1 gpu for testing device = int(args.gpus) args.batch_size = 1 for cur_model in models: print('Current model is {}'.format(cur_model)) # load model model = torch.load(join(args.save_path, cur_model)) # configure testing dec_path, ref_path = get_result_path(args.save_path, cur_model) test_metric = MatchRougeMetric(data=read_jsonl(data_paths['test']), dec_path=dec_path, ref_path=ref_path, n_total=len(test_set)) tester = Tester(data=test_set, model=model, metrics=[test_metric], batch_size=args.batch_size, device=device, use_tqdm=False) tester.test()
def plot_heatmap(gs1_str, gs2_str, dist_mat, thresh_pos, thresh_neg, dataset, dist_metric, norm): m, n = dist_mat.shape label_mat, num_poses, num_negs, _, _ = \ get_classification_labels_from_dist_mat( dist_mat, thresh_pos, thresh_neg) title = '{} pos pairs ({:.2%})\n{} neg pairs ({:.2%})'.format( num_poses, num_poses / (m * n), num_negs, num_negs / (m * n)) sorted_label_mat = np.sort(label_mat, axis=1)[:, ::-1] mat_str = '{}({})_{}({})_{}_{}'.format( gs1_str, m, gs2_str, n, thresh_pos, thresh_neg) fn = '{}_acc_{}_labels_heatmap_{}{}'.format(dist_metric, mat_str, dataset, get_norm_str(norm)) dir = '{}/{}/classif_labels'.format(get_result_path(), dataset) create_dir_if_not_exists(dir) plot_heatmap_helper(sorted_label_mat, title, dir, fn, cmap='bwr') sorted_dist_mat = np.sort(dist_mat, axis=1) mat_str = '{}({})_{}({})'.format( gs1_str, m, gs2_str, n) fn = '{}_acc_{}_dist_heatmap_{}{}'.format(dist_metric, mat_str, dataset, get_norm_str(norm)) plot_heatmap_helper(sorted_dist_mat, '', dir, fn, cmap='tab20')
from utils import get_result_path import pandas as pd import os import copy from ast import literal_eval from os.path import join name = 'aids700nef' dataset = join( get_result_path(), name, 'mcs', 'mcs_aids700nef_mccreesh2017_2018-11-27T02:36:27.553945_redacted-desktop_all_4cpus' ) df = pd.read_csv('{}.csv'.format(dataset), sep=',') # for index, chunk in enumerate(pd.read_csv('{}.csv'.format(dataset), sep=',', chunksize=1)): print('read csv') hits = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] cur_hit = 0 for index, row in df.iterrows(): perc = index / len(df) if cur_hit < len(hits) and abs(perc - hits[cur_hit]) <= 0.05: print('{}/{}={:.1%}'.format(index, len(df), perc)) cur_hit += 1 node_mapping = {} edge_mapping = row['node_mapping'] edge_mapping = literal_eval(edge_mapping)[0] row['edge_mapping'] = [copy.deepcopy(edge_mapping)] # one node with one node mapping if edge_mapping == {}:
'movie': '#ff6666', 'tvSeries': '#ff6666', 'actor': 'lightskyblue', 'actress': '#ffb3e6', 'director': 'yellowgreen', 'composer': '#c2c2f0', 'producer': '#ffcc99', 'cinematographer': 'gold'} if __name__ == '__main__': plot_what = 'att_vis' concise = True dataset = 'aids700nef' model = 'astar' dir = get_result_path() + '/{}/{}/{}'.format(dataset, plot_what, model) weight_data = load_as_dict("/home/songbian/Documents/fork/GraphEmbedding/" "model/Siamese/logs/" "siamese_classification_aids700nef_2018-07-28T10:09:33/" "test_info.pickle") weight = weight_data['atts'] info_dict = { # draw node config 'draw_node_size': 800 if dataset != 'linux' else 20, 'draw_node_label_enable': True, 'node_label_name': None if dataset == 'linux' else 'type', 'draw_node_label_font_size': 16, 'draw_node_color_map': TYPE_COLOR_MAP, # draw edge config 'draw_edge_label_enable': False, 'edge_label_name': 'valence',
save_path = info_dict['plot_save_path'] if save_path is None or save_path == "": plt.show() else: sp = info_dict['plot_save_path'] print('Saving query vis plot to {}'.format(sp)) plt.savefig(sp, dpi=info_dict['plot_dpi']) if __name__ == '__main__': dataset = 'linux' model = 'astar' concise = True ext = 'png' norms = [True, False] dir = get_result_path() + '/{}/att_vis_ourrank/{}'.format(dataset, model) create_dir_if_not_exists(dir) info_dict = { # draw node config 'draw_node_size': 10 if dataset != 'linux' else 10, 'draw_node_label_enable': True, 'node_label_name': None if dataset == 'linux' else 'type', 'draw_node_label_font_size': 6, 'draw_node_color_map': { 'C': '#ff6666', 'O': 'lightskyblue', 'N': 'yellowgreen', 'movie': '#ff6666', 'tvSeries': '#ff6666', 'actor': 'lightskyblue', 'actress': '#ffb3e6',
def draw_ranking(dataset, ds_metric, true_r, pred_r, model_name, node_feat_name, plot_node_ids=False, plot_gids=False, ds_norm=True, existing_mappings=None, extra_dir=None, plot_max_num=np.inf): plot_what = 'query_demo' concise = True dir = get_result_path() + '/{}/{}/{}'.format(dataset, plot_what, true_r.get_model()) info_dict = { # draw node config 'draw_node_size': 20, 'draw_node_label_enable': True, 'show_labels': plot_node_ids, 'node_label_type': 'label' if plot_node_ids else 'type', 'node_label_name': 'type', 'draw_node_label_font_size': 6, 'draw_node_color_map': get_color_map(true_r.get_all_gs()), # draw edge config 'draw_edge_label_enable': False, 'draw_edge_label_font_size': 6, # graph text info config 'each_graph_text_list': [], 'each_graph_text_font_size': 10, 'each_graph_text_pos': [0.5, 1.05], # graph padding: value range: [0, 1] 'top_space': 0.20 if concise else 0.26, # out of whole graph 'bottom_space': 0.05, 'hbetween_space': 0.6 if concise else 1, # out of the subgraph 'wbetween_space': 0, # plot config 'plot_dpi': 200, 'plot_save_path_eps': '', 'plot_save_path_png': '' } test_gs = true_r.get_row_gs() train_gs = None if true_r.has_single_col_gs(): train_gs = true_r.get_single_col_gs() if plot_node_ids and existing_mappings: # existing_orderings: [train + val ... test] test_gs = reorder_gs_based_on_exsiting_mappings( test_gs, existing_mappings[len(train_gs):], node_feat_name) train_gs = reorder_gs_based_on_exsiting_mappings( train_gs, existing_mappings[0:len(train_gs)], node_feat_name) plt_cnt = 0 ids_groundtruth = true_r.get_sort_id_mat(ds_norm) ids_rank = pred_r.get_sort_id_mat(ds_norm) for i in range(len(test_gs)): q = test_gs[i] if not true_r.has_single_col_gs(): train_gs = true_r.get_col_gs(i) middle_idx = len(train_gs) // 2 # Choose the top 6 matches, the overall middle match, and the worst match. selected_ids = list(range(6)) selected_ids.extend([middle_idx, -1]) # Get the selected graphs from the groundtruth and the model. gids_groundtruth = np.array(ids_groundtruth[i][selected_ids]) gids_rank = np.array(ids_rank[i][selected_ids]) # Top row graphs are only the groundtruth outputs. gs_groundtruth = [train_gs[j] for j in gids_groundtruth] # Bottom row graphs are the query graph + model ranking. gs_rank = [test_gs[i]] gs_rank = gs_rank + [train_gs[j] for j in gids_rank] gs = gs_groundtruth + gs_rank # Create the plot labels. text = [] # First label is the name of the groundtruth algorithm, rest are scores for the graphs. text += [get_text_label_for_ranking( ds_metric, true_r, i, i, ds_norm, True, dataset, gids_groundtruth, plot_gids)] text += [get_text_label_for_ranking( ds_metric, true_r, i, j, ds_norm, False, dataset, gids_groundtruth, plot_gids) for j in gids_groundtruth] # Start bottom row labels, just ranked from 1 to N with some fancy formatting. text.append("Rank by\n{}".format(model_name)) for j in range(len(gids_rank)): ds = format_ds(pred_r.pred_ds(i, gids_rank[j], ds_norm)) if j == len(gids_rank) - 2: rtn = '\n ... {} ...\n{}'.format(int(len(train_gs) / 2), ds) elif j == len(gids_rank) - 1: rtn = '\n {}\n{}'.format(int(len(train_gs)), ds) else: rtn = '\n {}\n{}'.format(str(j + 1), ds) # rtn = '\n {}: {:.2f}'.format('sim', pred_r.sim_mat_[i][j]) text.append(rtn) # Perform the visualization. info_dict['each_graph_text_list'] = text fn = '{}_{}_{}_{}{}'.format( plot_what, dataset, true_r.get_model(), i, get_norm_str(ds_norm)) info_dict, plt_cnt = set_save_paths_for_vis( info_dict, dir, extra_dir, fn, plt_cnt) vis_small(q, gs, info_dict) if plt_cnt > plot_max_num: print('Saved {} query demo plots'.format(plt_cnt)) return print('Saved {} query demo plots'.format(plt_cnt))
paths['test'] = 'data/test_CNNDM_' + encoder + '.jsonl' return paths path = get_data_path("test","bert") print(path) # # for name in path: # # assert exists(path[name]) # # print(path[name]) datasets = MatchSumPipe(20, "bert").process_from_file(path) print('Information of dataset is:') print(datasets) test_set = datasets.datasets['test'] device = int(0) batch_size = 1 for cur_model in models: print('Current model is {}'.format(cur_model)) # load model model = torch.load(join(save_path, cur_model)) # configure testing dec_path, ref_path = get_result_path(save_path, cur_model) test_metric = MatchRougeMetric(data=read_jsonl(path['test']), dec_path=dec_path, ref_path=ref_path, n_total = len(test_set)) tester = Tester(data=test_set, model=model, metrics=[test_metric], batch_size=batch_size, device=device, use_tqdm=False) tester.test()
def plot_true_pairs(dataset_name, num_pairs, fix_match_pos, want_gid_tuples, need_eps): dir = join(get_result_path(), dataset_name, 'matching_vis') want = ['true'] _plot_pairs(None, dataset_name, num_pairs, fix_match_pos, dir, want, want_gid_tuples, need_eps)
def plot_single_number_metric_helper(dataset, dsmetric, models, rs, true_result, metric, norm, ds_kernel, thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim, plot_results, extra_dir): # dsmetric: distance/similarity metric, e.g. ged, mcs, etc. # metric: eval metric. print_ids = [] rtn = {} val_list = [] for model in models: if metric == 'mrr': val = mean_reciprocal_rank( true_result, rs[model], norm, print_ids) elif metric == 'mse': val = mean_squared_error( true_result, rs[model], ds_kernel, norm) elif metric == 'dev': val = mean_deviation( true_result, rs[model], ds_kernel, norm) elif metric == 'time': val = average_time(rs[model]) elif 'acc' in metric: val = accuracy( true_result, rs[model], thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim, norm) pos_acc, neg_acc, acc = val if metric == 'pos_acc': val = pos_acc elif metric == 'neg_acc': val = neg_acc elif metric == 'acc': val = acc # only the overall acc else: assert (metric == 'accall') elif metric == 'kendalls_tau': val = kendalls_tau(true_result, rs[model], norm) elif metric == 'spearmans_rho': val = spearmans_rho(true_result, rs[model], norm) else: raise RuntimeError('Unknown {}'.format(metric)) # print('{} {}: {}'.format(metric, model, mrr_mse_time)) rtn[model] = val val_list.append(val) rtn = {'{}{}'.format(metric, get_norm_str(norm)): rtn} if not plot_results: return rtn plt = plot_multiple_bars(val_list, models, metric) if metric == 'time': ylabel = 'time (msec)' norm = None elif metric == 'pos_acc': ylabel = 'pos_recall' elif metric == 'neg_acc': ylabel = 'neg_recall' elif metric == 'kendalls_tau': ylabel = 'Kendall\'s $\\tau$' elif metric == 'spearmans_rho': ylabel = 'Spearman\'s $\\rho$' else: ylabel = metric plt.ylabel(ylabel) if metric == 'time': plt.yscale('log') metric_addi_info = '' bfn = '{}_{}{}_{}_{}{}'.format( dsmetric, metric, metric_addi_info, dataset, '_'.join(models), get_norm_str(norm)) sp = get_result_path() + '/{}/{}/'.format(dataset, metric) save_fig(plt, sp, bfn) if extra_dir: save_fig(plt, extra_dir, bfn) print(metric, 'plotted') return rtn
dist_mat[i][j] = d save(sfn, dist_mat) print('Saved to {}'.format(sfn)) return dist_mat if __name__ == '__main__': dataset = 'imdbmulti' dist_metric = 'ged' dist_algo = 'astar' dist_calculator = DistCalculator(dataset, dist_metric, dist_algo) # The server qilin calculated all the pairwise distances between # the training graphs. # Thus, enrich the distance map (i.e. calculator) using the qilin results. mat1 = np.load('{}/{}/{}/{}.npy'.format( get_result_path(), dataset, dist_metric, 'ged_ged_mat_imdbmulti_beam80_2018-08-02T22:38:34_qilin_all_20cpus')) mat2 = np.load('{}/{}/{}/{}.npy'.format( get_result_path(), dataset, dist_metric, 'ged_ged_mat_imdbmulti_hungarian_2018-08-03T13:40:54_qilin_all_20cpus') ) mat3 = np.load('{}/{}/{}/{}.npy'.format( get_result_path(), dataset, dist_metric, 'ged_ged_mat_imdbmulti_vj_2018-08-04T10:21:15_qilin_all_20cpus')) row_gs = load_data(dataset, train=True).graphs col_gs = load_data(dataset, train=True).graphs dist_calculator.load_from_dist_mat([mat1, mat2, mat3], row_gs, col_gs, check_symmetry=False)
#--- Get params parser = argparse.ArgumentParser() parser.add_argument('--experiment', type=str, default=None, help='name of experiment', required=True) args = parser.parse_args() #-- Load params experiment = args.experiment results = {} n_splits = 10 for i_split in range(1,n_splits+1): # path path = utils.get_result_path(experiment, str(i_split)) with open(path,'r') as f: lines = [line.strip() for line in f.readlines()] for line in lines: # parse line parts = line.split(': ') if len(parts)==3: settings = parts[0] element = parts[1] value = float(parts[2]) # if key does not exist add array key = settings+':'+element if key not in results: results[key] = []
def visualize_embeddings(dataset, orig_embs, true_result, thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim, norm, pred_r, eps_dir=None): # label_mat, _, _ = true_result.classification_mat( # thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim, norm) tsne = TSNE(n_components=2) embs = tsne.fit_transform(orig_embs) dir = '{}/{}/emb_vis'.format(get_result_path(), dataset) create_dir_if_not_exists(dir) if eps_dir: create_dir_if_not_exists(eps_dir) m = np.shape(pred_r.sort_id_mat_)[0] n = np.shape(pred_r.sort_id_mat_)[1] # m = np.shape(label_mat)[0] # n = np.shape(label_mat)[1] plt_cnt = 0 print('TSNE embeddings: {} --> {} to plot'.format(orig_embs.shape, embs.shape)) for i in range(m): axis_x_red = [] axis_y_red = [] axis_x_blue = [] axis_y_blue = [] axis_x_query = [] axis_y_query = [] for j in range(10): axis_x_blue.append(embs[pred_r.sort_id_mat_[i][j], 0]) axis_y_blue.append(embs[pred_r.sort_id_mat_[i][j], 1]) for j in range(n - 10): axis_x_red.append(embs[pred_r.sort_id_mat_[i][j + 10], 0]) axis_y_red.append(embs[pred_r.sort_id_mat_[i][j + 10], 1]) axis_x_query.append(embs[i + n, 0]) axis_y_query.append(embs[i + n, 1]) cm = plt.cm.get_cmap("Reds") plt.figure() plt.scatter(axis_x_red, axis_y_red, s=30, c=sorted(range(n - 10), reverse=False), marker='o', alpha=0.6, cmap=plt.cm.get_cmap("Blues")) plt.scatter(axis_x_blue, axis_y_blue, s=15, c=sorted(range(10), reverse=True), marker='s', alpha=0.6, cmap=plt.cm.get_cmap("Reds")) plt.scatter(axis_x_query, axis_y_query, s=400, c='limegreen', marker='P', alpha=0.6) plt.axis('off') cur_axes = plt.gca() cur_axes.axes.get_xaxis().set_visible(False) cur_axes.axes.get_yaxis().set_visible(False) plt.tight_layout() plt.savefig(dir + '/' + str(i) + '.png', bbox_inches='tight', pad_inches=0) if eps_dir: plt.savefig(eps_dir + '/' + str(i) + '.png', bbox_inches='tight', pad_inches=0) plt.savefig(eps_dir + '/' + str(i) + '.eps', bbox_inches='tight', pad_inches=0) plt_cnt += 1 plt.close() print('Saved {} embedding visualization plots'.format(plt_cnt))
for g in gs: if g.graph['gid'] == gid: return g return None if __name__ == '__main__': dataset = 'mutag' dist_metric = 'ged' dist_algo = 'astar' dist_sim_calculator = DistSimCalculator(dataset, dist_metric, dist_algo) # The server qilin calculated all the pairwise distances between # the training graphs. # Thus, enrich the distance map (i.e. calculator) using the qilin results. csv3 = ('{}/{}/csv/{}.csv'.format( get_result_path(), dataset, 'ged_mutag_beam80_2019-01-22T13:55:19.744928_qilin_all_20cpus')) csv1 = ('{}/{}/csv/{}.csv'.format( get_result_path(), dataset, 'ged_mutag_hungarian_2019-01-22T14:09:43.111557_feilong_all_15cpus')) csv2 = ('{}/{}/csv/{}.csv'.format( get_result_path(), dataset, 'ged_mutag_vj_2019-01-22T16:34:47.260820_qilin_all_20cpus')) row_gs = load_data(dataset, train=True).graphs col_gs = load_data(dataset, train=True).graphs dist_sim_calculator.load(row_gs, col_gs, csv_filenames=[csv1, csv2, csv3], ds_metric='ged', check_symmetry=False) # dataset = 'webeasy'
parser.add_argument('--experiment', type=str, default=None, help='name of experiment', required=True) args = parser.parse_args() #-- Load params experiment = args.experiment results = {} n_splits = 10 for i_split in range(1, n_splits + 1): # path path = utils.get_result_path(experiment, str(i_split)) with open(path, 'r') as f: lines = [line.strip() for line in f.readlines()] for line in lines: # parse line parts = line.split(': ') if len(parts) == 3: settings = parts[0] element = parts[1] value = float(parts[2]) # if key does not exist add array key = settings + ':' + element if key not in results: results[key] = []
def real_dataset_run_helper(computer_name, dataset, ds_metric, algo, row_graphs, col_graphs, num_cpu, timeout): if ds_metric == 'ged': func = ged elif ds_metric == 'mcs': func = mcs # For MCS, since the solver can handle labeled and unlabeled graphs, but the compressed # encoding must be labeled (need to tell it to ignore labels or not). # TODO: this should go in some kind of config file specific for mcs if node_has_type_attrib(row_graphs[0]): labeled = True label_key = 'type' print('Has node type') else: labeled = False label_key = '' print('Does not have node type') else: raise RuntimeError('Unknown distance similarity metric {}'.format(ds_metric)) m = len(row_graphs) n = len(col_graphs) ds_mat = np.zeros((m, n)) time_mat = np.zeros((m, n)) outdir = '{}/{}'.format(get_result_path(), dataset) create_dir_if_not_exists(outdir + '/csv') create_dir_if_not_exists(outdir + '/{}'.format(ds_metric)) create_dir_if_not_exists(outdir + '/time') exsiting_csv = prompt('File path to exsiting csv files?') exsiting_entries = load_from_exsiting_csv(exsiting_csv, ds_metric, skip_eval=False) is_symmetric = prompt('Is the ds matrix symmetric? (1/0)', options=['0', '1']) == '1' if is_symmetric: assert (m == n) smart_needed = prompt('Is smart pair sorting needed? (1/0)', options=['0', '1']) == '1' csv_fn = '{}/csv/{}_{}_{}_{}_{}_{}cpus.csv'.format( outdir, ds_metric, dataset, algo, get_ts(), computer_name, num_cpu) file = open(csv_fn, 'w') print('Saving to {}'.format(csv_fn)) if ds_metric == 'ged': print_and_log('i,j,i_gid,j_gid,i_node,j_node,i_edge,j_edge,ged,lcnt,time(msec)', file) else: print_and_log( 'i,j,i_gid,j_gid,i_node,j_node,i_edge,j_edge,mcs,node_mapping,edge_mapping,time(msec)', file) # Multiprocessing. pool = mp.Pool(processes=num_cpu) # Submit to pool workers. results = {} pairs_to_run = get_all_pairs_to_run(row_graphs, col_graphs, smart_needed) for k, (i, j) in enumerate(pairs_to_run): g1, g2 = row_graphs[i], col_graphs[j] i_gid, j_gid = g1.graph['gid'], g2.graph['gid'] if (i_gid, j_gid) in exsiting_entries: continue if is_symmetric and (j_gid, i_gid) in exsiting_entries: continue if ds_metric == 'mcs': results[(i, j)] = pool.apply_async( func, args=(g1, g2, algo, labeled, label_key, True, True, timeout,)) else: results[(i, j)] = pool.apply_async( func, args=(g1, g2, algo, True, True, timeout,)) print_progress(k, m, n, 'submit: {} {} {} {} cpus;'. format(algo, dataset, computer_name, num_cpu)) # Retrieve results from pool workers or a loaded csv file (previous run). for k, (i, j) in enumerate(pairs_to_run): print_progress(k, m, n, 'work: {} {} {} {} {} cpus;'. format(ds_metric, algo, dataset, computer_name, num_cpu)) g1, g2 = row_graphs[i], col_graphs[j] i_gid, j_gid = g1.graph['gid'], g2.graph['gid'] if (i, j) not in results: lcnt, mcs_node_mapping, mcs_edge_mapping = None, None, None tmp = exsiting_entries.get((i_gid, j_gid)) if tmp: if ds_metric == 'ged': i_gid, j_gid, i_node, j_node, ds, lcnt, t = tmp else: i_gid, j_gid, i_node, j_node, ds, mcs_node_mapping, mcs_edge_mapping, t = tmp else: assert (is_symmetric) get_from = exsiting_entries[(j_gid, i_gid)] if ds_metric == 'ged': j_gid, i_gid, j_node, i_node, ds, lcnt, t = \ get_from else: j_gid, i_gid, j_node, i_node, ds, mcs_node_mapping, mcs_edge_mapping, t = \ get_from if ds_metric == 'ged': assert (lcnt is not None) assert (g1.graph['gid'] == i_gid) assert (g2.graph['gid'] == j_gid) assert (g1.number_of_nodes() == i_node) assert (g2.number_of_nodes() == j_node) s = form_ged_print_string(i, j, g1, g2, ds, lcnt, t) else: assert (mcs_node_mapping is not None and mcs_edge_mapping is not None) s = form_mcs_print_string( i, j, g1, g2, ds, mcs_node_mapping, mcs_edge_mapping, t) else: if ds_metric == 'ged': ds, lcnt, g1_a, g2_a, t = results[(i, j)].get() i_gid, j_gid, i_node, j_node = \ g1.graph['gid'], g2.graph['gid'], \ g1.number_of_nodes(), g2.number_of_nodes() assert (g1.number_of_nodes() == g1_a.number_of_nodes()) assert (g2.number_of_nodes() == g2_a.number_of_nodes()) exsiting_entries[(i_gid, j_gid)] = \ (i_gid, j_gid, i_node, j_node, ds, lcnt, t) s = form_ged_print_string(i, j, g1, g2, ds, lcnt, t) else: # MCS ds, mcs_node_mapping, mcs_edge_mapping, t = \ results[(i, j)].get() exsiting_entries[(i_gid, j_gid)] = \ (ds, mcs_node_mapping, mcs_edge_mapping, t) s = form_mcs_print_string( i, j, g1, g2, ds, mcs_node_mapping, mcs_edge_mapping, t) print_and_log(s, file) if ds_metric == 'mcs' and (i_gid, j_gid) in exsiting_entries: # Save memory, clear the mappings since they're saved to file. exsiting_entries[(i_gid, j_gid)] = list(exsiting_entries[(i_gid, j_gid)]) exsiting_entries[(i_gid, j_gid)][1] = {} exsiting_entries[(i_gid, j_gid)][2] = {} ds_mat[i][j] = ds time_mat[i][j] = t file.close() save_as_np(outdir, ds_metric, ds_mat, time_mat, get_ts(), dataset, row_graphs, col_graphs, algo, computer_name, num_cpu)
test_model = args.test_model snapshot = args.snapshot experiment = args.experiment gauss_var = args.gauss_var #--- GPU caffe.set_mode_gpu() #--- LOAD SMOTHED POSITION MAPS position_maps = utils.load_position_maps(split_name, gauss_var) #--- LOAD TEST DATA test_data = utils.get_test_data_path(split_name) #--- GET TEST RESULTS PATH test_res_path = utils.get_result_path(experiment, split_name) ###--- TEST print 'Testing' sys.stdout.flush() net_results, position_results = test_net(test_model, snapshot, test_data, test_iters, position_maps) im_acc, price_acc, name_acc = net_results print 'NET: image accuracy:', im_acc print 'NET: price accuracy:', price_acc print 'NET: name accuracy:', name_acc p_im_acc, p_price_acc, p_name_acc = position_results print 'NET+POSITION: image accuracy:', p_im_acc print 'NET+POSITION: price accuracy:', p_price_acc