def load_bert(args): s_train, p_train = load_data('penn_treebank_dataset', 'train') doc_id, sen_id_train, global_graph = construct_graph(p_train) bert_train_paths = bert_embeddings(args, s_train, '_train') bert_train = np.load(bert_train_paths[-1]) s_dev, p_dev = load_data('penn_treebank_dataset', 'dev') doc_id, sen_id_dev, global_graph = construct_graph(p_dev) bert_dev_paths = bert_embeddings(args, s_dev, '_dev') bert_dev = np.load(bert_dev_paths[-1]) s_test, p_test = load_data('penn_treebank_dataset', 'test') doc_id, sen_id_test, global_graph = construct_graph(p_test) bert_test_paths = bert_embeddings(args, s_test, '_test') bert_test = np.load(bert_test_paths[-1]) return bert_train, bert_dev, bert_test, sen_id_train, sen_id_dev, sen_id_test
def load_graph(args, data_split=True): if not data_split: _, p_train = load_data('penn_treebank_dataset', 'train') doc_id, sen_id_train, global_graph = construct_graph(p_train) _, p_dev = load_data('penn_treebank_dataset', 'dev') doc_id, sen_id_dev, global_graph = construct_graph(p_dev) _, p_test = load_data('penn_treebank_dataset', 'test') doc_id, sen_id_test, global_graph = construct_graph(p_test) parsed = p_train + p_dev + p_test sen_id = sen_id_train + sen_id_dev + sen_id_test graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id) return graph_emb, sen_id else: _, p_train = load_data('penn_treebank_dataset', 'train') doc_id, sen_id_train, global_graph = construct_graph(p_train) ge_train = graph_embeddings(args, global_graph, doc_id, sen_id_train, '_train') _, p_dev = load_data('penn_treebank_dataset', 'dev') doc_id, sen_id_dev, global_graph = construct_graph(p_dev) ge_dev = graph_embeddings(args, global_graph, doc_id, sen_id_dev, '_dev') _, p_test = load_data('penn_treebank_dataset', 'test') doc_id, sen_id_test, global_graph = construct_graph(p_test) ge_test = graph_embeddings(args, global_graph, doc_id, sen_id_test, '_test') return ge_train, ge_dev, ge_test, sen_id_train, sen_id_dev, sen_id_test
def mi_mlps_ptb(args): # load data s_train, p_train = load_data('penn_treebank_dataset', 'train') s_dev, p_dev = load_data('penn_treebank_dataset', 'dev') s_test, p_test = load_data('penn_treebank_dataset', 'test') sentences = s_train + s_dev + s_test parsed = p_train + p_dev + p_test doc_id, sen_id, global_graph = construct_graph(parsed) s_train, p_train, s_dev, p_dev, s_test, p_test = [], [], [], [], [], [] # load embeddings graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id) bert_emb = load_glove(args, sentences) # bert_emb = load_elmo(args, sentences) # bert_emb_paths = bert_embeddings(args, sentences) # bert_emb = np.load(bert_emb_paths[0], allow_pickle=True) # initialize mi mir, mig, mib = [], [], [] for l in range(args.bert_layers_num): mib.append([]) for s in range(len(sentences)): mir.append(0.) mig.append(0.) for l in range(args.bert_layers_num): mib[l].append(0.) if args.baselines: print('3.1 start to calculate baselines of MI...') # calculate MI baselines for r in range(args.repeat): tmp_mir = mine_probe(args, graph_emb, bert_emb, len(sentences), 'lower') tmp_mig = mine_probe(args, graph_emb, bert_emb, len(sentences), 'upper') # get sum value mir = [mir[s]+tmp_mir[s] for s in range(len(tmp_mir))] mig = [mig[s]+tmp_mig[s] for s in range(len(tmp_mig))] print('3.2 start to calculate BERT hidden states of MI...') for r in range(args.repeat): tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), args.bert_layers_num - 1) mib[-1] = [mib[-1][s]+tmp_mib[s] for s in range(len(tmp_mib))] mib_layers = sum(mib[-1]) / (len(mib[-1]) * args.repeat) print('MI(G, Glove): {} |'.format(mib_layers))
def load_embeddings(args): # load data s_train, p_train = load_data('penn_treebank_dataset', 'train') s_dev, p_dev = load_data('penn_treebank_dataset', 'dev') s_test, p_test = load_data('penn_treebank_dataset', 'test') sentences = s_train + s_dev + s_test parsed = p_train + p_dev + p_test # sentences = s_test # parsed = p_test doc_id, sen_id, global_graph = construct_graph(parsed) # load embeddings graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id) bert_emb_paths = bert_embeddings(args, sentences) # graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id, '_test') # bert_emb_paths = bert_embeddings(args, sentences, '_test') bert_emb = np.load(bert_emb_paths[-1]) return graph_emb, bert_emb
# (1, dim) -> (1, dim) -> (1, ) logit = th.sigmoid(th.sum(src * dst)) preds.append(logit.detach().numpy().tolist()) labels.append(edge.label) fpr, tpr, thresholds = metrics.roc_curve(labels, preds, pos_label=1) print("Evaluate link prediction AUC: {:.4f}".format(metrics.auc(fpr, tpr))) if __name__ == "__main__": args = utils.init_args() valid_sku_raw_ids = utils.get_valid_sku_set(args.item_info_data) g, sku_encoder, sku_decoder = utils.construct_graph( args.action_data, args.session_interval_sec, valid_sku_raw_ids) train_g, test_g = utils.split_train_test_graph(g) sku_info_encoder, sku_info_decoder, sku_info = \ utils.encode_sku_fields(args.item_info_data, sku_encoder, sku_decoder) num_skus = len(sku_encoder) num_brands = len(sku_info_encoder["brand"]) num_shops = len(sku_info_encoder["shop"]) num_cates = len(sku_info_encoder["cate"]) print( "Num skus: {}, num brands: {}, num shops: {}, num cates: {}".\ format(num_skus, num_brands, num_shops, num_cates) )
l = list(nx.all_simple_paths(G, source=idx_s, target=idx_d)) paths_len.append(len(l)) else: continue return np.array(paths_len) if __name__ == '__main__': """ Load rules from ClassBench filter file, build a graph, and print graph statistics """ args = parse_args() ruleset = load_ruleset(args.ruleset, except_zero=False, random_priority=0) # build graph G = construct_graph(ruleset, True) # every nodes' (in degree, out degree) node_degree = [] node_list = list(G.nodes()) for idx, i in enumerate(node_list): node_degree.append((G.in_degree(i), G.out_degree(i))) """Calculation of nodes and edges""" # number of edges of each component edge_num_by_component = [] weak_list = list(nx.weakly_connected_components(G)) for idx, k in enumerate(weak_list): #if len(k) != 1: # edge_num_by_component.append(len(list(G.edges(weak_list[idx])))) edge_num_by_component.append(len(list(G.edges(weak_list[idx]))))
def mi_bert_ptb(args, npeet=False, uncontext=False): # load data s_train, p_train = load_data('penn_treebank_dataset', 'train') s_dev, p_dev = load_data('penn_treebank_dataset', 'dev') s_test, p_test = load_data('penn_treebank_dataset', 'test') sentences = s_train + s_dev + s_test parsed = p_train + p_dev + p_test doc_id, sen_id, global_graph = construct_graph(parsed) s_train, p_train, s_dev, p_dev, s_test, p_test = [], [], [], [], [], [] # load embeddings graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id) if uncontext: bert_emb = load_glove(args, sentences) # bert_emb = load_elmo(args, sentences) else: bert_emb_paths = bert_embeddings(args, sentences) # bert_emb_paths = load_elmos(args, sentences) bert_emb = np.load(bert_emb_paths[0], allow_pickle=True) # initialize mi mir, mig, mib = [], [], [] for l in range(args.bert_layers_num): mib.append([]) for s in range(len(sentences)): mir.append(0.) mig.append(0.) for l in range(args.bert_layers_num): mib[l].append(0.) if args.baselines: print('3.1 start to calculate baselines of MI...') # calculate MI baselines for r in range(args.repeat): tmp_mir = mine_probe(args, graph_emb, bert_emb, len(sentences), 'lower') tmp_mig = mine_probe(args, graph_emb, bert_emb, len(sentences), 'upper') # get sum value mir = [mir[s]+tmp_mir[s] for s in range(len(tmp_mir))] mig = [mig[s]+tmp_mig[s] for s in range(len(tmp_mig))] print('3.2 start to calculate BERT hidden states of MI...') if uncontext: for r in range(args.repeat): tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), args.bert_layers_num - 1) mib[-1] = [mib[-1][s]+tmp_mib[s] for s in range(len(tmp_mib))] mib_layers = sum(mib[-1]) / (len(mib[-1]) * args.repeat) print('MI(G, Glove): {} |'.format(mib_layers)) else: # calculate MI of BERT for l in range(args.bert_layers_num): bert_emb = np.load(bert_emb_paths[l], allow_pickle=True) for r in range(args.repeat): tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), l) mib[l] = [mib[l][s]+tmp_mib[s] for s in range(len(tmp_mib))] # compute average values for all results mir = [mi/args.repeat for mi in mir] mig = [mi/args.repeat for mi in mig] for l in range(args.bert_layers_num): mib[l] = [mi/args.repeat for mi in mib[l]] mib_layers = [sum(mib[l])/len(mib[l]) for l in range(len(mib))] # print general results results = {'lower:': mir, 'upper': mig, 'bert': mib} # print('\n', results, '\n') print('MI(G, R): {} | MI(G, G): {}| MI(G, BERT): {} |'.format(sum( mir)/len(mir), sum(mig)/len(mig), mib_layers)) return
torch.set_default_tensor_type(torch.DoubleTensor) torch.set_default_dtype(torch.float64) np.random.seed(config['seed']) torch.manual_seed(config['seed']) # ------------------------------------------------------------------------- # Setup logger # ------------------------------------------------------------------------- logger.info(f"Add file handle to logger...") logzero.logfile(os.path.join(result_dir, 'logs.log')) # ------------------------------------------------------------------------- # Construct graph from config file # ------------------------------------------------------------------------- logger.info("Construct graph...") g, g_noy = utils.construct_graph(config) # ------------------------------------------------------------------------- # Load data according to config # ------------------------------------------------------------------------- data_type = config['data']['type'] logger.info(f"Load {data_type} data; A: {config['data']['protected']} ...") data = data_loader.get_data(data_type, config['data'], graph=g) a, y = data['A'], data['Y'] config['data']['samples'] = len(y) config['max_cfu']['n_original'] = len(y) if debug > 0: logger.info("Create and save scatter plot of features...") plotters.plot_scatter_matrix(data, g, fig_dir, save=True) logger.info("Create conditional histograms...")