def __init__(self): print("reading graphs...") self.n_node, self.graph = utils.read_edges(config.train_filename, config.test_filename) self.root_nodes = [i for i in range(self.n_node)] print("reading initial embeddings...") self.node_embed_init_d = utils.read_embeddings( filename=config.pretrain_emb_filename_d, n_node=self.n_node, n_embed=config.n_emb) self.node_embed_init_g = utils.read_embeddings( filename=config.pretrain_emb_filename_g, n_node=self.n_node, n_embed=config.n_emb) # construct or read BFS-trees self.trees = None if os.path.isfile(config.cache_filename): print("reading BFS-trees from cache...") pickle_file = open(config.cache_filename, 'rb') self.trees = pickle.load(pickle_file) pickle_file.close() else: print("constructing BFS-trees...") pickle_file = open(config.cache_filename, 'wb') if config.multi_processing: self.construct_trees_with_mp(self.root_nodes) else: self.trees = self.construct_trees(self.root_nodes) pickle.dump(self.trees, pickle_file) pickle_file.close() print("building GAN model...") self.discriminator = None self.generator2 = None self.build_generator() self.build_discriminator() ################################### self.latest_checkpoint = tf.train.latest_checkpoint(config.model_log) self.saver = tf.compat.v1.train.Saver() self.config = tf.compat.v1.ConfigProto() ########### #self.config.gpu_options.allow_growth = True ########### self.init_op = tf.group(tf.compat.v1.global_variables_initializer(), tf.compat.v1.local_variables_initializer()) self.sess = tf.compat.v1.Session(config=self.config) self.sess.run(self.init_op)
def __init__(self): t = time.time() print "reading graph..." self.n_node, self.n_relation, self.graph = utils.read_graph( config.graph_filename) self.node_list = self.graph.keys() #range(0, self.n_node) print '[%.2f] reading graph finished. #node = %d #relation = %d' % ( time.time() - t, self.n_node, self.n_relation) t = time.time() print "read initial embeddings..." self.node_embed_init_d = utils.read_embeddings( filename=config.pretrain_node_emb_filename_d, n_node=self.n_node, n_embed=config.n_emb) self.node_embed_init_g = utils.read_embeddings( filename=config.pretrain_node_emb_filename_g, n_node=self.n_node, n_embed=config.n_emb) #self.rel_embed_init_d = utils.read_embeddings(filename=config.pretrain_rel_emb_filename_d, # n_node=self.n_node, # n_embed=config.n_emb) #self.rel_embed_init_g = utils.read_embeddings(filename=config.pretrain_rel_emb_filename_g, # n_node=self.n_node, # n_embed=config.n_emb) print "[%.2f] read initial embeddings finished." % (time.time() - t) print "build GAN model..." self.discriminator = None self.generator = None self.build_generator() self.build_discriminator() self.latest_checkpoint = tf.train.latest_checkpoint(config.model_log) self.saver = tf.train.Saver() self.dblp_evaluation = DBLP_evaluation() self.yelp_evaluation = Yelp_evaluation() self.aminer_evaluation = Aminer_evaluation() self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess = tf.Session(config=self.config) self.sess.run(self.init_op) self.show_config()
def build_classifier(data_path): print('Constructing classifier...') classes = sorted(get_classes_from_data(data_path)) if len(classes) <= 1: print("Not building classifier since we don't have enough faces") return class_to_num = {x: i for i, x in enumerate(classes)} X = None Y = None for i, aclass in enumerate(classes): embeddings = read_embeddings(os.path.join(data_path, aclass, 'embedding.h5')) X = embeddings if X is None else np.concatenate((X, embeddings), axis=0) labels = np.repeat(class_to_num[aclass], len(embeddings)) Y = labels if Y is None else np.concatenate((Y, labels), axis=0) model = svm.SVC(kernel='linear', probability=True) model.fit(X, Y) write_classifier(os.path.join(data_path, 'classifier.pickle'), model, classes) return (model, classes, X, Y)
def __init__(self, embed_filename, test_filename, test_neg_filename, n_node, n_embed): self.embed_filename = embed_filename # each line: node_id, embeddings(dim: n_embed) self.test_filename = test_filename # each line: node_id1, node_id2 self.test_neg_filename = test_neg_filename # each line: node_id1, node_id2 self.n_node = n_node self.n_embed = n_embed self.emd = utils.read_embeddings(embed_filename, n_node=n_node, n_embed=n_embed)
def load_embed(self): self.emb = utils.read_embeddings(self.embed_filename, n_node=self.n_node, n_embed=self.n_embed) epsilon = 1e-8 # ref to BIGCLAM threshold = math.sqrt(-math.log(1 - epsilon)) # ref to BIGCLAM self.emb = self.emb > threshold self.embed_m = sp.csr_matrix(self.emb.T, dtype=np.uint32)
def build_graph(graph, embedding_size=100, embedding_path=None, token2idx=None, input_dropout_rate=0.25, dropout_rate=0.5, l1=None, l2=None, convolutional_kernels=16, filter_extensions=[3, 4, 5], fix_embeddings=False, max_features=100000, max_len=100, output_dim=80): ''' Builds Keras Graph model that, given a query (in the form of a list of indices), returns a vector of output_dim non-negative weights that sum up to 1. The Convolutional Neural Network architecture is inspired by the following paper: Yoon Kim - Convolutional Neural Networks for Sentence Classification - arXiv:1408.5882v2 ''' regularizer = utils.get_regularizer(l1, l2) graph.add_input(name='input_query', input_shape=(None,), dtype='int32') E = None if embedding_path is not None: E = utils.read_embeddings(embedding_path, token2idx=token2idx, max_features=max_features) embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_size, input_length=max_len, weights=E) if fix_embeddings is True: embedding_layer.params = [] embedding_layer.updates = [] graph.add_node(embedding_layer, name='embedding', input='input_query') graph.add_node(Dropout(input_dropout_rate), name='embedding_dropout', input='embedding') flatten_layer_names = [] for w_size in filter_extensions: convolutional_layer = Convolution1D(input_dim=embedding_size, nb_filter=convolutional_kernels, filter_length=w_size, border_mode='valid', activation='relu', W_regularizer=regularizer, subsample_length=1) convolutional_layer_name = 'convolutional' + str(w_size) graph.add_node(convolutional_layer, name=convolutional_layer_name , input='embedding_dropout') pool_length = convolutional_layer.output_shape[1] pooling_layer = MaxPooling1D(pool_length=pool_length) pooling_layer_name = 'pooling' + str(w_size) graph.add_node(pooling_layer, name=pooling_layer_name, input=convolutional_layer_name) flatten_layer_name = 'flatten' + str(w_size) flatten_layer = Flatten() graph.add_node(flatten_layer, name=flatten_layer_name, input=pooling_layer_name) flatten_layer_names += [flatten_layer_name] graph.add_node(Dropout(dropout_rate), name='dropout', inputs=flatten_layer_names, merge_mode='concat') dense_layer = Dense(output_dim=output_dim, W_regularizer=regularizer) graph.add_node(dense_layer, name='dense', input='dropout') softmax_layer = Activation('softmax') graph.add_node(softmax_layer, name='softmax', input='dense') return graph
def main(args): G_train = nx.read_weighted_edgelist(args.train, nodetype=int, create_using=nx.Graph()) G_test = nx.read_weighted_edgelist(args.test, nodetype=int, create_using=nx.Graph()) vector = read_node_vectors(args.embedding, G_test) print("=====Compute AUC====") auc = [] for node in tqdm(list(G_test.nodes())): try: auc.append(AUC_MR.compute(G_test, node, vector)) except ValueError: continue auc_mean = float(sum(auc) / len(auc)) print("=====Compute MR====") sequence_order = AUC_MR.result_rank(G_test, vector) mr = [] for node in tqdm(G_test.nodes()): try: mr.append(AUC_MR.mean_rank(G_test, node, sequence_order)) except ValueError: continue Mean_Rank = sum(mr) / len(mr) print("=====Compute ACC====") n_node = len(G_train.nodes()) neg_sample_link = [] for node in tqdm(G_test.nodes()): neg_sample_link.append( [node, ACC.generate_neg_link(G_test, args.negative_num, node)]) np.savetxt("temp/negtive_link.txt", np.asarray(neg_sample_link), fmt="%s", newline="\n", delimiter="\t") test_edge = utils.read_edges_from_file(args.test) test_edge_neg = utils.read_edges_from_file("temp/negtive_link.txt") test_edge.extend(test_edge_neg) EMB, EMBMAP = utils.read_embeddings(args.embedding, n_node, args.dimensions) acc = ACC.eval_link_prediction(test_edge, EMB, EMBMAP) print("=====Show Results====") dataset_name = args.train.split("/")[-1].split(".")[0] tb = pt.PrettyTable() tb.field_names = ["dataset", "AUC", "MR", "ACC"] tb.add_row([dataset_name, auc_mean, Mean_Rank, acc]) print(tb)
def __init__(self): t = time.time() print('reading graph...') self.graph, self.n_node, self.node_list, self.node_list_s, self.egs = utils.read_graph(config.train_file) self.node_emd_shape = [2, self.n_node, config.n_emb] print('[%.2f] reading graph finished. #node = %d' % (time.time() - t, self.n_node)) self.dis_node_embed_init = None self.gen_node_embed_init = None if config.pretrain_dis_node_emb: t = time.time() print('reading initial embeddings...') dis_node_embed_init = np.array([utils.read_embeddings(filename=x, n_node=self.n_node, n_embed=config.n_emb) \ for x in [config.pretrain_dis_node_emb]]) gen_node_embed_init = np.array([utils.read_embeddings(filename=x, n_node=self.n_node, n_embed=config.n_emb) \ for x in [config.pretrain_gen_node_emb]]) print('[%.2f] read initial embeddings finished.' % (time.time() - t)) print('building DGGAN model...') self.discriminator = None self.generator = None self.build_generator() self.build_discriminator() if config.experiment == 'link_prediction': self.link_prediction = evaluation.LinkPrediction(config) self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.sess = tf.Session(config = self.config) self.saver = tf.train.Saver(max_to_keep=0) if config.pretrain_ckpt: print('restore...') pretrain_ckpt = tf.train.latest_checkpoint(config.pretrain_ckpt) self.saver.restore(self.sess, pretrain_ckpt) else: print('initial...') self.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init_op)
def main(): if len(sys.argv) > 1: emb_path = sys.argv[1] if not os.path.exists(emb_path): print('Error. Embeddings file is not found') return else: print('Error. Specify path to embeddings file') return embeddings, words2ids = read_embeddings(emb_path) embeddings = normalize_embeddings(embeddings) print('SIMILARITY test:') human_vs_cos_sim_correlation('datasets/tt_similarity.csv', embeddings, words2ids) print('RELATEDNESS test:') human_vs_cos_sim_correlation('datasets/tt_relatedness.csv', embeddings, words2ids) print('ANALOGIES test:') top_k = 10 answer_analogy_questions('datasets/tt_analogies.txt', embeddings, words2ids, top_k)
#graph_filename="graphs/ca-netscience.txt" #graph_filename="graphs/test1.txt" #embedding_filename="output_ca_netscience.txt" graph_filename = "graphs/web-google.txt" #graph_filename="graphs/CA-GrQc.txt" #graph_filename="graphs/test1.txt" #embedding_filename="output_ca_netscience.txt" embedding_filename = "output.txt" adj_matrix, vertex_map, edge_count = utils.generate_adj_matrix(graph_filename) test_edges, test_edges_neg = utils.generate_edges(adj_matrix, len(vertex_map), edge_count) test_edges.extend(test_edges_neg) embeddings = utils.read_embeddings(embedding_filename) score_res = [] for i in range(len(test_edges)): score_res.append( np.dot(embeddings[test_edges[i][0]], embeddings[test_edges[i][1]])) test_label = np.array(score_res) bar = np.median(test_label) ind_pos = test_label >= bar ind_neg = test_label < bar test_label[ind_pos] = 1 test_label[ind_neg] = 0 true_label = np.zeros(test_label.shape) true_label[0:len(true_label) // 2] = 1 accuracy = accuracy_score(true_label, test_label)
return accuracies # --- --- --- if __name__ == '__main__': """ Trains baseline and 2 Bi-LSTM layer models singularly for each dataset. """ resources_path = parse_args().resources_path emb_path_1grams = resources_path + "/train/embeddings_1grams.utf8" emb_path_2grams = resources_path + "/train/embeddings_2grams.utf8" emb_path_3grams = resources_path + "/train/embeddings_3grams.utf8" word_to_idx_1grams, idx_to_word_1grams, emb_matrix_1grams = u.read_embeddings(emb_path_1grams) word_to_idx_2grams, idx_to_word_2grams, emb_matrix_2grams = u.read_embeddings(emb_path_2grams) word_to_idx_3grams, idx_to_word_3grams, emb_matrix_3grams = u.read_embeddings(emb_path_3grams) labels_to_idx, idx_to_labels = u.get_label_dictionaries() #grid_search(resources_path) # Train on AS dataset tf.reset_default_graph() train_baseline_model(train_datasets=[resources_path + "/train/as_training_simpl_input.utf8", resources_path + "/train/as_training_simpl_label.utf8"], dev_datasets=["../resources/dev/as_dev_inputs.utf8", resources_path + "/dev/as_dev_labels.utf8"], test_datasets=[resources_path + "/dev/as_test_inputs.utf8", resources_path + "/dev/as_test_labels.utf8"], model_path=resources_path + "/base_model_as/base_model.ckpt", model_ID=0)
def predict(input_path, output_path, resources_path): """ This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the BIES format. The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. :param input_path: the path of the input file to predict. :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :return: None """ print("Loading embeddings...") emb_path_1grams = resources_path + "/train/embeddings_1grams.utf8" emb_path_2grams = resources_path + "/train/embeddings_2grams.utf8" word_to_idx_1grams, idx_to_word_1grams, emb_matrix_1grams = u.read_embeddings( emb_path_1grams) word_to_idx_2grams, idx_to_word_2grams, emb_matrix_2grams = u.read_embeddings( emb_path_2grams) labels_to_idx, idx_to_labels = u.get_label_dictionaries() print("Done.") tf.reset_default_graph() x_1grams, x_2grams, y, \ keep_pr, recurrent_keep_pr, \ lengths, train, \ loss, preds = m.get_layered_model(pretrained_emb_1grams=emb_matrix_1grams, pretrained_emb_2grams=emb_matrix_2grams, hidden_size=96, layers=1, y_size=len(labels_to_idx), learning_rate=0.005) model_path = resources_path + "/2layers_model_cityu/base_model.ckpt" print("Loading model saved in path: %s" % model_path) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, model_path) with open(output_path, mode='w', encoding='utf-8') as preds_file: pass print("\nGenerating predictions...") predictions = [] with open(output_path, mode='a', encoding='utf-8') as preds_file: for batch_inputs, \ batch_labels, \ batch_lengths in u.generate_batches(dataset_input=input_path, dataset_label="", batch_size=32, label_to_idx=labels_to_idx, ngram_features=[1, 2], word_to_idx=[word_to_idx_1grams, word_to_idx_2grams], to_shuffle=False, testing=True): preds_val = sess.run( [preds], feed_dict={ x_1grams: batch_inputs[0], x_2grams: batch_inputs[1], lengths: batch_lengths, keep_pr: 1.0, recurrent_keep_pr: 1.0 }) for p in preds_val[0]: p = p[1:np.count_nonzero(p) - 1] p = p.tolist() # default to "S" if some special tag (either '-' or '<PAD>') is predicted p = [ idx_to_labels[c] if c > 1 else idx_to_labels[5] for c in p ] predictions.append(p) if len(predictions) == 128: preds_file.writelines("%s\n" % ''.join(p) for p in predictions) predictions = [] if len(predictions) > 0: preds_file.writelines("%s\n" % ''.join(p) for p in predictions) print("Done.\nYour predictions have been stored in path: %s" % output_path)