def train(self): train_losses = [] val_losses = [] model_path = os.path.join(self.model_dir, self.model_file) print("Training model...\n") timer = Timer() timer.tic() x = self.data.x.to(self.device) train_pos_edge_index = self.data.train_pos_edge_index.to(self.device) for epoch in range(self.epochs): print("Epoch: {}".format(epoch + 1)) self.model.train() self.optimizer.zero_grad() z = self.model.encode(x, train_pos_edge_index) loss = self.model.recon_loss(z, train_pos_edge_index) if self.model_name == "ARGVA": loss = loss + (1 / self.data.num_nodes) * self.model.kl_loss() loss += self.dis_loss_para * self.model.discriminator_loss(z) + \ self.reg_loss_para * self.model.reg_loss(z) loss.backward() self.optimizer.step() # Evaluate on validation data self.model.eval() with torch.no_grad(): train_losses.append(loss.cpu().detach().numpy()) # Compute validation statistics val_pos_edge_index = self.data.val_pos_edge_index.to( self.device) val_neg_edge_index = self.data.val_neg_edge_index.to( self.device) z = self.model.encode(x, train_pos_edge_index) val_loss = self.model.recon_loss(z, train_pos_edge_index) if self.model_name == "ARGVA": val_loss += (1 / self.data.num_nodes) * self.model.kl_loss() val_loss += self.dis_loss_para * self.model.discriminator_loss( z) + self.reg_loss_para * self.model.reg_loss(z) val_losses.append(val_loss.cpu().detach().numpy()) if val_losses[-1] == min(val_losses): print("\tSaving model...") torch.save(self.model.state_dict(), model_path) print("\tSaved.") print("\ttrain_loss=", "{:.5f}".format(loss), "val_loss=", "{:.5f}".format(val_loss)) print("Finished training.\n") training_time = timer.toc() self._plot_losses(train_losses, val_losses) self._print_stats(train_losses, val_losses, training_time)
def __init__(self, embedding_type, gpu=0): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.embedding_type = embedding_type self.embeddings_parser = EmbeddingsParser(gpu) self.timer = Timer() self.path_persistent = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "han", self.embedding_type) if not os.path.exists(self.path_persistent): os.makedirs(self.path_persistent)
def __init__(self, embedding_type, graph_type, threshold=2, gpu=0): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.embedding_type = embedding_type self.graph_type = graph_type self.threshold = threshold self.embeddings_parser = EmbeddingsParser(gpu) self.timer = Timer() self.path_persistent = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type) if not os.path.isdir(self.path_persistent): os.mkdir(self.path_persistent)
def __init__(self): self.parser = FileParser() self.persistent = {} self.timer = Timer() self.processes = { "chapters_books": { "process_data": "_process_data_chapters_books", "persistent_file": os.path.join(self.path, "chapters_books.pkl") }, "chapters_all_scigraph_citations": { "process_data": "_process_data_chapters_all_scigraph_citations", "persistent_file": os.path.join(self.path, "chapters_all_scigraph_citations.pkl") }, "chapters_confproc_scigraph_citations": { "process_data": "_process_data_chapters_confproc_scigraph_citations", "persistent_file": os.path.join(self.path, "chapters_confproc_scigraph_citations.pkl") }, "books_conferences": { "process_data": "_process_data_books_conferences", "persistent_file": os.path.join(self.path, "books_conferences.pkl") }, "author_id_chapters": { "process_data": "_process_data_author_id_chapters", "persistent_file": os.path.join(self.path, "author_id_chapters.pkl") }, "author_name_chapters": { "process_data": "_process_data_author_name_chapters", "persistent_file": os.path.join(self.path, "author_name_chapters.pkl") }, "confproc_scigraph_citations_chapters": { "process_data": "_process_data_confproc_scigraph_citations_chapters", "persistent_file": os.path.join(self.path, "confproc_scigraph_citations_chapters.pkl") } }
def __init__(self, embedding_type, dataset, graph_type="directed", threshold=2, gpu=0): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.embedding_type = embedding_type self.dataset = dataset self.graph_type = graph_type self.threshold = threshold self.embeddings_parser = EmbeddingsParser(gpu) self.timer = Timer() self.path_persistent = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "gat", self.embedding_type, self.dataset) if not os.path.exists(self.path_persistent): os.makedirs(self.path_persistent)
def train(self, data): if not self._load_model_classifier(): print("Classifier not trained yet. Training now...") timer = Timer() timer.tic() print("Loading the training embeddings...") if not self._load_train_embeddings(): print("The pretrained embeddings are missing.") else: print("Loaded.") training_ids = list(data.chapter) training_embeddings = self.pretrained_embeddings[[ self.pretrained_embeddings_id_map[id] for id in training_ids ]] self.label_encoder = LabelEncoder() self.labels = self.label_encoder.fit_transform( data.conferenceseries) self.classifier.fit(training_embeddings, self.labels) self._save_model_classifier() print("Training finished.") timer.toc()
class DatasetsParser: path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "data", "interim", "parsed_data") def __init__(self): self.parser = FileParser() self.persistent = {} self.timer = Timer() self.processes = { "chapters_books": { "process_data": "_process_data_chapters_books", "persistent_file": os.path.join(self.path, "chapters_books.pkl") }, "chapters_all_scigraph_citations": { "process_data": "_process_data_chapters_all_scigraph_citations", "persistent_file": os.path.join(self.path, "chapters_all_scigraph_citations.pkl") }, "chapters_confproc_scigraph_citations": { "process_data": "_process_data_chapters_confproc_scigraph_citations", "persistent_file": os.path.join(self.path, "chapters_confproc_scigraph_citations.pkl") }, "books_conferences": { "process_data": "_process_data_books_conferences", "persistent_file": os.path.join(self.path, "books_conferences.pkl") }, "author_id_chapters": { "process_data": "_process_data_author_id_chapters", "persistent_file": os.path.join(self.path, "author_id_chapters.pkl") }, "author_name_chapters": { "process_data": "_process_data_author_name_chapters", "persistent_file": os.path.join(self.path, "author_name_chapters.pkl") }, "confproc_scigraph_citations_chapters": { "process_data": "_process_data_confproc_scigraph_citations_chapters", "persistent_file": os.path.join(self.path, "confproc_scigraph_citations_chapters.pkl") } } def get_data(self, process): # Check if the data is already present if (process in self.persistent): return self.persistent[process] print("Process '{}' not in memory yet.".format(process)) # Load from persistent file if data already processed if os.path.isfile(self.processes[process]["persistent_file"]): with open(self.processes[process]["persistent_file"], "rb") as f: self.persistent[process] = pickle.load(f) return self.persistent[process] print("Process '{}' not persistent yet. Processing.".format(process)) # Process the data self.persistent[process] = self._parse_file( self.processes[process]["process_data"]) with open(self.processes[process]["persistent_file"], "wb") as f: pickle.dump(self.persistent[process], f) return self.persistent[process] def _parse_file(self, process_data): print("Start processing file.\n") self.timer.tic() process_data_function = self.__getattribute__(process_data) results = process_data_function() self.timer.toc() return results # processes implementation def _process_data_chapters_books(self): # Load datasets df_chapters_books_isbns = pd.DataFrame( list(self.parser.get_data("chapters_books_isbns").items()), columns=["chapter", "books_isbns"]) df_isbn_book_ids = pd.DataFrame(list( self.parser.get_data("isbn_books").items()), columns=["isbn", "book"]) # Process datasets df_chapters_books_isbns[["isbn1", "isbn2"]] = pd.DataFrame( df_chapters_books_isbns["books_isbns"].tolist(), index=df_chapters_books_isbns.index) df_chapters_books_isbns.drop(columns=["books_isbns"], axis=1, inplace=True) df_chapters_isbn1 = pd.merge( df_chapters_books_isbns[["chapter", "isbn1"]], df_isbn_book_ids, how="inner", left_on=["isbn1"], right_on=["isbn"]) df_chapters_isbn1.drop(columns=["isbn1", "isbn"], inplace=True) df_chapters_isbn2 = pd.merge( df_chapters_books_isbns[["chapter", "isbn2"]], df_isbn_book_ids, how="inner", left_on=["isbn2"], right_on=["isbn"]) df_chapters_isbn2.drop(columns=["isbn2", "isbn"], inplace=True) df_chapters_books = df_chapters_isbn1.append(df_chapters_isbn2, ignore_index=True) df_chapters_books.drop_duplicates(inplace=True) return df_chapters_books def _process_data_chapters_all_scigraph_citations(self): df_chapters_citations = pd.DataFrame( list(self.parser.get_data("chapters_all_citations").items()), columns=["chapter", "chapter_citations"]) chapters_count = len(df_chapters_citations) with tqdm(desc="Processing citations", total=chapters_count, unit="chapter") as pbar: for idx in range(chapters_count): citations = df_chapters_citations.iloc[idx][ "chapter_citations"] citations = [ c for c in citations if c is not None and c.startswith("sg") ] df_chapters_citations.iloc[ idx]["chapter_citations"] = citations if citations \ else np.nan pbar.update(1) return df_chapters_citations[ df_chapters_citations["chapter_citations"].notnull()] def _process_data_chapters_confproc_scigraph_citations(self): df_scigraph_citations = self.get_data( "chapters_all_scigraph_citations") df_chapters = pd.DataFrame(self.parser.get_data("chapters"), columns=["chapter"]) chapters = set(list(df_chapters["chapter"])) chapters_count = len(df_scigraph_citations) with tqdm(desc="Processing citations", total=chapters_count, unit="chapter") as pbar: for idx in range(chapters_count): scigraph_citations = df_scigraph_citations.iloc[idx][ "chapter_citations"] citations = [c for c in scigraph_citations if c in chapters] df_scigraph_citations.iloc[idx][ "chapter_citations"] = citations if citations else np.nan pbar.update(1) return df_scigraph_citations[ df_scigraph_citations["chapter_citations"].notnull()] def _process_data_books_conferences(self): df_old_books_new_books = pd.DataFrame(list( self.parser.get_data("old_books_new_books").items()), columns=["old_book", "new_book"]) df_old_books_conferences = pd.DataFrame( list(self.parser.get_data("old_books_conferences").items()), columns=["old_book", "conference"]) df = pd.merge(df_old_books_new_books, df_old_books_conferences, how="left", on=["old_book", "old_book"]) df.drop(columns=["old_book"], axis=1, inplace=True) df.rename(columns={ "new_book": "book", "conference": "conference" }, inplace=True) return df[df["conference"].notnull()] def _process_data_author_id_chapters(self): df_chapters_authors = pd.DataFrame(list( self.parser.get_data("chapters_authors").items()), columns=["chapter", "authors"]) contributions = [] for idx in range(len(df_chapters_authors)): authors = [ author for author in df_chapters_authors.iloc[idx]["authors"] ] chapter = df_chapters_authors.iloc[idx]["chapter"] contributions.extend([(author, chapter) for author in authors]) author_id_chapters = pd.DataFrame.from_records( contributions, columns=["author", "chapter"]) return author_id_chapters def _process_data_author_name_chapters(self): df_chapters_authors_name = pd.DataFrame( list(self.parser.get_data("chapters_authors_name").items()), columns=["chapter", "authors_name"]) contributions = [] for idx in range(len(df_chapters_authors_name)): authors_name = [ author_name for author_name in df_chapters_authors_name.iloc[idx]["authors_name"] ] chapter = df_chapters_authors_name.iloc[idx]["chapter"] contributions.extend([(author_name, chapter) for author_name in authors_name]) author_name_chapters = pd.DataFrame.from_records( contributions, columns=["author_name", "chapter"]) return author_name_chapters def _process_data_confproc_scigraph_citations_chapters(self): df_chapters_confproc_scigraph_citations = self.get_data( "chapters_confproc_scigraph_citations") citations = [] for idx in range(len(df_chapters_confproc_scigraph_citations)): citation_list = [ citation for citation in df_chapters_confproc_scigraph_citations. iloc[idx]["chapter_citations"] ] chapter = df_chapters_confproc_scigraph_citations.iloc[idx][ "chapter"] citations.extend([(citation, chapter) for citation in citation_list]) confproc_scigraph_citations_chapter = pd.DataFrame.from_records( citations, columns=["citation", "chapter"]) return confproc_scigraph_citations_chapter
def inference(self, test_data, gpu_mem_fraction=None): print("Inference.") timer = Timer() timer.tic() G = test_data[0] features = test_data[1] id_map = test_data[2] class_map = test_data[4] if isinstance(list(class_map.values())[0], list): num_classes = len(list(class_map.values())[0]) else: num_classes = len(set(class_map.values())) if not features is None: # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1], ))]) placeholders = self._construct_placeholders(num_classes) minibatch = NodeMinibatchIterator(G, id_map, placeholders, class_map, num_classes, batch_size=self.batch_size, max_degree=self.max_degree) adj_info_ph = tf.compat.v1.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") model = self._create_model(num_classes, placeholders, features, adj_info, minibatch) config = tf.compat.v1.ConfigProto( log_device_placement=self.log_device_placement) config.gpu_options.allow_growth = True config.allow_soft_placement = True # Initialize session sess = tf.compat.v1.Session(config=config) merged = tf.compat.v1.summary.merge_all() # summary_writer = tf.summary.FileWriter(self._log_dir(), sess.graph) # Initialize model saver saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs) # Init variables sess.run(tf.compat.v1.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) # Restore model print("Restoring trained model.") checkpoint_file = os.path.join(self._log_dir(), "model.ckpt") ckpt = tf.compat.v1.train.get_checkpoint_state(checkpoint_file) if checkpoint_file: saver.restore(sess, checkpoint_file) print("Model restored.") else: print("This model checkpoint does not exist. The model might " + "not be trained yet or the checkpoint is invalid.") val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj) sess.run(val_adj_info.op) print("Computing predictions...") t_test = time.time() finished = False val_losses = [] val_preds = [] nodes = [] iter_num = 0 while not finished: feed_dict_val, _, finished, nodes_subset = minibatch.incremental_node_val_feed_dict( self.batch_size, iter_num, test=True) node_outs_val = sess.run([model.preds, model.loss], feed_dict=feed_dict_val) val_preds.append(node_outs_val[0]) val_losses.append(node_outs_val[1]) nodes.extend(nodes_subset) iter_num += 1 val_preds = np.vstack(val_preds) print("Computed.") # Return only the embeddings of the test nodes test_preds_ids = {} for i, node in enumerate(nodes): test_preds_ids[node] = i test_nodes = [n for n in G.nodes() if G.node[n]['test']] test_preds = val_preds[[test_preds_ids[id] for id in test_nodes]] timer.toc() sess.close() return test_nodes, test_preds
def train(self, train_data, test_data=None): print("Training model...") timer = Timer() timer.tic() G = train_data[0] features = train_data[1] id_map = train_data[2] class_map = train_data[4] if isinstance(list(class_map.values())[0], list): num_classes = len(list(class_map.values())[0]) else: num_classes = len(set(class_map.values())) if not features is None: # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1], ))]) placeholders = self._construct_placeholders(num_classes) minibatch = NodeMinibatchIterator(G, id_map, placeholders, class_map, num_classes, batch_size=self.batch_size, max_degree=self.max_degree) adj_info_ph = tf.compat.v1.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") model = self._create_model(num_classes, placeholders, features, adj_info, minibatch) config = tf.compat.v1.ConfigProto( log_device_placement=self.log_device_placement) config.gpu_options.allow_growth = True config.allow_soft_placement = True # Initialize session sess = tf.compat.v1.Session(config=config) merged = tf.compat.v1.summary.merge_all() # summary_writer = tf.summary.FileWriter(self._log_dir(), sess.graph) # Initialize model saver saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs) # Init variables sess.run(tf.compat.v1.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) # Train model total_steps = 0 avg_time = 0.0 epoch_val_costs = [] train_losses = [] validation_losses = [] train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj) val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj) for epoch in range(self.epochs): minibatch.shuffle() iter = 0 print('Epoch: %04d' % (epoch)) epoch_val_costs.append(0) train_loss_epoch = [] validation_loss_epoch = [] while not minibatch.end(): # Construct feed dictionary feed_dict, labels = minibatch.next_minibatch_feed_dict() feed_dict.update({placeholders['dropout']: self.dropout}) t = time.time() # Training step outs = sess.run( [merged, model.opt_op, model.loss, model.preds], feed_dict=feed_dict) train_cost = outs[2] train_loss_epoch.append(train_cost) if iter % self.validate_iter == 0: # Validation sess.run(val_adj_info.op) if self.validate_batch_size == -1: val_cost, val_f1_mic, val_f1_mac, duration = self._incremental_evaluate( sess, model, minibatch, self.batch_size) else: val_cost, val_f1_mic, val_f1_mac, duration = self._evaluate( sess, model, minibatch, self.validate_batch_size) sess.run(train_adj_info.op) epoch_val_costs[-1] += val_cost validation_loss_epoch.append(val_cost) # if total_steps % self.print_every == 0: # summary_writer.add_summary(outs[0], total_steps) # Print results avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) if total_steps % self.print_every == 0: train_f1_mic, train_f1_mac = self._calc_f1( labels, outs[-1]) print("Iter:", '%04d' % iter, "train_loss=", "{:.5f}".format(train_cost), "train_f1_mic=", "{:.5f}".format(train_f1_mic), "train_f1_mac=", "{:.5f}".format(train_f1_mac), "val_loss=", "{:.5f}".format(val_cost), "val_f1_mic=", "{:.5f}".format(val_f1_mic), "val_f1_mac=", "{:.5f}".format(val_f1_mac), "time=", "{:.5f}".format(avg_time)) iter += 1 total_steps += 1 if total_steps > self.max_total_steps: break # Keep track of train and validation losses per epoch train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch)) validation_losses.append( sum(validation_loss_epoch) / len(validation_loss_epoch)) # If the epoch has the lowest validation loss so far if validation_losses[-1] == min(validation_losses): print( "Minimum validation loss so far ({}) at epoch {}.".format( validation_losses[-1], epoch)) # Save model at each epoch print("Saving model at epoch {}.".format(epoch)) saver.save(sess, os.path.join(self._log_dir(), "model.ckpt")) if total_steps > self.max_total_steps: break print("Optimization Finished!") training_time = timer.toc() self._plot_losses(train_losses, validation_losses) self._print_stats(train_losses, validation_losses, training_time) sess.run(val_adj_info.op) val_cost, val_f1_mic, val_f1_mac, duration = self._incremental_evaluate( sess, model, minibatch, self.batch_size) print("Full validation stats:", "loss=", "{:.5f}".format(val_cost), "f1_micro=", "{:.5f}".format(val_f1_mic), "f1_macro=", "{:.5f}".format(val_f1_mac), "time=", "{:.5f}".format(duration)) with open(self._log_dir() + "val_stats.txt", "w") as fp: fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}". format(val_cost, val_f1_mic, val_f1_mac, duration))
def __init__(self): self.timer = Timer() self.persistent = {} self.processes = { # Old datasets "old_books": { "filename": os.path.join(self.path_raw, old_books_file), "process_line": "_process_line_old_books", "persistent_file": os.path.join(self.path_persistent, "old_books.pkl"), "persistent_variable": [], "dataset_format": "ntriples" }, "old_books_new_books": { "filename": os.path.join(self.path_raw, old_books_file), "process_line": "_process_line_old_books_new_books", "persistent_file": os.path.join( self.path_persistent, "old_books_new_books.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "old_books_conferences": { "filename": os.path.join(self.path_raw, old_books_file), "process_line": "_process_line_old_books_conferences", "persistent_file": os.path.join( self.path_persistent, "old_books_conferences.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences", "persistent_file": os.path.join(self.path_persistent, "conferences.pkl"), "persistent_variable": [], "dataset_format": "ntriples" }, "conferences_name": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_name", "persistent_file": os.path.join( self.path_persistent, "conferences_name.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_acronym": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_acronym", "persistent_file": os.path.join( self.path_persistent, "conferences_acronym.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_city": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_city", "persistent_file": os.path.join( self.path_persistent, "conferences_city.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_country": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_country", "persistent_file": os.path.join( self.path_persistent, "conferences_country.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_year": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_year", "persistent_file": os.path.join( self.path_persistent, "conferences_year.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_datestart": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_datestart", "persistent_file": os.path.join( self.path_persistent, "conferences_datestart.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_dateend": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_dateend", "persistent_file": os.path.join( self.path_persistent, "conferences_dateend.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_conferenceseries": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_conferenceseries", "persistent_file": os.path.join( self.path_persistent, "conferences_conferenceseries.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferenceseries": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferenceseries", "persistent_file": os.path.join( self.path_persistent, "conferenceseries.pkl"), "persistent_variable": [], "dataset_format": "ntriples" }, "conferenceseries_name": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferenceseries_name", "persistent_file": os.path.join( self.path_persistent, "conferenceseries_name.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, # New datasets "books": { "filename": os.path.join(self.path_raw, books_file), "process_line": "_process_line_books", "persistent_file": os.path.join(self.path_persistent, "books.pkl"), "persistent_variable": [], "dataset_format": "json" }, "isbn_books": { "filename": os.path.join(self.path_raw, books_file), "process_line": "_process_line_isbn_books", "persistent_file": os.path.join(self.path_persistent, "isbn_books.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "authors_name": { "filename": os.path.join(self.path_raw, authors_file), "process_line": "_process_line_authors_name", "persistent_file": os.path.join(self.path_persistent, "authors_name.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters", "persistent_file": os.path.join(self.path_persistent, "chapters.pkl"), "persistent_variable": [], "dataset_format": "json" }, "chapters_title": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_title", "persistent_file": os.path.join(self.path_persistent, "chapters_title.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_year": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_year", "persistent_file": os.path.join(self.path_persistent, "chapters_year.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_language": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_language", "persistent_file": os.path.join( self.path_persistent, "chapters_language.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_abstract": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_abstract", "persistent_file": os.path.join( self.path_persistent, "chapters_abstract.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_authors": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_authors", "persistent_file": os.path.join( self.path_persistent, "chapters_authors.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_authors_name": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_authors_name", "persistent_file": os.path.join( self.path_persistent, "chapters_authors_name.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_all_citations": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_all_citations", "persistent_file": os.path.join( self.path_persistent, "chapters_all_citations.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_keywords": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_keywords", "persistent_file": os.path.join( self.path_persistent, "chapters_keywords.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_books_isbns": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_books_isbns", "persistent_file": os.path.join( self.path_persistent, "chapters_books_isbns.pkl"), "persistent_variable": {}, "dataset_format": "json" }, }
def train(self, train_data): print("Training model...") timer = Timer() timer.tic() G = train_data[0] features = train_data[1] id_map = train_data[2] if features is not None: # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1], ))]) context_pairs = train_data[3] if self.random_context else None placeholders = self._construct_placeholders() minibatch = EdgeMinibatchIterator(G, id_map, placeholders, batch_size=self.batch_size, max_degree=self.max_degree, num_neg_samples=self.neg_sample_size, context_pairs=context_pairs) adj_info_ph = tf.compat.v1.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") model = self._create_model(placeholders, features, adj_info, minibatch) config = tf.compat.v1.ConfigProto( log_device_placement=self.log_device_placement) config.gpu_options.allow_growth = True config.allow_soft_placement = True # Initialize session sess = tf.compat.v1.Session(config=config) merged = tf.compat.v1.summary.merge_all() # summary_writer = tf.compat.v1.summary.FileWriter(self._log_dir(), # sess.graph) # Initialize model saver saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs) # Init variables sess.run(tf.compat.v1.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) # Train model train_shadow_mrr = None shadow_mrr = None total_steps = 0 avg_time = 0.0 epoch_val_costs = [] train_losses = [] validation_losses = [] train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj) val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj) for epoch in range(self.epochs): minibatch.shuffle() iter = 0 print('Epoch: %04d' % (epoch)) epoch_val_costs.append(0) train_loss_epoch = [] validation_loss_epoch = [] while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict() feed_dict.update({placeholders['dropout']: self.dropout}) t = time.time() # Training step outs = sess.run([ merged, model.opt_op, model.loss, model.ranks, model.aff_all, model.mrr, model.outputs1 ], feed_dict=feed_dict) train_cost = outs[2] train_mrr = outs[5] train_loss_epoch.append(train_cost) if train_shadow_mrr is None: train_shadow_mrr = train_mrr else: train_shadow_mrr -= (1 - 0.99) * (train_shadow_mrr - train_mrr) if iter % self.validate_iter == 0: # Validation sess.run(val_adj_info.op) val_cost, ranks, val_mrr, duration = self._evaluate( sess, model, minibatch, size=self.validate_batch_size) sess.run(train_adj_info.op) epoch_val_costs[-1] += val_cost validation_loss_epoch.append(val_cost) if shadow_mrr is None: shadow_mrr = val_mrr else: shadow_mrr -= (1 - 0.99) * (shadow_mrr - val_mrr) # if total_steps % self.print_every == 0: # summary_writer.add_summary(outs[0], total_steps) # Print results avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) if total_steps % self.print_every == 0: print( "Iter: %04d" % iter, "train_loss={:.5f}".format(train_cost), "train_mrr={:.5f}".format(train_mrr), # exponential moving average "train_mrr_ema={:.5f}".format(train_shadow_mrr), "val_loss={:.5f}".format(val_cost), "val_mrr={:.5f}".format(val_mrr), # exponential moving average "val_mrr_ema={:.5f}".format(shadow_mrr), "time={:.5f}".format(avg_time)) iter += 1 total_steps += 1 if total_steps > self.max_total_steps: break # Keep track of train and validation losses per epoch train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch)) validation_losses.append( sum(validation_loss_epoch) / len(validation_loss_epoch)) # Save embeddings if the epoch has the lowest validation loss # so far if self.save_embeddings and validation_losses[-1] == min( validation_losses): print( "Minimum validation loss so far ({}) at epoch {}.".format( validation_losses[-1], epoch)) sess.run(val_adj_info.op) self._save_embeddings(sess, model, minibatch, self.validate_batch_size, self._log_dir()) # Save model at each epoch print("Saving model at epoch {}.".format(epoch)) saver.save(sess, os.path.join(self._log_dir(), "model_epoch_" + str(epoch) + ".ckpt"), global_step=total_steps) if total_steps > self.max_total_steps: break print("Optimization finished!\n") training_time = timer.toc() self._plot_losses(train_losses, validation_losses) self._print_stats(train_losses, validation_losses, training_time)
def train(self): # Make the datasets iterable batch_size = 10000 train_data_loader = torch.utils.data.DataLoader( dataset=self.training_data, batch_size=batch_size) validation_data_loader = torch.utils.data.DataLoader( dataset=self.validation_data, batch_size=batch_size) train_labels_loader = torch.utils.data.DataLoader( dataset=self.training_labels, batch_size=batch_size) validation_labels_loader = torch.utils.data.DataLoader( dataset=self.validation_labels, batch_size=batch_size) # Train the model timer = Timer() timer.tic() mean_train_losses = [] mean_validation_losses = [] for epoch in range(self.epochs): print("Epoch: {}".format(epoch + 1)) train_losses = [] validation_losses = [] self.model.train() for i, (train_data, train_labels) in enumerate( zip(train_data_loader, train_labels_loader)): self.model.train() self.optimizer.zero_grad() outputs = self.model(train_data) loss = self.cross_entropy_loss(outputs.squeeze(), train_labels) loss.backward() self.optimizer.step() train_losses.append(loss.item()) # Compute validation loss self.model.eval() with torch.no_grad(): for _, (val_data, val_labels) in enumerate( zip(validation_data_loader, validation_labels_loader)): val_pred = self.model(val_data) val_loss = self.cross_entropy_loss( val_pred.squeeze(), val_labels) validation_losses.append(val_loss.item()) print("\tTrain loss: {}, validation loss: {}".format( np.mean(train_losses), np.mean(validation_losses))) mean_train_losses.append(np.mean(train_losses)) mean_validation_losses.append(np.mean(validation_losses)) if mean_validation_losses[-1] == min(mean_validation_losses): print("\tSaving model...") torch.save(self.model.state_dict(), self.model_path) print("\tSaved.") print("Finished training.") training_time = timer.toc() self._plot_losses(mean_train_losses, mean_validation_losses) self._print_stats(mean_train_losses, mean_validation_losses, training_time)
def main(): parser = argparse.ArgumentParser( description='Arguments for GraphSAGE concatenated ' + 'classifier model evaluation.') parser.add_argument( "classifier_name", choices=["KNN", "MLP", "MultinomialLogisticRegression"], help="The name of the classifier.") parser.add_argument('embedding_type', choices=[ "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('model_checkpoint_citations', help='Name of the GraphSAGE model checkpoint ' + 'for the citations graph.') parser.add_argument('model_checkpoint_authors', help='Name of the GraphSAGE model checkpoint ' + 'for the authors graph.') parser.add_argument('train_prefix_citations', help='Name of the object file that stores the ' + 'citations training data.') parser.add_argument('train_prefix_authors', help='Name of the object file that stores the ' + 'authors training data.') parser.add_argument('model_name', choices=[ "graphsage_mean", "gcn", "graphsage_seq", "graphsage_maxpool", "graphsage_meanpool" ], help="Model names.") parser.add_argument('--model_size', choices=["small", "big"], default="small", help="Can be big or small; model specific def'ns") parser.add_argument('--learning_rate', type=float, default=0.00001, help='Initial learning rate.') parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to train.') parser.add_argument('--dropout', type=float, default=0.0, help='Dropout rate (1 - keep probability).') parser.add_argument('--weight_decay', type=float, default=0.0, help='Weight for l2 loss on embedding matrix.') parser.add_argument('--max_degree', type=int, default=100, help='Maximum node degree.') parser.add_argument('--samples_1', type=int, default=25, help='Number of samples in layer 1.') parser.add_argument('--samples_2', type=int, default=10, help='Number of users samples in layer 2.') parser.add_argument('--dim_1', type=int, default=128, help='Size of output dim ' + '(final is 2x this, if using concat)') parser.add_argument('--dim_2', type=int, default=128, help='Size of output dim ' + '(final is 2x this, if using concat)') parser.add_argument('--random_context', action="store_false", default=True, help='Whether to use random context or direct ' + 'edges.') parser.add_argument('--neg_sample_size', type=int, default=20, help='Number of negative samples.') parser.add_argument('--batch_size', type=int, default=512, help='Minibatch size.') parser.add_argument('--identity_dim', type=int, default=0, help='Set to positive value to use identity ' + 'embedding features of that dimension.') parser.add_argument('--save_embeddings', action="store_true", default=False, help='Whether to save embeddings for all nodes ' + 'after training') parser.add_argument('--base_log_dir', default='../../../data/processed/graphsage/', help='Base directory for logging and saving ' + 'embeddings') parser.add_argument('--validate_iter', type=int, default=5000, help='How often to run a validation minibatch.') parser.add_argument('--validate_batch_size', type=int, default=256, help='How many nodes per validation sample.') parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') parser.add_argument('--print_every', type=int, default=50, help='How often to print training info.') parser.add_argument('--max_total_steps', type=int, default=10**10, help='Maximum total number of iterations.') parser.add_argument('--log_device_placement', action="store_true", default=False, help='Whether to log device placement.') parser.add_argument('--recs', type=int, default=10, help='Number of recommendations.') args = parser.parse_args() print("Starting evaluation...") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) print("Using GPU {}.".format(str(args.gpu))) from GraphSAGEClassifierConcatEvaluation import GraphSAGEClassifierConcatEvaluation evaluation_model = GraphSAGEClassifierConcatEvaluation( args.classifier_name, args.embedding_type, args.model_name, args.model_size, args.learning_rate, args.gpu, args.recs) # Initialize GraphSAGE models graphsage_model_citations = UnsupervisedModel( args.train_prefix_citations, args.model_name, args.model_size, args.learning_rate, args.epochs, args.dropout, args.weight_decay, args.max_degree, args.samples_1, args.samples_2, args.dim_1, args.dim_2, args.random_context, args.neg_sample_size, args.batch_size, args.identity_dim, args.save_embeddings, args.base_log_dir, args.validate_iter, args.validate_batch_size, args.gpu, args.print_every, args.max_total_steps, args.log_device_placement) graphsage_model_authors = UnsupervisedModel( args.train_prefix_authors, args.model_name, args.model_size, args.learning_rate, args.epochs, args.dropout, args.weight_decay, args.max_degree, args.samples_1, args.samples_2, args.dim_1, args.dim_2, args.random_context, args.neg_sample_size, args.batch_size, args.identity_dim, args.save_embeddings, args.base_log_dir, args.validate_iter, args.validate_batch_size, args.gpu, args.print_every, args.max_total_steps, args.log_device_placement) # Train model if needed: if not evaluation_model._has_persistent_model(): print("Classifier not trained yet. Training now...") timer = Timer() timer.tic() evaluation_model.train(graphsage_model_citations, graphsage_model_authors) print("Training finished.") timer.toc() else: evaluation_model._load_model_classifier() # Load test data print("Loading test data...") query_test, query_test_authors, truth = evaluation_model.load_data() print("Loaded.") # Infer embeddings print("Inferring embeddings for citations graph.") queue_citations = mp.Queue() process_citations = mp.Process( target=evaluation_model.infer_embeddings, args=(query_test, None, "citations", graphsage_model_citations, args.model_checkpoint_citations, queue_citations)) process_citations.start() embeddings_citations = queue_citations.get() process_citations.join() process_citations.terminate() print("Inferring embeddings for authors graphs.") queue_authors = mp.Queue() process_authors = mp.Process(target=evaluation_model.infer_embeddings, args=(query_test, query_test_authors, "authors", graphsage_model_authors, args.model_checkpoint_authors, queue_authors)) process_authors.start() embeddings_authors = queue_authors.get() process_authors.join() process_authors.terminate() # Concatenate embeddings test_embeddings = np.concatenate( (embeddings_citations, embeddings_authors), axis=1) print("Computing predictions...") recommendation = evaluation_model.compute_predictions(test_embeddings) print("Predictions computed.") # Evaluate print("Evaluating...") evaluation = EvaluationContainer() evaluation.evaluate(recommendation, truth) print("Finished.")
class Processor(): def __init__(self, embedding_type, graph_type, threshold=2, gpu=0): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.embedding_type = embedding_type self.graph_type = graph_type self.threshold = threshold self.embeddings_parser = EmbeddingsParser(gpu) self.timer = Timer() self.path_persistent = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type) if not os.path.isdir(self.path_persistent): os.mkdir(self.path_persistent) def training_data(self, num_walks=50): self.prefix = "train_val" self.timer.tic() print("Creating training files.") # Load training and validation data d_train = DataLoader() df_train = d_train.training_data_with_abstracts_citations().data d_val = DataLoader() df_validation = d_val.validation_data_with_abstracts_citations().data # Create and save graph self.G = nx.Graph() # Add nodes and edges print("Adding training nodes.") self._add_nodes(df_train, test=False, val=False) print("Adding training edges.") if self.graph_type == "citations" or self.graph_type == "authors": if self.graph_type == "authors": df_train = d_train.author_names().data self._add_edges(df_train) elif self.graph_type == "citations_authors_het_edges": # Adding heterogeneous edges # Add citation edges self._add_weighted_edges_citations(df_train) # Add author edges df_train = d_train.author_names().data self._add_weighted_edges_authors(df_train) else: raise KeyError("Graph type unknown.") print("Adding validation nodes.") self._add_nodes(df_validation, test=False, val=True) print("Adding validation edges.") if self.graph_type == "citations" or self.graph_type == "authors": if self.graph_type == "authors": df_validation = d_val.author_names().data self._add_edges(df_validation) elif self.graph_type == "citations_authors_het_edges": # Add citation edges self._add_weighted_edges_citations(df_validation) # Add author edges df_validation = d_val.author_names().data self._add_weighted_edges_authors(df_validation) else: raise KeyError("Graph type unknown.") if self.graph_type == "citations_authors_het_edges": # Remove edges with weight lower than threshold remove_edges = [(u, v) for u, v, e in self.G.edges(data=True) if e["weight"] < self.threshold] self.G.remove_edges_from(remove_edges) # Clear edge attributes for n1, n2, d in self.G.edges(data=True): d.clear() print("Edges in graph: {}.\n".format(self.G.number_of_edges())) print("Removing nodes without features.") for node in list(self.G.nodes()): if "feature" not in self.G.nodes[node].keys(): self.G.remove_node(node) print("Nodes in graph: {}, edges in graph: {}.\n".format( self.G.number_of_nodes(), self.G.number_of_edges())) print("Saving graph to disk.") G_data = json_graph.node_link_data(self.G) with open(os.path.join(self.path_persistent, self.prefix + "-G.json"), "w") as f: f.write(json.dumps(G_data)) # Create and save class map self.label_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=np.int) data = df_train.append(df_validation, ignore_index=True) labels = data.conferenceseries.unique() labels = labels.reshape(-1, 1) self.label_encoder.fit(labels) self._create_class_map(data) # Create and save id map self._create_id_map() # Create and save features self._create_features() # Perform and save random walks nodes = [ n for n in list(self.G.nodes()) if not self.G.node[n]["val"] and not self.G.node[n]["test"] ] subgraph = self.G.subgraph(nodes) self._run_random_walks(subgraph, nodes, num_walks) print("Finished creating training files.") self.timer.toc() # print some statistics self._get_stats() # Plot degree histogram self._degree_histogram() def test_data(self, df_test, G_train, authors_df=None, class_map=None, normalize=True): # TO DO: Add case for authors self.prefix = "test" print("Preprocessing data...") self.G = G_train print("Training graph has {} nodes and {} edges.\n".format( self.G.number_of_nodes(), self.G.number_of_edges())) # Add nodes and edges print("Adding test nodes.") self._add_nodes(df_test, test=True, val=False) print("Adding test edges.") if self.graph_type == "citations" or self.graph_type == "authors": if self.graph_type == "authors": if authors_df is not None: df_test = pd.merge(df_test, authors_df, how="left", on=["chapter", "chapter"]) else: raise ValueError("Chapter authors are missing.") self._add_edges(df_test) elif self.graph_type == "citations_authors_het_edges": # Adding heterogeneous edges # Add citation edges self._add_weighted_edges_citations(df_test) # Add author edges if authors_df is not None: df_test = pd.merge(df_test, authors_df, how="left", on=["chapter", "chapter"]) else: raise ValueError("Chapter authors are missing.") self._add_weighted_edges_authors(df_test) # Remove edges with weight lower than threshold remove_edges = [ (u, v) for u, v, e in self.G.edges(data=True) if "weight" in e.keys() and e["weight"] < self.threshold ] self.G.remove_edges_from(remove_edges) # Clear edge attributes for n1, n2, d in self.G.edges(data=True): d.clear() print("Edges in graph: {}.\n".format(self.G.number_of_edges())) else: raise KeyError("Graph type unknown.") print("Removing nodes without features.") for node in list(self.G.nodes()): if "feature" not in self.G.nodes[node].keys(): self.G.remove_node(node) print("Nodes in graph: {}, edges in graph: {}.\n".format( self.G.number_of_nodes(), self.G.number_of_edges())) # Remove all nodes that do not have val/test annotations broken_count = 0 for node in self.G.nodes(): if 'val' not in self.G.node[node] or 'test' not in self.G.node[ node]: self.G.remove_node(node) broken_count += 1 print( "Removed {} nodes that lacked proper annotations due to networkx versioning issues." .format(broken_count)) # Make sure the graph has edge train_removed annotations for edge in self.G.edges(): if (self.G.node[edge[0]]['val'] or self.G.node[edge[1]]['val'] or self.G.node[edge[0]]['test'] or self.G.node[edge[1]]['test']): self.G[edge[0]][edge[1]]['train_removed'] = True else: self.G[edge[0]][edge[1]]['train_removed'] = False # Create and process id map id_map = self._create_id_map() if isinstance(list(self.G.nodes)[0], int): conversion = lambda n: int(n) else: conversion = lambda n: n id_map = {conversion(k): int(v) for k, v in id_map.items()} # Create and process features features = self._create_features() if normalize: train_ids = np.array([ id_map[n] for n in self.G.nodes() if not self.G.node[n]['val'] and not self.G.node[n]['test'] ]) train_feats = features[train_ids] scaler = StandardScaler() scaler.fit(train_feats) features = scaler.transform(features) print("Finished preprocessing data.") # print some statistics self._get_stats() # Plot degree histogram self._degree_histogram() # Add "fake" temporary classes for test nodes in class map if class_map is not None: test_nodes = [n for n in self.G.nodes() if self.G.node[n]['test']] for test_node in test_nodes: class_map[test_node] = np.zeros( (len(class_map[list(class_map.keys())[0]]), ), dtype=int) return self.G, features, id_map, class_map return self.G, features, id_map def _add_nodes(self, data, test=False, val=False): with tqdm(desc="Adding nodes: ", total=len(data), unit="node") as pbar: for idx in range(len(data)): self.G.add_node( data.chapter.iloc[idx], test=test, feature=np.concatenate( (self.embeddings_parser.embed_sequence( data.chapter_title.iloc[idx], self.embedding_type), self.embeddings_parser.embed_sequence( data.chapter_abstract.iloc[idx], self.embedding_type)), axis=0).tolist(), val=val) pbar.update(1) print("Nodes in graph: {}.\n".format(self.G.number_of_nodes())) def _add_edges(self, data): if self.graph_type == "citations": self._add_edges_citations(data) elif self.graph_type == "authors": self._add_edges_authors(data) else: raise KeyError("Graph type unknown.") def _add_edges_citations(self, data): """Adds edges between papers that share a citation. """ with tqdm(desc="Adding edges: ", total=len(data)) as pbar: for idx in range(len(data)): self.G.add_edges_from([ (data.chapter.iloc[idx], data.chapter_citations.iloc[idx][i]) for i in range(len(data.chapter_citations.iloc[idx])) ]) pbar.update(1) print("Edges in graph: {}.\n".format(self.G.number_of_edges())) def _add_weighted_edges_citations(self, data): """Adds edges between papers that share a citation. """ with tqdm(desc="Adding edges: ", total=len(data)) as pbar: for idx in range(len(data)): self.G.add_edges_from( [(data.chapter.iloc[idx], data.chapter_citations.iloc[idx][i]) for i in range(len(data.chapter_citations.iloc[idx]))], weight=100) pbar.update(1) print("Edges in graph: {}.\n".format(self.G.number_of_edges())) def _add_edges_authors(self, data): """Adds edges between papers sharing an author. """ data_grouped = data.groupby("author_name")["chapter"].agg( list).reset_index() with tqdm(desc="Adding edges: ", total=len(data_grouped)) as pbar: for idx in range(len(data_grouped)): self.G.add_edges_from( combinations(data_grouped.iloc[idx].chapter, 2)) pbar.update(1) print("Edges in graph: {}.\n".format(self.G.number_of_edges())) def _add_weighted_edges_authors(self, data): """Adds edges between papers sharing an author. """ data_grouped = data.groupby("author_name")["chapter"].agg( list).reset_index() with tqdm(desc="Adding edges: ", total=len(data_grouped)) as pbar: for idx in range(len(data_grouped)): edges = combinations(data_grouped.iloc[idx].chapter, 2) for edge in edges: if self.G.has_edge(edge[0], edge[1]): self.G[edge[0]][edge[1]]["weight"] += 1 else: self.G.add_edge(edge[0], edge[1], weight=1) pbar.update(1) print("Edges in graph: {}.\n".format(self.G.number_of_edges())) def _create_class_map(self, data): print("Creating class map.") nodes = list(self.G.nodes) class_map = { nodes[i]: [ int(j) for j in list( self.label_encoder.transform( np.array(data[data.chapter == nodes[i]]. conferenceseries).reshape(-1, 1))[0]) ] for i in range(len(nodes)) } print("Saving class map to disk.") with open( os.path.join(self.path_persistent, self.prefix + "-class_map.json"), "w") as f: f.write(json.dumps(class_map)) with open(os.path.join(self.path_persistent, "label_encoder.pkl"), "wb") as f: pickle.dump(self.label_encoder, f) def _create_id_map(self): if self.prefix == "train_val": print("Creating id map.") nodes = list(self.G.nodes) id_map = {nodes[i]: i for i in range(len(nodes))} if self.prefix == "test": return id_map else: print("Saving id map to disk.") with open( os.path.join(self.path_persistent, self.prefix + "-id_map.json"), "w") as f: f.write(json.dumps(id_map)) def _create_features(self): if self.prefix == "train_val": print("Creating features.") features = np.array( [self.G.nodes[node]["feature"] for node in list(self.G.nodes)]) if self.prefix == "test": return features else: print("Saving features to disk.") np.save( os.path.join(self.path_persistent, self.prefix + "-feats.npy"), features) def _run_random_walks(self, graph, nodes, num_walks): print("Running random walks.") walks = run_random_walks(graph, nodes, num_walks=num_walks) print("Saving random walks to disk.") with open( os.path.join(self.path_persistent, self.prefix + "-walks.txt"), "w") as fp: fp.write("\n".join([str(w[0]) + "\t" + str(w[1]) for w in walks])) def _get_stats(self): degree_sequence = sorted([d for n, d in self.G.degree()], reverse=True) degree_count = Counter(degree_sequence) with open( os.path.join(self.path_persistent, self.prefix + "-stats.txt"), "w") as fp: self._print( "Number of nodes in the graph: {}\n".format( self.G.number_of_nodes()), fp) self._print( "Number of edges in the graph: {}\n".format( self.G.number_of_edges()), fp) self._print( "The graph is connected: {}\n".format(nx.is_connected(self.G)), fp) self._print( "Number of connected components: {}\n".format( nx.number_connected_components(self.G)), fp) self._print( "Number of self-loops: {}\n".format( nx.number_of_selfloops(self.G)), fp) self._print("Maximum degree: {}\n".format(max(degree_count)), fp) self._print("Minimum degree: {}\n".format(min(degree_count)), fp) self._print( "Average degree: {}\n".format( sum(degree_sequence) / len(self.G)), fp) def _degree_histogram(self): # Plot degree histogram degree_sequence = sorted([d for n, d in self.G.degree()], reverse=True) degree_count = Counter(degree_sequence) deg, cnt = zip(*degree_count.items()) fig, ax = plt.subplots() plt.bar(deg, cnt, width=0.80, color='b') plt.title("Degree Histogram") plt.ylabel("Count") plt.xlabel("Degree") ax.set_xticks([d + 0.4 for d in deg]) ax.set_xticklabels(deg) plt.savefig(os.path.join(self.path_persistent, self.prefix + "-degree_histogram.png"), bbox_inches="tight") def _print(self, text, f): print(text) f.write(text) def main(): parser = argparse.ArgumentParser( description='Arguments for data preprocessing.') parser.add_argument('embedding_type', choices=[ "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('dataset', help='Name of the object file that stores the ' + 'training data.') parser.add_argument('--threshold', type=int, default=2, help='Threshold for edge weights in ' + 'heterogeneous graph.') parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') args = parser.parse_args() print("Starting...") from preprocess_data import Processor processor = Processor(args.embedding_type, args.dataset, args.threshold, args.gpu) processor.training_data() print("Finished.") if __name__ == "__main__": main()
class Processor: def __init__(self, embedding_type, dataset, gpu=0): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.embedding_type = embedding_type self.dataset = dataset self.embeddings_parser = EmbeddingsParser(gpu) self.timer = Timer() self.path_persistent = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "gat", self.embedding_type, self.dataset) if not os.path.exists(self.path_persistent): os.makedirs(self.path_persistent) def training_data(self): self.timer.tic() print("Creating training files.\n") # Load training and validation data d_train = DataLoader() df_train = d_train.training_data_with_abstracts_citations().data d_val = DataLoader() df_validation = d_val.validation_data_with_abstracts_citations().data train_val_data = pd.concat((df_train, df_validation), axis=0).reset_index(drop=True) # Create file with feature vectors for both training and validation # data (as a scipy.sparse.csr.csr_matrix object) print("Creating feature vectors for training and validation data.") train_val_features = self._create_features(train_val_data) print("Created.") print("Saving to disk...") allx_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".allx") with open(allx_file, "wb") as f: pickle.dump(train_val_features, f) print("Saved.\n") # Create file with feature vectors only for training data # (as a scipy.sparse.csr.csr_matrix object) print("Creating feature vectors for training data.") train_features = train_val_features[:len(df_train)] print("Created.") print("Saving to disk...") x_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".x") with open(x_file, "wb") as f: pickle.dump(train_features, f) print("Saved.\n") # Create file with the labels for the training and validation data # (as a numpy.ndarray object) print("Creating labels for training and validation data.") self._train_label_encoder(train_val_data) train_val_labels = self.label_encoder.transform( np.array(train_val_data.conferenceseries).reshape(-1, 1)) print("Created") print("Saving to disk...") ally_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".ally") with open(ally_file, "wb") as f: pickle.dump(train_val_labels, f) print("Saved.\n") # Create file with the labels for the training data # (as a numpy.ndarray object) print("Creating labels for training data.") train_labels = train_val_labels[:len(df_train)] print("Created.") print("Saving to disk...") y_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".y") with open(y_file, "wb") as f: pickle.dump(train_labels, f) print("Saved.\n") # Create a dict in the format {index: [index_of_neighbor_nodes]} # (as a collections.defaultdict object) print("Creating dictionary of neighbours.") graph = defaultdict(list) with tqdm(desc="Adding neighbours: ", total=len(train_val_data)) as pbar: for idx in range(len(train_val_data)): citations_indices = [ train_val_data[train_val_data.chapter == citation].index.tolist() for citation in train_val_data.chapter_citations.iloc[idx] ] neighbours = [c[0] for c in citations_indices if c] graph[idx].extend(neighbours) for node in neighbours: graph[node].append(idx) pbar.update(1) with tqdm(desc="Removing duplicates: ", total=len(graph.keys())) as pbar: for idx in range(len(graph.keys())): graph[idx] = list(set(graph[idx])) pbar.update(1) print("Saving to disk...") graph_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".graph") with open(graph_file, "wb") as f: pickle.dump(graph, f) print("Saved.\n") print("Statistics") print("\tTraining data features: {}.".format(train_features.shape)) print("\tTraining data labels: {}.".format(len(train_labels))) print("\tTraining and validation data features: {}.".format( train_val_features.shape)) print("\tTraining and validation data labels: {}.".format( len(train_val_labels))) print("\tGraph size: {}.".format(len(graph))) def _create_features(self, data): features = [] with tqdm(desc="Creating features: ", total=len(data)) as pbar: for idx in range(len(data)): features.append( np.concatenate((self.embeddings_parser.embed_sequence( data.chapter_title.iloc[idx], self.embedding_type), self.embeddings_parser.embed_sequence( data.chapter_abstract.iloc[idx], self.embedding_type)), axis=0).tolist()) pbar.update(1) return sp.csr.csr_matrix(np.array(features)) def _train_label_encoder(self, data): self.label_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=np.int) labels = data.conferenceseries.unique() labels = labels.reshape(-1, 1) self.label_encoder.fit(labels) with open(os.path.join(self.path_persistent, "label_encoder.pkl"), "wb") as f: pickle.dump(self.label_encoder, f) def test_data(self, df_test, train_features, train_labels, train_val_features, train_val_labels, graph): print("Preprocessing data...") # Load training and validation data d_train = DataLoader() df_train = d_train.training_data_with_abstracts_citations().data d_val = DataLoader() df_validation = d_val.validation_data_with_abstracts_citations().data train_val_data = pd.concat((df_train, df_validation), axis=0).reset_index(drop=True) # Create the indices of test instances in graph (as a list object) test_indices = list(df_test.index) # Create "fake" temporary labels for test data test_labels = np.zeros((len(df_test), len(train_val_labels[0])), dtype=int) # Update graph with test data print("Updating graph information...") with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar: for idx in list(df_test.index): citations_indices = [ train_val_data[train_val_data.chapter == citation].index.tolist() for citation in df_test.chapter_citations.loc[idx] ] neighbours = [c[0] for c in citations_indices if c] graph[idx].extend(neighbours) for node in neighbours: graph[node].append(idx) pbar.update(1) with tqdm(desc="Removing duplicates: ", total=len(graph.keys())) as pbar: for idx in range(len(graph.keys())): graph[idx] = list(set(graph[idx])) pbar.update(1) print("Updated.") # Create feature vectors of test instances print("Creating features for test data...") test_features = self._create_features(df_test) print("Created.") max_degree = len(max(graph.values(), key=len)) test_idx_range = np.sort(test_indices) features = sp.vstack((train_val_features, test_features)).tolil() features[test_indices, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((train_val_labels, test_labels)) labels[test_indices, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(train_labels)) idx_val = range(len(train_labels), len(train_val_labels)) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] print("Finished preprocessing data.\n") print("Adjacency matrix shape: {}.".format(adj.shape)) print("Features matrix shape: {}.".format(features.shape)) print("Graph size: {}.".format(len(graph))) print("Max degree: {}.\n".format(max_degree)) dataset = [adj, features, y_train, y_test, train_mask, test_mask] prepared_test_data = self._prepare_test_data(dataset, max_degree) return prepared_test_data, max_degree def _prepare_test_data(self, dataset, max_degree): print("Preparing test data...") adj, features, y_train, y_test, train_mask, test_mask = dataset train_index = np.where(train_mask)[0] adj_train = adj[train_index, :][:, train_index] y_train = y_train[train_index] test_index = np.where(test_mask)[0] y_test = y_test[test_index] num_train = adj_train.shape[0] input_dim = features.shape[1] features = nontuple_preprocess_features(features).todense() train_features = features[train_index] norm_adj_train = nontuple_preprocess_adj(adj_train) norm_adj = nontuple_preprocess_adj(adj) adj_train, adj_val_train = compute_adjlist(norm_adj_train, max_degree) train_features = np.concatenate( (train_features, np.zeros((1, input_dim)))) print("Prepared.\n") return norm_adj, adj_train, adj_val_train, features, train_features, y_train, y_test, test_index def main(): parser = argparse.ArgumentParser( description='Arguments for data preprocessing.') parser.add_argument('embedding_type', choices=[ "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('dataset', help='Name of the object file that stores the ' + 'training data.') parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') args = parser.parse_args() print("Starting...") from preprocess_data import Processor processor = Processor(args.embedding_type, args.dataset, args.gpu) processor.training_data() print("Finished.") if __name__ == "__main__": main()
class Processor: def __init__(self, embedding_type, dataset, graph_type="directed", threshold=2, gpu=0): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.embedding_type = embedding_type self.dataset = dataset self.graph_type = graph_type self.threshold = threshold self.embeddings_parser = EmbeddingsParser(gpu) self.timer = Timer() self.path_persistent = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "gat", self.embedding_type, self.dataset) if not os.path.exists(self.path_persistent): os.makedirs(self.path_persistent) def training_data(self): self.timer.tic() print("Creating training files.\n") # Load training and validation data d_train = DataLoader() df_train = d_train.training_data_with_abstracts_citations().data d_val = DataLoader() df_validation = d_val.validation_data_with_abstracts_citations().data train_val_data = pd.concat((df_train, df_validation), axis=0).reset_index(drop=True) # Create file with feature vectors for both training and validation # data (as a scipy.sparse.csr.csr_matrix object) print("Creating feature vectors for training and validation data.") train_val_features = self._create_features(train_val_data) print("Created.") print("Saving to disk...") allx_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".allx") with open(allx_file, "wb") as f: pickle.dump(train_val_features, f) print("Saved.\n") # Create file with feature vectors only for training data # (as a scipy.sparse.csr.csr_matrix object) print("Creating feature vectors for training data.") train_features = train_val_features[:len(df_train)] print("Created.") print("Saving to disk...") x_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".x") with open(x_file, "wb") as f: pickle.dump(train_features, f) print("Saved.\n") # Create file with the labels for the training and validation data # (as a numpy.ndarray object) print("Creating labels for training and validation data.") self._train_label_encoder(train_val_data) train_val_labels = self.label_encoder.transform( np.array(train_val_data.conferenceseries).reshape(-1, 1)) print("Created") print("Saving to disk...") ally_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".ally") with open(ally_file, "wb") as f: pickle.dump(train_val_labels, f) print("Saved.\n") # Create file with the labels for the training data # (as a numpy.ndarray object) print("Creating labels for training data.") train_labels = train_val_labels[:len(df_train)] print("Created.") print("Saving to disk...") y_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".y") with open(y_file, "wb") as f: pickle.dump(train_labels, f) print("Saved.\n") # Create a dict in the format {index: [index_of_neighbor_nodes]} # (as a collections.defaultdict object) if self.dataset == "citations": if self.graph_type == "directed": graph = self._create_directed_graph(train_val_data) else: graph = self._create_undirected_graph(train_val_data) if self.dataset == "citations_authors_het_edges": df_train_authors = d_train.author_names().data df_val_authors = d_val.author_names().data train_val_authors_data = pd.concat( (df_train_authors, df_val_authors), axis=0).reset_index(drop=True) data_authors = train_val_authors_data.groupby( "author_name")["chapter"].agg(list).reset_index() if self.graph_type == "directed": graph = self._create_heterogeneous_directed_graph( train_val_data, data_authors) else: raise ValueError("Graph type incompatible. Only directed " + "graph is suported.") print("Finished creating training files.\n") print("Statistics") print("\tTraining data features: {}.".format(train_features.shape)) print("\tTraining data labels: {}.".format(len(train_labels))) print("\tTraining and validation data features: {}.".format( train_val_features.shape)) print("\tTraining and validation data labels: {}.".format( len(train_val_labels))) print("\tGraph size: {}.".format(len(graph))) print("\tMax node degree: {}.".format(len(max(graph.values(), key=len)))) def _create_directed_graph(self, train_val_data): print("Creating dictionary of neighbours.") graph = defaultdict(list) with tqdm(desc="Adding neighbours: ", total=len(train_val_data)) as pbar: for idx in range(len(train_val_data)): citations_indices = [ train_val_data[train_val_data.chapter == citation].index.tolist() for citation in train_val_data.chapter_citations.iloc[idx] ] graph[idx] = list(set([i[0] for i in citations_indices if i])) pbar.update(1) print("Created.") print("Saving to disk...") graph_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".graph_directed") with open(graph_file, "wb") as f: pickle.dump(graph, f) print("Saved.\n") return graph def _create_heterogeneous_directed_graph(self, train_val_data, data_authors): print("Creating dictionary of neighbours.") graph = defaultdict(list) # Add citation edges between papers with tqdm(desc="Adding citation neighbours: ", total=len(train_val_data)) as pbar: for idx in range(len(train_val_data)): citations_indices = [ train_val_data[train_val_data.chapter == citation].index.tolist() for citation in train_val_data.chapter_citations.iloc[idx] ] graph[idx] = [(i[0], 100) for i in citations_indices if i] pbar.update(1) # Add edges between papers if they share an author with tqdm(desc="Adding author neighbours: ", total=len(data_authors)) as pbar: for idx in range(len(data_authors)): authors_indices = [ train_val_data[train_val_data.chapter == paper].index.tolist() for paper in data_authors.chapter.iloc[idx] ] authors_indices = [i[0] for i in authors_indices if i] edges = [i for i in combinations(authors_indices, 2)] for edge in edges: graph[edge[0]].append((edge[1], 1)) pbar.update(1) # Removed edges with weights below the threshold for key in graph.keys(): d = defaultdict(int) for x, y in graph[key]: d[x] += y graph[key] = [k for k, v in d.items() if v >= self.threshold] print("Created.") print("Saving to disk...") graph_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".graph_directed") with open(graph_file, "wb") as f: pickle.dump(graph, f) print("Saved.\n") return graph def _create_undirected_graph(self, train_val_data): print("Creating dictionary of neighbours.") graph = defaultdict(list) with tqdm(desc="Adding neighbours: ", total=len(train_val_data)) as pbar: for idx in range(len(train_val_data)): citations_indices = [ train_val_data[train_val_data.chapter == citation].index.tolist() for citation in train_val_data.chapter_citations.iloc[idx] ] neighbours = [c[0] for c in citations_indices if c] graph[idx].extend(neighbours) for node in neighbours: graph[node].append(idx) pbar.update(1) with tqdm(desc="Removing duplicates: ", total=len(graph.keys())) as pbar: for idx in range(len(graph.keys())): graph[idx] = list(set(graph[idx])) pbar.update(1) print("Created.") print("Saving to disk...") graph_file = os.path.join(self.path_persistent, "ind." + self.dataset + ".graph") with open(graph_file, "wb") as f: pickle.dump(graph, f) print("Saved.\n") return graph def _create_features(self, data): features = [] with tqdm(desc="Creating features: ", total=len(data)) as pbar: for idx in range(len(data)): features.append( np.concatenate((self.embeddings_parser.embed_sequence( data.chapter_title.iloc[idx], self.embedding_type), self.embeddings_parser.embed_sequence( data.chapter_abstract.iloc[idx], self.embedding_type)), axis=0).tolist()) pbar.update(1) return sp.csr.csr_matrix(np.array(features)) def _train_label_encoder(self, data): self.label_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=np.int) labels = data.conferenceseries.unique() labels = labels.reshape(-1, 1) self.label_encoder.fit(labels) with open(os.path.join(self.path_persistent, "label_encoder.pkl"), "wb") as f: pickle.dump(self.label_encoder, f) def _update_directed_graph(self, graph, train_val_data, df_test): with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar: for idx in list(df_test.index): citations_indices = [ train_val_data[train_val_data.chapter == citation].index.tolist() for citation in df_test.chapter_citations.loc[idx] ] graph[idx] = list(set([i[0] for i in citations_indices if i])) pbar.update(1) return graph def _update_heterogeneous_directed_graph(self, graph, train_val_data, df_test, data_authors): with tqdm(desc="Adding citation neighbours: ", total=len(df_test)) as pbar: for idx in list(df_test.index): citations_indices = [ train_val_data[train_val_data.chapter == citation].index.tolist() for citation in df_test.chapter_citations.loc[idx] ] graph[idx] = [(i[0], 100) for i in citations_indices if i] pbar.update(1) with tqdm(desc="Adding author neighbours: ", total=len(data_authors)) as pbar: for idx in range(len(data_authors)): authors_indices = [ train_val_data[train_val_data.chapter == paper].index.tolist() for paper in data_authors.chapter.iloc[idx] ] authors_indices = [i[0] for i in authors_indices if i] edges = [i for i in combinations(authors_indices, 2)] for edge in edges: graph[edge[0]].append((edge[1], 1)) pbar.update(1) for key in graph.keys(): d = defaultdict(int) for e in reversed(graph[key]): if type(e) is tuple: if e[0] in d.keys(): d[e[0]] += e[1] else: d[e[0]] = e[1] graph[key].remove(e) graph[key].extend([k for k, v in d.items() if v >= self.threshold]) return graph def _update_undirected_graph(self, graph, train_val_data, df_test): with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar: for idx in list(df_test.index): citations_indices = [ train_val_data[train_val_data.chapter == citation].index.tolist() for citation in df_test.chapter_citations.loc[idx] ] neighbours = [c[0] for c in citations_indices if c] graph[idx].extend(neighbours) for node in neighbours: graph[node].append(idx) pbar.update(1) with tqdm(desc="Removing duplicates: ", total=len(graph.keys())) as pbar: for idx in range(len(graph.keys())): graph[idx] = list(set(graph[idx])) pbar.update(1) return graph def test_data(self, df_test, train_features, train_labels, train_val_features, train_val_labels, graph, authors_df=None): print("Preprocessing data...") # Load training and validation data d_train = DataLoader() df_train = d_train.training_data_with_abstracts_citations().data d_val = DataLoader() df_validation = d_val.validation_data_with_abstracts_citations().data train_val_data = pd.concat((df_train, df_validation), axis=0).reset_index(drop=True) # Create the indices of test instances in graph (as a list object) test_indices = list(df_test.index) # Create "fake" temporary labels for test data test_labels = np.zeros((len(df_test), len(train_val_labels[0])), dtype=int) # Update graph with test data print("Updating graph information...") if self.dataset == "citations": if self.graph_type == "directed": graph = self._update_directed_graph(graph, train_val_data, df_test) else: graph = self._update_undirected_graph(graph, train_val_data, df_test) if self.dataset == "citations_authors_het_edges": data_authors = authors_df.groupby("author_name")["chapter"].agg( list).reset_index() if self.graph_type == "directed": graph = self._update_heterogeneous_directed_graph( graph, train_val_data, df_test, data_authors) else: raise ValueError("Graph type incompatible. Only directed " + "graph is suported.") print("Updated.") # Create feature vectors of test instances print("Creating features for test data...") test_features = self._create_features(df_test) print("Created.") test_idx_range = np.sort(test_indices) features = sp.vstack((train_val_features, test_features)).tolil() features[test_indices, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((train_val_labels, test_labels)) labels[test_indices, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(train_labels)) idx_val = range(len(train_labels), len(train_val_labels)) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] print("Finished preprocessing data.") print("Adjacency matrix shape: {}.".format(adj.shape)) print("Features matrix shape: {}.".format(features.shape)) print("Graph size: {}.".format(len(graph))) return adj, features, y_train, y_test, train_mask, test_mask def main(): parser = argparse.ArgumentParser( description='Arguments for data preprocessing.') parser.add_argument('embedding_type', choices=[ "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('dataset', help='Name of the object file that stores the ' + 'training data.') parser.add_argument('--graph_type', choices=["directed", "undirected"], default="directed", help='The type of graph used ' + '(directed vs. undirected).') parser.add_argument('--threshold', type=int, default=2, help='Threshold for edge weights in ' + 'heterogeneous graph.') parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') args = parser.parse_args() print("Starting...") from preprocess_data import Processor processor = Processor(args.embedding_type, args.dataset, args.graph_type, args.threshold, args.gpu) processor.training_data() print("Finished.") if __name__ == "__main__": main()
class Processor: def __init__(self, embedding_type, gpu=0): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.embedding_type = embedding_type self.embeddings_parser = EmbeddingsParser(gpu) self.timer = Timer() self.path_persistent = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "han", self.embedding_type) if not os.path.exists(self.path_persistent): os.makedirs(self.path_persistent) def training_data(self): self.timer.tic() print("Creating training files.\n") # Load training and validation data d_train = DataLoader() df_train = d_train.training_data_with_abstracts_citations().data d_val = DataLoader() df_validation = d_val.validation_data_with_abstracts_citations().data train_val_data = pd.concat((df_train, df_validation), axis=0).reset_index(drop=True) print("Creating index files for training and validation data.") train_idx = np.asarray(list(train_val_data.index))[:len(df_train)] train_idx = np.asarray([train_idx]) val_idx = np.asarray(list(train_val_data.index))[len(df_train):] val_idx = np.asarray([val_idx]) print("Created.") print("Saving to disk...") train_idx_file = os.path.join(self.path_persistent, "train_idx.pkl") val_idx_file = os.path.join(self.path_persistent, "val_idx.pkl") with open(train_idx_file, "wb") as f: pickle.dump(train_idx, f) with open(val_idx_file, "wb") as f: pickle.dump(val_idx, f) print("Saved.") print("Creating labels for training and validation data.") self._train_label_encoder(train_val_data) train_val_labels = self.label_encoder.transform( np.array(train_val_data.conferenceseries).reshape(-1, 1)) print("Created") print("Saving to disk...") labels_file = os.path.join(self.path_persistent, "labels.pkl") with open(labels_file, "wb") as f: pickle.dump(train_val_labels, f) print("Saved.\n") print("Creating feature vectors for training and validation data.") train_val_features = self._create_features(train_val_data) print("Created.") print("Saving to disk...") features_file = os.path.join(self.path_persistent, "features.pkl") with open(features_file, "wb") as f: pickle.dump(train_val_features, f) print("Saved.\n") df_train_authors = d_train.author_names().data df_val_authors = d_val.author_names().data train_val_authors_data = pd.concat((df_train_authors, df_val_authors), axis=0).reset_index(drop=True) data_authors = train_val_authors_data.groupby( "author_name")["chapter"].agg(list).reset_index() print("Creating adjacency matrices...") PCP = self._create_PCP_adjacency(train_val_data) PAP = self._create_PAP_adjacency(train_val_data, data_authors) print("Created.") print("Finished creating training files.\n") print("Statistics") print("\tTraining and validation data features: {}.".format( train_val_features.shape)) print("\tTraining and validation data labels: {}.".format( train_val_labels.shape)) print("\tPCP graph size: {}.".format(len(PCP))) print("\tMax node degree: {}.".format(len(max(PCP.values(), key=len)))) print("\tPAP graph size: {}.".format(len(PAP))) print("\tMax node degree: {}.".format(len(max(PAP.values(), key=len)))) def test_data(self, df_test, authors_df, train_idx, features, labels, PCP, PAP): print("Preprocessing data...") # Load training and validation data d_train = DataLoader() df_train = d_train.training_data_with_abstracts_citations().data d_val = DataLoader() df_validation = d_val.validation_data_with_abstracts_citations().data train_val_data = pd.concat((df_train, df_validation), axis=0).reset_index(drop=True) data_authors = authors_df.groupby("author_name")["chapter"].agg( list).reset_index() # Create the indices of test instances in graph (as a list object) test_idx = np.asarray(list(df_test.index)) test_idx = np.asarray([test_idx]) # Create "fake" temporary labels for test data test_labels = np.zeros((len(df_test), len(labels[0])), dtype=int) labels = np.vstack((labels, test_labels)) train_mask = sample_mask(train_idx, labels.shape[0]) test_mask = sample_mask(test_idx, labels.shape[0]) y_train = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_test[test_mask, :] = labels[test_mask, :] # Update graph with test data print("Updating graph information...") PCP_graph = self._update_PCP_adjacency(PCP, train_val_data, df_test) PAP_graph = self._update_PAP_adjacency(PAP, train_val_data, df_test, data_authors) print("Updated.") PAP = nx.adjacency_matrix(nx.from_dict_of_lists(PAP_graph)) PCP = nx.adjacency_matrix(nx.from_dict_of_lists(PCP_graph)) row_networks = [PCP, PAP] print("PCP: {}; PAP: {}".format(PCP.shape, PAP.shape)) # Create feature vectors of test instances print("Creating features for test data...") test_features = self._create_features(df_test) features = np.vstack((features, test_features)) print("Features: {}".format(features.shape)) print("Created.") print("Finished preprocessing data.") print("y_train: {}, y_test: {}, train_idx: {}, test_idx: {}".format( y_train.shape, y_test.shape, train_idx.shape, test_idx.shape)) features_list = [features, features, features] return row_networks, features_list, y_train, y_test, train_mask, test_mask def _create_PCP_adjacency(self, data): print("Creating paper-citation-paper adjacency lists.") graph = defaultdict(list) with tqdm(desc="Adding neighbours: ", total=len(data)) as pbar: for idx in range(len(data)): citations_indices = [ data[data.chapter == citation].index.tolist() for citation in data.chapter_citations.iloc[idx] ] graph[idx] = list(set([i[0] for i in citations_indices if i])) pbar.update(1) print("Created.") print("Saving to disk...") graph_file = os.path.join(self.path_persistent, "PCP.pkl") with open(graph_file, "wb") as f: pickle.dump(graph, f) print("Saved.\n") return graph def _update_PCP_adjacency(self, graph, data, df_test): print("Updating paper-citation-paper adjacency lists.") with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar: for idx in list(df_test.index): citations_indices = [ data[data.chapter == citation].index.tolist() for citation in df_test.chapter_citations.loc[idx] ] graph[idx] = list(set([i[0] for i in citations_indices if i])) pbar.update(1) print("Updated.") return graph def _create_PAP_adjacency(self, data, data_authors): print("Creating paper-author-paper adjacency lists.") graph = defaultdict() for idx in data.index: graph[idx] = [] # Add edges between papers if they share an author with tqdm(desc="Adding neighbours: ", total=len(data_authors)) as pbar: for idx in range(len(data_authors)): authors_indices = [ data[data.chapter == paper].index.tolist() for paper in data_authors.chapter.iloc[idx] ] authors_indices = [i[0] for i in authors_indices if i] edges = [i for i in combinations(authors_indices, 2)] for edge in edges: graph[edge[0]].append(edge[1]) pbar.update(1) print("Created.") print("Saving to disk...") graph_file = os.path.join(self.path_persistent, "PAP.pkl") with open(graph_file, "wb") as f: pickle.dump(graph, f) print("Saved.\n") return graph def _update_PAP_adjacency(self, graph, data, df_test, data_authors): print("Updating paper-author-paper adjacency lists.") for idx in df_test.index: graph[idx] = [] with tqdm(desc="Adding neighbours: ", total=len(data_authors)) as pbar: for idx in range(len(data_authors)): authors_indices = [ data[data.chapter == paper].index.tolist() for paper in data_authors.chapter.iloc[idx] ] authors_indices = [i[0] for i in authors_indices if i] edges = [i for i in combinations(authors_indices, 2)] for edge in edges: graph[edge[0]].append(edge[1]) pbar.update(1) print("Updated.") return graph def _create_features(self, data): features = [] with tqdm(desc="Creating features: ", total=len(data)) as pbar: for idx in range(len(data)): features.append( np.concatenate((self.embeddings_parser.embed_sequence( data.chapter_title.iloc[idx], self.embedding_type), self.embeddings_parser.embed_sequence( data.chapter_abstract.iloc[idx], self.embedding_type)), axis=0).tolist()) pbar.update(1) return np.asarray(features) def _train_label_encoder(self, data): self.label_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=np.int) labels = data.conferenceseries.unique() labels = labels.reshape(-1, 1) self.label_encoder.fit(labels) with open(os.path.join(self.path_persistent, "label_encoder.pkl"), "wb") as f: pickle.dump(self.label_encoder, f) def main(): parser = argparse.ArgumentParser( description='Arguments for data preprocessing.') parser.add_argument('embedding_type', choices=[ "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') args = parser.parse_args() print("Starting...") from preprocess_data import Processor processor = Processor(args.embedding_type, args.gpu) processor.training_data() print("Finished.") if __name__ == "__main__": main()
def predict(self, test_data, model_checkpoint, gpu_mem_fraction=None): timer = Timer() timer.tic() G = test_data[0] features = test_data[1] id_map = test_data[2] if features is not None: # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1], ))]) context_pairs = test_data[3] if self.random_context else None placeholders = self._construct_placeholders() minibatch = EdgeMinibatchIterator(G, id_map, placeholders, batch_size=self.batch_size, max_degree=self.max_degree, num_neg_samples=self.neg_sample_size, context_pairs=context_pairs) adj_info_ph = tf.compat.v1.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") model = self._create_model(placeholders, features, adj_info, minibatch) config = tf.compat.v1.ConfigProto( log_device_placement=self.log_device_placement) if gpu_mem_fraction is not None: config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_fraction config.gpu_options.allow_growth = True config.allow_soft_placement = True # Initialize session sess = tf.compat.v1.Session(config=config) merged = tf.compat.v1.summary.merge_all() # summary_writer = tf.compat.v1.summary.FileWriter(self._log_dir(), # sess.graph) # Initialize model saver saver = tf.compat.v1.train.Saver() # Init variables sess.run(tf.compat.v1.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj) # Restore model print("Restoring trained model.") checkpoint_file = os.path.join(self._log_dir(), model_checkpoint) ckpt = tf.compat.v1.train.get_checkpoint_state(checkpoint_file) if checkpoint_file: saver.restore(sess, checkpoint_file) print("Model restored.") else: print("This model checkpoint does not exist. The model might " + "not be trained yet or the checkpoint is invalid.") # Infer embeddings sess.run(val_adj_info.op) print("Computing embeddings...") val_embeddings = [] finished = False seen = set([]) nodes = [] iter_num = 0 while not finished: feed_dict_val, finished, edges = minibatch.incremental_embed_feed_dict( self.validate_batch_size, iter_num) iter_num += 1 outs_val = sess.run([model.loss, model.mrr, model.outputs1], feed_dict=feed_dict_val) for i, edge in enumerate(edges): if not edge[0] in seen: val_embeddings.append(outs_val[-1][i, :]) nodes.append(edge[0]) seen.add(edge[0]) val_embeddings = np.vstack(val_embeddings) if self.save_embeddings: print("Saving embeddings...") if not os.path.exists(self._log_dir()): os.makedirs(self._log_dir()) np.save(self._log_dir() + "inferred_embeddings.npy", val_embeddings) with open(self._log_dir() + "inferred_embeddings_ids.txt", "w") as fp: fp.write("\n".join(map(str, nodes))) print("Embeddings saved.\n") # Return only the embeddings of the test nodes test_embeddings_ids = {} for i, node in enumerate(nodes): test_embeddings_ids[node] = i test_nodes = [n for n in G.nodes() if G.node[n]['test']] test_embeddings = val_embeddings[[ test_embeddings_ids[id] for id in test_nodes ]] sess.close() tf.compat.v1.reset_default_graph() timer.toc() return test_nodes, test_embeddings
def train(self): print("Loading data...") adj_list, features_list, y_train, y_val, train_mask, val_mask = load_data( self.embedding_type) print("Loaded.") nb_nodes = features_list[0].shape[0] ft_size = features_list[0].shape[1] nb_classes = y_train.shape[1] features_list = [features[np.newaxis] for features in features_list] y_train = y_train[np.newaxis] y_val = y_val[np.newaxis] train_mask = train_mask[np.newaxis] val_mask = val_mask[np.newaxis] biases_list = [preprocess_adj_bias(adj) for adj in adj_list] print("Training model...") timer = Timer() timer.tic() print( "Parameters: batch size={}, nb_nodes={}, ft_size={}, nb_classes={}\n" .format(self.batch_size, nb_nodes, ft_size, nb_classes)) model = HAN(self.model, self.hid_units, self.n_heads, nb_classes, nb_nodes, l2_coef=self.weight_decay, ffd_drop=self.ffd_drop, attn_drop=self.attn_drop, activation=self.nonlinearity, residual=self.residual) vlss_mn = np.inf vacc_mx = 0.0 curr_step = 0 train_loss_avg = 0 train_acc_avg = 0 val_loss_avg = 0 val_acc_avg = 0 train_losses = [] val_losses = [] train_accuracies = [] val_accuracies = [] for epoch in range(self.epochs): print("\nEpoch {}".format(epoch)) # Training tr_step = 0 tr_size = features_list[0].shape[0] while tr_step * self.batch_size < tr_size: feats_list = [ features[tr_step * self.batch_size:(tr_step + 1) * self.batch_size] for features in features_list ] _, train_embed, att_val, acc_tr, loss_value_tr = self._train( model=model, inputs_list=feats_list, bias_mat_list=biases_list, lbl_in=y_train[tr_step * self.batch_size:(tr_step + 1) * self.batch_size], msk_in=train_mask[tr_step * self.batch_size:(tr_step + 1) * self.batch_size]) train_loss_avg += loss_value_tr train_acc_avg += acc_tr tr_step += 1 # Validation vl_step = 0 vl_size = features_list[0].shape[0] while vl_step * self.batch_size < vl_size: feats_list = [ features[vl_step * self.batch_size:(vl_step + 1) * self.batch_size] for features in features_list ] _, val_embed, att_val, acc_vl, loss_value_vl = self.evaluate( model=model, inputs_list=feats_list, bias_mat_list=biases_list, lbl_in=y_val[vl_step * self.batch_size:(vl_step + 1) * self.batch_size], msk_in=val_mask[vl_step * self.batch_size:(vl_step + 1) * self.batch_size]) val_loss_avg += loss_value_vl val_acc_avg += acc_vl vl_step += 1 print( 'Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f' % (train_loss_avg / tr_step, train_acc_avg / tr_step, val_loss_avg / vl_step, val_acc_avg / vl_step)) train_losses.append(train_loss_avg / tr_step) val_losses.append(val_loss_avg / vl_step) train_accuracies.append(train_acc_avg / tr_step) val_accuracies.append(val_acc_avg / vl_step) # Early Stopping if val_acc_avg / vl_step >= vacc_mx or val_loss_avg / vl_step <= vlss_mn: if val_acc_avg / vl_step >= vacc_mx and val_loss_avg / vl_step <= vlss_mn: vacc_early_model = val_acc_avg / vl_step vlss_early_model = val_loss_avg / vl_step working_weights = model.get_weights() print( "Minimum validation loss ({}), maximum accuracy ({}) so far at epoch {}." .format(val_loss_avg / vl_step, val_acc_avg / vl_step, epoch)) self._save_model(model) vacc_mx = np.max((val_acc_avg / vl_step, vacc_mx)) vlss_mn = np.min((val_loss_avg / vl_step, vlss_mn)) curr_step = 0 else: curr_step += 1 if curr_step == self.patience: print("Early stop! Min loss: {}, Max accuracy: {}".format( vlss_mn, vacc_mx)) print("Early stop model validation loss: {}, accuracy: {}". format(vlss_early_model, vacc_early_model)) model.set_weights(working_weights) break train_loss_avg = 0 train_acc_avg = 0 val_loss_avg = 0 val_acc_avg = 0 print("Training finished.") training_time = timer.toc() train_losses = [x.numpy() for x in train_losses] val_losses = [x.numpy() for x in val_losses] train_accuracies = [x.numpy() for x in train_accuracies] val_accuracies = [x.numpy() for x in val_accuracies] self._plot_losses(train_losses, val_losses) self._plot_accuracies(train_accuracies, val_accuracies) self._print_stats(train_losses, val_losses, train_accuracies, val_accuracies, training_time)
def train(self, train_data, sampler_name='Uniform'): print("Training model...") timer = Timer() timer.tic() G = train_data[0] features = train_data[1] id_map = train_data[2] if features is not None: # pad with dummy zero vector features = np.vstack([features, np.zeros((features.shape[1], ))]) context_pairs = train_data[3] if self.random_context else None placeholders = self._construct_placeholders() minibatch = EdgeMinibatchIterator(G, id_map, placeholders, batch_size=self.batch_size, max_degree=self.max_degree, num_neg_samples=self.neg_sample_size, context_pairs=context_pairs) adj_info_ph = tf.compat.v1.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") adj_shape = adj_info.get_shape().as_list() model = self._create_model(sampler_name, placeholders, features, adj_info, minibatch) config = tf.compat.v1.ConfigProto( log_device_placement=self.log_device_placement) config.gpu_options.allow_growth = True config.allow_soft_placement = True # Initialize session sess = tf.compat.v1.Session(config=config) merged = tf.compat.v1.summary.merge_all() # summary_writer = tf.compat.v1.summary.FileWriter( # self._log_dir(sampler_name), sess.graph) # Initialize model saver saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs) # Init variables sess.run(tf.compat.v1.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) # Restore params of ML sampler model if sampler_name == 'ML' or sampler_name == 'FastML': sampler_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="MLsampler") saver_sampler = tf.compat.v1.train.Saver(var_list=sampler_vars) sampler_model_path = self._sampler_model_path() saver_sampler.restore(sess, sampler_model_path + 'model.ckpt') # Loss node path loss_node_path = self._loss_node_path(sampler_name) if not os.path.exists(loss_node_path): os.makedirs(loss_node_path) # Train model train_shadow_mrr = None shadow_mrr = None total_steps = 0 avg_time = 0.0 epoch_val_costs = [] train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj) val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj) train_losses = [] validation_losses = [] val_cost_ = [] val_mrr_ = [] shadow_mrr_ = [] duration_ = [] ln_acc = sparse.csr_matrix((adj_shape[0], adj_shape[0]), dtype=np.float32) lnc_acc = sparse.csr_matrix((adj_shape[0], adj_shape[0]), dtype=np.int32) ln_acc = ln_acc.tolil() lnc_acc = lnc_acc.tolil() for epoch in range(self.epochs): minibatch.shuffle() iter = 0 print('Epoch: %04d' % (epoch)) epoch_val_costs.append(0) train_loss_epoch = [] validation_loss_epoch = [] while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict() feed_dict.update({placeholders['dropout']: self.dropout}) t = time.time() # Training step outs = sess.run([ merged, model.opt_op, model.loss, model.ranks, model.aff_all, model.mrr, model.outputs1, model.loss_node, model.loss_node_count ], feed_dict=feed_dict) train_cost = outs[2] train_mrr = outs[5] train_loss_epoch.append(train_cost) if train_shadow_mrr is None: train_shadow_mrr = train_mrr else: train_shadow_mrr -= (1 - 0.99) * (train_shadow_mrr - train_mrr) if iter % self.validate_iter == 0: # Validation sess.run(val_adj_info.op) val_cost, ranks, val_mrr, duration = self._evaluate( sess, model, minibatch, size=self.validate_batch_size) sess.run(train_adj_info.op) epoch_val_costs[-1] += val_cost validation_loss_epoch.append(val_cost) if shadow_mrr is None: shadow_mrr = val_mrr else: shadow_mrr -= (1 - 0.99) * (shadow_mrr - val_mrr) val_cost_.append(val_cost) val_mrr_.append(val_mrr) shadow_mrr_.append(shadow_mrr) duration_.append(duration) # if total_steps % self.print_every == 0: # summary_writer.add_summary(outs[0], total_steps) # Print results avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) if total_steps % self.print_every == 0: print( "Iter: %04d" % iter, "train_loss={:.5f}".format(train_cost), "train_mrr={:.5f}".format(train_mrr), # exponential moving average "train_mrr_ema={:.5f}".format(train_shadow_mrr), "val_loss={:.5f}".format(val_cost), "val_mrr={:.5f}".format(val_mrr), # exponential moving average "val_mrr_ema={:.5f}".format(shadow_mrr), "time={:.5f}".format(avg_time)) ln = outs[7].values ln_idx = outs[7].indices ln_acc[ln_idx[:, 0], ln_idx[:, 1]] += ln lnc = outs[8].values lnc_idx = outs[8].indices lnc_acc[lnc_idx[:, 0], lnc_idx[:, 1]] += lnc iter += 1 total_steps += 1 if total_steps > self.max_total_steps: break # Keep track of train and validation losses per epoch train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch)) validation_losses.append( sum(validation_loss_epoch) / len(validation_loss_epoch)) # If the epoch has the lowest validation loss so far if validation_losses[-1] == min(validation_losses): print( "Minimum validation loss so far ({}) at epoch {}.".format( validation_losses[-1], epoch)) # Save loss node and count loss_node = sparse.save_npz(loss_node_path + 'loss_node.npz', sparse.csr_matrix(ln_acc)) loss_node_count = sparse.save_npz( loss_node_path + 'loss_node_count.npz', sparse.csr_matrix(lnc_acc)) # Save embeddings if self.save_embeddings and sampler_name is not "Uniform": sess.run(val_adj_info.op) self._save_embeddings(sess, model, minibatch, self.validate_batch_size, self._log_dir(sampler_name)) # Save model at each epoch print("Saving model at epoch {}.".format(epoch)) saver.save( sess, os.path.join(self._log_dir(sampler_name), "model_epoch_" + str(epoch) + ".ckpt")) if total_steps > self.max_total_steps: break print("Optimization Finished!") training_time = timer.toc() self._plot_losses(train_losses, validation_losses, sampler_name) self._print_stats(train_losses, validation_losses, training_time, sampler_name)
class FileParser: path_raw = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "data", "raw") path_persistent = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "data", "interim", "parsed_data" ) def __init__(self): self.timer = Timer() self.persistent = {} self.processes = { # Old datasets "old_books": { "filename": os.path.join(self.path_raw, old_books_file), "process_line": "_process_line_old_books", "persistent_file": os.path.join(self.path_persistent, "old_books.pkl"), "persistent_variable": [], "dataset_format": "ntriples" }, "old_books_new_books": { "filename": os.path.join(self.path_raw, old_books_file), "process_line": "_process_line_old_books_new_books", "persistent_file": os.path.join( self.path_persistent, "old_books_new_books.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "old_books_conferences": { "filename": os.path.join(self.path_raw, old_books_file), "process_line": "_process_line_old_books_conferences", "persistent_file": os.path.join( self.path_persistent, "old_books_conferences.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences", "persistent_file": os.path.join(self.path_persistent, "conferences.pkl"), "persistent_variable": [], "dataset_format": "ntriples" }, "conferences_name": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_name", "persistent_file": os.path.join( self.path_persistent, "conferences_name.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_acronym": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_acronym", "persistent_file": os.path.join( self.path_persistent, "conferences_acronym.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_city": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_city", "persistent_file": os.path.join( self.path_persistent, "conferences_city.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_country": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_country", "persistent_file": os.path.join( self.path_persistent, "conferences_country.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_year": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_year", "persistent_file": os.path.join( self.path_persistent, "conferences_year.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_datestart": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_datestart", "persistent_file": os.path.join( self.path_persistent, "conferences_datestart.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_dateend": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_dateend", "persistent_file": os.path.join( self.path_persistent, "conferences_dateend.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferences_conferenceseries": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferences_conferenceseries", "persistent_file": os.path.join( self.path_persistent, "conferences_conferenceseries.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, "conferenceseries": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferenceseries", "persistent_file": os.path.join( self.path_persistent, "conferenceseries.pkl"), "persistent_variable": [], "dataset_format": "ntriples" }, "conferenceseries_name": { "filename": os.path.join(self.path_raw, old_conferences_file), "process_line": "_process_line_conferenceseries_name", "persistent_file": os.path.join( self.path_persistent, "conferenceseries_name.pkl"), "persistent_variable": {}, "dataset_format": "ntriples" }, # New datasets "books": { "filename": os.path.join(self.path_raw, books_file), "process_line": "_process_line_books", "persistent_file": os.path.join(self.path_persistent, "books.pkl"), "persistent_variable": [], "dataset_format": "json" }, "isbn_books": { "filename": os.path.join(self.path_raw, books_file), "process_line": "_process_line_isbn_books", "persistent_file": os.path.join(self.path_persistent, "isbn_books.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "authors_name": { "filename": os.path.join(self.path_raw, authors_file), "process_line": "_process_line_authors_name", "persistent_file": os.path.join(self.path_persistent, "authors_name.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters", "persistent_file": os.path.join(self.path_persistent, "chapters.pkl"), "persistent_variable": [], "dataset_format": "json" }, "chapters_title": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_title", "persistent_file": os.path.join(self.path_persistent, "chapters_title.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_year": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_year", "persistent_file": os.path.join(self.path_persistent, "chapters_year.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_language": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_language", "persistent_file": os.path.join( self.path_persistent, "chapters_language.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_abstract": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_abstract", "persistent_file": os.path.join( self.path_persistent, "chapters_abstract.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_authors": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_authors", "persistent_file": os.path.join( self.path_persistent, "chapters_authors.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_authors_name": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_authors_name", "persistent_file": os.path.join( self.path_persistent, "chapters_authors_name.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_all_citations": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_all_citations", "persistent_file": os.path.join( self.path_persistent, "chapters_all_citations.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_keywords": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_keywords", "persistent_file": os.path.join( self.path_persistent, "chapters_keywords.pkl"), "persistent_variable": {}, "dataset_format": "json" }, "chapters_books_isbns": { "filename": os.path.join(self.path_raw, chapters_file), "process_line": "_process_line_chapters_books_isbns", "persistent_file": os.path.join( self.path_persistent, "chapters_books_isbns.pkl"), "persistent_variable": {}, "dataset_format": "json" }, } def get_data(self, process): # Check if the data is already present if (process in self.persistent): return self.persistent[process] print("Process '{}' not in memory yet.".format(process)) # Load from persistent file if data already processed if os.path.isfile(self.processes[process]["persistent_file"]): with open(self.processes[process]["persistent_file"], "rb") as f: self.persistent[process] = pickle.load(f) return self.persistent[process] print("Process '{}' not persistent yet. Processing.".format( process)) # Process the raw data self.persistent[process] = self.processes[process][ "persistent_variable"] self._parse_file( self.processes[process]["filename"], self.processes[process]["process_line"], self.persistent[process], self.processes[process]["dataset_format"] ) with open(self.processes[process]["persistent_file"], "wb") as f: pickle.dump(self.persistent[process], f) return self.persistent[process] def _parse_file(self, filename, process_line, results, dataset_format): if dataset_format == "json": self._process_json_file(filename, process_line, results) else: self._process_ntriples_file(filename, process_line, results) def _process_json_file(self, filename, process_line, results): print("Computing number of json files.") with tarfile.open(filename, "r:gz", encoding="utf-8") as tar: count_files = len(tar.getnames()) print("Finished computing number of files: {}.\n".format( count_files)) print("Start processing file.\n") self.timer.tic() process_line_function = self.__getattribute__(process_line) with tqdm(desc="Processing files: ", total=count_files, unit="file") as pbar: with tarfile.open(filename, "r:gz", encoding="utf-8") as tar: for member in tar.getmembers(): if "jsonl" in member.name: file = tar.extractfile(member) content = [json.loads(line) for line in file.readlines()] for line in content: process_line_function(line, results) pbar.update(1) self.timer.toc() print("Finished processing file.\n\n") def _process_ntriples_file(self, filename, process_line, results): print("Computing file size.") with gzip.open(filename, mode="rt", encoding="utf-8") as f: file_size = f.seek(0, io.SEEK_END) print("Finished computing file size: {} bytes.\n".format( file_size)) print("Start processing file.\n") self.timer.tic() process_line_function = self.__getattribute__(process_line) with tqdm(desc="Processing file: ", total=file_size, unit="bytes") as pbar: with gzip.open(filename, mode="rt", encoding="utf-8") as f: for line in f: process_line_function(line, results) pbar.update(len(line)) self.timer.toc() print("Finished processing file.\n\n") # Processes implementations def _process_line_old_books(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_has_conference: if line[0].startswith(nt_book): if line[0] not in results: results.append(line[0]) def _process_line_old_books_new_books(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_webpage: if line[0].startswith(nt_book): if line[0] in self.get_data("old_books"): new_book_id = "sg:pub." + line[2].split( ".com/")[-1].rsplit(">")[0] results[line[0]] = new_book_id def _process_line_old_books_conferences(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_has_conference: if line[0].startswith(nt_book): results[line[0]] = line[2] def _process_line_conferences(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[0].startswith(nt_conferences): if line[0] not in results: results.append(line[0]) def _process_line_conferences_name(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_name: if line[0].startswith(nt_conferences): results[line[0]] = line[2] def _process_line_conferences_acronym(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_acronym: if line[0].startswith(nt_conferences): results[line[0]] = line[2] def _process_line_conferences_city(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_city: if line[0].startswith(nt_conferences): results[line[0]] = line[2] def _process_line_conferences_country(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_country: if line[0].startswith(nt_conferences): results[line[0]] = line[2] def _process_line_conferences_year(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_year: if line[0].startswith(nt_conferences): results[line[0]] = line[2] def _process_line_conferences_datestart(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_datestart: if line[0].startswith(nt_conferences): results[line[0]] = line[2] def _process_line_conferences_dateend(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_dateend: if line[0].startswith(nt_conferences): results[line[0]] = line[2] def _process_line_conferences_conferenceseries(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_has_conference_series: results[line[0]] = line[2] def _process_line_conferenceseries(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[0].startswith(nt_conference_series): if line[0] not in results: results.append(line[0]) def _process_line_conferenceseries_name(self, line, results): line = line.rstrip(" .\n").split(maxsplit=2) if line[1] == nt_name: if line[0].startswith(nt_conference_series): results[line[0]] = line[2] def _process_line_books(self, line, results): new_books = list(self.get_data("old_books_new_books").values()) if line["id"] not in results: if line["id"] in new_books: results.append(line["id"]) def _process_line_isbn_books(self, line, results): if "isbn" in line.keys(): if line["id"] in self.get_data("books"): isbn_list = line["isbn"] for isbn in isbn_list: results[isbn] = line["id"] def _process_line_authors_name(self, line, results): family_name = line["familyName"] if "familyName" in line.keys() else "" given_name = line["givenName"] if "givenName" in line.keys() else "" if not family_name == "Not available": author_name = family_name + " " + given_name else: author_name = "" results[line["id"]] = author_name def _process_line_chapters(self, line, results): if "isPartOf" in line.keys(): if line["id"] not in results: book = line["isPartOf"] if "isbn" in book.keys(): isbn_list = book["isbn"] for isbn in isbn_list: if isbn in self.get_data("isbn_books"): results.append(line["id"]) def _process_line_chapters_title(self, line, results): if "name" in line.keys(): if line["id"] in self.get_data("chapters"): results[line["id"]] = line["name"] def _process_line_chapters_year(self, line, results): if "datePublished" in line.keys(): if line["id"] in self.get_data("chapters"): year = line["datePublished"].split("-")[0] results[line["id"]] = year def _process_line_chapters_language(self, line, results): if "inLanguage" in line.keys(): if line["id"] in self.get_data("chapters"): results[line["id"]] = line["inLanguage"][0] def _process_line_chapters_abstract(self, line, results): if "description" in line.keys(): if line["id"] in self.get_data("chapters"): results[line["id"]] = line["description"] def _process_line_chapters_authors(self, line, results): if "author" in line.keys(): if line["id"] in self.get_data("chapters"): authors = line["author"] authors_id = [authors[i]["id"] for i in range(len(authors)) if "id" in authors[i].keys()] results[line["id"]] = authors_id def _process_line_chapters_authors_name(self, line, results): if "author" in line.keys(): if line["id"] in self.get_data("chapters"): authors = line["author"] author_names = list() for i in range(len(authors)): family_name = authors[i]["familyName"] if \ "familyName" in authors[i].keys() else "" given_name = authors[i]["givenName"] if "givenName" \ in authors[i].keys() else "" author_names.append(family_name + " " + given_name) results[line["id"]] = author_names def _process_line_chapters_all_citations(self, line, results): if "citation" in line.keys(): if line["id"] in self.get_data("chapters"): citations = line["citation"] citations_id = [citations[i]["id"] for i in range( len(citations))] results[line["id"]] = citations_id def _process_line_chapters_keywords(self, line, results): if "keywords" in line.keys(): if line["id"] in self.get_data("chapters"): results[line["id"]] = line["keywords"] def _process_line_chapters_books_isbns(self, line, results): if "isPartOf" in line.keys(): if line["id"] in self.get_data("chapters"): book = line["isPartOf"] if "isbn" in book.keys(): isbn_list = book["isbn"] results[line["id"]] = isbn_list