def n2v(graph: str, output_dir: str, directed: bool, tag: str, params: dict) -> None: """Runs the SNAP implementation of Node2Vec on a NetworkX graph Args: graph (str): Path to a pickled NetworkX Graph output_dir (str): The directory that will save Node2Vec Model. directed (bool): If True, process as directed graph tag (str): The tag that will be appended to output files, useful for IDing params (dict): Dictionary of Node2Vec/Word2Vec Parameters """ # Ensure directories exist directory_check(output_dir) directory_check(output_dir + "/models") directory_check(output_dir + "/embeddings") temp_dir = output_dir + "/temp" directory_check(temp_dir) node2vec_init = n2v_init(temp_folder=temp_dir, **params) node2vec_fit = n2v_fit(**params) print("Beginning node2vec script") print("Graph: %s" % graph) for key, value in node2vec_init.items(): print("%s: %s" % (key, value)) for key, value in node2vec_fit.items(): print("%s: %s" % (key, value)) G = nx.read_gpickle(graph) if not directed: G = G.to_undirected() try: node2vec = Node2Vec(G, **node2vec_init) model = node2vec.fit(**node2vec_fit) except Exception as e: logging.error("Failed to run Node2Vec on Graph") logging.error(e.__doc__) embedding_file = generate_out_file("embeddings.pkl", output_dir + "/embeddings", tag) model_file = generate_out_file("model.pkl", output_dir + "/models", tag) # Save embeddings model.wv.save_word2vec_format(embedding_file) print("Embeddings saved to %s" % embedding_file) # Save model model.save(model_file) print("Model saved to %s" % model_file) print("Completed n2v.py")
def fit(self, save: bool = True): # Initiate early stopping callbacks = [] if self.early_stopping: early_stopping_monitor = EarlyStopping(patience=5) callbacks.append(early_stopping_monitor) # TODO: Fix tensorboard # Add TensorBoard # log_dir = config.TRAIN_LOGS + "/" + self.tag # tensorboard_callback = tf.keras.callbacks.TensorBoard( # log_dir=log_dir, histogram_freq=1 # ) # callbacks.append(tensorboard_callback) self.model.fit( self.X_train, self.y_train, callbacks=callbacks, validation_data=(self.X_valid, self.y_valid), **self.training_params, ) if save: outfile = generate_out_file("sp_model.h5", self.save_dir, self.tag) self.model.save(outfile) print(f"Model saved to {outfile}")
def build(fred: bool, append_ids: bool, rdf_dir: str, out_dir: str, tag: str) -> None: """ rdf_dir: Directory where RDFs are located out_dir: Directory to output graph and log fred: If True, leave FRED nodes intact append_ids: If True, append file ids to FRED nodes tag: a unique tag for the output files. Defaults to current time """ tag_addend = "ontol" if append_ids else "full" tag = tag_addend + "-" + tag now = datetime.datetime.now() print("build_corpus_graph.py") print("----------------------") print(f"Now: {now}") print(f"RDF Dir: {rdf_dir}") print(f"Output Dir: {out_dir}") print(f"Keep Fred Nodes?: {fred}") print(f"Stitch only on non-FRED nodes?: {append_ids}") print(f"Experiment tag: {tag}") rdf_sub_dirs = [str(rdf_dir + "/" + x + "/") for x in os.listdir(rdf_dir)] for i, entry in enumerate(rdf_sub_dirs): print(f"Appending subgraphs from {entry}") if i == 0: create_graph(entry, out_dir, fred, append_ids, tag) else: existing_graph = generate_out_file("corpus_graph.pkl", out_dir, tag) create_graph(entry, out_dir, fred, append_ids, tag, existing=existing_graph) now = datetime.datetime.now() print(f"Finished creating corpus graph {existing_graph}") print(now)
def report(self): print(self.classification_report) print(f"AUC: {self.auc}") heatmap_file = generate_out_file("confusion.png", self.save_dir, self.tag) self.heatmap.figure.savefig(heatmap_file) print(f"Confusion matrix saved to {heatmap_file}")
def fit(self, save: bool = True): self.model.fit(self.X_train, self.y_train) if save: outfile = generate_out_file("sp_model.h5", self.save_dir, self.tag) joblib.dump(self.model, outfile) print(f"Model saved to {outfile}")
def nodevec(graph: str, output_dir: str, directed: bool, tag: str, params: dict) -> None: # Ensure directories exist directory_check(output_dir) directory_check(output_dir + "/models") directory_check(output_dir + "/embeddings") temp_dir = output_dir + "/temp" directory_check(temp_dir) w2vparams = get_w2vparams(**params) node2vec_init = get_n2vparams(w2vparams=w2vparams, **params) print("Beginning node2vec script") print("File: %s" % graph) for key, value in node2vec_init.items(): print("%s: %s" % (key, value)) for key, value in w2vparams.items(): print("%s: %s" % (key, value)) G = nx.read_gpickle(graph) G = uri_to_str(G) if not directed: G = G.to_undirected() n2v_model = Node2Vec(**node2vec_init) n2v_model.fit(G) embedding_file = generate_out_file("embeddings.pkl", out_dir + "embeddings/", tag) model_file = generate_out_file("model.pkl", out_dir + "models/", tag) # Save embeddings n2v_model.model.wv.save_word2vec_format(embedding_file) print("Embeddings saved to %s" % embedding_file) # Save model n2v_model.model.save(model_file) print("Model saved to %s" % embedding_file) print("Completed nodevectors.py")
df, y, test_size=args.test_size, stratify=y, random_state=config.RANDOM_SEED) X_train, X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=args.test_size, stratify=y_train, random_state=config.RANDOM_SEED, ) out_dir = config.SP_SPLITS_DIR + '/train' out_file = "X_train.pkl" out_file = generate_out_file(out_file, out_dir, tag) X_train.to_pickle(out_file) out_file = "y_train.npy" out_file = generate_out_file(out_file, out_dir, tag) np.save(out_file, y_train) out_dir = config.SP_SPLITS_DIR + '/valid' out_file = "X_valid.pkl" out_file = generate_out_file(out_file, out_dir, tag) X_valid.to_pickle(out_file) out_file = "y_valid.npy" out_file = generate_out_file(out_file, out_dir, tag) np.save(out_file, y_valid) out_dir = config.SP_SPLITS_DIR + '/test' out_file = "X_test.pkl"
def create_graph( rdf_dir: str, out_dir: str, fred: bool, append: bool, tag: str = NOW, existing: str = None, ) -> None: """ rdf_dir: Directory where RDFs are located out_dir: Directory to output graph and log fred: If True, leave FRED nodes intact append: If True, append file ids to FRED nodes tag (optional): a unique tag for the output files. Defaults to current time existing (option): path to an existing networkx graph to append to. Should be pickled format. Default None """ full_graph = None # Initialize structure that will become final output graph if existing: full_graph = nx.read_gpickle(existing) else: full_graph = nx.MultiGraph() rdf_files = [x for x in os.listdir(rdf_dir) if ".rdf" in x] # for every rdf file for rdf_file in tqdm(rdf_files): print(f"\nParsing {rdf_file}") rdf_path = rdf_dir + rdf_file graph = None ### Parse RDF Graph try: graph = get_rdfGraph(rdf_path) except Exception as e: logging.error("Failed to parse: %s" % rdf_file) logging.error(e.__doc__) continue # Append unique RDF ids to FRED nodes (limits how much graph is combined) if append: uid = get_filename(rdf_path) graph = append_rdf_ids(graph, uid) # Make NetworkX Graph try: nx_graph = rdflib_to_networkx_multidigraph(graph) # rdf ->networkx except Exception as e: logging.error("Failed to parse RDF to NetworkX: %s" % rdf_file) logging.error(e.__doc__) continue # Collapse out FRED nodes if not fred: try: nx_graph = collapse_fred_nodes(nx_graph) except Exception as e: logging.error("Failed to collapse FRED Nodes: %s" % rdf_file) logging.error(e.__doc__) continue # Add new graph to corpus graph try: full_graph = nx.compose(full_graph, nx_graph) except Exception as e: logging.error("Failed to append %s to corpus graph" % rdf_file) logging.error(e.__doc__) continue out_graph = generate_out_file("corpus_graph.pkl", out_dir, tag) nx.write_gpickle(full_graph, out_graph) print(f"Completed appending {rdf_dir}")
for i, pair in enumerate(pairs): json, rdf_dir = pair if i == 0: generate_sp_df( n2v_model_file=n2v_model_file, snippets=json, rdf_dir=rdf_dir, out_dir=args.out_dir, node_file=args.node_file, tag=tag, weighted=args.weighted, directed=args.directed, ) else: existing_df = generate_out_file("sp_df.pkl", args.out_dir, tag) generate_sp_df( n2v_model_file=n2v_model_file, snippets=json, rdf_dir=rdf_dir, out_dir=args.out_dir, node_file=args.node_file, tag=tag, weighted=args.weighted, directed=args.directed, existing=existing_df, ) now = datetime.datetime.now() print(f"Finished creating shortest path dataframe {existing_df}") print(now)
def generate_sp_df( n2v_model_file: str, snippets: str, rdf_dir: str, out_dir: str, node_file: str, tag: str, weighted: bool = False, directed: bool = False, existing: str = None, ) -> pd.DataFrame: """Generates a dataframe of shortest path vectors between two nodes. Args: n2v_model_file (str): Path to Node2Vec model snippets (str): Path to a .json containing snippets containing relations rdf_dir (str): Path to the directory of RDFs corresponding to the snippets out_dir (str): The directory that the dataframe will be written to node_file (str): Path to pickled dataframe that contains terminal nodes for all relations tag (str): The experimental tag, to be appended to the output file name weighted (bool, optional): Process as weighted graph. Defaults to False. directed (bool, optional): Process as directed graph. Defaults to False. existing (str, optional): Filepath to an existing dataframe to append to. Defaults to None. """ now = datetime.datetime.now() print("-" * 30) print("Beginning shortest_path.generate_sp_df()") print("-" * 30) print(f"N2V Model: {n2v_model_file}") print(f"Snippet File: {snippets}") print(f"RDF Dir: {rdf_dir}") n2v_model = None nv = False if "nv" in n2v_model_file: n2v_model = load_nodevectors_model(n2v_model_file) nv = True else: n2v_model = load_n2v_model(n2v_model_file) data = list() # Get list of .rdf files in directory rdfs = os.listdir(rdf_dir) relations = None relation_type = snippets.split("/")[-1].split("_")[0].split('.')[ 0] # very GREC specific # load terminal nodes into <nodes> df nodes_df = pd.read_pickle(node_file) # load snippets into <relations> variable with open(snippets, "r") as f_grec: relations = json.loads(f_grec.read()) # for every .rdf in directory for rdf in rdfs: # generate path rdf_path = rdf_dir + "/" + rdf # set variables to retrieve from grec .json rating = None subj = None obj = None db_subj = None db_obj = None uid = rdf.split(".")[0] # get variables from grec .json for relation in relations: if relation["UID"] == uid: rating = relation["maj_vote"] subj = relation["sub"] obj = relation["obj"] db_subj = relation["dbpedia_sub"] db_obj = relation["dbpedia_obj"] break print( f"Processing {uid}: rating: {rating}, subject: {subj}, object: {obj}" ) sub_node = nodes_df.loc[uid]["sub"] obj_node = nodes_df.loc[uid]["obj"] # if bad subject/object, skip to next rdf if "Not Found" == sub_node or "Not Found" == obj_node: print(f"ERROR: Bad subject or object, skipping {uid}") continue # Parse graphs, remove VN tags, collapse nodes, and undirect graph try: graph = get_rdfGraph(rdf_path) # graph = remove_vn_tags(graph) # graph = append_rdf_ids(graph, uid) nx_graph = rdflib_to_networkx_multidigraph(graph) nx_graph = collapse_fred_nodes(nx_graph) nx_graph = nx_graph.to_undirected() # returns Multigraph object if directed: nx_graph = nx.DiGraph(nx_graph) else: nx_graph = nx.Graph(nx_graph) except Exception as e: print(f"ERROR: Could not generate graphs for {uid}.") print(e.__doc__) continue if weighted: # Calculate weight for all edges try: nx_graph = to_weighted_graph(nx_graph, n2v_model, nv) except Exception as e: print(f"ERROR: Could not weight graph {uid}") print(e.__doc__) continue # shortest path between subject and object (as a list) try: if weighted: shortest_path = nx.dijkstra_path(nx_graph, obj_node, sub_node) else: shortest_path = nx.shortest_path(nx_graph, obj_node, sub_node) except Exception as e: print( f"ERROR: There is no path found between {obj_node} and {sub_node}. Relation: {uid}" ) continue # Calculate normalized vectors for path ## vector_final holds sum of all vectors in path vector_final = None ## get vector for every node and add them for node in shortest_path: vector = (get_nodevectors_vector(n2v_model, node) if nv else get_n2v_vector(n2v_model, node)) if vector_final is None: # for first vector vector_final = vector else: vector_final = vector_final + vector # if these are none, there was an error. Skip if vector_final is None: print("ERROR: Issue with producing embeddings...") continue # Normalize vector n2v_norm = np.linalg.norm(vector_final) vector_final = vector_final / n2v_norm # append new entry to list new_entry = [uid, subj, obj, relation_type, rating, vector_final] data.append(new_entry) print(f"Finished processing {uid}") df = pd.DataFrame(data, columns=[ "UID", "Subject", "Object", "Relation", "Maj_Vote", "Short_Path" ]) out_file = generate_out_file("sp_df.pkl", out_dir, tag) if existing: df_existing = pd.read_pickle(existing) df = pd.concat([df_existing, df], ignore_index=True) df.to_pickle(out_file) print(f"Shortest paths written to {out_file}") print("Completed shortest_path.py execution") print("-" * 30) return df
str(args.rdf_dir + "/" + x) for x in os.listdir(args.rdf_dir) ] jsons = [str(args.grec_dir + "/" + x) for x in os.listdir(args.grec_dir)] pairs = [] for json in jsons: for rdf_sub_dir in rdf_sub_dirs: if json_relation_tag(json) in rdf_sub_dir: pairs.append([json, rdf_sub_dir]) continue for i, pair in enumerate(pairs): json, rdf_dir = pair if i == 0: generate_terminal_node_df(snippets=json, rdf_dir=rdf_dir, out_dir=args.out_dir, tag=tag) else: existing_df = generate_out_file("terminal_nodes.pkl", args.out_dir, tag) generate_terminal_node_df(snippets=json, rdf_dir=rdf_dir, out_dir=args.out_dir, tag=tag, existing=existing_df) now = datetime.datetime.now() print(f"Finished creating shortest path dataframe {existing_df}") print(now)