def compute_rballs_tw(in_files, output_dir): nx_graph, uri_node_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=True) node_uri_map = {node: uri.replace(u",", u"[comma]").replace(u"\n", u"[new_line]") for uri, node in uri_node_map.items()} nodes_in_graph = nx_graph.number_of_nodes() print "Nodes in graph:", nodes_in_graph for d in ["in", "out", "all"]: rballs_with_big_tw = set() for r in [2, 3, 4, 5]: out_file = codecs.open(output_dir + "tw_r{0}_{1}".format(r, d), "w", "utf8") i = 0 for node in nx_graph.nodes_iter(): print "-------------------------------------" print u"Node {0}/{1} ({2})".format(i, nodes_in_graph, node_uri_map[node]) print "r = {0}, d = {1}".format(r, d) if node in rballs_with_big_tw: # don't compute treewidth for r-balls which are known to be big tw = -1 else: rball = algorithms.r_ball(nx_graph, node, r, -1 if d == "in" else 1 if d == "out" else 0) print "r-ball nodes:", rball.number_of_nodes() tw = arnborg_proskurowski.get_treewidth(rball) if tw == -1: rballs_with_big_tw.add(node) print "Treewidth: ", tw line = u"{0},{1}\n".format(node_uri_map[node], tw) out_file.write(line) # nxext.visualize_graph(rball, node_labels=True, edge_labels=False) i += 1 out_file.close()
def testRDFToNxGraphConvertionWithColoring(self): dummy_colored, _ = rdf.convert_rdf_to_nx_graph( ["test_files/dummy.rdf"], test_mode=True) isomorphic = algorithms.isomorphic( example_graphs.gt_dummy_colored_expected, dummy_colored) self.assertTrue( isomorphic, "Problem converting RDF graph to Networkx graph with colors.")
def calculate_ch_matrix(): in_files = helpers.datasets[dataset]["files"] print "Converting RDF to NetworkX graph started at", time.strftime( time_format) start = time.time() graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False) print "Converting RDF to NetworkX graph took", time.time() - start, "s" print "-----------------------------------------" print "Saving NodeID map started at", time.strftime(time_format) start = time.time() inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset)) print "Saving NodeID map took", time.time() - start, "s" print "-----------------------------------------" print "Building hypergraph started at", time.strftime(time_format) start = time.time() hypergraph = Hypergraph(graph) print "Building hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Saving hypergraph started at", time.strftime(time_format) start = time.time() hypergraph.save_to_file(path + "{0}_hgraph".format(dataset)) print "Saving hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Building characteristic matrix started at", time.strftime( time_format) start = time.time() rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database( hypergraph, r_in=r_in, r_out=r_out, r_all=r_all) ch_matrix = CharacteristicMatrix(rballs_database, hypergraph.number_of_nodes(), wl_iterations=wl_iterations, print_progress=True) print "Building characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" print "Saving Column index to Node map started at", time.strftime( time_format) start = time.time() inout.save_to_file(index_node_map, path + "{0}_index_node_map".format(dataset)) print "Saving Column index to Node map took", time.time() - start, "s" print "-----------------------------------------" print "Saving characteristic matrix started at", time.strftime(time_format) start = time.time() ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset)) print "Saving characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" return ch_matrix, hypergraph, index_node_map, node_id_map
def calculate_ch_matrix(): in_files = helpers.datasets[dataset]["files"] print "Converting RDF to NetworkX graph started at", time.strftime(time_format) start = time.time() graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False) print "Converting RDF to NetworkX graph took", time.time() - start, "s" print "-----------------------------------------" print "Saving NodeID map started at", time.strftime(time_format) start = time.time() inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset)) print "Saving NodeID map took", time.time() - start, "s" print "-----------------------------------------" print "Building hypergraph started at", time.strftime(time_format) start = time.time() hypergraph = Hypergraph(graph) print "Building hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Saving hypergraph started at", time.strftime(time_format) start = time.time() hypergraph.save_to_file(path + "{0}_hgraph".format(dataset)) print "Saving hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Building characteristic matrix started at", time.strftime(time_format) start = time.time() rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database(hypergraph, r_in=r_in, r_out=r_out, r_all=r_all) ch_matrix = CharacteristicMatrix(rballs_database, hypergraph.number_of_nodes(), wl_iterations=wl_iterations, print_progress=True) print "Building characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" print "Saving Column index to Node map started at", time.strftime(time_format) start = time.time() inout.save_to_file(index_node_map, path + "{0}_index_node_map".format(dataset)) print "Saving Column index to Node map took", time.time() - start, "s" print "-----------------------------------------" print "Saving characteristic matrix started at", time.strftime(time_format) start = time.time() ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset)) print "Saving characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" return ch_matrix, hypergraph, index_node_map, node_id_map
def compute_rballs_tw(in_files, output_dir): nx_graph, uri_node_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=True) node_uri_map = { node: uri.replace(u",", u"[comma]").replace(u"\n", u"[new_line]") for uri, node in uri_node_map.items() } nodes_in_graph = nx_graph.number_of_nodes() print "Nodes in graph:", nodes_in_graph for d in ["in", "out", "all"]: rballs_with_big_tw = set() for r in [2, 3, 4, 5]: out_file = codecs.open(output_dir + "tw_r{0}_{1}".format(r, d), "w", "utf8") i = 0 for node in nx_graph.nodes_iter(): print "-------------------------------------" print u"Node {0}/{1} ({2})".format(i, nodes_in_graph, node_uri_map[node]) print "r = {0}, d = {1}".format(r, d) if node in rballs_with_big_tw: # don't compute treewidth for r-balls which are known to be big tw = -1 else: rball = algorithms.r_ball( nx_graph, node, r, -1 if d == "in" else 1 if d == "out" else 0) print "r-ball nodes:", rball.number_of_nodes() tw = arnborg_proskurowski.get_treewidth(rball) if tw == -1: rballs_with_big_tw.add(node) print "Treewidth: ", tw line = u"{0},{1}\n".format(node_uri_map[node], tw) out_file.write(line) # nxext.visualize_graph(rball, node_labels=True, edge_labels=False) i += 1 out_file.close()
def testRDFToNxGraphConvertionWithColoring(self): dummy_colored, _ = rdf.convert_rdf_to_nx_graph(["test_files/dummy.rdf"], test_mode=True) isomorphic = algorithms.isomorphic(example_graphs.gt_dummy_colored_expected, dummy_colored) self.assertTrue(isomorphic, "Problem converting RDF graph to Networkx graph with colors.")
dataset = "drugadmin" wl_iter_range = [3] # range(0, 10) k_L_range = [ (20, 1), # inflection point ~0. (15, 5), # inflection point 0.1 (10, 9), # inflection point 0.2 (7, 12), # inflection point 0.3 (5, 13), # inflection point 0.4 (4, 16), # inflection point 0.5 (3, 16), # inflection point 0.6 (2, 11), # inflection point 0.7 (2, 25), # inflection point 0.8 (1, 10), # inflection point 0.9 (1, 20), # inflection point ~1. ] infl_point_range = [0., 0.0000001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.] p_range = [1] r_in_range = [3] # range(0, 4) r_out_range = [2] # range(0, 4) r_all_range = [0] output_dir = "../output_rdf/crossval_test/" if __name__ == '__main__': in_files = helpers.datasets[dataset]["files"] graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False) hypergraph = Hypergraph(graph) best_model = crossval.loo_crossval(hypergraph, wl_iter_range, r_in_range, r_out_range, r_all_range, output_dir, infl_point_range=infl_point_range) # best_model = crossval.loo_crossval(hypergraph, wl_iter_range, r_in_range, r_out_range, r_all_range, output_dir, k_L_range=k_L_range) print "Best model:", best_model
def prepare_rdf_chemical_data(rdf_files, compounds_targets_file, uri_prefix, process_compound_function=None, compounds_and_targets=None, sort_rdf_nodes_before_processing=False, rdf_colors_state=None): def read_compounds_and_targets(): with open(compounds_targets_file, "r") as ct_file: for line in ct_file.readlines(): if line.startswith("#"): continue elif line.startswith("$"): break else: comp_id, target_label = tuple(line[:-1].split(" ")) yield unicode(comp_id), int(target_label) def chem_database_generator(full_graph, uri_node_map, type_color_map, compounds_and_targets): literal_colors = set() for rdf_type in type_color_map: # TODO: this condition is unsafe because it may remove not only literal colors if rdf_type.startswith(u"http://www.w3.org/2001/XMLSchema#"): literal_colors.add(type_color_map[rdf_type]) bool_colors = filter(lambda x: x.startswith(u"http://www.w3.org/2001/XMLSchema#boolean"), type_color_map) bool_colors = set(map(lambda x: type_color_map[x], bool_colors)) literal_colors -= bool_colors for node in full_graph.nodes(): node_labels_set = set(full_graph.node[node]["labels"]) # remove all literals (except booleans) if literal_colors & node_labels_set: full_graph.remove_node(node) # remove the color of named individual type from all nodes where it occurs named_indiv_uri = u"http://www.w3.org/2002/07/owl#NamedIndividual" if named_indiv_uri in type_color_map: named_indiv_color = type_color_map[named_indiv_uri] for node in full_graph.nodes_iter(): if named_indiv_color in full_graph.node[node]["labels"]: full_graph.node[node]["labels"].remove(named_indiv_color) full_hypergraph = Hypergraph(full_graph) # ################ # # INFO: use this to remove the isMutagenic property when predicting mutagenicity # is_mutag_color = type_color_map[u"http://dl-learner.org/carcinogenesis#isMutagenic"] # edges_to_remove = [] # for edge in full_hypergraph.edges_iter(): # if is_mutag_color in full_hypergraph.edge(edge)['labels']: # edges_to_remove.append(edge) # for edge in edges_to_remove: # full_hypergraph.safe_remove_edge(edge) # ################ if not compounds_and_targets: compounds_and_targets = read_compounds_and_targets() def remove_other_neighbors_of_bool_literals(hypergraph, center_node): center_neighbors = hypergraph.neighbors(center_node) bool_literals = filter(lambda n: set(hypergraph.node[n]['labels']) & bool_colors, center_neighbors) for bool_literal in bool_literals: bool_literal_neigbors = set(hypergraph.neighbors(bool_literal)) # exclude the center node from the removable nodes bool_literal_neigbors.remove(center_node) for neigh in bool_literal_neigbors: hypergraph.safe_remove_node(neigh) for comp_id, target_label in compounds_and_targets: node_id = u"n_{0}".format(uri_node_map[uri_prefix + comp_id]) comp_neighborhood_hypergraph = algorithms.r_ball_hyper(full_hypergraph, node_id, 2, 0) remove_other_neighbors_of_bool_literals(comp_neighborhood_hypergraph, node_id) ch_db_record = (comp_id, [comp_neighborhood_hypergraph], target_label) if process_compound_function: process_compound_function(ch_db_record) # ############ # def get_key(value, dictionary): # for key in dictionary: # if dictionary[key] == value: # return key # return None # g = ch_db_record[1][0].copy() # for n in g.node: # n_new_labels = [] # for n_color in g.node[n]['labels']: # n_rdf_type = get_key(n_color, type_color_map) # n_rdf_type = n_rdf_type[n_rdf_type.find(u"#") + 1:] # n_new_labels.append(n_rdf_type) # g.node[n]['labels'] = n_new_labels # g.visualize() # ############ yield ch_db_record if rdf_colors_state: rdf_base_colors = rdf_colors_state['colors'] rdf_next_color_id = rdf_colors_state['next_color_id'] else: rdf_base_colors = None rdf_next_color_id = None full_graph, uri_node_map, type_color_map, next_color_id = rdf.convert_rdf_to_nx_graph(rdf_files, return_colors=True, test_mode=sort_rdf_nodes_before_processing, base_colors=rdf_base_colors, next_color_id=rdf_next_color_id, encode_boolean_value_in_color=True) chem_database = chem_database_generator(full_graph, uri_node_map, type_color_map, compounds_and_targets) new_rdf_colors_state = {'colors': type_color_map, 'next_color_id': next_color_id} return chem_database, new_rdf_colors_state