def test_neo_to_graph_transform(): """ load from neo4j and transform to nx graph """ nt = NeoTransformer(host='localhost', port='7474', username='******', password='******') nt.load() nt.report() t = PandasTransformer(nt.graph) t.save(os.path.join(target_dir, "neo_graph.csv"))
def test_neo_to_graph_transform(): """ load from neo4j and transform to nx graph """ n = NeoTransformer() n.load() n.report() t = PandasTransformer(n) t.save("target/neo_graph.csv")
def test_csv_to_neo_load(): """ load csv to neo4j test """ pt = PandasTransformer() pt.parse(os.path.join(resource_dir, "cm_nodes.csv")) pt.parse(os.path.join(resource_dir, "cm_edges.csv")) nt = NeoTransformer(pt.graph, host='localhost', port='7474', username='******', password='******') nt.save_with_unwind() nt.neo4j_report()
def test_neo_to_graph_transform(): """ load from neo4j and transform to nx graph """ nt = NeoTransformer(uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) nt.load() nt.report() t = PandasTransformer(nt.graph) t.save(os.path.join(target_dir, "neo_graph.csv"))
def test_load_tsv(): t = PandasTransformer() t.parse(os.path.join(resource_dir, 'test_nodes.tsv'), input_format='tsv') t.parse(os.path.join(resource_dir, 'test_edges.tsv'), input_format='tsv') assert t.graph.number_of_nodes() == 3 assert t.graph.number_of_edges() == 1 assert t.graph.nodes['CURIE:123'][ 'description'] == '"Node of type Gene, CURIE:123"' assert t.graph.nodes['CURIE:456'][ 'description'] == '"Node of type Disease, CURIE:456"'
def test_csv_to_neo_load(): """ load csv to neo4j test """ pt = PandasTransformer() pt.parse(os.path.join(resource_dir, "cm_nodes.csv")) pt.parse(os.path.join(resource_dir, "cm_edges.csv")) nt = NeoTransformer(pt.graph, uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) nt.save() nt.neo4j_report()
def parse(self, name: str, data_file: str, source: str) -> None: """Processes the data_file. Args: name: Name of the ontology data_file: data file to parse source: Source name Returns: None. """ print(f"Parsing {data_file}") transformer = ObographJsonTransformer() transformer.parse(data_file, provided_by=source) output_transformer = PandasTransformer(transformer.graph) output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None)
def test_load(): """ load tests """ t = ObanRdfTransformer() t.parse("tests/resources/monarch/biogrid_test.ttl") t.report() w1 = PandasTransformer(t) w1.save('target/biogrid-e.csv', type='e') w1.save('target/biogrid-n.csv', type='n') w2 = GraphMLTransformer(t) w2.save("target/x1n.graphml") w3 = JsonTransformer(t) w3.save("target/x1n.json")
def test_neo_to_graph_transform(): """ load from neo4j and transform to nx graph """ return nt = NeoTransformer(host='localhost', port='7474', username='', password='') nt.load() nt.report() t = PandasTransformer(nt.graph) t.save("target/neo_graph.csv")
def test_csv_to_neo_load(): """ load csv to neo4j test """ pt = PandasTransformer() pt.parse("resources/x1n.csv") pt.parse("resources/x1e.csv") nt = NeoTransformer(pt.graph, host='http://localhost', port='7474', username='******', password='******') nt.save_with_unwind() nt.neo4j_report()
def load_node(self, node: dict) -> None: """ Load a node into networkx.MultiDiGraph .. Note:: This method transformers Reasoner Std API format fields to Biolink Model fields. Parameters ---------- node : dict A node """ if 'type' in node and 'category' not in node: node['category'] = node['type'] del node['type'] node = self.validate_node(node) kwargs = PandasTransformer._build_kwargs(node.copy()) if 'id' in kwargs: n = kwargs['id'] self.graph.add_node(n, **kwargs) else: logging.info("Ignoring node with no 'id': {}".format(node))
def load_edge(self, edge: Dict) -> None: """ Load an edge into a networkx.MultiDiGraph .. Note:: This methods transformers Reasoner Std API format fields to Biolink Model fields. Parameters ---------- edge : dict An edge """ if 'source_id' in edge: edge['subject'] = edge['source_id'] if 'target_id' in edge: edge['object'] = edge['target_id'] if 'relation_label' in edge: edge['edge_label'] = edge['relation_label'][0] edge = self.validate_edge(edge) kwargs = PandasTransformer._build_kwargs(edge.copy()) if 'subject' in kwargs and 'object' in kwargs: s = kwargs['subject'] o = kwargs['object'] key = generate_edge_key(s, kwargs['edge_label'], o) self.graph.add_edge(s, o, key, **kwargs) else: logging.info( "Ignoring edge with either a missing 'subject' or 'object': {}" .format(kwargs))
def test_csv_to_neo_load(): """ load csv to neo4j test """ return pt = PandasTransformer() pt.parse("resources/nodes.csv") pt.parse("resources/edges.csv") nt = NeoTransformer(pt.graph, host='localhost', port='7474', username='', password='') nt.save_with_unwind() nt.neo4j_report()
def main(path, output, model): if model is not None: bmt.load(model) t = JsonTransformer() t.parse(path) t = PandasTransformer(t.graph) t.save(output)
def test_owl_load(): """ Load a test OWL and export as JSON """ input_file = os.path.join(resource_dir, 'mody.ttl') output_archive_file = os.path.join(target_dir, 'mondo_test') output_json_file = os.path.join(target_dir, 'mondo_test.json') t = RdfOwlTransformer() t.parse(input_file, input_format='ttl') t.report() pt = PandasTransformer(t.graph) pt.save(output_archive_file) jt = JsonTransformer(t.graph) jt.save(output_json_file)
def test_load(): """ Test for loading data into PandasTransformer """ t = PandasTransformer() os.makedirs(target_dir, exist_ok=True) t.parse(os.path.join(resource_dir, "x1n.csv")) t.parse(os.path.join(resource_dir, "x1e.csv")) t.report() t.save(os.path.join(target_dir, 'x1copy.csv'))
def test_load(): """ load and save tests """ cwd = os.path.abspath(os.path.dirname(__file__)) src_path = os.path.join(cwd, 'resources', 'monarch', 'biogrid_test.ttl') tpath = os.path.join(cwd, 'target') os.makedirs(tpath, exist_ok=True) tg_path = os.path.join(tpath, "test_output.ttl") # execute ObanRdfTransformer's parse and save function t = ObanRdfTransformer() t.parse(src_path, input_format="turtle") t.save(tg_path, output_format="turtle") t.report() w1 = PandasTransformer(t.graph) w1.save(os.path.join(tpath, 'biogrid-e.csv'), type='e') w1.save(os.path.join(tpath, 'biogrid-n.csv'), type='n') # read again the source, test graph src_graph = rdflib.Graph() src_graph.parse(src_path, format="turtle") # read again the dumped target graph tg_graph = rdflib.Graph() tg_graph.parse(tg_path, format="turtle") # compare subgraphs from the source and the target graph. OBAN = Namespace('http://purl.org/oban/') for a in src_graph.subjects(RDF.type, OBAN.association): oban_src_graph = rdflib.Graph() oban_src_graph += src_graph.triples((a, None, None)) oban_tg_graph = rdflib.Graph() oban_tg_graph += tg_graph.triples((a, None, None)) # see they are indeed identical (isomorphic) if not oban_src_graph.isomorphic(oban_tg_graph): raise RuntimeError('The subgraphs whose subject is ' + str(a) + ' are not isomorphic ones.') w2 = GraphMLTransformer(t.graph) w2.save(os.path.join(tpath, "x1n.graphml")) w3 = JsonTransformer(t.graph) w3.save(os.path.join(tpath, "x1n.json"))
def test_mapping(): """ create a random graph and save it in different formats """ G = nx.MultiDiGraph() N = 100 E = N * 3 mapping = {} for i in range(0,N+1): nid = curie(i) mapping[nid] = mapped_curie(i) G.add_node(nid, label="node {}".format(i)) for i in range(1,E): s = random_curie(N) o = random_curie(N) G.add_edge(o,s,predicate='related_to') print('Nodes={}'.format(len(G.nodes()))) mapper.map_graph(G, mapping) print("Mapped..") count = 0 for nid in G.nodes(): src = G.node[nid]['source_curie'] assert nid.startswith("Y:") assert src.startswith("X:") count += 1 if count > 5: break print("Saving tsv") w = PandasTransformer(G) w.save("target/maptest.tar") w = ObanRdfTransformer(G) w.save("target/maptest.ttl")
def test_load(): """ load tests """ t = PandasTransformer() t.parse("tests/resources/x1n.csv") t.parse("tests/resources/x1e.csv") t.report() t.save('target/x1copy.csv') w = GraphMLTransformer(t) w.save("target/x1n.graphml")
def test_load(): """ load tests """ cwd = os.path.abspath(os.path.dirname(__file__)) resdir = os.path.join(cwd, 'resources') tdir = os.path.join(cwd, 'target') os.makedirs(tdir, exist_ok=True) t = RdfOwlTransformer() fn = os.path.join(resdir, "mody.ttl.gz") f = gzip.open(fn, 'rb') t.parse(f, input_format='ttl') t.report() w1 = PandasTransformer(t.graph) w1.save(os.path.join(tdir, 'mondo-e.csv'), type='e') w1.save(os.path.join(tdir, 'mondo-n.csv'), type='n') w3 = JsonTransformer(t.graph) w3.save(os.path.join(tdir, "mondo.json"))
def parse(self, name: str, data_file: str) -> None: """Processes the data_file. Args: name: Name of the ontology data_file: data file to parse Returns: None. """ logging.info(f"Parsing {data_file}") transformer = ObographJsonTransformer() transformer.parse(data_file) output_transformer = PandasTransformer(transformer.graph) output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), extension='tsv', mode=None)
def parse(self, data_file: str, input_format: str) -> None: """Processes the data_file. Args: data_file: data file to parse input_format: format of input file Returns: None """ # define prefix to IRI mappings cmap = { 'REACT': 'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_', 'WB': 'http://identifiers.org/wormbase/', 'FB': 'http://identifiers.org/flybase/', 'LEGO': 'http://geneontology.org/lego/', 'GOCAM': 'http://model.geneontology.org/', 'TAIR.LOCUS': 'http://identifiers.org/tair.locus/', 'POMBASE': 'http://identifiers.org/PomBase', 'DICTYBASE.GENE': 'http://identifiers.org/dictybase.gene/', 'XENBASE': 'http://identifiers.org/xenbase/' } # define predicates that are to be treated as node properties np = { 'http://geneontology.org/lego/evidence', 'https://w3id.org/biolink/vocab/subjectActivity', 'https://w3id.org/biolink/vocab/objectActivity', } print(f"Parsing {data_file}") transformer = RdfTransformer(curie_map=cmap) transformer.parse(data_file, node_property_predicates=np, input_format=input_format) output_transformer = PandasTransformer(transformer.graph) output_transformer.save(os.path.join(self.output_dir, self.source_name), output_format='tsv', mode=None)
def test_sanitize_export(query): value = PandasTransformer._sanitize_export(query[0][0], query[0][1]) if isinstance(query[1], str): assert value == query[1] elif isinstance(query[1], (list, set, tuple)): for x in query[1]: assert x in value elif isinstance(query[1], bool): assert query[1] == value else: assert query[1] in value
def test_semmeddb_csv(): """ Read nodes and edges from CSV and export the resulting graph as an archive """ t = PandasTransformer() nodes_file = os.path.join(resource_dir, "semmed/semmeddb_test_nodes.csv") edges_file = os.path.join(resource_dir, "semmed/semmeddb_test_edges.csv") output = os.path.join(target_dir, "semmeddb_test_export") t.parse(nodes_file) t.parse(edges_file) # save output as *.tar t.save(output) # save output as *.tar.gz t.save(output, mode='w:gz') # save output as *tar.bz2 t.save(output, mode='w:bz2')
def test_load(): """ load TTL and save as CSV """ input_file = os.path.join(resource_dir, 'monarch/biogrid_test.ttl') output_file = os.path.join(target_dir, 'test_output.ttl') t = ObanRdfTransformer() t.parse(input_file, input_format="turtle") t.report() t.save(output_file, output_format="turtle") output_archive_file = os.path.join(target_dir, 'biogrid_test') pt = PandasTransformer(t.graph) pt.save(output_archive_file) # read again the source, test graph src_graph = rdflib.Graph() src_graph.parse(input_file, format="turtle") # read again the dumped target graph target_graph = rdflib.Graph() target_graph.parse(output_file, format="turtle") # compare subgraphs from the source and the target graph. OBAN = Namespace('http://purl.org/oban/') for a in src_graph.subjects(RDF.type, OBAN.association): oban_src_graph = rdflib.Graph() oban_src_graph += src_graph.triples((a, None, None)) oban_tg_graph = rdflib.Graph() oban_tg_graph += target_graph.triples((a, None, None)) # see they are indeed identical (isomorphic) if not oban_src_graph.isomorphic(oban_tg_graph): print( 'The subgraphs whose subject is {} are not isomorphic'.format( a)) # w2 = GraphMLTransformer(t.graph) # w2.save(os.path.join(tpath, "x1n.graphml")) w3 = JsonTransformer(t.graph) w3.save(os.path.join(target_dir, "biogrid_test.json"))
def test_clique_merge(): """ Test for clique merge (lenient) """ t = PandasTransformer() os.makedirs(target_dir, exist_ok=True) t.parse(os.path.join(resource_dir, 'cm_nodes.csv')) t.parse(os.path.join(resource_dir, 'cm_edges.csv')) t.report() cm = CliqueMerge(prefix_prioritization_map) cm.build_cliques(t.graph) cm.elect_leader() updated_graph = cm.consolidate_edges() leaders = nx.get_node_attributes(updated_graph, 'clique_leader') leader_list = list(leaders.keys()) leader_list.sort() assert len(leader_list) == 2 n1 = updated_graph.nodes[leader_list[0]] assert n1['election_strategy'] == 'PREFIX_PRIORITIZATION' assert 'NCBIGene:100302240' in n1['aliases'] assert 'ENSEMBL:ENSG00000284458' in n1['aliases'] n2 = updated_graph.nodes[leader_list[1]] assert n2['election_strategy'] == 'PREFIX_PRIORITIZATION' assert 'NCBIGene:8202' in n2['aliases'] assert 'OMIM:601937' in n2['aliases'] assert 'ENSEMBL:ENSG00000124151' not in n2['aliases']
def test_filters(query): nodes = os.path.join(resource_dir, 'test_nodes2.tsv') edges = os.path.join(resource_dir, 'test_edges2.tsv') t = PandasTransformer() for nf in query[0].keys(): t.set_node_filter(nf, query[0][nf]) for ef in query[1].keys(): t.set_edge_filter(ef, query[1][ef]) t.parse(nodes, input_format='tsv') t.parse(edges, input_format='tsv') assert t.graph.number_of_nodes() == query[2] assert t.graph.number_of_edges() == query[3]
def test_load(): """ load tests """ t = PandasTransformer() cwd = os.path.abspath(os.path.dirname(__file__)) resdir = os.path.join(cwd, 'resources') targetdir = os.path.join(cwd, 'target') os.makedirs(targetdir, exist_ok=True) t.parse(os.path.join(resdir, "x1n.csv")) t.parse(os.path.join(resdir, "x1e.csv")) t.report() t.save(os.path.join(targetdir, 'x1copy.csv')) w = GraphMLTransformer(t.graph) w.save(os.path.join(targetdir, "x1n.graphml"))
def parse(self, name: str, data_file: str, source: str) -> None: """Processes the data_file. :param name: Name of the ontology :param data_file: data file to parse :param source: Source name :return: None. """ print(f"Parsing {data_file}") transformer = ObographJsonTransformer() compression: Optional[str] if data_file.endswith('.gz'): compression = 'gz' else: compression = None transformer.parse(data_file, compression=compression, provided_by=source) output_transformer = PandasTransformer(transformer.graph) output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None)
def test_clique_generation(): """ Test for generation of cliques """ t = PandasTransformer() t.parse(os.path.join(resource_dir, 'cm_nodes.csv')) t.parse(os.path.join(resource_dir, 'cm_edges.csv')) t.report() cm = CliqueMerge(prefix_prioritization_map) cm.build_cliques(t.graph) cliques = list(nx.connected_components(cm.clique_graph)) assert len(cliques) == 2