def upsert_edge(self, edge: Edge, node_type: str = None, edge_predicate: str = None): log.debug("Upserting edge...") if node_type is None: node_type = DEFAULT_NODE_TYPE if edge_predicate is None: edge_predicate = DEFAULT_EDGE_PREDICATE exists, uid = self.exists_edge(edge, node_type=node_type, edge_predicate=edge_predicate) if exists: log.warning("upsert_edge() was called with existing edge:") log.warning(str(edge)) return else: uid_a = self.upsert_node( Node(node_type=node_type, value=edge.src), echo=True).db_id uid_b = self.upsert_node( Node(node_type=node_type, value=edge.tgt), echo=True).db_id nquads = """ <{a}> <{ep}> _:e . _:e <{ep}> <{b}> . _:e <type> "{edge_type}" . _:e <value> "{edge_value}" . """.format(a=uid_a, b=uid_b, ep=edge_predicate, edge_type=edge.edge_type, edge_value=edge.edge_value) log.debug("Edge not found, adding nquad \n{}".format(nquads)) self.mutate(nquads)
def _setup_connected(g: Graph): n1 = Node(value="ABC") n2 = Node(value="BCD") g.upsert_node(n1) g.upsert_node(n2) e = Edge(src="ABC", tgt="BCD", edge_value=0) g.add_edge(e) g.save()
def _parse_node(d: dict) -> Node: n = Node(value="") for k, v in d.items(): if k == DEFAULT_NODE_TYPE: n.value = v elif k == "uid": n.db_id = v return n
def _parse_path(d: dict, node_type: str, edge_predicate: str) -> tuple: lt = [Node(value=d[node_type])] while True: if edge_predicate not in d: break d = d[edge_predicate][0][edge_predicate][0] v = d[node_type] lt.append(Node(value=v)) return tuple(lt)
def test_dgraph_find_value_reverse(dg: Dgraph): na = Node(value="ATCG") nb = Node(value="ATCC") dg.upsert_node(na) dg.upsert_node(nb) e = Edge(src="ATCG", tgt="ATCC", edge_type="genome_a", edge_value=0) dg.upsert_edge(e) _, uid_b = dg.exists_node(nb) v2 = dg.find_value_reverse(uid_b, "genome_a") assert v2 == 0
def _setup_connected_distant(g: Graph): n1 = Node(value="ABC") n2 = Node(value="BCD") n3 = Node(value="CDE") g.upsert_node(n1) g.upsert_node(n2) g.upsert_node(n3) e1 = Edge(src="ABC", tgt="BCD", edge_value=0) g.add_edge(e1) e2 = Edge(src="BCD", tgt="CDE", edge_value=1) g.add_edge(e2) g.save()
def test_dgraph_find_value(dg: Dgraph): na = Node(value="ATCG") nb = Node(value="ATCC") dg.upsert_node(na) dg.upsert_node(nb) e = Edge(src="ATCG", tgt="ATCC", edge_type="genome_a", edge_value=0) dg.upsert_edge(e) _, uid = dg.exists_node(na) v = dg.find_value(uid, "genome_a") log.info("Found value for {} as {}".format(uid, v)) assert v == 0
def path(self, node_a: str, node_b: str) -> typing.Tuple[tuple, tuple]: log.info("Checking all paths between {} and {}".format(node_a, node_b)) exists, uid_a = self.exists_node(Node(value=node_a)) if not exists: return tuple(), tuple() exists, uid_b = self.exists_node(Node(value=node_b)) if not exists: return tuple(), tuple() log.debug("Both nodes exist, checking connectivity...") connected, src_edges = self.connected(node_a, node_b) if not connected: return tuple(), tuple() log.debug("Nodes are connected") paths = [] paths_meta = [] for src_edge in src_edges: log.info("Finding path between {} and {} with source edge {}" "".format(node_a, node_b, src_edge)) tgt_edges = self.find_edges_reverse(node_b) for tgt_edge in tgt_edges: not_matching_edge = tgt_edge.edge_type != src_edge.edge_type not_directional = tgt_edge.edge_value < src_edge.edge_value ln = tgt_edge.edge_value - src_edge.edge_value pass_recursion = ln > sys.getrecursionlimit() if not_matching_edge or not_directional or pass_recursion: log.info("Skipping tgt edge {} for src edge {}".format( tgt_edge, src_edge )) continue log.debug("Checking path for type: {}".format( tgt_edge.edge_type)) log.debug("Found start value of {} and end value of {}".format( src_edge.edge_value, tgt_edge.edge_value)) query = self._path_query( node_type=DEFAULT_NODE_TYPE, node_value=node_a, edge_predicate=DEFAULT_EDGE_PREDICATE, edge_type=src_edge.edge_type, start_int=src_edge.edge_value, end_int=tgt_edge.edge_value) r = self.query(query) log.info(r) if len(r['q']) != 1: log.warning("Path not found for type: {}".format( src_edge.edge_type )) continue p = self._parse_path( r['q'][0], DEFAULT_NODE_TYPE, DEFAULT_EDGE_PREDICATE) paths.append(p) paths_meta.append({'edge_type': tgt_edge.edge_type}) return tuple(paths), tuple(paths_meta)
def _setup_connected_shortcut(g: Graph): n1 = Node(value="ABC") n2 = Node(value="BCD") n3 = Node(value="CDE") g.upsert_node(n1) g.upsert_node(n2) g.upsert_node(n3) e1 = Edge(src="ABC", tgt="BCD", edge_type="path1", edge_value=0) g.add_edge(e1) e2 = Edge(src="BCD", tgt="CDE", edge_type="path1", edge_value=1) g.add_edge(e2) e1_short = Edge(src="ABC", tgt="CDE", edge_type="path2", edge_value=0) g.add_edge(e1_short) g.save()
def test_dgraph_find_edges(dg: Dgraph): na = Node(value="ATCG") nb = Node(value="ATCC") dg.upsert_node(na) dg.upsert_node(nb) e = Edge(src="ATCG", tgt="ATCC", edge_type="genome_a", edge_value=0) dg.upsert_edge(e) ef_set = dg.find_edges("ATCG") assert len(ef_set) == 1 ef = ef_set[0] assert ef.src == "ATCG" assert ef.tgt == "ATCC" assert ef.edge_value == 0 assert ef.edge_type == "genome_a"
def test_dgraph_exists_node(dg: Dgraph): na = Node(value='a') exists, _ = dg.exists_node(na) assert not exists dg.upsert_node(na, echo=False) exists, uid = dg.exists_node(na) assert exists log.info("uid: {}".format(uid))
def test_graph_basics_nodes(g: Graph): expected = ["ABC", "BCE", "CEF"] expected = set(Node(value=v) for v in expected) for node in expected: g.upsert_node(node) # We assume the backing store might change ordering and only care for the # node values assert set(n.value for n in g.nodes) == set(ne.value for ne in expected)
def test_graph_connected_repeats_one_middle_path(g: Graph): n1 = Node(value="ABC") n2 = Node(value="BCD") n3 = Node(value="CDE") g.upsert_node(n1) g.upsert_node(n2) g.upsert_node(n3) e1 = Edge(src="ABC", tgt="BCD", edge_value=0) g.add_edge(e1) e2 = Edge(src="BCD", tgt="BCD", edge_value=1) g.add_edge(e2) e3 = Edge(src="BCD", tgt="CDE", edge_value=2) g.add_edge(e3) g.save() paths, _ = g.path('ABC', 'CDE') assert len(paths) == 1
def _setup_connected_multiple(g: Graph): n1 = Node(value="ABC") n2 = Node(value="BCD") n2_alt = Node(value="XYZ") n3 = Node(value="CDE") g.upsert_node(n1) g.upsert_node(n2) g.upsert_node(n3) g.upsert_node(n2_alt) e1 = Edge(src="ABC", tgt="BCD", edge_type="path1", edge_value=0) g.add_edge(e1) e2 = Edge(src="BCD", tgt="CDE", edge_type="path1", edge_value=1) g.add_edge(e2) e1_alt = Edge(src="ABC", tgt="XYZ", edge_type="path2", edge_value=0) g.add_edge(e1_alt) e2_alt = Edge(src="XYZ", tgt="CDE", edge_type="path2", edge_value=1) g.add_edge(e2_alt) g.save()
def test_graph_connected_repeats_full_path(g: Graph): n1 = Node(value="ABC") n2 = Node(value="BCD") n3 = Node(value="CDE") g.upsert_node(n1) g.upsert_node(n2) g.upsert_node(n3) e1 = Edge(src="ABC", tgt="BCD", edge_value=0) g.add_edge(e1) e2 = Edge(src="BCD", tgt="CDE", edge_value=1) g.add_edge(e2) e3 = Edge(src="CDE", tgt="ABC", edge_value=2) g.add_edge(e3) e4 = Edge(src="ABC", tgt="BCD", edge_value=3) g.add_edge(e4) e5 = Edge(src="BCD", tgt="CDE", edge_value=4) g.add_edge(e5) g.save() try: paths, _ = g.path('ABC', 'CDE') except: raise GraphException(g) assert len(paths) == 3 c = 0 for path in paths: assert path[0].value == "ABC" assert path[-1].value == "CDE" if len(path) == 3: assert path[1].value == "BCD" # There are 2 copies of this c += 1 elif len(path) == 6: assert path[1].value == "BCD" assert path[2].value == "CDE" assert path[3].value == "ABC" assert path[4].value == "BCD" assert path[5].value == "CDE" assert c == 2
def test_dgraph_bulk_basics(dgraph_bundled_helper: DgraphBundledHelper): dg = dgraph_bundled_helper tmp_dir = tempfile.mkdtemp() r1 = pathlib.Path(tmp_dir, 'r1.rdf') with open(r1, 'a') as f: f.write('_:a <{n}> "ATCG" . \n_:b <{n}> "ATGC" . \n'.format( n=DEFAULT_NODE_TYPE)) try: dg.load(tmp_dir) exists, _ = dg.g.exists_node(Node(value="ATCG")) assert exists log.info("ATCG exists") exists, _ = dg.g.exists_node(Node(value="ATGC")) assert exists log.info("ATGC exists") except Exception as e: log.critical("Couldn't find one of the expected nodes") files = glob.glob('{}/**'.format(dg.g.out_dir), recursive=True) log.critical("Files is out_dir are:\n{}".format(files)) raise e
def test_dgraph_find_depth(dg: Dgraph): na = Node(value="ATCG") nb = Node(value="ATCC") dg.upsert_node(na) dg.upsert_node(nb) e = Edge(src="ATCG", tgt="ATCC", edge_type="genome_a", edge_value=0) dg.upsert_edge(e) _, uid_a = dg.exists_node(na) _, uid_b = dg.exists_node(nb) d = dg.find_depth(uid_a, uid_b, "genome_a") log.info("Found depth for {} to {} as {}".format(uid_a, uid_b, d)) assert d == 1 nc = Node(value="ATGG") dg.upsert_node(nc) e2 = Edge(src="ATCC", tgt="ATGG", edge_type="genome_a", edge_value=1) dg.upsert_edge(e2) _, uid_c = dg.exists_node(nc) d2 = dg.find_depth(uid_a, uid_c, "genome_a") log.info("Found depth for {} to {} as {}".format(uid_a, uid_c, d)) assert d2 == 2
def test_graph_node_labels(g: Graph): # TODO: we dont support node labels yet for our Dgraph backend if isinstance(g, Dgraph): return expected = [ Node(value="ABC", labels={"species": "dog"}), Node(value="BCE", labels={"species": "cat"}) ] for node in expected: g.upsert_node(node) assert len(g.nodes) == 2 for node in g.nodes: if node.value == "ABC": assert node.labels == expected[0].labels elif node.value == "BCE": assert node.labels == expected[1].labels else: # Something went wrong assert False
def find_edges_reverse(self, node_value: str) -> tuple: exists, uid = self.exists_node(Node(value=node_value)) if not exists: log.warning("node_value {} doesn't exist".format(node_value)) return tuple() # Find the uid of the edge query = """ {{ q(func: has({et})) @filter(uid_in({et}, {tgt})) {{ uid }} }} """.format(et=DEFAULT_EDGE_PREDICATE, tgt=uid) r = self.query(query) if len(r['q']) == 0: return tuple() edge_uid = r['q'][0]['uid'] # Find the source edge query_2 = """ {{ q(func: has({nt})) @filter(uid_in({et}, {uid})) {{ {nt} }} }} """.format(nt=DEFAULT_NODE_TYPE, et=DEFAULT_EDGE_PREDICATE, uid=edge_uid) r_2 = self.query(query_2) src = r_2['q'][0][DEFAULT_NODE_TYPE] # Finally, query the edge query_3 = """ {{ q(func: has({et})) @filter(uid_in({et}, {tgt})) {{ expand(_all_) {{ uid expand(_all_) {{ {nt} }} }} }} }} """.format(nv=node_value, nt=DEFAULT_NODE_TYPE, et=DEFAULT_EDGE_PREDICATE, tgt=uid) r_3 = self.query(query_3) if len(r_3['q']) == 0: return tuple() return self._parse_edges_reverse( list_edges=r_3['q'], node_type=DEFAULT_NODE_TYPE, edge_predicate=DEFAULT_EDGE_PREDICATE, src=src)
def test_graph_basics_edges_dgraph(dg: Dgraph): g = dg expected = ["ABC", "BCE", "CEF"] expected = [Node(value=v) for v in expected] g.add_edge(Edge(src=expected[0].value, tgt=expected[1].value)) g.add_edge(Edge(src=expected[1].value, tgt=expected[2].value)) try: # In Dgraph we can retrieve the actual kmer value assert {(e.src, e.tgt) for e in g.edges} == { (expected[0].value, expected[1].value), (expected[1].value, expected[2].value)} except: raise GraphException(g)
def _parse_node(node: dict) -> Node: """ Parses the node returned by LemonGraph into our Node class :param node: :return: """ # Create a dictionary to store other labels labels = {} for k, v in node.items(): if k not in ('value', 'type', 'ID'): labels[k] = v labels = None if not labels else labels return Node(value=node['value'], node_type=node['type'], db_id=node['ID'], labels=labels)
def upsert_node(self, node: Node, echo: bool = True) -> typing.Optional[ Node]: exists, uid = self.exists_node(node) if exists: if echo: return Node(node_type=node.node_type, value=node.value, db_id=uid) else: return else: nquads = '_:{value} <{type}> "{value}" .'.format( value=node.value, type=node.node_type) log.debug("Node not found, adding nquads \n{}".format(nquads)) self.mutate(nquads) if echo: return self.upsert_node(node) else: return
def test_graph_basics_edges_lemongraph(lgr: LGGraph): g = lgr expected = ["ABC", "BCE", "CEF"] expected = [Node(value=v) for v in expected] nodes_with_ids = [] for node in expected: # Returned the node with db_id set, these are required to check edges n = g.upsert_node(node) nodes_with_ids.append(n) g.add_edge(Edge(src=expected[0].value, tgt=expected[1].value)) g.add_edge(Edge(src=expected[1].value, tgt=expected[2].value)) try: # In lemongraph these are stored as numerical IDs assert {(e.src, e.tgt) for e in g.edges} == { (nodes_with_ids[0].db_id, nodes_with_ids[1].db_id), (nodes_with_ids[1].db_id, nodes_with_ids[2].db_id)} except: raise GraphException(g)
def _setup_connected_no_node(g: Graph): n1 = Node(value="ABC") g.upsert_node(n1) g.save()
def update_graph(self, km: Kmers, gr: GraphRef, encode: bool = False, buffer: int = 333) -> int: self.out_file = os.path.join('outputs/', km.filepath + '.rdf') log.debug("Will write to {}".format(self.out_file)) log.debug("Starting to graph {} in pid {}".format(km, os.getpid())) st = time.time() c = 0 while km.has_next: header1, kmer1 = km.next() # Create the first node node1_label = gr.node_label(kmer1) if encode else kmer1 if not isinstance(self.graph, LGGraph): self.graph.upsert_node(Node(value=node1_label)) c += 1 # Used to incrementally encode the edges edge_c = 0 # The same contig still has a kmer while km.contig_has_next: header2, kmer2 = km.next() # Create the second node node2_label = gr.node_label(kmer2) if encode else kmer2 if not isinstance(self.graph, LGGraph): self.graph.upsert_node(Node(node2_label)) # Create an edge try: self.graph.add_edge(Edge( src=node1_label, tgt=node2_label, edge_type='{}{}{}'.format(header2, ET_DELIMITER, str(km)), edge_value=edge_c, ), echo=False) # self.graph.add_edge( # Edge( # src=node1_label, # tgt=node2_label, # edge_type='{} in {}'.format(header2, str(km)), # edge_value=edge_c, # labels={ # 'f': str(km), # 'hd': header2 # } # ), # echo=False # ) except Exception as e: log.fatal("Failed to add edge between {} and {}".format( node1_label, node2_label)) raise e # Set node1_id to node2_id node1_label = node2_label c += 1 edge_c += 1 if c % buffer == 0: # log.debug("Committing txn...") self.graph.save(self.out_file) if c % 100000 == 0: log.debug("{}/{}, {}%".format(c, len(km), int(c / len(km) * 100))) # At this point, we're out of kmers on that contig # The loop will check if there's still kmers, and reset kmer1 en = time.time() log.debug("Done graphing {}, covering {} kmers in {} s".format( km, c, en - st)) return c
def _setup_not_connected(g: Graph): n1 = Node(value="ABC") n2 = Node(value="BCD") g.upsert_node(n1) g.upsert_node(n2) g.save()