class BiolinkModel: root_type = 'biolink:NamedThing' def __init__(self, bl_version='1.5.0'): self.bl_url = f'https://raw.githubusercontent.com/biolink/biolink-model/{bl_version}/biolink-model.yaml' self.toolkit = Toolkit(self.bl_url) """ Programmatic model of Biolink. """ def to_camel_case(self, snake_str): """ Convert a snake case string to camel case. """ components = snake_str.split('_') return ''.join(x.title() for x in components) def get_class(self, name): """ Get a Python class from a string name. """ return getattr(sys.modules["biolink.model"], name) def is_derived(self, a_class_name, classes): """ Return true if the class derives from any of the provided classes. """ for c in classes: if isinstance(self.get_class(self.to_camel_case(a_class_name)), c): return True return False def find_biolink_leaves(self, biolink_concepts): """ Given a list of biolink concepts, returns the leaves removing any parent concepts. :param biolink_concepts: list of biolink concepts :return: leave concepts. """ ancestry_set = set() all_mixins_in_tree = set() all_concepts = set(biolink_concepts) # Keep track of things like "MacromolecularMachine" in current datasets # @TODO remove this and make nodes as errors unknown_elements = set() for x in all_concepts: current_element = self.toolkit.get_element(x) mixins = set() if current_element: if 'mixins' in current_element and len( current_element['mixins']): for m in current_element['mixins']: mixins.add(self.toolkit.get_element(m).class_uri) else: unknown_elements.add(x) ancestors = set( self.toolkit.get_ancestors(x, reflexive=False, formatted=True)) ancestry_set = ancestry_set.union(ancestors) all_mixins_in_tree = all_mixins_in_tree.union(mixins) leaf_set = all_concepts - ancestry_set - all_mixins_in_tree - unknown_elements return leaf_set def get_leaf_class(self, names): """ Return the leaf classes in the provided list of names. """ leaves = list(self.find_biolink_leaves(names)) return leaves[0]
def test_get_element(): toolkit = Toolkit() gene = toolkit.get_element('gene') locus = toolkit.get_element('locus') assert gene == locus o = toolkit.get_element('drug intake') assert o and o.name == 'drug exposure' o = toolkit.get_element('molecular function') assert o and o.name == 'molecular activity' o = toolkit.get_element('RNA Product') assert o and o.name == 'RNA product' o = toolkit.get_element('rna product') assert o and o.name == 'RNA product'
class NodeFactory: def __init__(self, label_dir): #self.url_base = 'http://arrival.edc.renci.org:32511/bl' self.url_base = 'https://bl-lookup-sri.renci.org/bl' self.toolkit = Toolkit( 'https://raw.githubusercontent.com/biolink/biolink-model/1.6.1/biolink-model.yaml' ) self.ancestor_map = {} self.prefix_map = {} self.ignored_prefixes = set() self.extra_labels = {} self.label_dir = label_dir def get_ancestors(self, input_type): if input_type in self.ancestor_map: return self.ancestor_map[input_type] a = self.toolkit.get_ancestors(input_type) ancs = [self.toolkit.get_element(ai)['class_uri'] for ai in a] if input_type not in ancs: ancs = [input_type] + ancs self.ancestor_map[input_type] = ancs return ancs def get_prefixes(self, input_type): if input_type in self.prefix_map: return self.prefix_map[input_type] url = f'{self.url_base}/{input_type}' response = requests.get(url) try: j = response.json() prefs = j['id_prefixes'] except: #this is a mega hack to deal with the taxon change prefs = ['NCBITaxon', 'MESH'] #The pref are in a particular order, but apparently it can have dups (ugh) newprefs = [''] for pref in prefs: if not pref == newprefs[-1]: newprefs.append(pref) prefs = newprefs[1:] self.prefix_map[input_type] = prefs return prefs def make_json_id(self, input): if isinstance(input, LabeledID): if input.label is not None and input.label != '': return {'identifier': input.identifier, 'label': input.label} return {'identifier': input.identifier} return {'identifier': input} def clean_list(self, input_identifiers): #Sometimes we end up with something like [(HP:123,'name'),HP:123,UMLS:3445] Clean up cleanup = defaultdict(list) for x in list(input_identifiers): if isinstance(x, LabeledID): cleanup[x.identifier].append(x) else: cleanup[x].append(x) cleaned = [] for v in cleanup.values(): if len(v) == 1: cleaned.append(v[0]) else: #Originally, we were just trying to get the LabeledID. But sometimes we get more than one, so len(v) # can be more than two. wrote = False for vi in v: if isinstance(vi, LabeledID): cleaned.append(vi) wrote = True break if not wrote: print(input_identifiers) exit() return cleaned def load_extra_labels(self, prefix): labelfname = os.path.join(self.label_dir, prefix, 'labels') lbs = {} if os.path.exists(labelfname): with open(labelfname, 'r') as inf: for line in inf: x = line.strip().split('\t') lbs[x[0]] = x[1] self.extra_labels[prefix] = lbs def apply_labels(self, input_identifiers, labels): #Originally we needed to clean up the identifer lists, because there would be both labeledids and # string ids and we had to reconcile them. # But now, we only allow regular ids in the list, and now we need to turn some of them into labeled ids for output labeled_list = [] for iid in input_identifiers: if isinstance(iid, LabeledID): print('LabeledID dont belong here, pass in labels seperately', iid) exit() if iid in labels: labeled_list.append( LabeledID(identifier=iid, label=labels[iid])) else: prefix = Text.get_prefix(iid) if prefix not in self.extra_labels: self.load_extra_labels(prefix) if iid in self.extra_labels[prefix]: labeled_list.append( LabeledID(identifier=iid, label=self.extra_labels[prefix][iid])) else: labeled_list.append(iid) return labeled_list def create_node(self, input_identifiers, node_type, labels={}): #This is where we will normalize, i.e. choose the best id, and add types in accord with BL. #we should also include provenance and version information for the node set build. ancestors = self.get_ancestors(node_type) #ancestors.reverse() prefixes = self.get_prefixes(node_type) if len(input_identifiers) == 0: return None if len(input_identifiers) > 1000: print('this seems like a lot') print(len(input_identifiers)) cleaned = self.apply_labels(input_identifiers, labels) try: idmap = defaultdict(list) for i in list(cleaned): idmap[Text.get_curie(i).upper()].append(i) except AttributeError: print('something very bad') print(input_identifiers) print(len(input_identifiers)) for i in list(input_identifiers): print(i) print(type(i)) print(Text.get_curie(i)) print(Text.get_curie(i).upper()) exit() identifiers = [] accepted_ids = set() #Converting identifiers from LabeledID to dicts #In order to be consistent from run to run, we need to worry about the # case where e.g. there are 2 UMLS id's and UMLS is the preferred pref. # We're going to choose the canonical ID here just by sorting the N . for p in prefixes: pupper = p.upper() if pupper in idmap: newids = [] for v in idmap[pupper]: newid = Text.recurie(v, p) jid = self.make_json_id(newid) newids.append((jid['identifier'], jid)) accepted_ids.add(v) newids.sort() identifiers += [nid[1] for nid in newids] #Warn if we have prefixes that we're ignoring for k, vals in idmap.items(): for v in vals: if v not in accepted_ids and ( k, node_type) not in self.ignored_prefixes: print( f'Ignoring prefix {k} for type {node_type}, identifier {v}' ) self.ignored_prefixes.add((k, node_type)) if len(identifiers) == 0: return None best_id = identifiers[0]['identifier'] # identifiers is in preferred order, so choose the first non-empty label to be the node label labels = list( filter(lambda x: len(x) > 0, [l['label'] for l in identifiers if 'label' in l])) label = None if len(labels) > 0: label = labels[0] node = { 'id': { 'identifier': best_id, }, 'equivalent_identifiers': identifiers, 'type': ancestors } if label is not None: node['id']['label'] = label return node
class ObographSource(JsonSource): """ ObographSource is responsible for reading data as records from an OBO Graph JSON. """ HAS_OBO_NAMESPACE = 'http://www.geneontology.org/formats/oboInOwl#hasOBONamespace' SKOS_EXACT_MATCH = 'http://www.w3.org/2004/02/skos/core#exactMatch' def __init__(self): super().__init__() self.toolkit = Toolkit() self.ecache: Dict = {} def parse( self, filename: str, format: str = 'json', compression: Optional[str] = None, provided_by: Optional[str] = None, **kwargs: Any, ) -> Generator: """ This method reads from JSON and yields records. Parameters ---------- filename: str The filename to parse format: str The format (``json``) compression: Optional[str] The compression type (``gz``) provided_by: Optional[str] The name of the source providing the input file kwargs: Any Any additional arguments Returns ------- Generator A generator for records """ if provided_by: self.graph_metadata['provided_by'] = [provided_by] n = self.read_nodes(filename, compression) e = self.read_edges(filename, compression) yield from chain(n, e) def read_nodes(self, filename: str, compression: Optional[str] = None) -> Generator: """ Read node records from a JSON. Parameters ---------- filename: str The filename to read from compression: Optional[str] The compression type Returns ------- Generator A generator for node records """ if compression and compression == 'gz': FH = gzip.open(filename, 'rb') else: FH = open(filename, 'rb') for n in ijson.items(FH, 'graphs.item.nodes.item'): yield self.read_node(n) def read_node(self, node: Dict) -> Dict: """ Read and parse a node record. Parameters ---------- node: Dict The node record Returns ------- Dict The processed node """ curie = self.prefix_manager.contract(node['id']) node_properties = {} if 'meta' in node: node_properties = self.parse_meta(node['id'], node['meta']) fixed_node = dict() fixed_node['id'] = curie if 'lbl' in node: fixed_node['name'] = node['lbl'] fixed_node['iri'] = node['id'] if 'description' in node_properties: fixed_node['description'] = node_properties['description'] if 'synonym' in node_properties: fixed_node['synonym'] = node_properties['synonym'] if 'xrefs' in node_properties: fixed_node['xref'] = node_properties['xrefs'] if 'subsets' in node_properties: fixed_node['subsets'] = node_properties['subsets'] if 'category' not in node: category = self.get_category(curie, node) if category: fixed_node['category'] = [category] else: fixed_node['category'] = ['biolink:OntologyClass'] if 'equivalent_nodes' in node_properties: equivalent_nodes = node_properties['equivalent_nodes'] fixed_node['same_as'] = equivalent_nodes # for n in node_properties['equivalent_nodes']: # data = {'subject': fixed_node['id'], 'predicate': 'biolink:same_as', 'object': n, 'relation': 'owl:sameAs'} # super().load_node({'id': n, 'category': ['biolink:OntologyClass']}) # self.graph.add_edge(fixed_node['id'], n, **data) return super().read_node(fixed_node) def read_edges(self, filename: str, compression: Optional[str] = None) -> Generator: """ Read edge records from a JSON. Parameters ---------- filename: str The filename to read from compression: Optional[str] The compression type Returns ------- Generator A generator for edge records """ if compression == 'gz': FH = gzip.open(filename, 'rb') else: FH = open(filename, 'rb') for e in ijson.items(FH, 'graphs.item.edges.item'): yield self.read_edge(e) def read_edge(self, edge: Dict) -> Dict: """ Read and parse an edge record. Parameters ---------- edge: Dict The edge record Returns ------- Dict The processed edge """ fixed_edge = dict() fixed_edge['subject'] = self.prefix_manager.contract(edge['sub']) if PrefixManager.is_iri(edge['pred']): curie = self.prefix_manager.contract(edge['pred']) if curie in self.ecache: edge_predicate = self.ecache[curie] else: element = get_biolink_element(curie) if not element: try: mapping = self.toolkit.get_element_by_mapping( edge['pred']) if mapping: element = self.toolkit.get_element(mapping) except ValueError as e: log.error(e) if element: edge_predicate = format_biolink_slots( element.name.replace(',', '')) fixed_edge['predicate'] = edge_predicate else: edge_predicate = 'biolink:related_to' self.ecache[curie] = edge_predicate fixed_edge['predicate'] = edge_predicate fixed_edge['relation'] = curie else: if edge['pred'] == 'is_a': fixed_edge['predicate'] = 'biolink:subclass_of' fixed_edge['relation'] = 'rdfs:subClassOf' elif edge['pred'] == 'has_part': fixed_edge['predicate'] = 'biolink:has_part' fixed_edge['relation'] = "BFO:0000051" elif edge['pred'] == 'part_of': fixed_edge['predicate'] = 'biolink:part_of' fixed_edge['relation'] = "BFO:0000050" else: fixed_edge[ 'predicate'] = f"biolink:{edge['pred'].replace(' ', '_')}" fixed_edge['relation'] = edge['pred'] fixed_edge['object'] = self.prefix_manager.contract(edge['obj']) for x in edge.keys(): if x not in {'sub', 'pred', 'obj'}: fixed_edge[x] = edge[x] return super().read_edge(fixed_edge) def get_category(self, curie: str, node: dict) -> Optional[str]: """ Get category for a given CURIE. Parameters ---------- curie: str Curie for node node: dict Node data Returns ------- Optional[str] Category for the given node CURIE. """ category = None # use meta.basicPropertyValues if 'meta' in node and 'basicPropertyValues' in node['meta']: for p in node['meta']['basicPropertyValues']: if p['pred'] == self.HAS_OBO_NAMESPACE: category = p['val'] element = self.toolkit.get_element(category) if element: category = ( f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}" ) else: element = self.toolkit.get_element_by_mapping(category) if element: category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}" else: category = 'biolink:OntologyClass' if not category or category == 'biolink:OntologyClass': prefix = PrefixManager.get_prefix(curie) # TODO: the mapping should be via biolink-model lookups if prefix == 'HP': category = "biolink:PhenotypicFeature" elif prefix == 'CHEBI': category = "biolink:ChemicalSubstance" elif prefix == 'MONDO': category = "biolink:Disease" elif prefix == 'UBERON': category = "biolink:AnatomicalEntity" elif prefix == 'SO': category = "biolink:SequenceFeature" elif prefix == 'CL': category = "biolink:Cell" elif prefix == 'PR': category = "biolink:Protein" elif prefix == 'NCBITaxon': category = "biolink:OrganismalEntity" else: log.debug( f"{curie} Could not find a category mapping for '{category}'; Defaulting to 'biolink:OntologyClass'" ) return category def parse_meta(self, node: str, meta: Dict) -> Dict: """ Parse 'meta' field of a node. Parameters ---------- node: str Node identifier meta: Dict meta dictionary for the node Returns ------- Dict A dictionary that contains 'description', 'synonyms', 'xrefs', and 'equivalent_nodes'. """ # cross species links are in meta; this needs to be parsed properly too # do not put assumptions in code; import as much as possible properties = {} if 'definition' in meta: # parse 'definition' as 'description' description = meta['definition']['val'] properties['description'] = description if 'subsets' in meta: # parse 'subsets' subsets = meta['subsets'] properties['subsets'] = [ x.split('#')[1] if '#' in x else x for x in subsets ] if 'synonyms' in meta: # parse 'synonyms' as 'synonym' synonyms = [s['val'] for s in meta['synonyms']] properties['synonym'] = synonyms if 'xrefs' in meta: # parse 'xrefs' as 'xrefs' xrefs = [x['val'] for x in meta['xrefs']] properties['xrefs'] = xrefs if 'deprecated' in meta: # parse 'deprecated' flag properties['deprecated'] = meta['deprecated'] equivalent_nodes = [] if 'basicPropertyValues' in meta: # parse SKOS_EXACT_MATCH entries as 'equivalent_nodes' for p in meta['basicPropertyValues']: if p['pred'] in {self.SKOS_EXACT_MATCH}: n = self.prefix_manager.contract(p['val']) if not n: n = p['val'] equivalent_nodes.append(n) properties['equivalent_nodes'] = equivalent_nodes return properties
class ObographSource(JsonSource): """ ObographSource is responsible for reading data as records from an OBO Graph JSON. """ HAS_OBO_NAMESPACE = "http://www.geneontology.org/formats/oboInOwl#hasOBONamespace" SKOS_EXACT_MATCH = "http://www.w3.org/2004/02/skos/core#exactMatch" def __init__(self, owner): super().__init__(owner) self.toolkit = Toolkit() self.ecache: Dict = {} def parse( self, filename: str, format: str = "json", compression: Optional[str] = None, **kwargs: Any, ) -> Generator: """ This method reads from JSON and yields records. Parameters ---------- filename: str The filename to parse format: str The format (``json``) compression: Optional[str] The compression type (``gz``) kwargs: Any Any additional arguments Returns ------- Generator A generator for records """ self.set_provenance_map(kwargs) n = self.read_nodes(filename, compression) e = self.read_edges(filename, compression) yield from chain(n, e) def read_nodes(self, filename: str, compression: Optional[str] = None) -> Generator: """ Read node records from a JSON. Parameters ---------- filename: str The filename to read from compression: Optional[str] The compression type Returns ------- Generator A generator for node records """ if compression and compression == "gz": FH = gzip.open(filename, "rb") else: FH = open(filename, "rb") for n in ijson.items(FH, "graphs.item.nodes.item"): yield self.read_node(n) def read_node(self, node: Dict) -> Optional[Tuple[str, Dict]]: """ Read and parse a node record. Parameters ---------- node: Dict The node record Returns ------- Dict The processed node """ curie = self.prefix_manager.contract(node["id"]) node_properties = {} if "meta" in node: node_properties = self.parse_meta(node["id"], node["meta"]) fixed_node = dict() fixed_node["id"] = curie if "lbl" in node: fixed_node["name"] = node["lbl"] fixed_node["iri"] = node["id"] if "description" in node_properties: fixed_node["description"] = node_properties["description"] if "synonym" in node_properties: fixed_node["synonym"] = node_properties["synonym"] if "xrefs" in node_properties: fixed_node["xref"] = node_properties["xrefs"] if "subsets" in node_properties: fixed_node["subsets"] = node_properties["subsets"] if "category" not in node: category = self.get_category(curie, node) if category: fixed_node["category"] = [category] else: fixed_node["category"] = ["biolink:OntologyClass"] if "equivalent_nodes" in node_properties: equivalent_nodes = node_properties["equivalent_nodes"] fixed_node["same_as"] = equivalent_nodes # for n in node_properties['equivalent_nodes']: # data = {'subject': fixed_node['id'], 'predicate': 'biolink:same_as', # 'object': n, 'relation': 'owl:sameAs'} # super().load_node({'id': n, 'category': ['biolink:OntologyClass']}) # self.graph.add_edge(fixed_node['id'], n, **data) return super().read_node(fixed_node) def read_edges(self, filename: str, compression: Optional[str] = None) -> Generator: """ Read edge records from a JSON. Parameters ---------- filename: str The filename to read from compression: Optional[str] The compression type Returns ------- Generator A generator for edge records """ if compression == "gz": FH = gzip.open(filename, "rb") else: FH = open(filename, "rb") for e in ijson.items(FH, "graphs.item.edges.item"): yield self.read_edge(e) def read_edge(self, edge: Dict) -> Optional[Tuple]: """ Read and parse an edge record. Parameters ---------- edge: Dict The edge record Returns ------- Dict The processed edge """ fixed_edge = dict() fixed_edge["subject"] = self.prefix_manager.contract(edge["sub"]) if PrefixManager.is_iri(edge["pred"]): curie = self.prefix_manager.contract(edge["pred"]) if curie in self.ecache: edge_predicate = self.ecache[curie] else: element = get_biolink_element(curie) if not element: try: mapping = self.toolkit.get_element_by_mapping( edge["pred"]) if mapping: element = self.toolkit.get_element(mapping) # TODO: not sure how this exception would be thrown here.. under what conditions? except ValueError as e: self.owner.log_error( entity=str(edge["pred"]), error_type=ErrorType.INVALID_EDGE_PREDICATE, message=str(e)) element = None if element: edge_predicate = format_biolink_slots( element.name.replace(",", "")) fixed_edge["predicate"] = edge_predicate else: edge_predicate = "biolink:related_to" self.ecache[curie] = edge_predicate fixed_edge["predicate"] = edge_predicate fixed_edge["relation"] = curie else: if edge["pred"] == "is_a": fixed_edge["predicate"] = "biolink:subclass_of" fixed_edge["relation"] = "rdfs:subClassOf" elif edge["pred"] == "has_part": fixed_edge["predicate"] = "biolink:has_part" fixed_edge["relation"] = "BFO:0000051" elif edge["pred"] == "part_of": fixed_edge["predicate"] = "biolink:part_of" fixed_edge["relation"] = "BFO:0000050" else: fixed_edge[ "predicate"] = f"biolink:{edge['pred'].replace(' ', '_')}" fixed_edge["relation"] = edge["pred"] fixed_edge["object"] = self.prefix_manager.contract(edge["obj"]) for x in edge.keys(): if x not in {"sub", "pred", "obj"}: fixed_edge[x] = edge[x] return super().read_edge(fixed_edge) def get_category(self, curie: str, node: dict) -> Optional[str]: """ Get category for a given CURIE. Parameters ---------- curie: str Curie for node node: dict Node data Returns ------- Optional[str] Category for the given node CURIE. """ category = None # use meta.basicPropertyValues if "meta" in node and "basicPropertyValues" in node["meta"]: for p in node["meta"]["basicPropertyValues"]: if p["pred"] == self.HAS_OBO_NAMESPACE: category = p["val"] element = self.toolkit.get_element(category) if element: category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element.name))}" else: element = self.toolkit.get_element_by_mapping(category) if element: category = f"biolink:{stringcase.pascalcase(stringcase.snakecase(element))}" else: category = "biolink:OntologyClass" if not category or category == "biolink:OntologyClass": prefix = PrefixManager.get_prefix(curie) # TODO: the mapping should be via biolink-model lookups if prefix == "HP": category = "biolink:PhenotypicFeature" elif prefix == "CHEBI": category = "biolink:ChemicalSubstance" elif prefix == "MONDO": category = "biolink:Disease" elif prefix == "UBERON": category = "biolink:AnatomicalEntity" elif prefix == "SO": category = "biolink:SequenceFeature" elif prefix == "CL": category = "biolink:Cell" elif prefix == "PR": category = "biolink:Protein" elif prefix == "NCBITaxon": category = "biolink:OrganismalEntity" else: self.owner.log_error( entity=f"{str(category)} for node {curie}", error_type=ErrorType.MISSING_CATEGORY, message= f"Missing category; Defaulting to 'biolink:OntologyClass'", message_level=MessageLevel.WARNING) return category def parse_meta(self, node: str, meta: Dict) -> Dict: """ Parse 'meta' field of a node. Parameters ---------- node: str Node identifier meta: Dict meta dictionary for the node Returns ------- Dict A dictionary that contains 'description', 'synonyms', 'xrefs', and 'equivalent_nodes'. """ # cross species links are in meta; this needs to be parsed properly too # do not put assumptions in code; import as much as possible properties = {} if "definition" in meta: # parse 'definition' as 'description' description = meta["definition"]["val"] properties["description"] = description if "subsets" in meta: # parse 'subsets' subsets = meta["subsets"] properties["subsets"] = [ x.split("#")[1] if "#" in x else x for x in subsets ] if "synonyms" in meta: # parse 'synonyms' as 'synonym' synonyms = [s["val"] for s in meta["synonyms"]] properties["synonym"] = synonyms if "xrefs" in meta: # parse 'xrefs' as 'xrefs' xrefs = [x["val"] for x in meta["xrefs"]] properties["xrefs"] = xrefs if "deprecated" in meta: # parse 'deprecated' flag properties["deprecated"] = meta["deprecated"] equivalent_nodes = [] if "basicPropertyValues" in meta: # parse SKOS_EXACT_MATCH entries as 'equivalent_nodes' for p in meta["basicPropertyValues"]: if p["pred"] in {self.SKOS_EXACT_MATCH}: n = self.prefix_manager.contract(p["val"]) if not n: n = p["val"] equivalent_nodes.append(n) properties["equivalent_nodes"] = equivalent_nodes return properties
class _GraphInterface: def __init__(self, host, port, auth): self.driver = Neo4jHTTPDriver(host=host, port=port, auth=auth) self.schema = None self.summary = None self.meta_kg = None self.bl_version = config.get('BL_VERSION', '1.5.0') self.bl_url = f'https://raw.githubusercontent.com/biolink/biolink-model/{self.bl_version}/biolink-model.yaml' self.toolkit = Toolkit(self.bl_url) def find_biolink_leaves(self, biolink_concepts: list): """ Given a list of biolink concepts, returns the leaves removing any parent concepts. :param biolink_concepts: list of biolink concepts :return: leave concepts. """ ancestry_set = set() all_mixins_in_tree = set() all_concepts = set(biolink_concepts) # Keep track of things like "MacromolecularMachine" in current datasets. unknown_elements = set() for x in all_concepts: current_element = self.toolkit.get_element(x) mixins = set() if current_element: if 'mixins' in current_element and len( current_element['mixins']): for m in current_element['mixins']: mixins.add(self.toolkit.get_element(m).class_uri) else: unknown_elements.add(x) ancestors = set( self.toolkit.get_ancestors(x, reflexive=False, formatted=True)) ancestry_set = ancestry_set.union(ancestors) all_mixins_in_tree = all_mixins_in_tree.union(mixins) leaf_set = all_concepts - ancestry_set - all_mixins_in_tree - unknown_elements return leaf_set def invert_predicate(self, biolink_predicate): """Given a biolink predicate, find its inverse""" element = self.toolkit.get_element(biolink_predicate) if element is None: return None # If its symmetric if 'symmetric' in element and element.symmetric: return biolink_predicate # if neither symmetric nor an inverse is found if 'inverse' not in element or not element['inverse']: return None # if an inverse is found return self.toolkit.get_element(element['inverse']).slot_uri def get_schema(self): """ Gets the schema of the graph. To be used by. Also generates graph summary :return: Dict of structure source label as outer most keys, target labels as inner keys and list of predicates as value. :rtype: dict """ self.schema_raw_result = {} if self.schema is None: query = """ MATCH (a)-[x]->(b) WHERE not a:Concept and not b:Concept RETURN DISTINCT labels(a) as source_labels, type(x) as predicate, labels(b) as target_labels """ logger.info( f"starting query {query} on graph... this might take a few" ) result = self.driver.run_sync(query) logger.info(f"completed query, preparing initial schema") structured = self.convert_to_dict(result) self.schema_raw_result = structured schema_bag = {} # permute source labels and target labels array # replacement for unwind for previous cypher structured_expanded = [] for triplet in structured: # Since there are some nodes in data currently just one label ['biolink:NamedThing'] # This filter is to avoid that scenario. # @TODO need to remove this filter when data build # avoids adding nodes with single ['biolink:NamedThing'] labels. filter_named_thing = lambda x: list( filter(lambda y: y != 'biolink:NamedThing', x)) source_labels, predicate, target_labels =\ self.find_biolink_leaves(filter_named_thing(triplet['source_labels'])), triplet['predicate'], \ self.find_biolink_leaves(filter_named_thing(triplet['target_labels'])) for source_label in source_labels: for target_label in target_labels: structured_expanded.append({ 'source_label': source_label, 'target_label': target_label, 'predicate': predicate }) structured = structured_expanded for triplet in structured: subject = triplet['source_label'] predicate = triplet['predicate'] objct = triplet['target_label'] if subject not in schema_bag: schema_bag[subject] = {} if objct not in schema_bag[subject]: schema_bag[subject][objct] = [] if predicate not in schema_bag[subject][objct]: schema_bag[subject][objct].append(predicate) # If we invert the order of the nodes we also have to invert the predicate inverse_predicate = self.invert_predicate(predicate) if inverse_predicate is not None and \ inverse_predicate not in schema_bag.get(objct,{}).get(subject,[]): # create the list if empty if objct not in schema_bag: schema_bag[objct] = {} if subject not in schema_bag[objct]: schema_bag[objct][subject] = [] schema_bag[objct][subject].append(inverse_predicate) self.schema = schema_bag logger.info("schema done.") if not self.summary: query = """ MATCH (c) RETURN DISTINCT labels(c) as types, count(c) as count """ logger.info(f'generating graph summary: {query}') raw = self.convert_to_dict(self.driver.run_sync(query)) summary = {} for node in raw: labels = node['types'] count = node['count'] query = f""" MATCH (:{':'.join(labels)})-[e]->(b) WITH DISTINCT e , b RETURN type(e) as edge_types, count(e) as edge_counts, labels(b) as target_labels """ raw = self.convert_to_dict(self.driver.run_sync(query)) summary_key = ':'.join(labels) summary[summary_key] = {'nodes_count': count} for row in raw: target_key = ':'.join(row['target_labels']) edge_name = row['edge_types'] edge_count = row['edge_counts'] summary[summary_key][target_key] = summary[ summary_key].get(target_key, {}) summary[summary_key][target_key][ edge_name] = edge_count self.summary = summary logger.info( f'generated summary for {len(summary)} node types.') return self.schema async def get_mini_schema(self, source_id, target_id): """ Given either id of source and/or target returns predicates that relate them. And their possible labels. :param source_id: :param target_id: :return: """ source_id_syntaxed = f"{{id: \"{source_id}\"}}" if source_id else '' target_id_syntaxed = f"{{id: \"{target_id}\"}}" if target_id else '' query = f""" MATCH (a{source_id_syntaxed})-[x]->(b{target_id_syntaxed}) WITH [la in labels(a) where la <> 'Concept'] as source_label, [lb in labels(b) where lb <> 'Concept'] as target_label, type(x) as predicate RETURN DISTINCT source_label, predicate, target_label """ response = await self.driver.run(query) response = self.convert_to_dict(response) return response async def get_node(self, node_type: str, curie: str) -> list: """ Returns a node that matches curie as its ID. :param node_type: Type of the node. :type node_type:str :param curie: Curie. :type curie: str :return: value of the node in neo4j. :rtype: list """ query = f"MATCH (c:`{node_type}`{{id: '{curie}'}}) return c" response = await self.driver.run(query) data = response.get('results', [{}])[0].get('data', []) ''' data looks like [ {'row': [{...node data..}], 'meta': [{...}]}, {'row': [{...node data..}], 'meta': [{...}]}, {'row': [{...node data..}], 'meta': [{...}]} ] ''' rows = [] if len(data): from functools import reduce rows = reduce(lambda x, y: x + y.get('row', []), data, []) return rows async def get_single_hops(self, source_type: str, target_type: str, curie: str) -> list: """ Returns a triplets of source to target where source id is curie. :param source_type: Type of the source node. :type source_type: str :param target_type: Type of target node. :type target_type: str :param curie: Curie of source node. :type curie: str :return: list of triplets where each item contains source node, edge, target. :rtype: list """ query = f'MATCH (c:`{source_type}`{{id: \'{curie}\'}})-[e]->(b:`{target_type}`) return distinct c , e, b' response = await self.driver.run(query) rows = list( map(lambda data: data['row'], response['results'][0]['data'])) query = f'MATCH (c:`{source_type}`{{id: \'{curie}\'}})<-[e]-(b:`{target_type}`) return distinct b , e, c' response = await self.driver.run(query) rows += list( map(lambda data: data['row'], response['results'][0]['data'])) return rows async def run_cypher(self, cypher: str, **kwargs) -> list: """ Runs cypher directly. :param cypher: cypher query. :type cypher: str :return: unprocessed neo4j response. :rtype: list """ return await self.driver.run(cypher, **kwargs) async def get_sample(self, node_type): """ Returns a few nodes. :param node_type: Type of nodes. :type node_type: str :return: Node dict values. :rtype: dict """ query = f"MATCH (c:{node_type}) return c limit 5" response = await self.driver.run(query) rows = response['results'][0]['data'][0]['row'] return rows async def get_examples(self, source, target=None): """ Returns an example for source node only if target is not specified, if target is specified a sample one hop is returned. :param source: Node type of the source node. :type source: str :param target: Node type of the target node. :type target: str :return: A single source node value if target is not provided. If target is provided too, a triplet. :rtype: """ if target: query = f"MATCH (source:{source})-[edge]->(target:{target}) return source, edge, target limit 1" response = await self.run_cypher(query) final = list( map(lambda data: data['row'], response['results'][0]['data'])) return final else: query = f"MATCH ({source}:{source}) return {source} limit 1" response = await self.run_cypher(query) final = list( map(lambda node: node[source], self.driver.convert_to_dict(response))) return final def get_curie_prefix_by_node_type(self, node_type): query = f""" MATCH (n:`{node_type}`) return collect(n.id) as ids """ logger.info( f"starting query {query} on graph... this might take a few") result = self.driver.run_sync(query) logger.info(f"completed query, collecting node curie prefixes") result = self.convert_to_dict(result) curie_prefixes = set() for i in result[0]['ids']: curie_prefixes.add(i.split(':')[0]) # sort according to bl model node_bl_def = self.toolkit.get_element(node_type) id_prefixes = node_bl_def.id_prefixes sorted_curie_prefixes = [ i for i in id_prefixes if i in curie_prefixes ] # gives presidence to what's in BL # add other ids even if not in BL next sorted_curie_prefixes += [ i for i in curie_prefixes if i not in sorted_curie_prefixes ] return sorted_curie_prefixes async def get_meta_kg(self): if self.meta_kg: return self.meta_kg schema = self.get_schema() nodes = {} predicates = [] for subject in schema: for object in schema[subject]: for edge_type in schema[subject][object]: predicates.append({ 'subject': subject, 'object': object, 'predicate': edge_type }) if object not in nodes: nodes[object] = { 'id_prefixes': list(self.get_curie_prefix_by_node_type(object)) } if subject not in nodes: nodes[subject] = { 'id_prefixes': list(self.get_curie_prefix_by_node_type(subject)) } self.meta_kg = {'nodes': nodes, 'edges': predicates} return self.meta_kg def supports_apoc(self): """ Returns true if apoc is supported by backend database. :return: bool true if neo4j supports apoc. """ return self.driver.check_apoc_support() async def run_apoc_cover(self, ids: list): """ Runs apoc.algo.cover on list of ids :param ids: :return: dictionary of edges and source and target nodes ids """ query = f""" MATCH (node:`biolink:NamedThing`) USING INDEX node:`biolink:NamedThing`(id) WHERE node.id in {ids} WITH collect(node) as nodes CALL apoc.algo.cover(nodes) yield rel WITH {{subject: startNode(rel).id , object: endNode(rel).id, predicate: type(rel), edge: rel }} as row return collect(row) as result """ result = self.convert_to_dict(self.driver.run_sync(query)) return result def convert_to_dict(self, result): return self.driver.convert_to_dict(result)
class WrappedBMT: """ Wrapping around some of the BMT Toolkit functions to provide case conversions to the new format """ def __init__(self): self.bmt = BMToolkit() self.all_slots = self.bmt.get_all_slots() self.all_slots_formatted = [ "biolink:" + s.replace(" ", "_") for s in self.all_slots ] self.prefix = "biolink:" self.entity_prefix_mapping = { bmt.util.format(el_name, case="pascal"): id_prefixes for el_name in self.bmt.get_all_classes() if (el := self.bmt.get_element(el_name)) is not None if (id_prefixes := getattr(el, "id_prefixes", [])) } def new_case_to_old_case(self, s): """ Convert new biolink case format (biolink:GeneOrGeneProduct) to old case format (gene or gene product) Also works with slots (biolink:related_to -> related to) """ s = s.replace(self.prefix, "") if s in self.all_slots_formatted: return s.replace("_", " ") else: return camel_to_snake(s) def old_case_to_new_case(self, s): """ Convert old case format (gene or gene product) to new biolink case format (biolink:GeneOrGeneProduct) Also works with slots (related to -> biolink:related_to) """ if s in self.all_slots: return self.prefix + s.replace(" ", "_") else: return self.prefix + snake_to_camel(s) def get_descendants(self, concept): """Wrapped BMT descendants function that does case conversions""" descendants = self.bmt.get_descendants(concept, formatted=True) if len(descendants) == 0: descendants.append(concept) return descendants def get_ancestors(self, concept, reflexive=True): """Wrapped BMT ancestors function that does case conversions""" concept_old_format = self.new_case_to_old_case(concept) ancestors_old_format = self.bmt.get_ancestors(concept_old_format, reflexive=reflexive) ancestors = [ self.old_case_to_new_case(a) for a in ancestors_old_format ] return ancestors def predicate_is_symmetric(self, predicate): """Get whether a given predicate is symmetric""" predicate_old_format = self.new_case_to_old_case(predicate) predicate_element = self.bmt.get_element(predicate_old_format) if not predicate_element: # Not in the biolink model return False return predicate_element.symmetric def predicate_inverse(self, predicate): """Get the inverse of a predicate if it has one""" predicate_old_format = self.new_case_to_old_case(predicate) predicate_element = self.bmt.get_element(predicate_old_format) if not predicate_element: # Not in the biolink model return None if predicate_element.symmetric: return predicate predicate_inverse_old_format = predicate_element.inverse if not predicate_inverse_old_format: # No inverse return None return self.old_case_to_new_case(predicate_inverse_old_format)