def test_get_biolink_element(): # TODO: Parameterize element1 = get_biolink_element('gene') assert element1 is not None assert element1.name == 'gene' element2 = get_biolink_element('biolink:Gene') assert element2 is not None assert element2 == element1
def get_the_most_specific_category(self, categories: list) -> Tuple[str, list]: """ From a list of categories, it tries to fetch ancestors for all. The category with the longest ancestor is considered to be the most specific. Parameters ---------- categories: list A list of categories Returns ------- tuple[str, list] A tuple of the most specific category and a list of ancestors of that category """ # TODO: could be integrated into update_categories method most_specific_category = None most_specific_category_ancestors = [] for category in categories: logging.debug("category: {}".format(category)) element = get_biolink_element(category) if element: # category exists in BioLink Model as a class or as an alias to a class mapped_category = element['name'] ancestors = get_biolink_ancestors(mapped_category) logging.debug("ancestors: {}".format(ancestors)) if len(ancestors) > len(most_specific_category_ancestors): # the category with the longest list of ancestors will be the most specific category most_specific_category = category most_specific_category_ancestors = ancestors return most_specific_category, most_specific_category_ancestors
def test_get_biolink_element(query): """ Test to get biolink element. """ element1 = get_biolink_element(query[0]) assert element1 is not None assert element1.name == query[1]
def check_categories( categories: List, closure: List, category_mapping: Optional[Dict[str, str]] = None ) -> Tuple[List, List, List]: """ Check categories to ensure whether values in ``categories`` are valid biolink categories. Valid biolink categories are classes that descend from 'NamedThing'. Mixins, while valid ancestors, are not valid categories. Parameters ---------- categories: List A list of categories to check closure: List A list of nodes in a clique category_mapping: Optional[Dict[str, str]] A map that provides mapping from a non-biolink category to a biolink category Returns ------- Tuple[List, List, List] A tuple consisting of valid biolink categories, invalid biolink categories, and invalid categories """ valid_biolink_categories = [] invalid_biolink_categories = [] invalid_categories = [] tk = get_toolkit() for x in categories: # use the toolkit to check if the declared category is actually a mixin. if tk.is_mixin(x): invalid_categories.append(x) continue # get biolink element corresponding to category element = get_biolink_element(x) if element: mapped_category = format_biolink_category(element["name"]) if mapped_category in closure: valid_biolink_categories.append(x) else: log.warning(f"category '{mapped_category}' not in closure: {closure}") if category_mapping: mapped = category_mapping[x] if x in category_mapping.keys() else x if mapped not in closure: log.warning( f"category '{mapped_category}' is not in category_mapping." ) invalid_biolink_categories.append(x) else: invalid_biolink_categories.append(x) else: log.warning(f"category '{x}' is not in Biolink Model") invalid_categories.append(x) continue return valid_biolink_categories, invalid_biolink_categories, invalid_categories
def check_categories( categories: List, closure: List, category_mapping: Optional[Dict[str, str]] = None) -> Tuple[List, List, List]: """ Check categories to ensure whether values in ``categories`` are valid biolink categories. Parameters ---------- categories: List A list of categories to check closure: List A list of nodes in a clique category_mapping: Optional[Dict[str, str]] A map that provides mapping from a non-biolink category to a biolink category Returns ------- Tuple[List, List, List] A tuple consisting of valid biolink categories, invalid biolink categories, and invalid categories """ valid_biolink_categories = [] invalid_biolink_categories = [] invalid_categories = [] for x in categories: # get biolink element corresponding to category element = get_biolink_element(x) if element: mapped_category = format_biolink_category(element['name']) if mapped_category in closure: valid_biolink_categories.append(x) else: log.warning( f"category '{mapped_category}' not in closure: {closure}") if category_mapping: mapped = category_mapping[x] if x in category_mapping.keys( ) else x if mapped not in closure: log.warning( f"category '{mapped_category}' is not in category_mapping." ) invalid_biolink_categories.append(x) else: invalid_biolink_categories.append(x) else: log.warning(f"category '{x}' is not in Biolink Model") invalid_categories.append(x) continue return valid_biolink_categories, invalid_biolink_categories, invalid_categories
def update_categories(self, clique: list): """ For a given clique, get category for each node in clique and validate against BioLink Model, mapping to BioLink Model category where needed. Ex.: If a node has `gene` as its category, then this method adds all of its ancestors. Parameters ---------- clique: list A list of nodes from a clique """ updated_node_categories = {} for node in clique: data = self.clique_graph.nodes[node] print(data) if 'category' in data: categories = data['category'] else: # get category from equivalence categories = self.get_category_from_equivalence(node, data) extended_categories = set() invalid_categories = [] for category in categories: logging.debug("Looking at category: {}".format(category)) element = get_biolink_element(category) if element: # category exists in BioLink Model as a class or as an alias to a class mapped_category = element['name'] ancestors = get_biolink_ancestors(mapped_category) if len(ancestors) > len(extended_categories): # the category with the longest list of ancestors will be the most specific category logging.debug("Ancestors for {} is larger than previous one".format(mapped_category)) extended_categories = ancestors else: logging.warning("[1] category '{}' not in BioLink Model".format(category)) invalid_categories.append(category) logging.debug("Invalid categories: {}".format(invalid_categories)) for x in categories: element = get_biolink_element(x) if element is None: logging.warning("[2] category '{}' is not in BioLink Model".format(x)) continue mapped_category = element['name'] if mapped_category not in extended_categories: logging.warning("category '{}' not in ancestor closure: {}".format(mapped_category, extended_categories)) mapped = MAPPING[x] if x in MAPPING.keys() else x if mapped not in extended_categories: logging.warning("category '{}' is not even in any custom defined mapping. ".format(mapped_category)) invalid_categories.append(x) update_dict = {'category': extended_categories} if invalid_categories: update_dict['_invalid_category'] = invalid_categories updated_node_categories[node] = update_dict logging.debug("Updating nodes in clique with: {}".format(updated_node_categories)) nx.set_node_attributes(self.clique_graph, updated_node_categories) nx.set_node_attributes(self.target_graph, updated_node_categories)
def read_edge(self, edge: Dict) -> Dict: """ Read and parse an edge record. Parameters ---------- edge: Dict The edge record Returns ------- Dict The processed edge """ fixed_edge = dict() fixed_edge['subject'] = self.prefix_manager.contract(edge['sub']) if PrefixManager.is_iri(edge['pred']): curie = self.prefix_manager.contract(edge['pred']) if curie in self.ecache: edge_predicate = self.ecache[curie] else: element = get_biolink_element(curie) if not element: try: mapping = self.toolkit.get_element_by_mapping( edge['pred']) if mapping: element = self.toolkit.get_element(mapping) except ValueError as e: log.error(e) if element: edge_predicate = format_biolink_slots( element.name.replace(',', '')) fixed_edge['predicate'] = edge_predicate else: edge_predicate = 'biolink:related_to' self.ecache[curie] = edge_predicate fixed_edge['predicate'] = edge_predicate fixed_edge['relation'] = curie else: if edge['pred'] == 'is_a': fixed_edge['predicate'] = 'biolink:subclass_of' fixed_edge['relation'] = 'rdfs:subClassOf' elif edge['pred'] == 'has_part': fixed_edge['predicate'] = 'biolink:has_part' fixed_edge['relation'] = "BFO:0000051" elif edge['pred'] == 'part_of': fixed_edge['predicate'] = 'biolink:part_of' fixed_edge['relation'] = "BFO:0000050" else: fixed_edge[ 'predicate'] = f"biolink:{edge['pred'].replace(' ', '_')}" fixed_edge['relation'] = edge['pred'] fixed_edge['object'] = self.prefix_manager.contract(edge['obj']) for x in edge.keys(): if x not in {'sub', 'pred', 'obj'}: fixed_edge[x] = edge[x] return super().read_edge(fixed_edge)
def read_edge(self, edge: Dict) -> Optional[Tuple]: """ Read and parse an edge record. Parameters ---------- edge: Dict The edge record Returns ------- Dict The processed edge """ fixed_edge = dict() fixed_edge["subject"] = self.prefix_manager.contract(edge["sub"]) if PrefixManager.is_iri(edge["pred"]): curie = self.prefix_manager.contract(edge["pred"]) if curie in self.ecache: edge_predicate = self.ecache[curie] else: element = get_biolink_element(curie) if not element: try: mapping = self.toolkit.get_element_by_mapping( edge["pred"]) if mapping: element = self.toolkit.get_element(mapping) # TODO: not sure how this exception would be thrown here.. under what conditions? except ValueError as e: self.owner.log_error( entity=str(edge["pred"]), error_type=ErrorType.INVALID_EDGE_PREDICATE, message=str(e)) element = None if element: edge_predicate = format_biolink_slots( element.name.replace(",", "")) fixed_edge["predicate"] = edge_predicate else: edge_predicate = "biolink:related_to" self.ecache[curie] = edge_predicate fixed_edge["predicate"] = edge_predicate fixed_edge["relation"] = curie else: if edge["pred"] == "is_a": fixed_edge["predicate"] = "biolink:subclass_of" fixed_edge["relation"] = "rdfs:subClassOf" elif edge["pred"] == "has_part": fixed_edge["predicate"] = "biolink:has_part" fixed_edge["relation"] = "BFO:0000051" elif edge["pred"] == "part_of": fixed_edge["predicate"] = "biolink:part_of" fixed_edge["relation"] = "BFO:0000050" else: fixed_edge[ "predicate"] = f"biolink:{edge['pred'].replace(' ', '_')}" fixed_edge["relation"] = edge["pred"] fixed_edge["object"] = self.prefix_manager.contract(edge["obj"]) for x in edge.keys(): if x not in {"sub", "pred", "obj"}: fixed_edge[x] = edge[x] return super().read_edge(fixed_edge)