def test_prefixes(): assert contract_uri(bp_iri) == [bp_id] assert expand_uri(bp_id) == bp_iri assert contract_uri("FAKE", strict=False) == [] try: contract_uri("FAKE", strict=True) except NoPrefix as e: pass else: assert False
def contract(uri) -> str: """ We sort the curies to ensure that we take the same item every time """ curies = contract_uri(str(uri), cmaps=cmaps) if len(curies) > 0: curies.sort() return curies[0] return None
def test_prefixes_cmaps(): cmaps = [ {'GO': 'http://purl.obolibrary.org/obo/GO_'}, {'OBO': 'http://purl.obolibrary.org/obo/'} ] assert contract_uri(bp_iri, cmaps) == [bp_id] all_curies = contract_uri(bp_iri, cmaps, shortest=False) assert len(all_curies) == 2 assert obo_bp_id in all_curies assert bp_id in all_curies assert expand_uri(bp_id, cmaps) == bp_iri assert expand_uri(obo_bp_id, cmaps) == bp_iri assert contract_uri("FAKE", cmaps, strict=False) == [] try: contract_uri("FAKE", cmaps, strict=True) except NoPrefix as e: pass else: assert False
def get_descendants(graph: Graph, node: str, edge: Optional[URIRef] = RDFS['subClassOf'], reflexive: Optional[bool] = True) -> Set[str]: nodes = set() node = URIRef(expand_uri(node, strict=True)) for sub in graph.transitive_subjects(edge, node): if not reflexive and node == sub: continue if isinstance(sub, Literal): continue nodes.add(contract_uri(str(sub), strict=True)[0]) return nodes
def contract_uri(self, iri) -> str: """Contract a given IRI. Contract a given IRI, with special parsing and transformations depending on the nature of the IRI. Args: iri: IRI as string Returns: str. """ curie = "" if 'http://www.genenames.org/cgi-bin/gene_symbol_report?match=' in iri: identifier = iri.split('=')[-1] if identifier in self.gene_info_map: curie = f"NCBIGene:{self.gene_info_map[identifier]['NCBI']}" else: [curie] = contract_uri(iri, cmaps=[CUSTOM_CMAP]) else: if self.is_iri(iri): curie = contract_uri(iri) if curie: curie = curie[0] else: curie = contract_uri(iri, cmaps=[CUSTOM_CMAP]) if curie: curie = curie[0] else: curie = iri elif self.is_curie(iri): curie = iri else: curie = f":{iri}" return curie
def get_ancestors(graph: Graph, node: str, edge: Optional[URIRef] = RDFS['subClassOf'], root: Optional[str] = None, reflexive: Optional[bool] = True) -> Set[str]: nodes = set() root_seen = {} node = URIRef(expand_uri(node, strict=True)) if root is not None: root = URIRef(expand_uri(root, strict=True)) root_seen = {root: 1} for obj in graph.transitive_objects(node, edge, root_seen): if isinstance(obj, Literal) or isinstance(obj, BNode): continue if not reflexive and node == obj: continue nodes.add(contract_uri(str(obj), strict=True)[0]) # Add root to graph if root is not None: nodes.add(contract_uri(str(root), strict=True)[0]) return nodes
def get_leaf_nodes(graph: Graph, node: str, edge: Optional[URIRef] = RDFS['subClassOf']) -> Set[str]: if not isinstance(node, URIRef): obj = URIRef(expand_uri(node, strict=True)) else: obj = node subjects = list(graph.subjects(edge, obj)) if len(subjects) == 0: yield contract_uri(str(obj), strict=True)[0] else: for subject in subjects: for leaf in get_leaf_nodes(graph, subject, edge): yield leaf
def shorten_iri_to_curie(iri: str, curie_to_iri_map: list = []): if iri.startswith('owl:') or iri.startswith('OIO:'): return iri if "/GO/GO%3A" in iri: # hack for fixing issue #410 iri = iri.replace("/GO/GO%3A", "/GO/") if "/HPO/HP%3A" in iri: # hack for fixing issue #665 iri = iri.replace("/HPO/HP%3A", "/HP/") curie_list = prefixcommons.contract_uri(iri, curie_to_iri_map) assert len(curie_list) in [0, 1] if len(curie_list) == 1: curie_id = curie_list[0] else: curie_id = None if curie_id is not None: # deal with IRIs like 'https://identifiers.org/umls/ATC/L01AX02' which get converted to CURIE 'UMLS:ATC/L01AX02' umls_match = REGEX_UMLS_CURIE.match(curie_id) if umls_match is not None: curie_id = umls_match[1] + ':' + umls_match[2] return curie_id
def contract(uri) -> str: """ Contract a URI a CURIE. We sort the curies to ensure that we take the same item every time. Parameters ---------- uri: Union[rdflib.term.URIRef, str] A URI Returns ------- str The CURIE """ curies = contract_uri(str(uri), cmaps=cmaps) if len(curies) > 0: curies.sort() return curies[0] return None
def shorten_iri_to_curie(iri: str, curie_to_iri_map: list) -> str: if iri is None: raise ValueError('cannot shorten an IRI with value None') curie_list = prefixcommons.contract_uri(iri, curie_to_iri_map) if len(curie_list) == 0: return None if len(curie_list) == 1: curie_id = curie_list[0] else: assert False, "somehow got a list after calling prefixcommons.contract on URI: " + iri + "; list is: " + str( curie_list) curie_id = None # if curie_id is not None: # # deal with IRIs like 'https://identifiers.org/umls/ATC/L01AX02' which get converted to CURIE 'UMLS:ATC/L01AX02' # umls_match = REGEX_UMLS_CURIE.match(curie_id) # if umls_match is not None: # curie_id = umls_match[1] + ':' + umls_match[2] return curie_id
def _process_hpo_data(file_path: str) -> Dict[str, List[str]]: logger.info("loading mondo into memory") mondo = Graph() mondo.parse(gzip.open("../data/mondo.owl.gz", 'rb'), format='xml') logger.info("finished loading mondo") mondo_merged_lines: List[str] = [] disease_info: Dict[str, List[str]] = {} if file_path.startswith("http"): context_manager = closing(requests.get(file_path)) else: context_manager = open(file_path, "r") # https://stackoverflow.com/a/35371451 with context_manager as file: if file_path.startswith("http"): file = file.content.decode('utf-8').splitlines() reader = csv.reader(file, delimiter='\t', quotechar='\"') counter = 0 for row in reader: try: (db, num, name, severity, pheno_id, publist, eco, onset, freq) = row[0:9] except ValueError: logger.warning("Too few values in row {}".format(row)) continue # Align Id prefixes if db == 'MIM': db = 'OMIM' if db == 'ORPHA': db = 'Orphanet' if db == 'ORPHANET': db = 'Orphanet' disease_id = "{}:{}".format(db, num) disease_iri = URIRef(expand_uri(disease_id, strict=True)) mondo_curie = None mondo_iri = None for subj in mondo.subjects(OWL['equivalentClass'], disease_iri): curie = contract_uri(str(subj), strict=True)[0] if curie.startswith('MONDO'): mondo_curie = curie mondo_iri = subj break if mondo_curie is None: logger.warn("No mondo id for {}".format(disease_id)) continue has_omim = False for obj in mondo.objects(mondo_iri, OWL['equivalentClass']): try: curie = contract_uri(str(obj), strict=True)[0] except NoPrefix: continue if curie.startswith('OMIM'): has_omim = True # use scigraph instead of the above # mondo_node = monarch.get_clique_leader(disease_id) # mondo_curie = mondo_node['id'] if mondo_curie is not None and 'hgnc' in mondo_curie: # to keep these, likely decipher IDs # mondo_curie = disease_id continue if disease_id.startswith('Orphanet') \ and has_omim is False \ and len(list(mondo.objects(mondo_iri, RDFS['subClassOf']))) > 0: # disease is a disease group, skip logger.info( "{} is a disease group, skipping".format(disease_id)) continue mondo_merged_lines.append( (mondo_curie, pheno_id, onset, freq, severity)) counter += 1 if counter % 10000 == 0: logger.info("processed {} rows".format(counter)) logger.info("processed {} rows".format(counter)) for line in mondo_merged_lines: key = "{}-{}".format(line[0], line[1]) values = [line[2], line[3], line[4]] if key in disease_info and disease_info[key] != values: logger.warning("Metadata for {} and {} mismatch: {} vs {}".format( line[0], line[1], values, disease_info[key])) # attempt to merge by collapsing freq, onset, severity # that is empty in one disease but not another # conflicts will defer to the disease first inserted merged_disease_info = disease_info[key] for index, val in enumerate(values): if val == disease_info[key][index] \ or val == '' and disease_info[key][index] != '': continue elif val != '' and disease_info[key][index] == '': merged_disease_info[index] = val else: logger.warning("Cannot merge {} and {} for {}".format( values, disease_info[key], line[0])) else: disease_info[key] = values return disease_info