Exemplo n.º 1
0
def expand_curie_to_iri(curie_id: str, curie_to_iri_map: list) -> Optional[str]:
    if curie_id.startswith('UMLS:CN'):
        curie_id = curie_id.replace('UMLS:CN', 'medgen:CN')  # see GitHub issue 810
    iri = prefixcommons.expand_uri(curie_id, curie_to_iri_map)
    if iri == curie_id:
        iri = None
    return iri
Exemplo n.º 2
0
def label(curie: str, graph: Graph) -> str:
    """
    Given a list of phenotypes, get the reflexive closure for each phenotype
    stored in a single set.  This can be used for jaccard similarity or
    simGIC
    """
    return graph.label(URIRef(expand_uri(curie, strict=True)))
Exemplo n.º 3
0
def get_node_curie_id_from_ontology_node_id(ontology_node_id: str,
                                            ontology: ontobio.ontol.Ontology,
                                            uri_to_curie_shortener: callable):
    node_curie_id = None
    if not ontology_node_id.startswith(
            'http:') and not ontology_node_id.startswith('https:'):
        if not ontology_node_id.startswith('OBO:'):
            if not ontology_node_id.startswith('UMLS:C'):
                node_curie_id = ontology_node_id
            else:
                node_curie_id = CUI_PREFIX + ':' + ontology_node_id.split(
                    'UMLS:')[1]
        else:
            node_curie_id = uri_to_curie_shortener(
                prefixcommons.expand_uri(ontology_node_id))
    else:
        node_curie_id = uri_to_curie_shortener(ontology_node_id)
        if node_curie_id is None:
            kg2_util.log_message(
                message="could not shorten this IRI to a CURIE",
                ontology_name=ontology.id,
                node_curie_id=ontology_node_id,
                output_stream=sys.stderr)
            node_curie_id = ontology_node_id

    # Ensure that all CUI CURIE IDs use the "CUI:" prefix (part of fix for issue #565)
    if is_cui_id(node_curie_id
                 ) and get_prefix_from_curie_id(node_curie_id) != CUI_PREFIX:
        node_curie_id = CUI_PREFIX + ":" + get_local_id_from_curie_id(
            node_curie_id)

    return node_curie_id
def test_prefixes_cmaps():
    cmaps = [ {'GO': 'http://purl.obolibrary.org/obo/GO_'},
              {'OBO': 'http://purl.obolibrary.org/obo/'}
    ]
    assert contract_uri(bp_iri, cmaps) == [bp_id]
    all_curies = contract_uri(bp_iri, cmaps, shortest=False)
    assert len(all_curies) == 2
    assert obo_bp_id in all_curies
    assert bp_id in all_curies
    assert expand_uri(bp_id, cmaps) == bp_iri
    assert expand_uri(obo_bp_id, cmaps) == bp_iri
    assert contract_uri("FAKE", cmaps, strict=False) == []
    try:
        contract_uri("FAKE", cmaps, strict=True)
    except NoPrefix as e:
        pass
    else:
        assert False
def test_prefixes():
    assert contract_uri(bp_iri) == [bp_id]
    assert expand_uri(bp_id) == bp_iri
    assert contract_uri("FAKE", strict=False) == []
    try:
        contract_uri("FAKE", strict=True)
    except NoPrefix as e:
        pass
    else:
        assert False
Exemplo n.º 6
0
def get_descendants(graph: Graph,
                    node: str,
                    edge: Optional[URIRef] = RDFS['subClassOf'],
                    reflexive: Optional[bool] = True) -> Set[str]:

    nodes = set()
    node = URIRef(expand_uri(node, strict=True))
    for sub in graph.transitive_subjects(edge, node):
        if not reflexive and node == sub:
            continue
        if isinstance(sub, Literal):
            continue
        nodes.add(contract_uri(str(sub), strict=True)[0])
    return nodes
Exemplo n.º 7
0
def get_ancestors(graph: Graph,
                  node: str,
                  edge: Optional[URIRef] = RDFS['subClassOf'],
                  root: Optional[str] = None,
                  reflexive: Optional[bool] = True) -> Set[str]:
    nodes = set()
    root_seen = {}
    node = URIRef(expand_uri(node, strict=True))

    if root is not None:
        root = URIRef(expand_uri(root, strict=True))
        root_seen = {root: 1}
    for obj in graph.transitive_objects(node, edge, root_seen):
        if isinstance(obj, Literal) or isinstance(obj, BNode):
            continue
        if not reflexive and node == obj:
            continue
        nodes.add(contract_uri(str(obj), strict=True)[0])

    # Add root to graph
    if root is not None:
        nodes.add(contract_uri(str(root), strict=True)[0])

    return nodes
Exemplo n.º 8
0
def get_leaf_nodes(graph: Graph,
                   node: str,
                   edge: Optional[URIRef] = RDFS['subClassOf']) -> Set[str]:

    if not isinstance(node, URIRef):
        obj = URIRef(expand_uri(node, strict=True))
    else:
        obj = node

    subjects = list(graph.subjects(edge, obj))
    if len(subjects) == 0:
        yield contract_uri(str(obj), strict=True)[0]
    else:
        for subject in subjects:
            for leaf in get_leaf_nodes(graph, subject, edge):
                yield leaf
Exemplo n.º 9
0
def make_rel(preds_dict: dict, subject_curie: str, object_curie: str,
             predicate: str, pmid: str, pub_date: str, sentence: str,
             subject_score: str, object_score: str, negated: bool):
    key = subject_curie + '-' + predicate + '-' + object_curie
    key_val = preds_dict.get(key, None)
    publication_curie = 'PMID:' + pmid
    publication_info_dict = {
        'publication date': pub_date,
        'sentence': sentence,
        'subject score': subject_score,
        'object score': object_score
    }
    if key_val is None:
        relation_type = predicate.lower()
        if relation_type != 'xref':
            relation_iri = kg2_util.convert_snake_case_to_camel_case(
                relation_type.replace(' ', '_'))
            relation_iri = relation_iri[0].lower() + relation_iri[1:]
            relation_iri = SEMMEDDB_IRI + '#' + relation_iri
            relation_curie = 'SEMMEDDB:' + relation_type
        else:
            relation_curie = 'OBO:xref'
            relation_iri = prefixcommons.expand_uri(relation_curie)
        edge_dict = kg2_util.make_edge(subject_curie, object_curie,
                                       relation_iri, relation_curie,
                                       relation_type, SEMMEDDB_IRI,
                                       curr_timestamp)
        edge_dict['publications'] = [publication_curie]
        edge_dict['publications info'] = {
            publication_curie: publication_info_dict
        }
        edge_dict['negated'] = negated
        preds_dict[key] = edge_dict
    else:
        key_val['publications info'][publication_curie] = publication_info_dict
        key_val['publications'] = key_val['publications'] + [publication_curie]
def _process_hpo_data(file_path: str) -> Dict[str, List[str]]:
    logger.info("loading mondo into memory")
    mondo = Graph()
    mondo.parse(gzip.open("../data/mondo.owl.gz", 'rb'), format='xml')
    logger.info("finished loading mondo")

    mondo_merged_lines: List[str] = []
    disease_info: Dict[str, List[str]] = {}

    if file_path.startswith("http"):
        context_manager = closing(requests.get(file_path))
    else:
        context_manager = open(file_path, "r")

    # https://stackoverflow.com/a/35371451
    with context_manager as file:
        if file_path.startswith("http"):
            file = file.content.decode('utf-8').splitlines()
        reader = csv.reader(file, delimiter='\t', quotechar='\"')
        counter = 0
        for row in reader:
            try:
                (db, num, name, severity, pheno_id, publist, eco, onset,
                 freq) = row[0:9]
            except ValueError:
                logger.warning("Too few values in row {}".format(row))
                continue

            # Align Id prefixes
            if db == 'MIM': db = 'OMIM'
            if db == 'ORPHA': db = 'Orphanet'
            if db == 'ORPHANET': db = 'Orphanet'

            disease_id = "{}:{}".format(db, num)
            disease_iri = URIRef(expand_uri(disease_id, strict=True))
            mondo_curie = None
            mondo_iri = None
            for subj in mondo.subjects(OWL['equivalentClass'], disease_iri):
                curie = contract_uri(str(subj), strict=True)[0]
                if curie.startswith('MONDO'):
                    mondo_curie = curie
                    mondo_iri = subj
                    break
            if mondo_curie is None:
                logger.warn("No mondo id for {}".format(disease_id))
                continue

            has_omim = False
            for obj in mondo.objects(mondo_iri, OWL['equivalentClass']):
                try:
                    curie = contract_uri(str(obj), strict=True)[0]
                except NoPrefix:
                    continue
                if curie.startswith('OMIM'):
                    has_omim = True

            # use scigraph instead of the above
            # mondo_node = monarch.get_clique_leader(disease_id)
            # mondo_curie = mondo_node['id']
            if mondo_curie is not None and 'hgnc' in mondo_curie:
                # to keep these, likely decipher IDs
                # mondo_curie = disease_id
                continue

            if disease_id.startswith('Orphanet') \
                    and has_omim is False \
                    and len(list(mondo.objects(mondo_iri, RDFS['subClassOf']))) > 0:
                # disease is a disease group, skip
                logger.info(
                    "{} is a disease group, skipping".format(disease_id))
                continue

            mondo_merged_lines.append(
                (mondo_curie, pheno_id, onset, freq, severity))

            counter += 1
            if counter % 10000 == 0:
                logger.info("processed {} rows".format(counter))

    logger.info("processed {} rows".format(counter))

    for line in mondo_merged_lines:
        key = "{}-{}".format(line[0], line[1])
        values = [line[2], line[3], line[4]]
        if key in disease_info and disease_info[key] != values:
            logger.warning("Metadata for {} and {} mismatch: {} vs {}".format(
                line[0], line[1], values, disease_info[key]))
            # attempt to merge by collapsing freq, onset, severity
            # that is empty in one disease but not another
            # conflicts will defer to the disease first inserted
            merged_disease_info = disease_info[key]
            for index, val in enumerate(values):
                if val == disease_info[key][index] \
                        or val == '' and disease_info[key][index] != '':
                    continue
                elif val != '' and disease_info[key][index] == '':
                    merged_disease_info[index] = val
                else:
                    logger.warning("Cannot merge {} and {} for {}".format(
                        values, disease_info[key], line[0]))
        else:
            disease_info[key] = values

    return disease_info
     edge_dict['edge label'] = 'INVERTED:' + edge_label
     new_object = edge_dict['subject']
     edge_dict['subject'] = edge_dict['object']
     edge_dict['object'] = new_object
 edge_dict['simplified edge label'] = simplified_edge_label
 if drop_self_edges_except is not None and \
    edge_dict['subject'] == edge_dict['object'] and \
    simplified_edge_label not in drop_self_edges_except:
     continue  # see issue 743
 edge_dict['simplified relation curie'] = simplified_relation_curie
 if simplified_relation_curie in nodes_dict:
     simplified_relation = nodes_dict[simplified_relation_curie]['iri']
 else:
     simplified_relation_curie_prefix = simplified_relation_curie.split(
         ':')[0]
     simplified_relation_uri_prefix = prefixcommons.expand_uri(
         simplified_relation_curie_prefix + ':', curies_to_uri_map)
     if simplified_relation_uri_prefix != simplified_relation_curie_prefix:
         simplified_relation = kg2_util.predicate_label_to_iri_and_curie(
             simplified_edge_label, simplified_relation_curie_prefix,
             simplified_relation_uri_prefix)[0]
     else:
         simplified_relation = relation
         relation_curies_not_in_nodes.add(simplified_relation_curie)
 edge_dict['simplified relation'] = simplified_relation
 edge_dict['provided by'] = [edge_dict['provided by']]
 edge_key = edge_dict[
     'subject'] + ' /// ' + simplified_edge_label + ' /// ' + edge_dict[
         'object']
 existing_edge = new_edges.get(edge_key, None)
 if existing_edge is not None:
     existing_edge['provided by'] = list(
Exemplo n.º 12
0
def get_rels_dict(nodes: dict, owl_file_information_dict_list: list,
                  uri_to_curie_shortener: callable,
                  map_of_node_ontology_ids_to_curie_ids: dict):
    rels_dict = dict()

    for owl_file_information_dict in owl_file_information_dict_list:
        ontology = owl_file_information_dict['ontology']
        ontology_id = owl_file_information_dict['id']
        ont_graph = ontology.get_graph()
        ontology_curie_id = map_of_node_ontology_ids_to_curie_ids[ontology_id]
        for (object_id, subject_id,
             predicate_dict) in ont_graph.edges(data=True):
            assert type(predicate_dict) == dict

            ontology_node = nodes.get(ontology_curie_id, None)
            if ontology_node is not None:
                ontology_update_date = ontology_node['update date']

            if subject_id == OWL_BASE_CLASS or object_id == OWL_BASE_CLASS:
                continue

            if subject_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER) or \
               object_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER):
                continue

            # subject_id and object_id are IDs from the original ontology objects; these may not
            # always be the node curie IDs (e.g., for SNOMED terms). Need to map them
            subject_curie_id = map_of_node_ontology_ids_to_curie_ids.get(
                subject_id, None)
            if subject_curie_id is None:
                kg2_util.log_message(
                    message="ontology node ID has no curie ID in the map",
                    ontology_name=ontology.id,
                    node_curie_id=subject_id,
                    output_stream=sys.stderr)
                continue
            object_curie_id = map_of_node_ontology_ids_to_curie_ids.get(
                object_id, None)
            if object_curie_id is None:
                kg2_util.log_message(
                    message="ontology node ID has no curie ID in the map",
                    ontology_name=ontology.id,
                    node_curie_id=object_id,
                    output_stream=sys.stderr)
                continue

            predicate_label = None
            edge_pred_string = predicate_dict['pred']

            if subject_curie_id.startswith(
                    'TUI:') and object_curie_id.startswith(
                        'TUI:') and edge_pred_string == 'subClassOf':
                continue

            if not edge_pred_string.startswith(
                    'http:') and not edge_pred_string.startswith('https'):
                # edge_pred_string is not a URI; this is the most common case
                if ':' not in edge_pred_string:
                    # edge_pred_string is not a CURIE; this is the most common subcase
                    if edge_pred_string != 'subClassOf':
                        predicate_curie = 'owl:' + edge_pred_string
                    else:
                        predicate_curie = 'rdfs:subClassOf'
                    predicate_label = kg2_util.convert_camel_case_to_snake_case(
                        edge_pred_string)
                else:
                    # edge_pred_string is a CURIE
                    predicate_curie = edge_pred_string
                    predicate_node = nodes.get(predicate_curie, None)
                    if predicate_node is not None:
                        predicate_label = predicate_node['name']
                    else:
                        # predicate has no node object defined; just pull the label out of the CURIE
                        if edge_pred_string.startswith('OBO:'):
                            test_curie = edge_pred_string.replace('OBO:',
                                                                  '').replace(
                                                                      '_', ':')
                            predicate_node = nodes.get(test_curie, None)
                            if predicate_node is None:
                                predicate_label = edge_pred_string.split(
                                    ':')[1].split('#')[-1]
                            else:
                                predicate_curie = test_curie
                        else:
                            predicate_label = edge_pred_string
                predicate_iri = prefixcommons.expand_uri(predicate_curie)
                predicate_curie_new = uri_to_curie_shortener(predicate_iri)
                if predicate_curie_new is not None:
                    predicate_curie = predicate_curie_new
            else:
                predicate_iri = edge_pred_string
                predicate_curie = uri_to_curie_shortener(predicate_iri)

            if predicate_curie is None:
                kg2_util.log_message(message="predicate IRI has no CURIE: " +
                                     predicate_iri,
                                     ontology_name=ontology.id,
                                     output_stream=sys.stderr)
                continue

            if subject_curie_id == object_curie_id and predicate_label == 'xref':
                continue

            if predicate_curie == 'UMLS:hasSTY':
                subject_node = nodes[subject_curie_id]
                object_node = nodes[object_curie_id]
                subject_description = subject_node['description']
                if subject_description is None:
                    subject_description = ''
                subject_node['description'] = '; '.join(
                    list(
                        filter(None, [
                            subject_description,
                            'UMLS Semantic Type: ' + object_node['id']
                        ])))
                continue

            rel_key = make_rel_key(subject_curie_id, predicate_curie,
                                   object_curie_id, ontology_curie_id)

            if predicate_label is None and ':' in predicate_curie:
                pred_node = nodes.get(predicate_curie, None)
                if pred_node is not None:
                    predicate_label = pred_node['name']
                    if predicate_label[0].isupper():
                        predicate_label = predicate_label[0].lower(
                        ) + predicate_label[1:]

            assert predicate_label is not None
            predicate_label = predicate_label.replace(' ', '_')
            # Only tested on Food and Efo ontologies
            predicate_label = kg2_util.convert_camel_case_to_snake_case(
                predicate_label)
            if rels_dict.get(rel_key, None) is None:
                edge = kg2_util.make_edge(subject_curie_id, object_curie_id,
                                          predicate_iri, predicate_curie,
                                          predicate_label, ontology_id,
                                          ontology_update_date)
                rels_dict[rel_key] = edge
        for node_id, node_dict in nodes.items():
            xrefs = node_dict['xrefs']
            if xrefs is not None:
                for xref_node_id in xrefs:
                    if xref_node_id in nodes and node_id != xref_node_id:
                        provided_by = nodes[node_id]['provided by']
                        key = make_rel_key(node_id, CURIE_OBO_XREF,
                                           xref_node_id, provided_by)
                        if rels_dict.get(key, None) is None:
                            edge = kg2_util.make_edge(node_id, xref_node_id,
                                                      IRI_OBO_XREF,
                                                      CURIE_OBO_XREF, 'xref',
                                                      provided_by,
                                                      ontology_update_date)
                            rels_dict[key] = edge

    return rels_dict
Exemplo n.º 13
0
def make_nodes_dict_from_ontologies_list(
        ontology_info_list: list, curies_to_categories: dict,
        uri_to_curie_shortener: callable,
        category_label_to_iri_mapper: callable):
    ret_dict = dict()
    ontologies_iris_to_curies = dict()

    for ontology_info_dict in ontology_info_list:
        ontology = ontology_info_dict['ontology']
        iri_of_ontology = ontology_info_dict['id']
        assert iri_of_ontology is not None

        ontology_curie_id = uri_to_curie_shortener(iri_of_ontology)
        if ontology_curie_id is None or len(ontology_curie_id) == 0:
            ontology_curie_id = iri_of_ontology
        umls_sver = ontology_info_dict.get('umls-sver', None)
        updated_date = None
        if umls_sver is not None:
            # if you can, parse sver string into a date string
            updated_date = parse_umls_sver_date(umls_sver)
        if updated_date is None:
            updated_date = ontology_info_dict['file last modified timestamp']

        ontology_node = kg2_util.make_node(ontology_curie_id, iri_of_ontology,
                                           ontology_info_dict['title'],
                                           'data source', updated_date,
                                           iri_of_ontology)
        ontology_node['description'] = ontology_info_dict['description']
        ontology_node['ontology node ids'] = [iri_of_ontology]
        ontology_node['xrefs'] = []
        ret_dict[ontology_curie_id] = ontology_node

        ontologies_iris_to_curies[iri_of_ontology] = ontology_curie_id

        for ontology_node_id in ontology.nodes():
            onto_node_dict = ontology.node(ontology_node_id)
            assert onto_node_dict is not None

            if ontology_node_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER):
                continue

            node_curie_id = get_node_curie_id_from_ontology_node_id(
                ontology_node_id, ontology, uri_to_curie_shortener)
            assert not node_curie_id.startswith('UMLS:C')  # :DEBUG:

            iri = onto_node_dict.get('id', None)
            if iri is None:
                iri = ontology_node_id

            # Ensure all CUI nodes use a 'umls/cui' IRI (part of fix for #565)
            if is_cui_id(node_curie_id):
                iri = CUI_BASE_IRI + '/' + get_local_id_from_curie_id(
                    node_curie_id)

            if not iri.startswith('http:') and not iri.startswith('https:'):
                iri = prefixcommons.expand_uri(iri)

            if node_curie_id.startswith(
                    'NCBIGene:') or node_curie_id.startswith('HGNC:'):
                iri = prefixcommons.expand_uri(node_curie_id)

            generated_iri = prefixcommons.expand_uri(node_curie_id)
            if generated_iri != node_curie_id:
                if (generated_iri.startswith('http:') or generated_iri.startswith('https:')) and \
                   generated_iri != iri:
                    iri = generated_iri

            node_name = onto_node_dict.get('label', None)
            node_full_name = None

            [node_category_label, ontology_id_of_node_with_category
             ] = get_biolink_category_for_node(ontology_node_id, node_curie_id,
                                               ontology, curies_to_categories,
                                               uri_to_curie_shortener, set(),
                                               False)

            node_deprecated = False
            node_description = None
            node_creation_date = None
            node_update_date = None
            node_replaced_by_curie = None
            node_full_name = None
            node_publications = set()
            node_synonyms = set()
            node_xrefs = set()
            node_tui = None
            node_has_cui = False
            node_tui_category_label = None

            node_meta = onto_node_dict.get('meta', None)
            if node_meta is not None:
                node_deprecated = node_meta.get('deprecated', False)
                node_definition = node_meta.get('definition', None)
                if node_definition is not None:
                    node_description = node_definition['val']
                    if node_description.startswith(
                            'OBSOLETE:') or node_description.startswith(
                                'Obsolete.'):
                        continue

                    node_definition_xrefs = node_definition.get('xrefs', None)
                    if node_definition_xrefs is not None:
                        assert type(node_definition_xrefs) == list
                        for xref in node_definition_xrefs:
                            xref_pub = xref_as_a_publication(xref)
                            if xref_pub is not None:
                                node_publications.add(xref_pub)

                node_synonyms_list = node_meta.get('synonyms', None)
                if node_synonyms_list is not None:
                    for syn_dict in node_synonyms_list:
                        syn_pred = syn_dict['pred']
                        if syn_pred == 'hasExactSynonym':
                            node_synonyms.add(syn_dict['val'])
                            syn_xrefs = syn_dict['xrefs']
                            if len(syn_xrefs) > 0:
                                for syn_xref in syn_xrefs:
                                    syn_xref_pub = xref_as_a_publication(
                                        syn_xref)
                                    if syn_xref_pub is not None:
                                        node_publications.add(syn_xref_pub)

                node_xrefs_list = node_meta.get('xrefs', None)
                if node_xrefs_list is not None:
                    for xref_dict in node_xrefs_list:
                        xref_curie = xref_dict['val']
                        if xref_curie.startswith('MESH:'):
                            xref_curie = xref_curie.replace('MESH:', 'MSH:')
                        elif xref_curie.startswith('UMLS:C'):
                            xref_curie = CUI_PREFIX + ':' + xref_curie.split(
                                'UMLS:')[1]
                        node_xrefs.add(xref_curie)
                basic_property_values = node_meta.get('basicPropertyValues',
                                                      None)
                if basic_property_values is not None:
                    node_tui_list = []
                    for basic_property_value_dict in basic_property_values:
                        bpv_pred = basic_property_value_dict['pred']
                        bpv_pred_curie = uri_to_curie_shortener(bpv_pred)
                        if bpv_pred_curie is None:
                            bpv_pred_curie = bpv_pred
                        bpv_val = basic_property_value_dict['val']
                        if bpv_pred_curie in [
                                'OIO:creation_date', 'dcterms:issued',
                                'HGNC:DATE_CREATED'
                        ]:
                            node_creation_date = bpv_val
                        elif bpv_pred_curie == 'HGNC:DATE_LAST_MODIFIED':
                            node_update_date = bpv_val
                        elif bpv_pred_curie == 'IAL:0100001':
                            assert node_deprecated
                            node_replaced_by_uri = bpv_val
                            node_replaced_by_curie = uri_to_curie_shortener(
                                node_replaced_by_uri)
                        elif bpv_pred_curie == 'UMLS:STY':  # STY_BASE_IRI:
                            node_tui_list.append(bpv_val)
                        elif bpv_pred_curie == 'skos:prefLabel':
                            if not node_curie_id.startswith('HGNC:'):
                                node_name = bpv_val
                            else:
                                node_full_name = bpv_val
                                if node_name is None:
                                    node_name = node_full_name
                        elif bpv_pred_curie == 'skos:altLabel':
                            node_synonyms.add(bpv_val)
                        elif bpv_pred_curie == 'skos:definition':
                            node_description = kg2_util.strip_html(bpv_val)
                        elif bpv_pred_curie == 'HGNC:GENESYMBOL':
                            node_name = bpv_val
                            node_synonyms.add(bpv_val)
                        elif bpv_pred_curie == 'UMLS:cui':
                            node_has_cui = True
                    if len(node_tui_list) == 1:
                        node_tui = node_tui_list[0]
                        node_tui_uri = posixpath.join(
                            'https://identifiers.org/umls/STY', node_tui)
                        node_tui_curie = uri_to_curie_shortener(node_tui_uri)
                        assert node_tui_curie is not None
                        [node_tui_category_label,
                         _] = get_biolink_category_for_node(
                             node_tui_uri, node_tui_curie, ontology,
                             curies_to_categories, uri_to_curie_shortener,
                             set(), True)

                node_comments = node_meta.get('comments', None)
                if node_comments is not None:
                    comments_str = 'COMMENTS: ' + (' // '.join(node_comments))
                    if node_description is not None:
                        node_description += ' // ' + comments_str
                    else:
                        node_description = comments_str

            if node_category_label is None:
                node_type = onto_node_dict.get('type', None)
                if node_type is not None and node_type == 'PROPERTY':
                    node_category_label = 'property'

            if node_category_label is None:
                if not node_deprecated:
                    kg2_util.log_message("Node does not have a category",
                                         ontology.id,
                                         node_curie_id,
                                         output_stream=sys.stderr)
                    node_category_label = 'unknown category'
                else:
                    node_category_label = 'deprecated node'

            if node_has_cui:
                assert node_tui is not None or len(node_tui_list) > 0

                if node_tui_category_label is None:
                    node_tui_category_label = 'unknown category'
                    if node_tui is not None:
                        kg2_util.log_message(
                            message='Node ' + ontology_node_id +
                            ' has CUI whose TUI cannot be mapped to category: '
                            + node_tui)
                    else:
                        kg2_util.log_message(
                            message='Node ' + ontology_node_id +
                            ' has CUI with multiple associated TUIs: ' +
                            ', '.join(node_tui_list))
                else:
                    if node_category_label is None:
                        node_category_label = node_tui_category_label  # override the node category label if we have a TUI
                node_tui_category_iri = category_label_to_iri_mapper(
                    node_tui_category_label)
            ontology_curie_id = ontologies_iris_to_curies[iri_of_ontology]
            source_ontology_information = ret_dict.get(ontology_curie_id, None)
            if source_ontology_information is None:
                kg2_util.log_message(
                    message=
                    "ontology IRI has no information dictionary available",
                    ontology_name=iri_of_ontology,
                    output_stream=sys.stderr)
                assert False
            source_ontology_update_date = source_ontology_information[
                'update date']
            if node_update_date is None:
                node_update_date = source_ontology_update_date

            if node_description is not None:
                node_description_xrefs_match = REGEX_XREF_END_DESCRIP.match(
                    node_description)
                if node_description_xrefs_match is not None:
                    node_description_xrefs_str = node_description_xrefs_match[
                        1]
                    node_description_xrefs_list = node_description_xrefs_str.split(
                        ',')
                    for node_description_xref_str in node_description_xrefs_list:
                        node_description_xref_str = node_description_xref_str.strip(
                        )
                        if ':' in node_description_xref_str:
                            node_xrefs.add(node_description_xref_str)
                node_description_pubs = REGEX_PUBLICATIONS.findall(
                    node_description)
                for pub_curie in node_description_pubs:
                    node_publications.add(pub_curie)

            # deal with node names that are ALLCAPS
            if node_name is not None and node_name.isupper():
                node_name = kg2_util.allcaps_to_only_first_letter_capitalized(
                    node_name)

            node_dict = kg2_util.make_node(node_curie_id, iri, node_name,
                                           node_category_label,
                                           node_update_date, iri_of_ontology)
            node_dict['full name'] = node_full_name
            node_dict['description'] = node_description
            node_dict[
                'creation date'] = node_creation_date  # slot name is not biolink standard
            node_dict[
                'deprecated'] = node_deprecated  # slot name is not biolink standard
            node_dict[
                'replaced by'] = node_replaced_by_curie  # slot name is not biolink standard
            node_dict['ontology node ids'] = [
                ontology_node_id
            ]  # slot name is not biolink standard
            node_dict['xrefs'] = list(
                node_xrefs)  # slot name is not biolink standard
            node_dict['synonym'] = list(
                node_synonyms)  # slot name is not biolink standard
            node_dict['publications'] = list(node_publications)

            # check if we need to make a CUI node
            if node_meta is not None and basic_property_values is not None:
                for basic_property_value_dict in basic_property_values:
                    bpv_pred = basic_property_value_dict['pred']
                    bpv_pred_curie = uri_to_curie_shortener(bpv_pred)
                    bpv_val = basic_property_value_dict['val']
                    if bpv_pred_curie == 'UMLS:cui':  # CUI_BASE_IRI:
                        cui_node_dict = dict(node_dict)
                        cui_uri = bpv_pred + '/' + bpv_val
                        cui_curie = uri_to_curie_shortener(cui_uri)
                        assert cui_curie is not None
                        assert not cui_curie.startswith('UMLS:C')  # :DEBUG:
                        # Skip this CUI if it's identical to the ontology node itself (happens with files created
                        # using 'load_on_cuis' - part of fix for issue #565)
                        if get_local_id_from_curie_id(
                                cui_curie) == get_local_id_from_curie_id(
                                    node_curie_id):
                            continue
                        cui_node_dict['id'] = cui_curie
                        cui_node_dict['iri'] = cui_uri
                        cui_node_dict['synonym'] = []
                        cui_node_dict['category'] = node_tui_category_iri
                        cui_node_dict[
                            'category label'] = node_tui_category_label.replace(
                                ' ', '_')
                        cui_node_dict['ontology node ids'] = []
                        cui_node_dict['provided by'] = CUI_BASE_IRI
                        cui_node_dict['xrefs'] = [
                        ]  # blanking the "xrefs" here is *vital* in order to avoid issue #395
                        cui_node_dict_existing = ret_dict.get(cui_curie, None)
                        if cui_node_dict_existing is not None:
                            cui_node_dict = kg2_util.merge_two_dicts(
                                cui_node_dict, cui_node_dict_existing)
                        ret_dict[cui_curie] = cui_node_dict
                        node_dict_xrefs = node_dict['xrefs']
                        node_dict_xrefs.append(cui_curie)
                        node_dict['xrefs'] = list(set(node_dict_xrefs))
                    elif bpv_pred_curie == 'HGNC:ENTREZGENE_ID':
                        entrez_gene_id = bpv_val
                        entrez_node_dict = dict(node_dict)
                        entrez_curie = 'NCBIGene:' + entrez_gene_id
                        entrez_node_dict['id'] = entrez_curie
                        entrez_node_dict[
                            'iri'] = 'https://identifiers.org/NCBIGene/' + entrez_gene_id
                        ret_dict[entrez_curie] = entrez_node_dict
                        node_dict_xrefs = node_dict['xrefs']
                        node_dict_xrefs.append(entrez_curie)
                        node_dict['xrefs'] = list(set(node_dict_xrefs))
            if node_curie_id in ret_dict:
                node_dict = kg2_util.merge_two_dicts(ret_dict[node_curie_id],
                                                     node_dict)
            ret_dict[node_curie_id] = node_dict
    return ret_dict
Exemplo n.º 14
0
     for result_item_list in edges_result
 ]
 for edge_dict in edges_list:
     del edge_dict['is_defined_by']
     del edge_dict['seed_node_uuid']
     del edge_dict['source_node_uuid']
     del edge_dict['target_node_uuid']
     predicate_label = edge_dict['relation']
     edge_dict['edge label'] = predicate_label
     del edge_dict['relation']
     [relation, relation_curie] = kg2_util.predicate_label_to_iri_and_curie(
         predicate_label, KG1_RELATION_CURIE_PREFIX,
         KG1_RELATION_IRI_PREFIX)
     if relation_curie == 'BioLink:subclass_of':
         relation_curie = 'rdfs:subClassOf'
         relation = prefixcommons.expand_uri(relation_curie)
     edge_dict['relation'] = relation
     edge_dict['relation curie'] = relation_curie
     edge_dict['negated'] = False
     publications = edge_dict.get('publications', None)
     if publications is not None and publications != '':
         publications = publications.split(',')
     else:
         publications = []
     edge_dict['publications'] = publications
     edge_dict['update date'] = None
     provided_by = edge_dict['provided_by']
     if provided_by.startswith('DGIdb;'):
         provided_by = 'DGIdb'
     provided_by_kg2 = KG1_PROVIDED_BY_TO_KG2_IRIS.get(provided_by, None)
     edge_dict['provided by'] = provided_by_kg2