示例#1
0
def make_edges(records: list, nodes_dict: dict):
    ret_list = []
    for record_dict in records:
        accession = record_dict['AC'][0]
        curie_id = 'UniProtKB:' + accession
        organism_int = record_dict['organism']
        update_date = nodes_dict[curie_id]['update date']
        ret_list.append(
            kg2_util.make_edge(curie_id, 'NCBITaxon:' + str(organism_int),
                               'gene_product_has_organism_source',
                               UNIPROTKB_BASE_IRI, update_date))
        record_xrefs = record_dict.get('DR', None)
        if record_xrefs is not None:
            for xref_str in record_xrefs:
                hgnc_match = REGEX_HGNC.match(xref_str)
                if hgnc_match is not None:
                    hgnc_curie = hgnc_match[1]
                    ret_list.append(
                        kg2_util.make_edge(hgnc_curie, curie_id, 'encodes',
                                           UNIPROTKB_BASE_IRI, update_date))
                gene_id_match = REGEX_NCBIGeneID.match(xref_str)
                if gene_id_match is not None:
                    ncbi_curie = 'NCBIGene:' + gene_id_match[1]
                    ret_list.append(
                        kg2_util.make_edge(ncbi_curie, curie_id, 'encodes',
                                           UNIPROTKB_BASE_IRI, update_date))
    return ret_list
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    df = pd.read_csv(input_file_name)
    for idx in range(len(df)):
        if not df['status'].isna()[idx]:
            status = df['status'][idx].lower()
        else:
            status = "unknown_status"
        if not df['phase'].isna()[idx]:
            phase = df['phase'][idx].lower().replace(" ",
                                                     "_").replace("/", "_or_")
        else:
            phase = "unknown_phase"
        relation = "clinically_tested_" + status + "_" + phase
        edge_dict = kg2_util.make_edge(
            subject_id=DRUGBANK_CURIE + df['drug_id'][idx],
            object_id=UMLS_CURIE + df['ind_id'][idx],
            relation=REPODB_IRI + '#' +
            kg2_util.convert_snake_case_to_camel_case(relation),
            relation_curie=REPODB_CURIE + relation,
            predicate_label=relation,
            provided_by=REPODB_IRI,
            update_date=None)
        if not df['NCT'].isna()[idx]:
            edge_dict['publications'].append(NCT_CUTRIE + df['NCT'][idx])
            edge_dict['publications info'][
                NCT_CUTRIE +
                df['NCT'][idx]] = CLINICALTRIALS_IRI + df['NCT'][idx]
        edges.append(edge_dict)
    return {'nodes': nodes, 'edges': edges}
def make_rel(preds_dict: dict, subject_curie: str, object_curie: str,
             predicate: str, pmid: str, pub_date: str, sentence: str,
             subject_score: str, object_score: str, negated: bool):
    key = subject_curie + '-' + predicate + '-' + object_curie
    key_val = preds_dict.get(key, None)
    publication_curie = kg2_util.CURIE_PREFIX_PMID + ':' + pmid
    publication_info_dict = {
        'publication date': pub_date,
        'sentence': sentence,
        'subject score': subject_score,
        'object score': object_score
    }
    if key_val is None:
        relation_type = predicate.lower()
        if relation_type != 'xref':
            relation_curie = SEMMEDDB_CURIE_PREFIX + ':' + relation_type
        else:
            relation_curie = 'OBO:xref'
        edge_dict = kg2_util.make_edge(subject_curie, object_curie,
                                       relation_curie, relation_type,
                                       SEMMEDDB_CURIE_PREFIX + ':',
                                       curr_timestamp)
        edge_dict['publications'] = [publication_curie]
        edge_dict['publications_info'] = {
            publication_curie: publication_info_dict
        }
        edge_dict['negated'] = negated
        preds_dict[key] = edge_dict
    else:
        key_val['publications_info'][publication_curie] = publication_info_dict
        key_val['publications'] = key_val['publications'] + [publication_curie]
示例#4
0
def make_edge(subject_curie_id: str, object_curie_id: str,
              predicate_label: str, update_date: str):
    [relation, relation_curie] = kg2_util.predicate_label_to_iri_and_curie(
        predicate_label, ENSEMBL_RELATION_CURIE_PREFIX, ENSEMBL_KB_IRI)
    rel = kg2_util.make_edge(subject_curie_id, object_curie_id, relation,
                             relation_curie, predicate_label, ENSEMBL_KB_IRI,
                             update_date)
    return rel
def format_edge(subject_id, object_id, predicate, update_date):
    relation_curie = kg2_util.predicate_label_to_curie(
        predicate, DRUGCENTRAL_RELATION_CURIE_PREFIX)
    if predicate == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS:
        return kg2_util.make_edge_biolink(subject_id, object_id, predicate,
                                          DRUGCENTRAL_SOURCE, update_date)
    else:
        return kg2_util.make_edge(subject_id, object_id, relation_curie,
                                  predicate, DRUGCENTRAL_SOURCE, update_date)
def format_edge(subject_id: str, object_id: str, predicate_label: str):
    relation_curie = kg2_util.predicate_label_to_curie(
        predicate_label, REACTOME_RELATION_CURIE_PREFIX)
    if predicate_label == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS:
        return kg2_util.make_edge_biolink(subject_id, object_id,
                                          predicate_label,
                                          REACTOME_KB_CURIE_ID, None)
    return kg2_util.make_edge(subject_id, object_id, relation_curie,
                              predicate_label, REACTOME_KB_CURIE_ID)
def make_edge(subject_curie_id: str, object_curie_id: str,
              predicate_label: str, update_date: str):
    relation = kg2_util.BIOLINK_CATEGORY_BASE_IRI + kg2_util.convert_snake_case_to_camel_case(
        predicate_label)
    relation_curie = kg2_util.BIOLINK_CURIE_PREFIX + ':' + predicate_label.replace(
        ' ', '_')
    rel = kg2_util.make_edge(subject_curie_id, object_curie_id, relation,
                             relation_curie, predicate_label,
                             UNIPROTKB_BASE_IRI, update_date)
    return rel
示例#8
0
def make_edges(input_tsv: str, gene_id_dict: Dict[str, list],
               pmids_dict: Dict[str, Dict[str, set]], test_mode: bool) -> list:
    gene_ids_actually_used = set()
    update_date = datetime.datetime.now().replace(microsecond=0).isoformat()
    with open(input_tsv) as inp:
        tsvin = csv.reader(inp, delimiter="\t")
        edges = list()
        for row in tsvin:
            [
                gene_id, gene_name, disease_id, disease_name, z_score, _,
                source_url
            ] = row
            gene_ids_actually_used.add(gene_id)
            kg2_gene_id_list = gene_id_dict.get(gene_id, None)
            if kg2_gene_id_list is None:
                # print(f"Missing kg2 equivalent gene ids for {gene_id}. Skipping")
                continue
            if float(z_score) < 3.0:
                continue
            for kg2_gene_id in kg2_gene_id_list:
                if pmids_dict['disease'].get(disease_id, None) is None:
                    # print(f"Disease id {disease_id} is not DOID. Skipping.")
                    continue
                publications_list = list(
                    pmids_dict['gene'][gene_id].intersection(
                        pmids_dict['disease'][disease_id]))
                publications_list = publications_list[:
                                                      30]  # limit number of publications to 30 for size constraints
                edge = kg2_util.make_edge(kg2_gene_id, disease_id,
                                          "JensenLab:associated_with",
                                          "associated_with",
                                          kg2_util.CURIE_ID_JENSENLAB,
                                          update_date)
                # seems hacky, but following example in rtx_kg1_neo4j_to_kg_json.py
                publication_info_dict = {
                    'publication date': None,
                    'sentence': None,
                    'subject score': None,
                    'object score': str(z_score)
                }
                publications_info = {edge['object']: publication_info_dict}
                edge["publications"] = publications_list
                edge["publications_info"] = publications_info
                edges.append(edge)
            if test_mode and len(gene_ids_actually_used) > 1000:
                break

    used_genes_missing_ids = gene_ids_actually_used - set(gene_id_dict.keys())
    print(
        f"Skipped {len(used_genes_missing_ids)} rows for lack of kg2 gene ids."
    )
    print(
        f"Found {len(gene_ids_actually_used - used_genes_missing_ids)} used kg2 gene ids."
    )
    return edges
示例#9
0
def make_node_and_edges(article: dict, mesh_predicate_label: str,
                        mesh_relation_curie: str):
    nodes = []
    edges = []

    article_citation = article["MedlineCitation"]

    pmid = kg2_util.CURIE_PREFIX_PMID + ":" + article_citation["PMID"]["#text"]

    update_date = extract_date(article_citation["DateRevised"])

    if pmid in pmids:
        # These aren't necessary yet, but it might be someday, so I wrote
        # and tested a couple of functions to extract them

        #authors = get_authors(article_citation)

        #journal = get_journal(article_citation)

        name = article_citation["Article"]["ArticleTitle"]
        if isinstance(name, dict):
            try:
                name = name["#text"]
            except:
                temp_name = name
                for key in temp_name:
                    name = temp_name[key]["#text"]

        try:
            created_date = extract_date(
                article_citation["Article"]["ArticleDate"])
        except:
            created_date = None

        iri = PMID_BASE_IRI + article_citation["PMID"]["#text"]

        node = kg2_util.make_node(pmid, iri, name,
                                  BIOLINK_CATEGORY_PUBLICATION, update_date,
                                  PMID_PROVIDED_BY_CURIE_ID)
        node["creation_date"] = created_date
        nodes.append(node)
        try:
            for mesh_topic in (
                    article_citation["MeshHeadingList"]["MeshHeading"]):
                mesh_id = kg2_util.CURIE_PREFIX_MESH + ":" + \
                          mesh_topic["DescriptorName"]["@UI"]
                edge = kg2_util.make_edge(pmid, mesh_id, mesh_relation_curie,
                                          mesh_predicate_label,
                                          PMID_PROVIDED_BY_CURIE_ID,
                                          update_date)
                edges.append(edge)
        except:
            mesh_id = None

    return [{"nodes": nodes, "edges": edges}, update_date]
示例#10
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    ensembl_data = kg2_util.load_json(input_file_name)
    nodes = []
    edges = []
    genebuild_str = ensembl_data['genebuild']
    update_date = genebuild_str.split('/')[1]
    gene_ctr = 0
    for gene_dict in ensembl_data['genes']:
        gene_ctr += 1
        if test_mode and gene_ctr > 10000:
            break
        ensembl_gene_id = gene_dict['id']
        description = gene_dict.get('description', None)
        gene_symbol = gene_dict.get('name', None)
        other_synonyms = []
        xrefs = gene_dict.get('xrefs', None)
        if xrefs is not None:
            other_synonyms = list(set([xref['primary_id'] for xref in xrefs if xref['primary_id'] != ensembl_gene_id]))
        node_dict = make_node(ensembl_gene_id,
                              description,
                              gene_symbol,
                              update_date,
                              other_synonyms)
        nodes.append(node_dict)
        ensembl_gene_curie_id = node_dict['id']
        taxon_id_int = gene_dict.get('taxon_id', None)
        assert taxon_id_int == 9606, "unexpected taxon ID"
        edges.append(kg2_util.make_edge(ensembl_gene_curie_id,
                                        'NCBITaxon:' + str(taxon_id_int),
                                        'gene_found_in_organism',
                                        ENSEMBL_KB_IRI,
                                        update_date))
        hgnc_list = gene_dict.get('HGNC', None)
        if hgnc_list is not None:
            for hgnc_curie in hgnc_list:
                edges.append(kg2_util.make_edge(ensembl_gene_curie_id,
                                                hgnc_curie,
                                                'xref',
                                                ENSEMBL_KB_IRI,
                                                update_date))
    return {'nodes': nodes,
            'edges': edges}
示例#11
0
def make_edge(subject_id: str,
              object_id: str,
              predicate_label: str,
              update_date: str = None,
              publications: list = None):
    relation_curie = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + predicate_label
    edge = kg2_util.make_edge(subject_id, object_id, relation_curie,
                              predicate_label, CHEMBL_KB_CURIE_ID, update_date)
    edge['publications'] = [] if publications is None else publications
    edge['publications_info'] = {}
    return edge
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    ensembl_data = kg2_util.load_json(input_file_name)
    nodes = []
    edges = []
    genebuild_str = ensembl_data['genebuild']
    update_date = genebuild_str.split('/')[1]
    gene_ctr = 0

    ontology_curie_id = ENSEMBL_KB_CURIE_ID
    ens_kp_node = kg2_util.make_node(ontology_curie_id, ENSEMBL_KB_URI,
                                     'Ensembl Genes',
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     update_date, ontology_curie_id)
    nodes.append(ens_kp_node)

    for gene_dict in ensembl_data['genes']:
        gene_ctr += 1
        if test_mode and gene_ctr > 10000:
            break
        ensembl_gene_id = gene_dict['id']
        description = gene_dict.get('description', None)
        gene_symbol = gene_dict.get('name', None)
        other_synonyms = []
        xrefs = gene_dict.get('xrefs', None)
        if xrefs is not None:
            other_synonyms = list(
                set([
                    xref['primary_id'] for xref in xrefs
                    if xref['primary_id'] != ensembl_gene_id
                ]))
        node_dict = make_node(ensembl_gene_id, description, gene_symbol,
                              update_date, other_synonyms)
        nodes.append(node_dict)
        ensembl_gene_curie_id = node_dict['id']
        taxon_id_int = gene_dict.get('taxon_id', None)
        assert taxon_id_int == 9606, "unexpected taxon ID"
        edges.append(
            kg2_util.make_edge_biolink(
                ensembl_gene_curie_id,
                kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(taxon_id_int),
                kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, ENSEMBL_KB_CURIE_ID,
                update_date))
        hgnc_list = gene_dict.get('HGNC', None)
        if hgnc_list is not None:
            for hgnc_curie in hgnc_list:
                edges.append(
                    kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie,
                                       kg2_util.CURIE_ID_OWL_SAME_AS,
                                       kg2_util.EDGE_LABEL_OWL_SAME_AS,
                                       ENSEMBL_KB_CURIE_ID, update_date))
    return {'nodes': nodes, 'edges': edges}
示例#13
0
def format_edge(subject_id: str,
                object_id: str,
                predicate_label: str,
                description: str,
                publications: list = None):
    relation_curie = kg2_util.predicate_label_to_curie(
        predicate_label, DRUGBANK_RELATION_CURIE_PREFIX)

    edge = kg2_util.make_edge(subject_id, object_id, relation_curie,
                              predicate_label, DRUGBANK_KB_CURIE_ID, None)

    if description is not None:
        edge["publications_info"] = {"sentence": description}

    if publications is not None:
        edge["publications"] = publications

    return edge
def make_hmdb_edge(subject_id: str, object_id: str, subject_prefix: str,
                   object_prefix: str, predicate_label: str, update_date: str,
                   publications_info: dict):
    relation_curie = kg2_util.predicate_label_to_curie(predicate_label,
                                                       CURIE_PREFIX_HMDB)
    subject = subject_prefix + ":" + subject_id
    object = object_id
    if object_prefix is not None:
        object = object_prefix + ":" + object_id
    if predicate_label == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS:
        edge = kg2_util.make_edge_biolink(subject, object, predicate_label,
                                          HMDB_PROVIDED_BY_CURIE_ID,
                                          update_date)

    else:
        edge = kg2_util.make_edge(subject, object, relation_curie,
                                  predicate_label, HMDB_PROVIDED_BY_CURIE_ID,
                                  update_date)
    edge["publications_info"] = publications_info

    return edge
示例#15
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    update_date = os.path.getmtime(input_file_name)
    nodes = [
        kg2_util.make_node(id=REPODB_CURIE + ':',
                           iri=REPODB_IRI,
                           name='repoDB drug repositioning database',
                           category_label=kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                           update_date=update_date,
                           provided_by=REPODB_CURIE + ':')
    ]
    edges = []
    df = pd.read_csv(input_file_name)
    for idx in range(len(df)):
        if not df['status'].isna()[idx]:
            status = df['status'][idx].lower()
        else:
            status = "unknown_status"
        if not df['phase'].isna()[idx]:
            phase = df['phase'][idx].lower().replace(" ",
                                                     "_").replace("/", "_or_")
        else:
            phase = "unknown_phase"
        relation = "clinically_tested_" + status + "_" + phase
        edge_dict = kg2_util.make_edge(
            subject_id=DRUGBANK_CURIE + ':' + df['drug_id'][idx],
            object_id=UMLS_CURIE + ':' + df['ind_id'][idx],
            relation_curie=REPODB_CURIE + ':' + relation,
            relation_label=relation,
            provided_by=REPODB_CURIE + ':',
            update_date=None)
        if not df['NCT'].isna()[idx]:
            edge_dict['publications'].append(NCT_CURIE + df['NCT'][idx])
            edge_dict['publications_info'][
                NCT_CURIE +
                df['NCT'][idx]] = CLINICALTRIALS_IRI + df['NCT'][idx]
        edges.append(edge_dict)
    return {'nodes': nodes, 'edges': edges}
示例#16
0
def make_rel(preds_dict: dict, subject_curie: str, object_curie: str,
             predicate: str, pmid: str, pub_date: str, sentence: str,
             subject_score: str, object_score: str, negated: bool):
    key = subject_curie + '-' + predicate + '-' + object_curie
    key_val = preds_dict.get(key, None)
    publication_curie = 'PMID:' + pmid
    publication_info_dict = {
        'publication date': pub_date,
        'sentence': sentence,
        'subject score': subject_score,
        'object score': object_score
    }
    if key_val is None:
        relation_type = predicate.lower()
        if relation_type != 'xref':
            relation_iri = kg2_util.convert_snake_case_to_camel_case(
                relation_type.replace(' ', '_'))
            relation_iri = relation_iri[0].lower() + relation_iri[1:]
            relation_iri = SEMMEDDB_IRI + '#' + relation_iri
            relation_curie = 'SEMMEDDB:' + relation_type
        else:
            relation_curie = 'OBO:xref'
            relation_iri = prefixcommons.expand_uri(relation_curie)
        edge_dict = kg2_util.make_edge(subject_curie, object_curie,
                                       relation_iri, relation_curie,
                                       relation_type, SEMMEDDB_IRI,
                                       curr_timestamp)
        edge_dict['publications'] = [publication_curie]
        edge_dict['publications info'] = {
            publication_curie: publication_info_dict
        }
        edge_dict['negated'] = negated
        preds_dict[key] = edge_dict
    else:
        key_val['publications info'][publication_curie] = publication_info_dict
        key_val['publications'] = key_val['publications'] + [publication_curie]
示例#17
0
def make_xref(subject: str, object: str, update_date: str):
    edge_dict = kg2_util.make_edge(subject, object, kg2_util.IRI_OWL_SAME_AS,
                                   kg2_util.CURIE_OWL_SAME_AS, 'equivalent_to',
                                   UNICHEM_KB_IRI, update_date)
    return edge_dict
示例#18
0
def make_edge(intact_row):
    if row.startswith('#'):
        return None
    data = row.split('\t')
    # last data element is 'Identification method participant B'
    [subject_id,  # ID(s) interactor A
     object_id,  # ID(s) interactor B,
     _,  # Alt. ID(s) interactor A,
     _,  # Alt. ID(s) interactor B,
     subject_name,  # Alias(es) interactor A,
     object_name,  # Alias(es) interactor B,
     _,  # Interaction detection method(s),
     _,  # Publication 1st author(s),
     publications,  # Publication Identifier(s),
     subject_taxon,  # Taxid interactor A,
     object_taxon,  # Taxid interactor B,
     predicate,  # Interaction type(s),
     _,  # Source database(s),
     _,  # Interaction identifier(s),
     confidence,  # Confidence value(s),
     _,  # Expansion method(s),
     _,  # Biological role(s) interactor A,
     _,  # Biological role(s) interactor B,
     _,  # Experimental role(s) interactor A,
     _,  # Experimental role(s) interactor B,
     _,  # Type(s) interactor A,
     _,  # Type(s) interactor B,
     _,  # Xref(s) interactor A,
     _,  # Xref(s) interactor B,
     _,  # Interaction Xref(s),
     _,  # Annotation(s) interactor A,
     _,  # Annotation(s) interactor B,
     _,  # Interaction annotation(s),
     taxon,  # Host organism(s),
     _,  # Interaction parameter(s),
     created_date,  # Creation date,
     update_date,  # Update date,
     _,  # Checksum(s) interactor A,
     _,  # Checksum(s) interactor B,
     _,  # Interaction Checksum(s),
     _,  # Negative,
     _,  # Feature(s) interactor A,
     _,  # Feature(s) interactor B,
     _,  # Stoichiometry(s) interactor A,
     _,  # Stoichiometry(s) interactor B,
     _,  # Identification method participant A
     _] = data
    if subject_taxon == HUMAN_TAXON and object_taxon == HUMAN_TAXON:
        publications = [format_pmid(publication)
                        for publication in publications.split('|')
                        if publication.startswith('pubmed')]
        confidence = [score.replace('intact-miscore:', '')
                      for score in confidence.split('|')
                      if confidence.startswith('intact-miscore:')]
        if len(confidence) < 1:
            confidence = None
        else:
            confidence = confidence[0]
        relation_label = format_rel_label(predicate)
        relation = predicate.split('"')[1]
        subject_id = subject_id.replace('uniprotkb',
                                        kg2_util.CURIE_PREFIX_UNIPROT)
        object_id = object_id.replace('uniprotkb',
                                      kg2_util.CURIE_PREFIX_UNIPROT)
        created_date = format_date(created_date)
        update_date = format_date(update_date)
        edge = kg2_util.make_edge(subject_id,
                                  object_id,
                                  relation,
                                  relation_label,
                                  INTACT_KB_CURIE_ID,
                                  update_date)
        edge['publications'] = publications
        return edge
    return None
示例#19
0
    for line in file_arr:
        if line[0].startswith("!") is False:
            predicate_label = line[2]
            subject_curie = kg2_util.CURIE_PREFIX_UNIPROT + ":" + line[1]
            object_curie = line[3]
            publications = [line[4]]
            eco_code = line[5]
            source = line[6].split("|")
            update_date = line[8]
            evidence = line[10]
            negated = False
            if "NOT|" in predicate_label:
                negated = True
                predicate_label = predicate_label.replace("NOT|", "")

            relation_curie = CURIE_PREFIX_GO + ":" + predicate_label

            edge = kg2_util.make_edge(subject_curie, object_curie,
                                      relation_curie, predicate_label,
                                      GO_PROVIDED_BY_CURIE_ID, update_date)
            edge["negated"] = negated
            edge["publications"] = publications
            edges.append(edge)

    kg2_util.save_json({
        "nodes": [],
        "edges": edges
    }, args.outputFile, args.test)

    print("Ending at", date())
示例#20
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    gene_ctr = 0

    update_date = os.path.getmtime(input_file_name)
    ontology_curie_id = NCBI_KB_CURIE_ID
    ens_kp_node = kg2_util.make_node(ontology_curie_id, NCBI_KB_URL,
                                     'NCBI Genes',
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     update_date, ontology_curie_id)
    nodes.append(ens_kp_node)

    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            if line.startswith('#'):
                continue
            gene_ctr += 1
            if test_mode and gene_ctr > 10000:
                break
            fields = line.rstrip("\n").split("\t")
            fields = [(field if field.strip() != '-' else None)
                      for field in fields]
            [
                taxon_id_str, ncbi_gene_id, gene_symbol, locus_tag,
                synonyms_str, db_xrefs, chromosome, map_location, description,
                type_of_gene, symbol_auth, full_name_auth, nomenc_status,
                other_desig, modify_date, feature_type
            ] = fields
            taxon_id_int = int(taxon_id_str)
            if taxon_id_int != kg2_util.NCBI_TAXON_ID_HUMAN:
                # skip neanderthal- and denisovan-specific genes
                continue
            node_synonyms = list()
            if synonyms_str is not None:
                node_synonyms += synonyms_str.split('|')
            if other_desig is not None:
                node_synonyms += other_desig.split('|')
            if symbol_auth is not None and symbol_auth != gene_symbol:
                node_synonyms = [symbol_auth] + node_synonyms
            node_synonyms = list(set(node_synonyms))
            full_name = full_name_auth
            if full_name is None:
                full_name = description
            if type_of_gene != "unknown" or (db_xrefs is None) or (not db_xrefs.startswith("MIM:")) or \
               nomenc_status is not None:
                category_label = kg2_util.BIOLINK_CATEGORY_GENE
            else:
                full_name = 'Genetic locus associated with ' + full_name
                category_label = kg2_util.BIOLINK_CATEGORY_GENOMIC_ENTITY
            if full_name.startswith('microRNA'):
                category_label = kg2_util.BIOLINK_CATEGORY_MICRORNA
            node_dict = make_node(ncbi_gene_id, full_name, gene_symbol,
                                  modify_date, category_label, node_synonyms)
            node_curie_id = node_dict['id']
            type_str = 'Type:' + type_of_gene
            node_description = ''
            if description is not None and description != full_name_auth:
                node_description = description + '; '
            node_description += type_str
            if nomenc_status is not None:
                nomenc_tag = 'official'
            else:
                nomenc_tag = 'unofficial'
            if map_location is not None:
                node_description += '; Locus:' + map_location
            node_description += '; NameStatus:' + nomenc_tag
            node_dict['description'] = node_description
            nodes.append(node_dict)
            org_curie = kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + taxon_id_str
            predicate_label = 'in_taxon'

            edge_dict = kg2_util.make_edge_biolink(node_curie_id, org_curie,
                                                   predicate_label,
                                                   NCBI_KB_CURIE_ID,
                                                   modify_date)
            edges.append(edge_dict)
            if db_xrefs is not None:
                xrefs_list = db_xrefs.split('|')
                for xref_curie in xrefs_list:
                    if xref_curie.startswith('HGNC:HGNC:'):
                        xref_curie = kg2_util.CURIE_PREFIX_HGNC + ':' + xref_curie.replace(
                            'HGNC:', '')
                    elif xref_curie.startswith('Ensembl:'):
                        xref_curie = xref_curie.upper()
                    elif xref_curie.startswith('MIM:'):
                        xref_curie = kg2_util.CURIE_PREFIX_OMIM + ':' + xref_curie.replace(
                            'MIM:', '')
                    elif xref_curie.startswith('miRBase:'):
                        xref_curie = kg2_util.CURIE_PREFIX_MIRBASE + ':' + xref_curie.replace(
                            'miRBase:', '')
                    edges.append(
                        kg2_util.make_edge(node_curie_id, xref_curie,
                                           kg2_util.CURIE_ID_OWL_SAME_AS,
                                           kg2_util.EDGE_LABEL_OWL_SAME_AS,
                                           NCBI_KB_CURIE_ID, modify_date))
    return {'nodes': nodes, 'edges': edges}
示例#21
0
def get_rels_dict(nodes: dict, owl_file_information_dict_list: list,
                  uri_to_curie_shortener: callable,
                  map_of_node_ontology_ids_to_curie_ids: dict):
    rels_dict = dict()

    for owl_file_information_dict in owl_file_information_dict_list:
        ontology = owl_file_information_dict['ontology']
        ontology_id = owl_file_information_dict['id']
        ont_graph = ontology.get_graph()
        ontology_curie_id = map_of_node_ontology_ids_to_curie_ids[ontology_id]
        for (object_id, subject_id,
             predicate_dict) in ont_graph.edges(data=True):
            assert type(predicate_dict) == dict

            ontology_node = nodes.get(ontology_curie_id, None)
            if ontology_node is not None:
                ontology_update_date = ontology_node['update date']

            if subject_id == OWL_BASE_CLASS or object_id == OWL_BASE_CLASS:
                continue

            if subject_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER) or \
               object_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER):
                continue

            # subject_id and object_id are IDs from the original ontology objects; these may not
            # always be the node curie IDs (e.g., for SNOMED terms). Need to map them
            subject_curie_id = map_of_node_ontology_ids_to_curie_ids.get(
                subject_id, None)
            if subject_curie_id is None:
                kg2_util.log_message(
                    message="ontology node ID has no curie ID in the map",
                    ontology_name=ontology.id,
                    node_curie_id=subject_id,
                    output_stream=sys.stderr)
                continue
            object_curie_id = map_of_node_ontology_ids_to_curie_ids.get(
                object_id, None)
            if object_curie_id is None:
                kg2_util.log_message(
                    message="ontology node ID has no curie ID in the map",
                    ontology_name=ontology.id,
                    node_curie_id=object_id,
                    output_stream=sys.stderr)
                continue

            predicate_label = None
            edge_pred_string = predicate_dict['pred']

            if subject_curie_id.startswith(
                    'TUI:') and object_curie_id.startswith(
                        'TUI:') and edge_pred_string == 'subClassOf':
                continue

            if not edge_pred_string.startswith(
                    'http:') and not edge_pred_string.startswith('https'):
                # edge_pred_string is not a URI; this is the most common case
                if ':' not in edge_pred_string:
                    # edge_pred_string is not a CURIE; this is the most common subcase
                    if edge_pred_string != 'subClassOf':
                        predicate_curie = 'owl:' + edge_pred_string
                    else:
                        predicate_curie = 'rdfs:subClassOf'
                    predicate_label = kg2_util.convert_camel_case_to_snake_case(
                        edge_pred_string)
                else:
                    # edge_pred_string is a CURIE
                    predicate_curie = edge_pred_string
                    predicate_node = nodes.get(predicate_curie, None)
                    if predicate_node is not None:
                        predicate_label = predicate_node['name']
                    else:
                        # predicate has no node object defined; just pull the label out of the CURIE
                        if edge_pred_string.startswith('OBO:'):
                            test_curie = edge_pred_string.replace('OBO:',
                                                                  '').replace(
                                                                      '_', ':')
                            predicate_node = nodes.get(test_curie, None)
                            if predicate_node is None:
                                predicate_label = edge_pred_string.split(
                                    ':')[1].split('#')[-1]
                            else:
                                predicate_curie = test_curie
                        else:
                            predicate_label = edge_pred_string
                predicate_iri = prefixcommons.expand_uri(predicate_curie)
                predicate_curie_new = uri_to_curie_shortener(predicate_iri)
                if predicate_curie_new is not None:
                    predicate_curie = predicate_curie_new
            else:
                predicate_iri = edge_pred_string
                predicate_curie = uri_to_curie_shortener(predicate_iri)

            if predicate_curie is None:
                kg2_util.log_message(message="predicate IRI has no CURIE: " +
                                     predicate_iri,
                                     ontology_name=ontology.id,
                                     output_stream=sys.stderr)
                continue

            if subject_curie_id == object_curie_id and predicate_label == 'xref':
                continue

            if predicate_curie == 'UMLS:hasSTY':
                subject_node = nodes[subject_curie_id]
                object_node = nodes[object_curie_id]
                subject_description = subject_node['description']
                if subject_description is None:
                    subject_description = ''
                subject_node['description'] = '; '.join(
                    list(
                        filter(None, [
                            subject_description,
                            'UMLS Semantic Type: ' + object_node['id']
                        ])))
                continue

            rel_key = make_rel_key(subject_curie_id, predicate_curie,
                                   object_curie_id, ontology_curie_id)

            if predicate_label is None and ':' in predicate_curie:
                pred_node = nodes.get(predicate_curie, None)
                if pred_node is not None:
                    predicate_label = pred_node['name']
                    if predicate_label[0].isupper():
                        predicate_label = predicate_label[0].lower(
                        ) + predicate_label[1:]

            assert predicate_label is not None
            predicate_label = predicate_label.replace(' ', '_')
            # Only tested on Food and Efo ontologies
            predicate_label = kg2_util.convert_camel_case_to_snake_case(
                predicate_label)
            if rels_dict.get(rel_key, None) is None:
                edge = kg2_util.make_edge(subject_curie_id, object_curie_id,
                                          predicate_iri, predicate_curie,
                                          predicate_label, ontology_id,
                                          ontology_update_date)
                rels_dict[rel_key] = edge
        for node_id, node_dict in nodes.items():
            xrefs = node_dict['xrefs']
            if xrefs is not None:
                for xref_node_id in xrefs:
                    if xref_node_id in nodes and node_id != xref_node_id:
                        provided_by = nodes[node_id]['provided by']
                        key = make_rel_key(node_id, CURIE_OBO_XREF,
                                           xref_node_id, provided_by)
                        if rels_dict.get(key, None) is None:
                            edge = kg2_util.make_edge(node_id, xref_node_id,
                                                      IRI_OBO_XREF,
                                                      CURIE_OBO_XREF, 'xref',
                                                      provided_by,
                                                      ontology_update_date)
                            rels_dict[key] = edge

    return rels_dict
示例#22
0
def make_xref(subject: str, object: str, update_date: str):
    edge_dict = kg2_util.make_edge(subject, object,
                                   kg2_util.CURIE_ID_OWL_SAME_AS,
                                   kg2_util.EDGE_LABEL_OWL_SAME_AS,
                                   UNICHEM_KB_CURIE, update_date)
    return edge_dict
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    line_ctr = 0
    update_date = None
    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            line = line.rstrip("\n")
            if line.startswith('#'):
                update_date = line.replace('#', '')
                continue
            if line.startswith('gene_name\t'):
                continue
            line_ctr += 1
            if test_mode and line_ctr > 10000:
                break
            fields = line.split("\t")
            [gene_name,
             gene_claim_name,
             entrez_id,
             interaction_claim_source,
             interaction_types,
             drug_claim_name,
             drug_claim_primary_name,
             drug_name,
             drug_concept_id,
             _, #12.5.2020 new field in tsv: interaction group score
             PMIDs] = fields
            if entrez_id != "":
                object_curie_id = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + entrez_id
                if drug_concept_id != "":   
                    if "chembl" in drug_concept_id:
                        _, chembl_id = drug_concept_id.split(":")                    
                        subject_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_COMPOUND + ':' + chembl_id
                    else:
                        print(f"DGIDB: Skipping row with drug concept id {drug_concept_id}", file=sys.stderr)
                        continue #skipping over wikidata nodes, see #1185
                else:
                    if drug_claim_name != "":
                        node_pubs_list = []
                        subject_curie_id = None
                        if interaction_claim_source == INTERACTION_CLAIM_SOURCE_GTPI:
                            subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name
                            pmid_match = RE_PMID.match(drug_claim_primary_name)
                            if pmid_match is not None:
                                node_pubs_list = [pmid_match[2].replace(' ', '').strip()]
                                node_name = pmid_match[1].strip()
                            else:
                                node_name = drug_claim_primary_name
                            node_iri = GTPI_BASE_URL + drug_claim_name
                            provided_by = GTPI_KB_CURIE
                        elif interaction_claim_source == INTERACTION_CLAIM_SOURCE_TTD:
                            subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name
                            node_name = drug_claim_primary_name
                            node_iri = TTD_IRI_BASE + drug_claim_name
                            provided_by = TTD_KB_CURIE
                        if subject_curie_id is not None:
                            node_dict = kg2_util.make_node(subject_curie_id,
                                                           node_iri,
                                                           node_name,
                                                           kg2_util.BIOLINK_CATEGORY_CHEMICAL_SUBSTANCE,
                                                           update_date,
                                                           provided_by)
                            node_dict['publications'] = node_pubs_list
                            nodes.append(node_dict)
                if subject_curie_id is None:
                    print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name +
                          "; source DB: " + interaction_claim_source, file=sys.stderr)
                    continue
                if interaction_types == "":
                    print("DGIDB: interaction type was empty. Setting to 'affects'.", file=sys.stderr)
                    interaction_types = "affects"
                pmids_list = []
                if PMIDs.strip() != "":
                    pmids_list = [(kg2_util.CURIE_PREFIX_PMID + ':' + pmid.strip()) for pmid in PMIDs.split(',')]
                interaction_list = interaction_types.split(',')
                for interaction in interaction_list:
                    interaction = interaction.replace(' ', '_')
                    edge_dict = kg2_util.make_edge(subject_curie_id,
                                                   object_curie_id,
                                                   DGIDB_CURIE_PREFIX + ':' + interaction,
                                                   interaction,
                                                   DGIDB_KB_CURIE,
                                                   update_date)
                    edge_dict['publications'] = pmids_list
                    edges.append(edge_dict)
    return {'nodes': nodes,
            'edges': edges}
示例#24
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    gene_ctr = 0
    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            if line.startswith('#'):
                continue
            gene_ctr += 1
            if test_mode and gene_ctr > 10000:
                break
            fields = line.rstrip("\n").split("\t")
            fields = [(field if field.strip() != '-' else None) for field in fields]
            [taxon_id_str,
             ncbi_gene_id,
             gene_symbol,
             locus_tag,
             synonyms_str,
             db_xrefs,
             chromosome,
             map_location,
             description,
             type_of_gene,
             symbol_auth,
             full_name_auth,
             nomenc_status,
             other_desig,
             modify_date,
             feature_type] = fields
            taxon_id_int = int(taxon_id_str)
            if taxon_id_int != kg2_util.NCBI_TAXON_ID_HUMAN:
                # skip neanderthal- and denisovan-specific genes
                continue
            node_synonyms = list()
            if synonyms_str is not None:
                node_synonyms += synonyms_str.split('|')
            if other_desig is not None:
                node_synonyms += other_desig.split('|')
            if symbol_auth is not None and symbol_auth != gene_symbol:
                node_synonyms = [symbol_auth] + node_synonyms
            node_synonyms = list(set(node_synonyms))
            full_name = full_name_auth
            if full_name is None:
                full_name = description
            node_dict = make_node(ncbi_gene_id,
                                  full_name,
                                  gene_symbol,
                                  modify_date,
                                  node_synonyms)
            node_curie_id = node_dict['id']
            type_str = 'Type:'+type_of_gene
            node_description = ''
            if description is not None and description != full_name_auth:
                node_description = description + '; '
            node_description += type_str
            if map_location is not None:
                node_description += '; Locus:' + map_location
            if nomenc_status is not None:
                nomenc_tag = 'official'
            else:
                nomenc_tag = 'unofficial'
            node_description += '; NameStatus:' + nomenc_tag
            node_dict['description'] = node_description
            nodes.append(node_dict)
            org_curie = kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + taxon_id_str
            predicate_label = 'gene_found_in_organism'
            [relation, relation_curie] = kg2_util.predicate_label_to_iri_and_curie(predicate_label,
                                                                                   NCBI_RELATION_CURIE_PREFIX,
                                                                                   NCBI_BASE_IRI)
            edge_dict = kg2_util.make_edge(node_curie_id,
                                           org_curie,
                                           relation,
                                           relation_curie,
                                           predicate_label,
                                           NCBI_BASE_IRI,
                                           modify_date)
            edges.append(edge_dict)
            if db_xrefs is not None:
                xrefs_list = db_xrefs.split('|')
                for xref_curie in xrefs_list:
                    if xref_curie.startswith('HGNC:HGNC:'):
                        xref_curie = 'HGNC:' + xref_curie.replace('HGNC:', '')
                    elif xref_curie.startswith('Ensembl:'):
                        xref_curie = xref_curie.upper()
                    elif xref_curie.startswith('MIM:'):
                        xref_curie = 'OMIM:' + xref_curie.replace('MIM:', '')
                    edges.append(kg2_util.make_edge(node_curie_id,
                                                    xref_curie,
                                                    kg2_util.IRI_OWL_SAME_AS,
                                                    kg2_util.CURIE_OWL_SAME_AS,
                                                    'equivalent_to',
                                                    NCBI_BASE_IRI,
                                                    modify_date))
    return {'nodes': nodes,
            'edges': edges}
示例#25
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    line_ctr = 0
    update_date = None
    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            line = line.rstrip("\n")
            if line.startswith('#'):
                update_date = line.replace('#', '')
                continue
            if line.startswith('gene_name\t'):
                continue
            line_ctr += 1
            if test_mode and line_ctr > 10000:
                break
            fields = line.split("\t")
            [gene_name,
             gene_claim_name,
             entrez_id,
             interaction_claim_source,
             interaction_types,
             drug_claim_name,
             drug_claim_primary_name,
             drug_name,
             drug_chembl_id,
             PMIDs] = fields
            if entrez_id != "":
                object_curie_id = 'NCBIGene:' + entrez_id
                if drug_chembl_id != "":
                    subject_curie_id = 'CHEMBL.COMPOUND:' + drug_chembl_id
                else:
                    if drug_claim_name != "":
                        node_pubs_list = []
                        subject_curie_id = None
                        if interaction_claim_source == "GuideToPharmacologyInteractions":
                            subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name
                            pmid_match = RE_PMID.match(drug_claim_primary_name)
                            if pmid_match is not None:
                                node_pubs_list = [pmid_match[2].replace(' ', '').strip()]
                                node_name = pmid_match[1].strip()
                            else:
                                node_name = drug_claim_primary_name
                            node_iri = GTPI_IRI_BASE + GTPI_LIGAND_SUFFIX + drug_claim_name
                            provided_by = GTPI_IRI_BASE
                        elif interaction_claim_source == "TTD":
                            subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name
                            node_name = drug_claim_primary_name
                            node_iri = TTD_IRI_BASE + drug_claim_name
                            provided_by = TTD_IRI_BASE
                        if subject_curie_id is not None:
                            node_dict = kg2_util.make_node(subject_curie_id,
                                                           node_iri,
                                                           node_name,
                                                           'chemical_substance',
                                                           update_date,
                                                           provided_by)
                            node_dict['publications'] = node_pubs_list
                            nodes.append(node_dict)
                if subject_curie_id is None:
                    print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name + "; source DB: " + interaction_claim_source, file=sys.stderr)
                    continue
                if interaction_types == "":
                    interaction_types = "affects"
                pmids_list = []
                if PMIDs.strip() != "":
                    pmids_list = [('PMID:' + pmid.strip()) for pmid in PMIDs.split(',')]
                interaction_list = interaction_types.split(',')
                for interaction in interaction_list:
                    interaction = interaction.replace(' ', '_')
                    edge_dict = kg2_util.make_edge(subject_curie_id,
                                                   object_curie_id,
                                                   DGIDB_BASE_IRI + '/' +
                                                   kg2_util.convert_snake_case_to_camel_case(interaction),
                                                   DGIDB_CURIE_PREFIX + ':' + interaction,
                                                   interaction,
                                                   DGIDB_BASE_IRI,
                                                   update_date)
                    edge_dict['publications'] = pmids_list
                    edges.append(edge_dict)
    return {'nodes': nodes,
            'edges': edges}