Пример #1
0
def get_connection_normalizing_count(gene_list, node_type_list):
    # dictionary that keeps track of all connections from a gene to any node type
    connection_dict = {}
    for gene_symbol in gene_list:
        gene_found = False
        gene_query = ht.query(gene_symbol)['Gene']
        for i in gene_query:
            if (i['SYMBOL'].lower() == gene_symbol.lower()):
                gene = i
                gene_found = True
        if (gene_found == True):
            count = 0
            input_object = gene
            for x in node_type_list:
                try:
                    ## only look at direct connections
                    fc = FindConnection(input_obj=input_object,
                                        output_obj=x,
                                        intermediate_nodes=None)
                    fc.connect(verbose=False)
                    df = fc.display_table_view()
                    rows = df.shape[0]
                    count = count + rows
                except:
                    print("gene " + str(gene_symbol) +
                          " for node intermediate " + str(x) + " failed")
            connection_dict[gene_symbol] = count
        else:
            print(gene_symbol + ' could not be found')
            connection_dict[gene_symbol] = 'Unknown'
    return (connection_dict)
 def setUp(self):
     ht = Hint()
     cxcr4 = ht.query('CXCR4')['Gene'][0]
     fc = FindConnection(input_obj=cxcr4,
                         output_obj='ChemicalSubstance',
                         intermediate_nodes=None)
     fc.connect(verbose=True)
     self.response = fc.to_reasoner_std()
 def setUpClass(cls):
     ht = Hint()
     cxcr4 = ht.query("CXCR4")["Gene"][0]
     fc = FindConnection(input_obj=cxcr4,
                         output_obj="ChemicalSubstance",
                         intermediate_nodes=None)
     fc.connect(verbose=True)
     cls.response = fc.to_reasoner_std()
Пример #4
0
 def test_one_filt(self):
     ht = Hint()
     input = ht.query('hyperphenylalaninemia')['Disease'][0]
     filt = [{}, {'name': 'NodeDegree', 'count': 25}]
     fc = FindConnection(input_obj=input,
                         output_obj='ChemicalSubstance',
                         intermediate_nodes=['Gene'],
                         filters=filt)
     fc.connect()
     self.assertEqual(27, len(fc.fc.G.nodes))
Пример #5
0
 def test_no_filter(self):
     ht = Hint()
     input = ht.query('hyperphenylalaninemia')['Disease'][0]
     filt = []
     fc = FindConnection(input_obj=input,
                         output_obj='ChemicalSubstance',
                         intermediate_nodes=['Gene'],
                         filters=filt)
     fc.connect()
     for node, y in fc.fc.G.nodes(data=True):
         self.assertTrue('filteredBy' not in y.keys())
Пример #6
0
 def test_result_section(self):
     """Find connection between TMPRSS2 and pentamidine through all intermediate nodes """
     tmprss2 = ht.query("TMPRSS2")["Gene"][0]
     pentamidine = ht.query("pentamidine")["ChemicalSubstance"][0]
     fc = FindConnection(
         input_obj=tmprss2,
         output_obj=pentamidine,
         intermediate_nodes=["BiologicalEntity"],
         registry=None,
     )
     fc.connect(verbose=True)
     self.assertIn("PLXNA2", fc.fc.G)
Пример #7
0
 def test_query_with_broken_intermediate_nodes(self):
     """For a query with long intermediate nodes, it might happen that one intermediate query returns 0 hits.
     In this case, we should stop the code execution"""
     mof = ht.query("Multiple Organ Failure")["Disease"][0]
     fc = FindConnection(
         mof,
         output_obj="Gene",
         intermediate_nodes=[
             "BiologicalProcess",
             "Cell",
             "AnatomicalEntity",
             "CellularComponent",
         ],
     )
     fc.connect()
     self.assertGreater(len(fc.fc.G), 2)
Пример #8
0
def get_disease_to_gene_results(disease_input):
    disease_to_gene_results = {}

    #directly related
    fc = FindConnection(input_obj=disease_input,
                        output_obj='Gene',
                        intermediate_nodes=None)
    fc.connect(verbose=False)
    disease_to_genes = fc.display_table_view()

    disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str.
                                        contains('UMLS')]

    # keep track of number of occurrences from direct disease -> gene connection
    print("running disease -> gene")
    i = list(disease_to_genes["output_name"])
    d = {x: i.count(x) for x in i}
    sorted_disease_to_genes = {
        k: v
        for k, v in sorted(d.items(), key=lambda item: item[1])
    }
    disease_to_gene_results[
        "sorted_disease_to_genes"] = sorted_disease_to_genes
    # print("occurences of genes directly related to genes")
    # print(disease_to_gene_results["sorted_disease_to_genes"])

    one_step_genes_pub_counts = {}
    for index, row in disease_to_genes.iterrows():
        current_pubcount = 0
        if (row["pred1_pubmed"] != None):
            current_pubcount = current_pubcount + row["pred1_pubmed"].count(
                ",") + 1
        if row["output_name"] in one_step_genes_pub_counts:
            one_step_genes_pub_counts[
                row["output_name"]] = one_step_genes_pub_counts[
                    row["output_name"]] + current_pubcount
        else:
            one_step_genes_pub_counts[row["output_name"]] = current_pubcount
    disease_to_gene_results[
        "one_step_genes_pub_counts"] = one_step_genes_pub_counts

    disease_to_genes_list = list(reversed(list(
        sorted_disease_to_genes.keys())))
    disease_to_gene_results["disease_to_genes_list"] = disease_to_genes_list

    return (disease_to_gene_results)
Пример #9
0
def predict_many(input_object, intermediate_node_list, output_type):
    df_list = []
    for inter in intermediate_node_list:
        try:
            print("Intermediate Node type running:")
            print(inter)
            fc = FindConnection(input_obj=input_object,
                                output_obj=output_type,
                                intermediate_nodes=[inter])
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if (rows > 0):
                df_list.append(df)
        except:
            print("FAILED")
    if (len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None
Пример #10
0
 def get(self):
     input_obj = self.get_query_argument('input_obj')
     output_obj = self.get_query_argument('output_obj')
     print("executing connect query: ", self.request.uri)
     intermediate_nodes = self.get_query_argument('intermediate_nodes')
     if type(input_obj) == str:
         input_obj = tornado.escape.json_decode(input_obj)
     if type(output_obj) == str:
         output_obj = tornado.escape.json_decode(output_obj)
     if type(intermediate_nodes) == str:
         intermediate_nodes = ast.literal_eval(intermediate_nodes)
     fc = FindConnection(input_obj=input_obj,
                         output_obj=output_obj,
                         intermediate_nodes=intermediate_nodes,
                         registry=reg)
     fc.connect()
     df = fc.display_table_view()
     if df.empty:
         res = []
     else:
         df = df[[
             'input', 'pred1', 'pred1_api', 'node1_name', 'node1_type',
             'pred2', 'pred2_api', 'output_name'
         ]]
         df.drop_duplicates(inplace=True)
         res = df.to_dict('records')
     if res:
         self.set_status(200)
         self.write(
             tornado.escape.json_encode({
                 'data': res,
                 'log': fc.fc.log
             }))
         self.finish()
         return
     else:
         self.set_status(404)
         self.write(json.dumps({'error': "Unable to find any connection"}))
         self.finish()
         return
Пример #11
0
 def test_two_filt(self):
     ht = Hint()
     input = ht.query('hyperphenylalaninemia')['Disease'][0]
     filt = [{
         'name': 'NodeDegree',
         'count': 20
     }, {
         'name': 'EdgeLabel',
         'label': 'related_to'
     }]
     fc = FindConnection(input_obj=input,
                         output_obj='ChemicalSubstance',
                         intermediate_nodes=['Gene'],
                         filters=filt)
     fc.connect()
     self.assertEqual(52, len(fc.fc.G.nodes))
     for node, y in fc.fc.G.nodes(data=True):
         if node != 'mild hyperphenylalaninemia':
             if y['type'] == 'Gene':
                 self.assertEqual('NodeDegree', y['filteredBy'])
             elif y['type'] == 'ChemicalSubstance':
                 self.assertEqual('EdgeLabel', y['filteredBy'])
Пример #12
0
from biothings_explorer.hint import Hint
from biothings_explorer.user_query_dispatcher import FindConnection

ht = Hint()
ace2 = ht.query("ACE2")['Gene'][0]
# print(ace2)

fc = FindConnection(input_obj=ace2,
                    output_obj='DiseaseOrPhenotypicFeature',
                    intermediate_nodes='ChemicalSubstance')

fc.connect(verbose=True)
# save graphml file to test.graphml
output = fc.to_graphml("test.graphml")
Пример #13
0
# The angiotensin-converting enzyme 2, or ACE2 “receptor,” is the protein
# provides the entry point for the coronavirus to hook into and
# infect a wide range of human cells.

from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

ace2 = ht.query("ACE2")['Gene'][0]

fc = FindConnection(input_obj=ace2,
                    output_obj='ChemicalSubstance',
                    intermediate_nodes=['BiologicalEntity'])
fc.connect(
    verbose=True
)  # There was a KeyError: "equivalent_ids" on this line, will not move on from this example
Пример #14
0
def determined_genes_to_symptoms(gene_list, symptom_list):
    # gene -> phenotypic feature nodes
    print("Genes -> PhenotypicFeatures")

    df_list = []
    for x in gene_list:
        #     print(x)
        try:
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene,
                                output_obj='PhenotypicFeature',
                                intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if (rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if (len(df_list) > 0):
        top_gene_to_phenotypicFeature = pd.concat(df_list)

    ## Get names for HP ids
    HP_ids = top_gene_to_phenotypicFeature[
        top_gene_to_phenotypicFeature["output_name"].str.contains(
            "HP:", regex=False)]["output_name"]
    HP_ids = list(HP_ids)
    HP_ids = list(dict.fromkeys(HP_ids))
    # len(HP_ids)
    HP_dict = {}
    for x in HP_ids:
        HP_ID = x.split(':')[1]
        r = requests.get('https://biothings.ncats.io/hpo/phenotype/HP%3A' +
                         HP_ID)
        res = r.json()
        if (('_id' in res) & ('name' in res)):
            HP_dict[res['_id']] = res['name'].lower()

    phen_indices = get_similar_phen_indices(
        list(top_gene_to_phenotypicFeature["output_name"]), symptom_list, 0.95,
        HP_dict)
    phen_top = top_gene_to_phenotypicFeature.iloc[phen_indices, :]

    # phen_top = top_gene_to_phenotypicFeature
    # phen_top
    for index in range(phen_top.shape[0]):
        #     if("HP:" in row['output_name']):
        #     print(index)
        if (phen_top.iloc[index]["output_name"] in HP_dict):
            phen_top.iloc[index]["output_name"] = HP_dict[phen_top.iloc[index]
                                                          ["output_name"]]

    phen_top

    # gene -> bioprocess
    print("Genes -> Bioprocesses")
    df_list = []
    for x in gene_list:
        #     print(x)
        try:
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene,
                                output_obj='BiologicalProcess',
                                intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if (rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if (len(df_list) > 0):
        top_gene_to_bioprocesses = pd.concat(df_list)

    go_ids = top_gene_to_bioprocesses[
        top_gene_to_bioprocesses["output_name"].str.contains(
            "go:", regex=False)]["output_name"]
    go_ids = list(go_ids)
    go_ids = list(dict.fromkeys(go_ids))
    # len(go_ids)
    go_dict = {}
    for x in go_ids:
        go_ID = x.split(':')[1]
        r = requests.get('https://biothings.ncats.io/go_bp/geneset/GO%3A' +
                         go_ID)
        res = r.json()
        if ('name' in res):
            go_dict[res['_id']] = res['name'].lower()

    bp_indices = get_similar_bp_indices(
        list(top_gene_to_bioprocesses["output_name"]), symptom_list, 0.95,
        go_dict)
    bioprocess_top = top_gene_to_bioprocesses.iloc[bp_indices, :]

    # bioprocess_top = top_gene_to_bioprocesses
    # Genes -> disease type "symptoms"
    print("Genes -> Diseases")
    df_list = []
    for x in gene_list:
        try:
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene,
                                output_obj='Disease',
                                intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if (rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if (len(df_list) > 0):
        top_gene_to_diseases = pd.concat(df_list)

    disease_indices = get_similar_disease_indices(
        list(top_gene_to_diseases["output_name"]), symptom_list, 0.95)
    relevant_top_gene_to_diseases = top_gene_to_diseases.iloc[
        disease_indices, :]

    # relevant_top_gene_to_diseases = top_gene_to_diseases

    ## make dataframe with all genes -> symptoms
    all_gene_connections = pd.concat(
        [bioprocess_top, phen_top, relevant_top_gene_to_diseases])
    all_gene_connections["output_name"] = all_gene_connections[
        "output_name"].str.lower()
    return (all_gene_connections)
Пример #15
0
def get_symtpom_prevalence(hp_symptom_dict, disease_name):
    for key in hp_symptom_dict:
        print(key)
        edges_out_count = 0
        # print("name: " + str(hp_symptom_dict[key]))
        UMLS = ''
        for y in ['PhenotypicFeature', 'Disease', 'BiologicalProcess']:
            if y == 'PhenotypicFeature':
                a = ht.query(key)[y]
                if len(a) > 0:
                    b = a[0]
                    if 'UMLS' in b:
                        UMLS = b['UMLS']
                    try:
                        fc = FindConnection(input_obj=b,
                                            output_obj='Gene',
                                            intermediate_nodes=None)
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        print('gene')
                        print(df.shape)
                        # print("phen")
                        # print(hp_symptom_dict[key])
                        # print(df.shape[0])
                        if (df.shape[0] > 0):
                            print("OKKKk")
                            df = df[df["output_name"] != disease_name]
                            edges_out_count = edges_out_count + df.shape[0]
                            print(edges_out_count)
                    except:
                        print("Nope")
                    try:
                        fc = FindConnection(input_obj=b,
                                            output_obj='Disease',
                                            intermediate_nodes=None)
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        print(df.shape)
                        # print("phen")
                        # print(hp_symptom_dict[key])
                        # print(df.shape[0])
                        if (df.shape[0] > 0):
                            print("ok edge phen to dis")
                            df = df[df["output_name"] != disease_name]
                            edges_out_count = edges_out_count + df.shape[0]
                            print(edges_out_count)
                    except:
                        print("Nope")
            if (y == 'Disease') | (y == 'BiologicalProcess'):
                for z in hp_symptom_dict[key]["names"]:
                    if ((y == 'Disease') & (len(UMLS) > 0)):
                        try:
                            a = ht.query(UMLS)[y]
                        except:
                            a = []
                            # pass
                    else:
                        try:
                            a = ht.query(z)[y]
                        except:
                            a = []
                            # pass
                    # print(a)
                    for b in a:
                        if b['name'].lower() == z.lower():
                            # print('match')
                            # print(b)
                            # print(z)
                            try:
                                fc = FindConnection(input_obj=b,
                                                    output_obj='Gene',
                                                    intermediate_nodes=None)
                                fc.connect(verbose=False)
                                df = fc.display_table_view()
                                # print("BD")
                                # print(df.shape[0])
                                if (df.shape[0] > 0):
                                    df = df[df["output_name"] != disease_name]
                                    edges_out_count = edges_out_count + df.shape[
                                        0]

                            except:
                                print("Nope")
                            try:
                                fc = FindConnection(input_obj=b,
                                                    output_obj='Disease',
                                                    intermediate_nodes=None)
                                fc.connect(verbose=False)
                                df = fc.display_table_view()
                                # print("BD")
                                # print(df.shape[0])
                                if (df.shape[0] > 0):
                                    df = df[df["output_name"] != disease_name]
                                    edges_out_count = edges_out_count + df.shape[
                                        0]
                            except:
                                print("Nope")
        print("edges out")
        print(edges_out_count)
        hp_symptom_dict[key]["edges_out_count"] = edges_out_count
    return (hp_symptom_dict)