Пример #1
0
def get_connection_normalizing_count(gene_list, node_type_list):
    # dictionary that keeps track of all connections from a gene to any node type
    connection_dict = {}
    for gene_symbol in gene_list:
        gene_found = False
        gene_query = ht.query(gene_symbol)['Gene']
        for i in gene_query:
            if (i['SYMBOL'].lower() == gene_symbol.lower()):
                gene = i
                gene_found = True
        if (gene_found == True):
            count = 0
            input_object = gene
            for x in node_type_list:
                try:
                    ## only look at direct connections
                    fc = FindConnection(input_obj=input_object,
                                        output_obj=x,
                                        intermediate_nodes=None)
                    fc.connect(verbose=False)
                    df = fc.display_table_view()
                    rows = df.shape[0]
                    count = count + rows
                except:
                    print("gene " + str(gene_symbol) +
                          " for node intermediate " + str(x) + " failed")
            connection_dict[gene_symbol] = count
        else:
            print(gene_symbol + ' could not be found')
            connection_dict[gene_symbol] = 'Unknown'
    return (connection_dict)
Пример #2
0
def get_disease_to_gene_results(disease_input):
    disease_to_gene_results = {}

    #directly related
    fc = FindConnection(input_obj=disease_input,
                        output_obj='Gene',
                        intermediate_nodes=None)
    fc.connect(verbose=False)
    disease_to_genes = fc.display_table_view()

    disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str.
                                        contains('UMLS')]

    # keep track of number of occurrences from direct disease -> gene connection
    print("running disease -> gene")
    i = list(disease_to_genes["output_name"])
    d = {x: i.count(x) for x in i}
    sorted_disease_to_genes = {
        k: v
        for k, v in sorted(d.items(), key=lambda item: item[1])
    }
    disease_to_gene_results[
        "sorted_disease_to_genes"] = sorted_disease_to_genes
    # print("occurences of genes directly related to genes")
    # print(disease_to_gene_results["sorted_disease_to_genes"])

    one_step_genes_pub_counts = {}
    for index, row in disease_to_genes.iterrows():
        current_pubcount = 0
        if (row["pred1_pubmed"] != None):
            current_pubcount = current_pubcount + row["pred1_pubmed"].count(
                ",") + 1
        if row["output_name"] in one_step_genes_pub_counts:
            one_step_genes_pub_counts[
                row["output_name"]] = one_step_genes_pub_counts[
                    row["output_name"]] + current_pubcount
        else:
            one_step_genes_pub_counts[row["output_name"]] = current_pubcount
    disease_to_gene_results[
        "one_step_genes_pub_counts"] = one_step_genes_pub_counts

    disease_to_genes_list = list(reversed(list(
        sorted_disease_to_genes.keys())))
    disease_to_gene_results["disease_to_genes_list"] = disease_to_genes_list

    return (disease_to_gene_results)
Пример #3
0
def predict_many(input_object, intermediate_node_list, output_type):
    df_list = []
    for inter in intermediate_node_list:
        try:
            print("Intermediate Node type running:")
            print(inter)
            fc = FindConnection(input_obj=input_object,
                                output_obj=output_type,
                                intermediate_nodes=[inter])
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if (rows > 0):
                df_list.append(df)
        except:
            print("FAILED")
    if (len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None
Пример #4
0
 def get(self):
     input_obj = self.get_query_argument('input_obj')
     output_obj = self.get_query_argument('output_obj')
     print("executing connect query: ", self.request.uri)
     intermediate_nodes = self.get_query_argument('intermediate_nodes')
     if type(input_obj) == str:
         input_obj = tornado.escape.json_decode(input_obj)
     if type(output_obj) == str:
         output_obj = tornado.escape.json_decode(output_obj)
     if type(intermediate_nodes) == str:
         intermediate_nodes = ast.literal_eval(intermediate_nodes)
     fc = FindConnection(input_obj=input_obj,
                         output_obj=output_obj,
                         intermediate_nodes=intermediate_nodes,
                         registry=reg)
     fc.connect()
     df = fc.display_table_view()
     if df.empty:
         res = []
     else:
         df = df[[
             'input', 'pred1', 'pred1_api', 'node1_name', 'node1_type',
             'pred2', 'pred2_api', 'output_name'
         ]]
         df.drop_duplicates(inplace=True)
         res = df.to_dict('records')
     if res:
         self.set_status(200)
         self.write(
             tornado.escape.json_encode({
                 'data': res,
                 'log': fc.fc.log
             }))
         self.finish()
         return
     else:
         self.set_status(404)
         self.write(json.dumps({'error': "Unable to find any connection"}))
         self.finish()
         return
Пример #5
0
# Select the correct representation of depression
depression = depression_hint["Disease"][0]
print(depression)
print()

# help(FindConnection.__init__)
fc = FindConnection(input_obj=depression,
                    output_obj=tbi,
                    intermediate_nodes="BiologicalEntity")
# BTE finding connection
fc.connect(verbose=True)

print()
print("Displaying and filter results")
# Displaying and filter results
df = fc.display_table_view()
# because UMLS is not currently well-integrated in our ID-to-object translation system, removing UMLS-only entries here
patternDel = "^UMLS:C\d+"
filter = df.node1_id.str.contains(patternDel)
df = df[~filter]

fc.to_graphml("TBI.graphml")
fc.to_reasoner_std()

print(df.shape)
df.sample(10)

# Which diseases are mentioned the most
mentioned = df.node1_name.value_counts().head(10)
print(mentioned)
Пример #6
0
def determined_genes_to_symptoms(gene_list, symptom_list):
    # gene -> phenotypic feature nodes
    print("Genes -> PhenotypicFeatures")

    df_list = []
    for x in gene_list:
        #     print(x)
        try:
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene,
                                output_obj='PhenotypicFeature',
                                intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if (rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if (len(df_list) > 0):
        top_gene_to_phenotypicFeature = pd.concat(df_list)

    ## Get names for HP ids
    HP_ids = top_gene_to_phenotypicFeature[
        top_gene_to_phenotypicFeature["output_name"].str.contains(
            "HP:", regex=False)]["output_name"]
    HP_ids = list(HP_ids)
    HP_ids = list(dict.fromkeys(HP_ids))
    # len(HP_ids)
    HP_dict = {}
    for x in HP_ids:
        HP_ID = x.split(':')[1]
        r = requests.get('https://biothings.ncats.io/hpo/phenotype/HP%3A' +
                         HP_ID)
        res = r.json()
        if (('_id' in res) & ('name' in res)):
            HP_dict[res['_id']] = res['name'].lower()

    phen_indices = get_similar_phen_indices(
        list(top_gene_to_phenotypicFeature["output_name"]), symptom_list, 0.95,
        HP_dict)
    phen_top = top_gene_to_phenotypicFeature.iloc[phen_indices, :]

    # phen_top = top_gene_to_phenotypicFeature
    # phen_top
    for index in range(phen_top.shape[0]):
        #     if("HP:" in row['output_name']):
        #     print(index)
        if (phen_top.iloc[index]["output_name"] in HP_dict):
            phen_top.iloc[index]["output_name"] = HP_dict[phen_top.iloc[index]
                                                          ["output_name"]]

    phen_top

    # gene -> bioprocess
    print("Genes -> Bioprocesses")
    df_list = []
    for x in gene_list:
        #     print(x)
        try:
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene,
                                output_obj='BiologicalProcess',
                                intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if (rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if (len(df_list) > 0):
        top_gene_to_bioprocesses = pd.concat(df_list)

    go_ids = top_gene_to_bioprocesses[
        top_gene_to_bioprocesses["output_name"].str.contains(
            "go:", regex=False)]["output_name"]
    go_ids = list(go_ids)
    go_ids = list(dict.fromkeys(go_ids))
    # len(go_ids)
    go_dict = {}
    for x in go_ids:
        go_ID = x.split(':')[1]
        r = requests.get('https://biothings.ncats.io/go_bp/geneset/GO%3A' +
                         go_ID)
        res = r.json()
        if ('name' in res):
            go_dict[res['_id']] = res['name'].lower()

    bp_indices = get_similar_bp_indices(
        list(top_gene_to_bioprocesses["output_name"]), symptom_list, 0.95,
        go_dict)
    bioprocess_top = top_gene_to_bioprocesses.iloc[bp_indices, :]

    # bioprocess_top = top_gene_to_bioprocesses
    # Genes -> disease type "symptoms"
    print("Genes -> Diseases")
    df_list = []
    for x in gene_list:
        try:
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene,
                                output_obj='Disease',
                                intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if (rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if (len(df_list) > 0):
        top_gene_to_diseases = pd.concat(df_list)

    disease_indices = get_similar_disease_indices(
        list(top_gene_to_diseases["output_name"]), symptom_list, 0.95)
    relevant_top_gene_to_diseases = top_gene_to_diseases.iloc[
        disease_indices, :]

    # relevant_top_gene_to_diseases = top_gene_to_diseases

    ## make dataframe with all genes -> symptoms
    all_gene_connections = pd.concat(
        [bioprocess_top, phen_top, relevant_top_gene_to_diseases])
    all_gene_connections["output_name"] = all_gene_connections[
        "output_name"].str.lower()
    return (all_gene_connections)
Пример #7
0
def get_symtpom_prevalence(hp_symptom_dict, disease_name):
    for key in hp_symptom_dict:
        print(key)
        edges_out_count = 0
        # print("name: " + str(hp_symptom_dict[key]))
        UMLS = ''
        for y in ['PhenotypicFeature', 'Disease', 'BiologicalProcess']:
            if y == 'PhenotypicFeature':
                a = ht.query(key)[y]
                if len(a) > 0:
                    b = a[0]
                    if 'UMLS' in b:
                        UMLS = b['UMLS']
                    try:
                        fc = FindConnection(input_obj=b,
                                            output_obj='Gene',
                                            intermediate_nodes=None)
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        print('gene')
                        print(df.shape)
                        # print("phen")
                        # print(hp_symptom_dict[key])
                        # print(df.shape[0])
                        if (df.shape[0] > 0):
                            print("OKKKk")
                            df = df[df["output_name"] != disease_name]
                            edges_out_count = edges_out_count + df.shape[0]
                            print(edges_out_count)
                    except:
                        print("Nope")
                    try:
                        fc = FindConnection(input_obj=b,
                                            output_obj='Disease',
                                            intermediate_nodes=None)
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        print(df.shape)
                        # print("phen")
                        # print(hp_symptom_dict[key])
                        # print(df.shape[0])
                        if (df.shape[0] > 0):
                            print("ok edge phen to dis")
                            df = df[df["output_name"] != disease_name]
                            edges_out_count = edges_out_count + df.shape[0]
                            print(edges_out_count)
                    except:
                        print("Nope")
            if (y == 'Disease') | (y == 'BiologicalProcess'):
                for z in hp_symptom_dict[key]["names"]:
                    if ((y == 'Disease') & (len(UMLS) > 0)):
                        try:
                            a = ht.query(UMLS)[y]
                        except:
                            a = []
                            # pass
                    else:
                        try:
                            a = ht.query(z)[y]
                        except:
                            a = []
                            # pass
                    # print(a)
                    for b in a:
                        if b['name'].lower() == z.lower():
                            # print('match')
                            # print(b)
                            # print(z)
                            try:
                                fc = FindConnection(input_obj=b,
                                                    output_obj='Gene',
                                                    intermediate_nodes=None)
                                fc.connect(verbose=False)
                                df = fc.display_table_view()
                                # print("BD")
                                # print(df.shape[0])
                                if (df.shape[0] > 0):
                                    df = df[df["output_name"] != disease_name]
                                    edges_out_count = edges_out_count + df.shape[
                                        0]

                            except:
                                print("Nope")
                            try:
                                fc = FindConnection(input_obj=b,
                                                    output_obj='Disease',
                                                    intermediate_nodes=None)
                                fc.connect(verbose=False)
                                df = fc.display_table_view()
                                # print("BD")
                                # print(df.shape[0])
                                if (df.shape[0] > 0):
                                    df = df[df["output_name"] != disease_name]
                                    edges_out_count = edges_out_count + df.shape[
                                        0]
                            except:
                                print("Nope")
        print("edges out")
        print(edges_out_count)
        hp_symptom_dict[key]["edges_out_count"] = edges_out_count
    return (hp_symptom_dict)