def get_connection_normalizing_count(gene_list, node_type_list): # dictionary that keeps track of all connections from a gene to any node type connection_dict = {} for gene_symbol in gene_list: gene_found = False gene_query = ht.query(gene_symbol)['Gene'] for i in gene_query: if (i['SYMBOL'].lower() == gene_symbol.lower()): gene = i gene_found = True if (gene_found == True): count = 0 input_object = gene for x in node_type_list: try: ## only look at direct connections fc = FindConnection(input_obj=input_object, output_obj=x, intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] count = count + rows except: print("gene " + str(gene_symbol) + " for node intermediate " + str(x) + " failed") connection_dict[gene_symbol] = count else: print(gene_symbol + ' could not be found') connection_dict[gene_symbol] = 'Unknown' return (connection_dict)
def setUp(self): ht = Hint() cxcr4 = ht.query('CXCR4')['Gene'][0] fc = FindConnection(input_obj=cxcr4, output_obj='ChemicalSubstance', intermediate_nodes=None) fc.connect(verbose=True) self.response = fc.to_reasoner_std()
def setUpClass(cls): ht = Hint() cxcr4 = ht.query("CXCR4")["Gene"][0] fc = FindConnection(input_obj=cxcr4, output_obj="ChemicalSubstance", intermediate_nodes=None) fc.connect(verbose=True) cls.response = fc.to_reasoner_std()
def test_one_filt(self): ht = Hint() input = ht.query('hyperphenylalaninemia')['Disease'][0] filt = [{}, {'name': 'NodeDegree', 'count': 25}] fc = FindConnection(input_obj=input, output_obj='ChemicalSubstance', intermediate_nodes=['Gene'], filters=filt) fc.connect() self.assertEqual(27, len(fc.fc.G.nodes))
def test_no_filter(self): ht = Hint() input = ht.query('hyperphenylalaninemia')['Disease'][0] filt = [] fc = FindConnection(input_obj=input, output_obj='ChemicalSubstance', intermediate_nodes=['Gene'], filters=filt) fc.connect() for node, y in fc.fc.G.nodes(data=True): self.assertTrue('filteredBy' not in y.keys())
def test_result_section(self): """Find connection between TMPRSS2 and pentamidine through all intermediate nodes """ tmprss2 = ht.query("TMPRSS2")["Gene"][0] pentamidine = ht.query("pentamidine")["ChemicalSubstance"][0] fc = FindConnection( input_obj=tmprss2, output_obj=pentamidine, intermediate_nodes=["BiologicalEntity"], registry=None, ) fc.connect(verbose=True) self.assertIn("PLXNA2", fc.fc.G)
def test_query_with_broken_intermediate_nodes(self): """For a query with long intermediate nodes, it might happen that one intermediate query returns 0 hits. In this case, we should stop the code execution""" mof = ht.query("Multiple Organ Failure")["Disease"][0] fc = FindConnection( mof, output_obj="Gene", intermediate_nodes=[ "BiologicalProcess", "Cell", "AnatomicalEntity", "CellularComponent", ], ) fc.connect() self.assertGreater(len(fc.fc.G), 2)
def get_disease_to_gene_results(disease_input): disease_to_gene_results = {} #directly related fc = FindConnection(input_obj=disease_input, output_obj='Gene', intermediate_nodes=None) fc.connect(verbose=False) disease_to_genes = fc.display_table_view() disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str. contains('UMLS')] # keep track of number of occurrences from direct disease -> gene connection print("running disease -> gene") i = list(disease_to_genes["output_name"]) d = {x: i.count(x) for x in i} sorted_disease_to_genes = { k: v for k, v in sorted(d.items(), key=lambda item: item[1]) } disease_to_gene_results[ "sorted_disease_to_genes"] = sorted_disease_to_genes # print("occurences of genes directly related to genes") # print(disease_to_gene_results["sorted_disease_to_genes"]) one_step_genes_pub_counts = {} for index, row in disease_to_genes.iterrows(): current_pubcount = 0 if (row["pred1_pubmed"] != None): current_pubcount = current_pubcount + row["pred1_pubmed"].count( ",") + 1 if row["output_name"] in one_step_genes_pub_counts: one_step_genes_pub_counts[ row["output_name"]] = one_step_genes_pub_counts[ row["output_name"]] + current_pubcount else: one_step_genes_pub_counts[row["output_name"]] = current_pubcount disease_to_gene_results[ "one_step_genes_pub_counts"] = one_step_genes_pub_counts disease_to_genes_list = list(reversed(list( sorted_disease_to_genes.keys()))) disease_to_gene_results["disease_to_genes_list"] = disease_to_genes_list return (disease_to_gene_results)
def predict_many(input_object, intermediate_node_list, output_type): df_list = [] for inter in intermediate_node_list: try: print("Intermediate Node type running:") print(inter) fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter]) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] if (rows > 0): df_list.append(df) except: print("FAILED") if (len(df_list) > 0): return pd.concat(df_list) else: return None
def get(self): input_obj = self.get_query_argument('input_obj') output_obj = self.get_query_argument('output_obj') print("executing connect query: ", self.request.uri) intermediate_nodes = self.get_query_argument('intermediate_nodes') if type(input_obj) == str: input_obj = tornado.escape.json_decode(input_obj) if type(output_obj) == str: output_obj = tornado.escape.json_decode(output_obj) if type(intermediate_nodes) == str: intermediate_nodes = ast.literal_eval(intermediate_nodes) fc = FindConnection(input_obj=input_obj, output_obj=output_obj, intermediate_nodes=intermediate_nodes, registry=reg) fc.connect() df = fc.display_table_view() if df.empty: res = [] else: df = df[[ 'input', 'pred1', 'pred1_api', 'node1_name', 'node1_type', 'pred2', 'pred2_api', 'output_name' ]] df.drop_duplicates(inplace=True) res = df.to_dict('records') if res: self.set_status(200) self.write( tornado.escape.json_encode({ 'data': res, 'log': fc.fc.log })) self.finish() return else: self.set_status(404) self.write(json.dumps({'error': "Unable to find any connection"})) self.finish() return
def test_two_filt(self): ht = Hint() input = ht.query('hyperphenylalaninemia')['Disease'][0] filt = [{ 'name': 'NodeDegree', 'count': 20 }, { 'name': 'EdgeLabel', 'label': 'related_to' }] fc = FindConnection(input_obj=input, output_obj='ChemicalSubstance', intermediate_nodes=['Gene'], filters=filt) fc.connect() self.assertEqual(52, len(fc.fc.G.nodes)) for node, y in fc.fc.G.nodes(data=True): if node != 'mild hyperphenylalaninemia': if y['type'] == 'Gene': self.assertEqual('NodeDegree', y['filteredBy']) elif y['type'] == 'ChemicalSubstance': self.assertEqual('EdgeLabel', y['filteredBy'])
from biothings_explorer.hint import Hint from biothings_explorer.user_query_dispatcher import FindConnection ht = Hint() ace2 = ht.query("ACE2")['Gene'][0] # print(ace2) fc = FindConnection(input_obj=ace2, output_obj='DiseaseOrPhenotypicFeature', intermediate_nodes='ChemicalSubstance') fc.connect(verbose=True) # save graphml file to test.graphml output = fc.to_graphml("test.graphml")
# The angiotensin-converting enzyme 2, or ACE2 “receptor,” is the protein # provides the entry point for the coronavirus to hook into and # infect a wide range of human cells. from biothings_explorer.user_query_dispatcher import FindConnection from biothings_explorer.hint import Hint ht = Hint() ace2 = ht.query("ACE2")['Gene'][0] fc = FindConnection(input_obj=ace2, output_obj='ChemicalSubstance', intermediate_nodes=['BiologicalEntity']) fc.connect( verbose=True ) # There was a KeyError: "equivalent_ids" on this line, will not move on from this example
def determined_genes_to_symptoms(gene_list, symptom_list): # gene -> phenotypic feature nodes print("Genes -> PhenotypicFeatures") df_list = [] for x in gene_list: # print(x) try: gene = ht.query(x)["Gene"][0] fc = FindConnection(input_obj=gene, output_obj='PhenotypicFeature', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] if (rows > 0): df_list.append(df) except: print(str(x) + " FAILED") if (len(df_list) > 0): top_gene_to_phenotypicFeature = pd.concat(df_list) ## Get names for HP ids HP_ids = top_gene_to_phenotypicFeature[ top_gene_to_phenotypicFeature["output_name"].str.contains( "HP:", regex=False)]["output_name"] HP_ids = list(HP_ids) HP_ids = list(dict.fromkeys(HP_ids)) # len(HP_ids) HP_dict = {} for x in HP_ids: HP_ID = x.split(':')[1] r = requests.get('https://biothings.ncats.io/hpo/phenotype/HP%3A' + HP_ID) res = r.json() if (('_id' in res) & ('name' in res)): HP_dict[res['_id']] = res['name'].lower() phen_indices = get_similar_phen_indices( list(top_gene_to_phenotypicFeature["output_name"]), symptom_list, 0.95, HP_dict) phen_top = top_gene_to_phenotypicFeature.iloc[phen_indices, :] # phen_top = top_gene_to_phenotypicFeature # phen_top for index in range(phen_top.shape[0]): # if("HP:" in row['output_name']): # print(index) if (phen_top.iloc[index]["output_name"] in HP_dict): phen_top.iloc[index]["output_name"] = HP_dict[phen_top.iloc[index] ["output_name"]] phen_top # gene -> bioprocess print("Genes -> Bioprocesses") df_list = [] for x in gene_list: # print(x) try: gene = ht.query(x)["Gene"][0] fc = FindConnection(input_obj=gene, output_obj='BiologicalProcess', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] if (rows > 0): df_list.append(df) except: print(str(x) + " FAILED") if (len(df_list) > 0): top_gene_to_bioprocesses = pd.concat(df_list) go_ids = top_gene_to_bioprocesses[ top_gene_to_bioprocesses["output_name"].str.contains( "go:", regex=False)]["output_name"] go_ids = list(go_ids) go_ids = list(dict.fromkeys(go_ids)) # len(go_ids) go_dict = {} for x in go_ids: go_ID = x.split(':')[1] r = requests.get('https://biothings.ncats.io/go_bp/geneset/GO%3A' + go_ID) res = r.json() if ('name' in res): go_dict[res['_id']] = res['name'].lower() bp_indices = get_similar_bp_indices( list(top_gene_to_bioprocesses["output_name"]), symptom_list, 0.95, go_dict) bioprocess_top = top_gene_to_bioprocesses.iloc[bp_indices, :] # bioprocess_top = top_gene_to_bioprocesses # Genes -> disease type "symptoms" print("Genes -> Diseases") df_list = [] for x in gene_list: try: gene = ht.query(x)["Gene"][0] fc = FindConnection(input_obj=gene, output_obj='Disease', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() rows = df.shape[0] if (rows > 0): df_list.append(df) except: print(str(x) + " FAILED") if (len(df_list) > 0): top_gene_to_diseases = pd.concat(df_list) disease_indices = get_similar_disease_indices( list(top_gene_to_diseases["output_name"]), symptom_list, 0.95) relevant_top_gene_to_diseases = top_gene_to_diseases.iloc[ disease_indices, :] # relevant_top_gene_to_diseases = top_gene_to_diseases ## make dataframe with all genes -> symptoms all_gene_connections = pd.concat( [bioprocess_top, phen_top, relevant_top_gene_to_diseases]) all_gene_connections["output_name"] = all_gene_connections[ "output_name"].str.lower() return (all_gene_connections)
def get_symtpom_prevalence(hp_symptom_dict, disease_name): for key in hp_symptom_dict: print(key) edges_out_count = 0 # print("name: " + str(hp_symptom_dict[key])) UMLS = '' for y in ['PhenotypicFeature', 'Disease', 'BiologicalProcess']: if y == 'PhenotypicFeature': a = ht.query(key)[y] if len(a) > 0: b = a[0] if 'UMLS' in b: UMLS = b['UMLS'] try: fc = FindConnection(input_obj=b, output_obj='Gene', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() print('gene') print(df.shape) # print("phen") # print(hp_symptom_dict[key]) # print(df.shape[0]) if (df.shape[0] > 0): print("OKKKk") df = df[df["output_name"] != disease_name] edges_out_count = edges_out_count + df.shape[0] print(edges_out_count) except: print("Nope") try: fc = FindConnection(input_obj=b, output_obj='Disease', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() print(df.shape) # print("phen") # print(hp_symptom_dict[key]) # print(df.shape[0]) if (df.shape[0] > 0): print("ok edge phen to dis") df = df[df["output_name"] != disease_name] edges_out_count = edges_out_count + df.shape[0] print(edges_out_count) except: print("Nope") if (y == 'Disease') | (y == 'BiologicalProcess'): for z in hp_symptom_dict[key]["names"]: if ((y == 'Disease') & (len(UMLS) > 0)): try: a = ht.query(UMLS)[y] except: a = [] # pass else: try: a = ht.query(z)[y] except: a = [] # pass # print(a) for b in a: if b['name'].lower() == z.lower(): # print('match') # print(b) # print(z) try: fc = FindConnection(input_obj=b, output_obj='Gene', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() # print("BD") # print(df.shape[0]) if (df.shape[0] > 0): df = df[df["output_name"] != disease_name] edges_out_count = edges_out_count + df.shape[ 0] except: print("Nope") try: fc = FindConnection(input_obj=b, output_obj='Disease', intermediate_nodes=None) fc.connect(verbose=False) df = fc.display_table_view() # print("BD") # print(df.shape[0]) if (df.shape[0] > 0): df = df[df["output_name"] != disease_name] edges_out_count = edges_out_count + df.shape[ 0] except: print("Nope") print("edges out") print(edges_out_count) hp_symptom_dict[key]["edges_out_count"] = edges_out_count return (hp_symptom_dict)