def get_summary(symbol): version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) ' \ 'Gecko/20071127 Firefox/2.0.0.11' mg = MyGeneInfo() try: entrez_id = mg.query('symbol:%s' % symbol, species='human')['hits'][0]['entrezgene'] except Exception as e: logging.info("Error with query: " + str(e)) return "Not found", "No entries found. (Entrez ID not found)" url = 'http://www.ncbi.nlm.nih.gov/gene/' + str(entrez_id) response = get(url, version) html_output = response.text search_string_start = '<dt>Summary</dt>' match_start = html_output.find(search_string_start) if match_start != -1: match_start += len(search_string_start) html_output = html_output[match_start:] search_string_end = '<dt>Orthologs</dt>' match_end = html_output.find(search_string_end) if match_end != -1: html_output = html_output[:match_end] # takes out the HTML tags extract_string = re.sub('<[^<]+?>', '', html_output) else: extract_string = "No entries found. (match_end = -1)" else: extract_string = "No entries found. (match_start = -1)" return entrez_id, extract_string
def __init__(self, taxon): GenericSimilarity.__init__(self) self.mg = MyGeneInfo() self.input_object = '' self.taxon = taxon self.ont = 'go' self.meta = { 'input_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'source': 'Monarch Biolink', 'predicate': [ 'blm:macromolecular machine to biological process association', 'macromolecular machine to molecular activity association' ] } # Load the functional catalog of # GO ontology and annotation associations self.load_associations(taxon)
def load_gene_set(self, input_gene_set): annotated_gene_set = [] for gene in input_gene_set.get_input_curie_set(): mg = MyGeneInfo() gene_curie = '' sim_input_curie = '' symbol = '' if 'MGI' in gene['hit_id']: gene_curie = gene['hit_id'] sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI') symbol = None if 'HGNC' in gene['hit_id']: gene_curie = gene['hit_id'].replace('HGNC', 'hgnc') scope = 'HGNC' mg_hit = mg.query(gene_curie, scopes=scope, species=self.taxon, fields='uniprot, symbol, HGNC', entrezonly=True) try: gene_curie = gene['hit_id'] sim_input_curie = 'UniProtKB:{}'.format( mg_hit['hits'][0]['uniprot']['Swiss-Prot']) except Exception as e: print(__name__ + ".load_gene_set() Exception: ", gene, e) annotated_gene_set.append({ 'input_id': gene_curie, 'sim_input_curie': sim_input_curie, 'input_symbol': gene['hit_symbol'] }) return annotated_gene_set
def load_gene_set(self): for gene in self.input_object['input']: mg = MyGeneInfo() gene_curie = '' sim_input_curie = '' symbol = '' if 'MGI' in gene['hit_id']: gene_curie = gene['hit_id'] sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI') symbol = None if 'HGNC' in gene['hit_id']: gene_curie = gene['hit_id'].replace('HGNC', 'hgnc') scope = 'HGNC' mg_hit = mg.query( gene_curie, scopes=scope, species=self.input_object['parameters']['taxon'], fields='uniprot, symbol, HGNC', entrezonly=True) try: gene_curie = gene['hit_id'] sim_input_curie = 'UniProtKB:{}'.format( mg_hit['hits'][0]['uniprot']['Swiss-Prot']) except Exception as e: print(gene, e) self.gene_set.append({ 'input_id': gene_curie, 'sim_input_curie': sim_input_curie, 'input_symbol': gene['hit_symbol'] })
def get_pert_agent(noble_coder, pert_text, title): """ Extract perturbation agent Args: noble_coder: the execution path of Noble Coder pert_text: the perturbation text title: the title of the GSE Returns: the perturbation agent """ # Try to identify perturbation agent from perturbation text first, if unsuccessful, try with title pert_agent = run_noble_coder(pert_text, noble_coder) if pert_agent is None: pert_agent = run_noble_coder(title, noble_coder) # Extract gene symbol if pert_agent is not None: for special_char in SPECIAL_CHARS: pert_agent = pert_agent.replace(special_char, " ") pert_agent = pert_agent.replace("Superfamily", "") mg = MyGeneInfo() response = mg.query(pert_agent) if response["hits"]: pert_agent = response["hits"][0]["symbol"] return pert_agent
def __init__(self, associations: AssociationSet = None): GenericSimilarity.__init__(self) self.mg = MyGeneInfo() self.gene_set = [] self.input_object = '' self.ont = 'go' self.group = '' self.meta = { 'input_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'source': 'Monarch Biolink', 'predicate': [ 'blm:macromolecular machine to biological process association', 'macromolecular machine to molecular activity association' ] }
def ensemble_to_symbol(ens): mg = MyGeneInfo() gene_info = mg.getgenes(geneids=ens, fields='symbol', as_dataframe=True, df_index=False) gene_info = gene_info.drop_duplicates('query').reset_index() gene_symbol = gene_info['symbol'].values gene_id = gene_info.symbol.str.cat([gene_info['query']], sep='|', na_rep='?').values return gene_symbol, gene_id
def query(self): mg = MyGeneInfo(url=self.base_url) # get the total q = mg.query(self.q, fields=self.fields, entrezonly=self.entrezonly) total = q['total'] # get the cursor q = mg.query(self.q, fields=self.fields, fetch_all=True, entrezonly=self.entrezonly) return q, total
def _batch_query(self, ids): """ Uses mygene.info service to query many Entrez gene IDs. It returns a dict of {id-1: result-1, id-2: ... } with the IDs that were found (i.e. leaves out the not found ones). """ if not hasattr(self, 'mg'): self.mg = MyGeneInfo() for batch_of_ids in grouped(ids, self.BATCH_SIZE): batch_annotations = {} for hit in self.mg.querymany(batch_of_ids, scopes='entrezgene', fields='all', verbose=self.VERBOSE): if 'notfound' not in hit and hit['taxid'] == self.TAXID: batch_annotations[hit['query']] = hit yield batch_annotations
def get_uid(name): mg = MyGeneInfo() res = mg.query(name, scopes='symbol, alias', fields='uniprot, symbol', species='human') symbol = [] uid = [] for hit in res['hits']: try: uid.append(hit['uniprot']['Swiss-Prot']) symbol.append(hit['symbol']) except KeyError: uid.append('unable to retrieve') dict = {s: i for s, i in zip(symbol, uid)} try: uid = dict[name] out = uid except KeyError: out = dict return out
def get_mg_cursor(self, taxid, filter_f=None): # get a cursor to all mygene docs for a specific taxid # accepts a function that can be used to filter the gene cursor (returns True or False for each doc) mg = MyGeneInfo(url=self.base_url) # get the total q = mg.query(self.q, fields=self.fields, species=str(taxid), entrezonly=self.entrezonly) total = q['total'] # get the cursor q = mg.query(self.q, fields=self.fields, species=str(taxid), fetch_all=True, entrezonly=self.entrezonly) if filter_f: q = filter(filter_f, q) return q, total
def convert_gene_api(query): mg = MyGeneInfo() dic = {} out = float('nan') out_format = 'entrezgene' try: res = mg.query(query) except: res = {} res['hits'] = [] if len(res['hits']) > 0: for h in res['hits']: if h['taxid'] == 9606 and out_format in h.keys(): out = h[out_format] else: out = float('nan') dic[query] = out return (dic)
def pathway_enrichment(gene_names, pipe_section=1, dbs=None, total_genes=20531, p_cutoff=0.05, cache_path='../data/cache/'): mg = MyGeneInfo() mg.set_caching(cache_db=os.path.join(cache_path, 'mygene_cache'), verbose=False) if not os.path.exists(cache_path): os.makedirs(cache_path) gene_ids = [] for g in gene_names: gene_ids.append(g.split('|')[pipe_section]) gene_info = mg.getgenes(geneids=gene_ids, fields='pathway', as_dataframe=True, df_index=False) try: pathways = gene_info['pathway'] except Exception as e: print(e) print('No pathways found with the selected genes:') print(gene_names) return None p_df = [] for idx, p in pathways.iteritems(): if not (p is np.nan or p != p): # print(p) path = dict(p) for key in path.keys(): if dbs is not None and key not in dbs: continue p_dict = path[key] if type(p_dict) is list: for k in p_dict: p_df.append([k['id'], k['name'], key, str(gene_info['query'][idx])]) else: p_df.append([p_dict['id'], p_dict['name'], key, str(gene_info['query'][idx])]) p_df = pd.DataFrame(p_df, columns=['id', 'name', 'db', 'genes']) p_df = p_df.groupby(['id', 'name', 'db'], as_index=False)['genes'].apply(list) p_df = p_df.reset_index() p_df.columns = ['id', 'name', 'db', 'genes'] pathway_size = [] for idx, p_row in p_df.iterrows(): if idx % 50 == 0: print('querying {}/{}'.format(idx, p_df.shape[0])) p_size = mg.query('pathway.{}.id:{}'.format(p_row.db, p_row.id), size=0, verbose=False)['total'] pathway_size.append(p_size) p_df['sup'] = [len(x) for x in p_df.genes.as_matrix()] p_df['size'] = pathway_size p_p = [] nb_slected_genes = len(gene_names) for idx, p_row in p_df.iterrows(): p_p.append(hypergeom.sf(p_row['sup'] - 1, total_genes, p_row['size'], nb_slected_genes)) p_df['p_value'] = p_p p_df = p_df[p_df['p_value'] <= p_cutoff] p_df['ratio'] = [x['sup'] / x['size'] for i, x in p_df.iterrows()] p_df = p_df.sort_values(by=['p_value']).reset_index(drop=True) return p_df
def download_targets_for_diseases(data_dir: str): my_gene_info = MyGeneInfo() open_targets_client = OpenTargetsClient() for disease_efo_id, disease_abbreviation in zip(disease_efo_ids, DISEASE_ABBREVIATIONS): path = os.path.join(data_dir, disease_abbreviation, opentargets_file_name) if os.path.exists(path): continue with open(path, 'w+') as file: download_targets_for_disease( disease_efo_id=disease_efo_id, my_gene_info=my_gene_info, open_targets_client=open_targets_client, file=file, )
def __init__(self): self.blw = BioLinkWrapper() self.mg = MyGeneInfo() self.input_object = '' self.meta = { 'data_type': 'disease', 'input_type': { 'complexity': 'single', 'id_type': ['MONDO', 'DO', 'OMIM'], }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC' }, 'taxon': 'human', 'limit': None, 'source': 'Monarch Biolink', 'predicate': 'blm:gene associated with condition' }
class MygeneAnnotator(WebAnnotatorWithCache): """ Provides gene annotations given one or more gene entrez IDs. """ SOURCE_NAME = 'mygene' ANNOTATIONS_ARE_JSON = True VERBOSE = False TAXID = 9606 # ^ Human ID, used to avoid annotating with genes from another species # It can be replaced at Runtime to annotate with other species BATCH_SIZE = 1000 def _batch_query(self, ids): """ Uses mygene.info service to query many Entrez gene IDs. It returns a dict of {id-1: result-1, id-2: ... } with the IDs that were found (i.e. leaves out the not found ones). """ if not hasattr(self, 'mg'): self.mg = MyGeneInfo() for batch_of_ids in grouped(ids, self.BATCH_SIZE): batch_annotations = {} for hit in self.mg.querymany(batch_of_ids, scopes='entrezgene', fields='all', verbose=self.VERBOSE): if 'notfound' not in hit and hit['taxid'] == self.TAXID: batch_annotations[hit['query']] = hit yield batch_annotations @staticmethod def _parse_annotation(raw_annotation): annotation = {k: v for k, v in raw_annotation.items()} if 'uniprot' in annotation: swissprot_id = annotation['uniprot'].get('Swiss-Prot') if swissprot_id: annotation['swissprot'] = swissprot_id del (annotation['uniprot']) return annotation
def download_targets_for_disease( disease_efo_id: str, open_targets_client: Optional[OpenTargetsClient] = None, my_gene_info: Optional[MyGeneInfo] = None, file: Optional[TextIO] = None, ) -> None: """ :param disease_efo_id: A disease's EFO identifier :param open_targets_client: An OpenTargetsClient :param my_gene_info: A MyGeneInfo client :param file: Place to output targets for disease """ if open_targets_client is None: open_targets_client = OpenTargetsClient() associations = open_targets_client.get_associations_for_disease( disease_efo_id, fields=[ 'associationscore.datatypes', 'target.id', ], ).filter( datatype='known_drug', ) ensembl_list = [ association['target']['id'] for association in associations ] if my_gene_info is None: my_gene_info = MyGeneInfo() id_mappings = my_gene_info.getgenes(ensembl_list, fields="entrezgene") print('efo', 'ncbigene', file=file, sep='\t') for mapping in id_mappings: entrez_gene_id = mapping.get('entrezgene') if entrez_gene_id is not None: print(disease_efo_id, entrez_gene_id, file=file, sep='\t')
group_by, desc, arrange, slice_head, tibble, left_join, mutate, is_na, across, if_else, filter, pull, select, ) mygene = MyGeneInfo() class QueryGenesNotFound(Exception): """When genes cannot be found""" def gene_name_conversion( genes, species, infmt, outfmt, notfound, ): """Convert gene names using MyGeneInfo
class FunctionalSimilarity(GenericSimilarity): def __init__(self, associations: AssociationSet = None): GenericSimilarity.__init__(self) self.mg = MyGeneInfo() self.gene_set = [] self.input_object = '' self.ont = 'go' self.group = '' self.meta = { 'input_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'source': 'Monarch Biolink', 'predicate': [ 'blm:macromolecular machine to biological process association', 'macromolecular machine to molecular activity association' ] } def metadata(self): print("""Mod1A Functional Similarity metadata:""") pprint(self.meta) def load_input_object(self, input_object): self.input_object = input_object if self.input_object['parameters']['taxon'] == 'mouse': self.group = 'mouse' if self.input_object['parameters']['taxon'] == 'human': self.group = 'human' def load_associations(self): self.retrieve_associations(ont=self.ont, group=self.group) def load_gene_set(self): for gene in self.input_object['input']: mg = MyGeneInfo() gene_curie = '' sim_input_curie = '' symbol = '' if 'MGI' in gene['hit_id']: gene_curie = gene['hit_id'] sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI') symbol = None if 'HGNC' in gene['hit_id']: gene_curie = gene['hit_id'].replace('HGNC', 'hgnc') scope = 'HGNC' mg_hit = mg.query( gene_curie, scopes=scope, species=self.input_object['parameters']['taxon'], fields='uniprot, symbol, HGNC', entrezonly=True) try: gene_curie = gene['hit_id'] sim_input_curie = 'UniProtKB:{}'.format( mg_hit['hits'][0]['uniprot']['Swiss-Prot']) except Exception as e: print(gene, e) self.gene_set.append({ 'input_id': gene_curie, 'sim_input_curie': sim_input_curie, 'input_symbol': gene['hit_symbol'] }) def compute_similarity(self): group = self.input_object['parameters']['taxon'] lower_bound = float(self.input_object['parameters']['threshold']) results = self.compute_jaccard(self.gene_set, lower_bound) for result in results: if group == 'human': result['hit_id'] = self.symbol2hgnc(result['hit_symbol']) for gene in self.gene_set: if gene['sim_input_curie'] != result['input_id']: result['input_id'] = self.symbol2hgnc( result['input_symbol']) return results def symbol2hgnc(self, symbol): mg_hit = self.mg.query('symbol:{}'.format(symbol), fields='HGNC,symbol,taxon', species='human', entrezonly=True) if mg_hit['total'] == 1: return 'HGNC:{}'.format(mg_hit['hits'][0]['HGNC'])
def get_mg_gene(self, entrezgene): mg = MyGeneInfo(url=self.base_url) q = mg.getgene(entrezgene, fields=self.fields) return q, 1
class FunctionalSimilarity(GenericSimilarity): def __init__(self, taxon): GenericSimilarity.__init__(self) self.mg = MyGeneInfo() self.input_object = '' self.taxon = taxon self.ont = 'go' self.meta = { 'input_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'output_type': { 'complexity': 'set', 'id_type': 'HGNC', 'data_type': 'gene', }, 'source': 'Monarch Biolink', 'predicate': [ 'blm:macromolecular machine to biological process association', 'macromolecular machine to molecular activity association' ] } # Load the functional catalog of # GO ontology and annotation associations self.load_associations(taxon) def metadata(self): print("""Mod1A Functional Similarity metadata:""") pprint(self.meta) def load_gene_set(self, input_gene_set): annotated_gene_set = [] for gene in input_gene_set.get_input_curie_set(): mg = MyGeneInfo() gene_curie = '' sim_input_curie = '' symbol = '' if 'MGI' in gene['hit_id']: gene_curie = gene['hit_id'] sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI') symbol = None if 'HGNC' in gene['hit_id']: gene_curie = gene['hit_id'].replace('HGNC', 'hgnc') scope = 'HGNC' mg_hit = mg.query(gene_curie, scopes=scope, species=self.taxon, fields='uniprot, symbol, HGNC', entrezonly=True) try: gene_curie = gene['hit_id'] sim_input_curie = 'UniProtKB:{}'.format( mg_hit['hits'][0]['uniprot']['Swiss-Prot']) except Exception as e: print(__name__ + ".load_gene_set() Exception: ", gene, e) annotated_gene_set.append({ 'input_id': gene_curie, 'sim_input_curie': sim_input_curie, 'input_symbol': gene['hit_symbol'] }) return annotated_gene_set def compute_similarity(self, annotated_gene_set, threshold): lower_bound = float(threshold) results = self.compute_jaccard(annotated_gene_set, lower_bound) for result in results: if self.taxon == 'human': result['hit_id'] = self.symbol2hgnc(result['hit_symbol']) for gene in annotated_gene_set: if gene['sim_input_curie'] != result['input_id']: result['input_id'] = self.symbol2hgnc( result['input_symbol']) return results def symbol2hgnc(self, symbol): mg_hit = self.mg.query('symbol:{}'.format(symbol), fields='HGNC,symbol,taxon', species='human', entrezonly=True) if mg_hit['total'] == 1: return 'HGNC:{}'.format(mg_hit['hits'][0]['HGNC'])
from mygene import MyGeneInfo from pprint import pprint import csv import sys mg = MyGeneInfo() dict_symbol = {} def gene_name(ensg_id): gene = mg.getgene(ensg_id,fields='symbol') if gene != None and type(gene) is dict: return gene['symbol'] elif type(gene) is list: print(ensg_id) pprint(gene) return ensg_id else: return ensg_id ''' print(gene_name('ENSG00000273842')) ''' with open('sample.txt','r') as f: rows = csv.reader(f, delimiter='\t') with open('gene_table.csv','w') as f_write: f_csv = csv.writer(f_write) i = 0 for row in rows: i += 1 name = gene_name(row[0][:15])