def dicty_mutant_gene_sets(tax_id: str): """ Return dicty mutant phenotype gene sets from Dictybase """ if tax_id == '44689': gene_sets = [] gene_matcher = GeneMatcher('44689') for phenotype, mutants in phenotypes.phenotype_mutants().items(): phenotype = phenotype.replace(",", " ") gene_symbols = [ phenotypes.mutant_genes(mutant)[0] for mutant in mutants ] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(str(gene.gene_id)) gs = GeneSet(gs_id=phenotype, name=phenotype, genes=genes, hierarchy=('Dictybase', 'Phenotypes'), organism=tax_id, link='') gene_sets.append(gs) for gs_group in GeneSets(gene_sets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def dicty_mutant_gene_sets(org): """ Return dicty mutant phenotype gene sets from Dictybase """ if org == '352472': gene_sets = [] gene_matcher = GeneMatcher('352472') for phenotype, mutants in dicty.phenotypes.phenotype_mutants().items(): gene_symbols = [ dicty.phenotypes.mutant_genes(mutant)[0] for mutant in mutants ] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) if len(gene_symbols) != len(genes): print(len(gene_symbols), len(genes)) gs = GeneSet(gs_id=phenotype, name=phenotype, genes=genes, hierarchy=('Dictybase', 'Phenotypes'), organism='352472', link='') gene_sets.append(gs) return GeneSets(gene_sets)
def cytoband_gene_sets(tax_id: str) -> None: """ Create cytoband gene sets from Stanford Microarray Database """ if tax_id == '9606': download_link = 'http://statweb.stanford.edu/~tibs/GSA/cytobands-stanford.gmt' gene_matcher = GeneMatcher('9606') with urlopen(download_link) as stream: data = stream.read().splitlines() genesets = [] for band in data: b = band.decode().split('\t') gene_symbols = b[2:] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(gene.gene_id) genesets.append( GeneSet(gs_id=b[0], name=b[1], genes=genes if b[2:] else set(), hierarchy=('Cytobands', ), organism='9606', link='')) for gs_group in GeneSets(genesets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def cytoband_gene_sets(org): """ Create cytoband gene sets from Stanford Microarray Database """ if org == '9606': gene_matcher = GeneMatcher('9606') with urlopen(CYTOBAND_DOWNLOAD_LINK) as stream: data = stream.read().splitlines() genesets = [] for band in data: b = band.decode().split('\t') gene_symbols = b[2:] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) genesets.append( GeneSet(gs_id=b[0], name=b[1], genes=genes if b[2:] else [], hierarchy=('Cytobands', ), organism='9606', link='')) return GeneSets(genesets)
def reactome_gene_sets(org): """ Prepare human pathways gene sets from reactome pathways """ if org == '9606': gene_matcher = GeneMatcher('9606') with urlopen(REACTOME_DOWNLOAD_LINK) as url: memfile = io.BytesIO(url.read()) with ZipFile(memfile, 'r') as myzip: f = myzip.open(REACTOME_FILE_NAME) content = f.read().decode().splitlines() genesets = [] for path in content: gene_symbols = path.split('\t')[2:] if path.split( '\t')[2:] else [] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) gs = GeneSet(gs_id=path.split('\t')[0], name=path.split('\t')[0], genes=genes, hierarchy=('Reactome', 'Pathways'), organism='9606', link='') genesets.append(gs) return GeneSets(genesets)
def omim_gene_sets(org): """ Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses """ if org == '9606': gene_matcher = GeneMatcher('9606') genesets = [] for disease in omim.diseases(): gene_symbols = omim.disease_genes(disease) gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) gs = GeneSet( gs_id=disease.id, name=disease.name, genes=genes, hierarchy=('OMIM', ), organism='9606', link=(OMIM_LINK.format(disease.id) if disease.id else None)) genesets.append(gs) return GeneSets(genesets)
def test_synonym_multiple_matches(self): gm = GeneMatcher('9606') gm.genes = ['HB1'] gene = gm.genes[0] self.assertEqual(gene.input_identifier, 'HB1') # Gene matcher should not find any unique match self.assertEqual(gene.gene_id, None)
def test_symbol_match_scenario(self): gm = GeneMatcher('9606') gm.genes = ['SCN5A'] gene = gm.genes[0] self.assertEqual(gene.input_identifier, 'SCN5A') self.assertEqual(gene.symbol, 'SCN5A') self.assertEqual(gene.gene_id, '6331')
def test_different_input_identifier_types(self): gm = GeneMatcher('9606') gm.genes = ['CD4', '614535', 'HB-1Y', 'ENSG00000205426'] for gene in gm.genes: self.assertIsNotNone(gene.description) self.assertIsNotNone(gene.tax_id) self.assertIsNotNone(gene.species) self.assertIsNotNone(gene.gene_id)
def find_homologs(self, genes: List[Union[str, Gene]]) -> List[Optional[Gene]]: gm = GeneMatcher(self.source_tax) gm.genes = genes homologs = [ g.homolog_gene(taxonomy_id=self.target_tax) for g in gm.genes ] homologs = load_gene_summary(self.target_tax, homologs) return homologs
def test_homologs(self): gm = GeneMatcher('9606') gm.genes = ['920'] g = gm.genes[0] self.assertIsNotNone(g.homologs) self.assertTrue(len(g.homologs)) self.assertIn('10090', g.homologs) self.assertEqual(g.homology_group_id, '513') self.assertEqual(g.homolog_gene('10090'), '12504') self.assertIsNone(g.homolog_gene('Unknown_taxonomy'))
def matchDDBids(genesDDB): matcher = GeneMatcher(44689) matcher.genes = genesDDB geneNames = matcher.genes geneInfo = dict() for gene in geneNames: ddb = gene.input_identifier symbol = parseNoneStr(gene.symbol) entrez = parseNoneStr(gene.gene_id) description = parseNoneStr(gene.description) geneInfo[ddb] = (symbol, entrez, description) return geneInfo
def reactome_gene_sets(tax_id: str) -> None: """ Prepare human pathways gene sets from reactome pathways """ if tax_id == '9606': download_link = 'http://www.reactome.org/download/current/ReactomePathways.gmt.zip' file_name = 'ReactomePathways.gmt' detail_link = 'https://reactome.org/content/detail/{}' gene_matcher = GeneMatcher('9606') with urlopen(download_link) as url: memfile = io.BytesIO(url.read()) with ZipFile(memfile, 'r') as myzip: f = myzip.open(file_name) content = f.read().decode().splitlines() genesets = [] for path in content: gene_symbols = path.split('\t')[2:] if path.split( '\t')[2:] else [] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(str(gene.gene_id)) pathway = path.split('\t')[0].replace(',', ' ') pathway_id = path.split('\t')[1].replace(',', ' ') gs = GeneSet(gs_id=pathway_id, name=pathway, genes=genes, hierarchy=('Reactome', 'pathways'), organism='9606', link=detail_link.format(pathway_id)) genesets.append(gs) for gs_group in GeneSets(genesets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def name_genes_entrez(gene_names: list, key_entrez: bool, organism: int = ORGANISM) -> dict: """ Add entrez id to each gene name :param gene_names: Gene names (eg. from dictyBase) :param organism: organism ID :param key_entrez: True: Entrez IDs as keys and names as values, False: vice versa :return: Dict of gene names and matching Entres IDs for genes that have Entrez ID """ entrez_names = dict() matcher = GeneMatcher(organism) matcher.genes = gene_names for gene in matcher.genes: name = gene.input_identifier entrez = gene.gene_id if entrez is not None: if key_entrez: entrez_names[entrez] = name else: entrez_names[name] = entrez return entrez_names
def send_to_output(self, result): self.progress_bar.finish() self.setStatusMessage('') etc_json, table_name = result # convert to table data = etc_to_table(etc_json, bool(self.gene_as_attr_name)) # set table name data.name = table_name # match genes gene_matcher = GeneMatcher(str(self.orgnism)) if not bool(self.gene_as_attr_name): if 'Gene' in data.domain: gene_column = data.domain['Gene'] gene_names = data.get_column_view(gene_column)[0] gene_matcher.genes = gene_names gene_matcher.run_matcher() domain_ids = Domain([], metas=[StringVariable(NCBI_ID)]) data_ids = [[str(gene.ncbi_id) if gene.ncbi_id else '?'] for gene in gene_matcher.genes] table_ids = Table(domain_ids, data_ids) data = Table.concatenate([data, table_ids]) data.attributes[GENE_ID_COLUMN] = NCBI_ID else: gene_matcher.match_table_attributes(data) data.attributes[GENE_ID_ATTRIBUTE] = NCBI_ID # add table attributes data.attributes[TAX_ID] = str(self.orgnism) data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.gene_as_attr_name) # reset cache indicators self.set_cached_indicator() # send data to the output signal self.Outputs.etc_data.send(data)
from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher # Count of dictyBase genes and genes with EID (involved in Orange gene sets) dicty_annotations = 0 dicty_genes = set() orange_annotations = 0 orange_genes = set() empty_sets = 0 gene_matcher = GeneMatcher('44689') for phenotype, mutants in phenotypes.phenotype_mutants().items(): gene_symbols = set( phenotypes.mutant_genes(mutant)[0] for mutant in mutants) dicty_annotations += len(gene_symbols) dicty_genes.update(gene_symbols) gene_matcher.genes = gene_symbols N_genes_set_Orange = 0 N_genes_set_dicty = len(gene_symbols) for gene in gene_matcher.genes: if gene.gene_id is not None: orange_genes.add(gene.gene_id) N_genes_set_Orange += 1 orange_annotations += N_genes_set_Orange if N_genes_set_Orange < 1 and N_genes_set_dicty > 0: empty_sets += 1 print('N genes with phenotype annotations in dictyBase:', len(dicty_genes), 'and in Orange Dictybase Phenotypes:', len(orange_genes)) print( 'N of genes across gene sets (with genes being involved in multiple gene sets): dictyBase', dicty_annotations, ', Orange', orange_annotations)
Gene()).homology_group_id homologs = [ gene.gene_id for gene in self._homologs_by_group.get(homology_group, []) if gene.tax_id == organism ] if len(homologs) == 1: return homologs[0] else: # Is possible that find more then one gene? return None if __name__ == "__main__": from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher, load_gene_summary import Orange homology = HomoloGene() gm = GeneMatcher('4932') genes = Orange.data.Table("brown-selected") gm.genes = genes _homologs = [ homology.find_homolog(str(gene.gene_id), '9606') for gene in gm.genes ] _homologs = load_gene_summary('9606', _homologs) for gene, homolog in zip(gm.genes, _homologs): print(f'{gene} ----> {homolog}')
from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher, GENE_INFO_TAGS # specify input organism = 9606 genes_symbols_to_match = ['HB1', 'BCKDHB', 'TWIST1'] # initialize gene matcher object gene_matcher = GeneMatcher(organism) gene_matcher.genes = genes_symbols_to_match # run matching process gene_matcher.run_matcher() # inspect results for gene in gene_matcher.genes: print("\ninput name: " + gene.input_name, "\nid from ncbi: ", gene.ncbi_id, "\nmatch type: ", gene.type_of_match ) if gene.ncbi_id is None and gene.possible_hits: print('possible_hits: ', [hit.ncbi_id for hit in gene.possible_hits])
def Update(self): """ Update (recompute enriched pathways) the widget state. """ if not self.data: return self.error(0) self.information(0) # XXX: Check data in setData, do not even allow this to be executed if # data has no genes try: genes = self.GeneNamesFromData(self.data) except ValueError: self.error(0, "Cannot extract gene names from input.") genes = [] if not self.useAttrNames and any("," in gene for gene in genes): genes = reduce(add, (split_and_strip(gene, ",") for gene in genes), []) self.information(0, "Separators detected in input gene names. " "Assuming multiple genes per instance.") self.queryGenes = genes self.information(1) reference = None if self.useReference and self.refData: reference = self.GeneNamesFromData(self.refData) if not self.useAttrNames \ and any("," in gene for gene in reference): reference = reduce(add, (split_and_strip(gene, ",") for gene in reference), []) self.information(1, "Separators detected in reference gene " "names. Assuming multiple genes per " "instance.") org_code = self.SelectedOrganismCode() from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher gm = GeneMatcher(kegg.to_taxid(org_code)) gm.genes = genes gm.run_matcher() mapped_genes = {gene: str(ncbi_id) for gene, ncbi_id in gm.map_input_to_ncbi().items()} def run_enrichment(org_code, genes, reference=None, progress=None): org = kegg.KEGGOrganism(org_code) if reference is None: reference = org.get_ncbi_ids() # This is here just to keep widget working without any major changes. # map not needed, geneMatcher will not work on widget level. unique_genes = genes unique_ref_genes = dict([(gene, gene) for gene in set(reference)]) taxid = kegg.to_taxid(org.org_code) # Map the taxid back to standard 'common' taxids # (as used by 'geneset') if applicable r_tax_map = dict((v, k) for k, v in kegg.KEGGGenome.TAXID_MAP.items()) if taxid in r_tax_map: taxid = r_tax_map[taxid] # We use the kegg pathway gene sets provided by 'geneset' for # the enrichment calculation. kegg_api = kegg.api.CachedKeggApi() linkmap = kegg_api.link(org.org_code, "pathway") converted_ids = kegg_api.conv(org.org_code, 'ncbi-geneid') kegg_sets = relation_list_to_multimap(linkmap, dict((gene.upper(), ncbi.split(':')[-1]) for ncbi, gene in converted_ids)) kegg_sets = geneset.GeneSets(input=kegg_sets) pathways = pathway_enrichment( kegg_sets, unique_genes.values(), unique_ref_genes.keys(), callback=progress ) # Ensure that pathway entries are pre-cached for later use in the # list/tree view kegg_pathways = kegg.KEGGPathways() kegg_pathways.pre_cache( pathways.keys(), progress_callback=progress ) return pathways, org, unique_genes, unique_ref_genes self.progressBarInit() self.setEnabled(False) self.infoLabel.setText("Retrieving...\n") progress = concurrent.methodinvoke(self, "setProgress", (float,)) self._enrichTask = concurrent.Task( function=lambda: run_enrichment(org_code, mapped_genes, reference, progress) ) self._enrichTask.finished.connect(self._onEnrichTaskFinished) self._executor.submit(self._enrichTask)
def _on_dataready(self): self.setEnabled(True) self.setBlocking(False) self.progressBarFinished(processEvents=False) try: data = self._datatask.result() except urlrequest.URLError as error: self.error(0, ("Error while connecting to the NCBI ftp server! " "'%s'" % error)) sys.excepthook(type(error), error, getattr(error, "__traceback__")) return finally: self._datatask = None data_name = data.name samples, _ = self.selectedSamples() self.warning(0) message = None from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher gene_matcher = GeneMatcher(self.currentGds.get('taxid', '')) if self.outputRows: def samplesinst(ex): out = [] for meta in data.domain.metas: out.append((meta.name, ex[meta].value)) if data.domain.class_var.name != 'class': out.append((data.domain.class_var.name, ex[data.domain.class_var].value)) return out samples = set(samples) mask = [samples.issuperset(samplesinst(ex)) for ex in data] data = data[numpy.array(mask, dtype=bool)] gene_matcher.match_table_attributes(data) if len(data) == 0: message = "No samples with selected sample annotations." else: samples = set(samples) domain = Domain( [attr for attr in data.domain.attributes if samples.issuperset(attr.attributes.items())], data.domain.class_var, data.domain.metas ) # domain.addmetas(data.domain.getmetas()) if len(domain.attributes) == 0: message = "No samples with selected sample annotations." stypes = set(s[0] for s in samples) for attr in domain.attributes: attr.attributes = dict( (key, value) for key, value in attr.attributes.items() if key in stypes ) data = Table(domain, data) if 'gene' in data.domain: gene_column = data.domain['gene'] gene_names = data.get_column_view(gene_column)[0] gene_matcher.genes = gene_names gene_matcher.run_matcher() domain_ids = Domain([], metas=[StringVariable(NCBI_ID)]) data_ids = [[str(gene.ncbi_id) if gene.ncbi_id else '?'] for gene in gene_matcher.genes] table_ids = Table(domain_ids, data_ids) data = Table.concatenate([data, table_ids]) if message is not None: self.warning(0, message) data.attributes[TAX_ID] = self.currentGds.get('taxid', '') data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.outputRows) if not bool(self.outputRows): data.attributes[GENE_ID_COLUMN] = NCBI_ID else: data.attributes[GENE_ID_ATTRIBUTE] = NCBI_ID data.name = data_name self.send("Expression Data", data) model = self.treeWidget.model().sourceModel() row = self.gds.index(self.currentGds) model.setData(model.index(row, 0), " ", Qt.DisplayRole) self.updateInfo() self.selectionChanged = False