def dicty_mutant_gene_sets(tax_id: str): """ Return dicty mutant phenotype gene sets from Dictybase """ if tax_id == '44689': gene_sets = [] gene_matcher = GeneMatcher('44689') for phenotype, mutants in phenotypes.phenotype_mutants().items(): phenotype = phenotype.replace(",", " ") gene_symbols = [ phenotypes.mutant_genes(mutant)[0] for mutant in mutants ] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(str(gene.gene_id)) gs = GeneSet(gs_id=phenotype, name=phenotype, genes=genes, hierarchy=('Dictybase', 'Phenotypes'), organism=tax_id, link='') gene_sets.append(gs) for gs_group in GeneSets(gene_sets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def test_synonym_multiple_matches(self): gm = GeneMatcher('9606') gm.genes = ['HB1'] gene = gm.genes[0] self.assertEqual(gene.input_identifier, 'HB1') # Gene matcher should not find any unique match self.assertEqual(gene.gene_id, None)
def cytoband_gene_sets(tax_id: str) -> None: """ Create cytoband gene sets from Stanford Microarray Database """ if tax_id == '9606': download_link = 'http://statweb.stanford.edu/~tibs/GSA/cytobands-stanford.gmt' gene_matcher = GeneMatcher('9606') with urlopen(download_link) as stream: data = stream.read().splitlines() genesets = [] for band in data: b = band.decode().split('\t') gene_symbols = b[2:] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(gene.gene_id) genesets.append( GeneSet(gs_id=b[0], name=b[1], genes=genes if b[2:] else set(), hierarchy=('Cytobands', ), organism='9606', link='')) for gs_group in GeneSets(genesets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def cytoband_gene_sets(org): """ Create cytoband gene sets from Stanford Microarray Database """ if org == '9606': gene_matcher = GeneMatcher('9606') with urlopen(CYTOBAND_DOWNLOAD_LINK) as stream: data = stream.read().splitlines() genesets = [] for band in data: b = band.decode().split('\t') gene_symbols = b[2:] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) genesets.append( GeneSet(gs_id=b[0], name=b[1], genes=genes if b[2:] else [], hierarchy=('Cytobands', ), organism='9606', link='')) return GeneSets(genesets)
def reactome_gene_sets(org): """ Prepare human pathways gene sets from reactome pathways """ if org == '9606': gene_matcher = GeneMatcher('9606') with urlopen(REACTOME_DOWNLOAD_LINK) as url: memfile = io.BytesIO(url.read()) with ZipFile(memfile, 'r') as myzip: f = myzip.open(REACTOME_FILE_NAME) content = f.read().decode().splitlines() genesets = [] for path in content: gene_symbols = path.split('\t')[2:] if path.split( '\t')[2:] else [] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) gs = GeneSet(gs_id=path.split('\t')[0], name=path.split('\t')[0], genes=genes, hierarchy=('Reactome', 'Pathways'), organism='9606', link='') genesets.append(gs) return GeneSets(genesets)
def send_to_output(self, result): self.progress_bar.finish() self.setStatusMessage('') etc_json, table_name = result # convert to table data = etc_to_table(etc_json, bool(self.gene_as_attr_name)) # set table name data.name = table_name # match genes gene_matcher = GeneMatcher(str(self.organism)) if not bool(self.gene_as_attr_name): if 'Gene' in data.domain: data = gene_matcher.match_table_column( data, 'Gene', StringVariable(ENTREZ_ID)) data.attributes[GENE_ID_COLUMN] = ENTREZ_ID else: gene_matcher.match_table_attributes(data) data.attributes[GENE_ID_ATTRIBUTE] = ENTREZ_ID # add table attributes data.attributes[TAX_ID] = str(self.organism) data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.gene_as_attr_name) # reset cache indicators self.set_cached_indicator() # send data to the output signal self.Outputs.etc_data.send(data)
def __init__(self, organism, ontology=None, progress_callback=None): self.ontology = ontology #: A dictionary mapping a gene (gene_id) to a set of all annotations of that gene. self.gene_annotations = defaultdict(list) #: A dictionary mapping a GO term id to a set of annotations that are directly annotated to that term self.term_anotations = defaultdict(list) self.all_annotations = defaultdict(list) self._gene_names = None self._gene_names_dict = None self.gene_matcher = GeneMatcher(organism) #: A list of all :class:`AnnotationRecords` instances. self.annotations = [] self.header = '' self.taxid = organism self._ontology = None try: path = serverfiles.localpath_download( DOMAIN, FILENAME_ANNOTATION.format(organism), progress_callback=progress_callback) except FileNotFoundError: raise taxonomy.UnknownSpeciesIdentifier(organism) self._parse_file(path)
def omim_gene_sets(org): """ Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses """ if org == '9606': gene_matcher = GeneMatcher('9606') genesets = [] for disease in omim.diseases(): gene_symbols = omim.disease_genes(disease) gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) gs = GeneSet( gs_id=disease.id, name=disease.name, genes=genes, hierarchy=('OMIM', ), organism='9606', link=(OMIM_LINK.format(disease.id) if disease.id else None)) genesets.append(gs) return GeneSets(genesets)
def dicty_mutant_gene_sets(org): """ Return dicty mutant phenotype gene sets from Dictybase """ if org == '352472': gene_sets = [] gene_matcher = GeneMatcher('352472') for phenotype, mutants in dicty.phenotypes.phenotype_mutants().items(): gene_symbols = [ dicty.phenotypes.mutant_genes(mutant)[0] for mutant in mutants ] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) if len(gene_symbols) != len(genes): print(len(gene_symbols), len(genes)) gs = GeneSet(gs_id=phenotype, name=phenotype, genes=genes, hierarchy=('Dictybase', 'Phenotypes'), organism='352472', link='') gene_sets.append(gs) return GeneSets(gene_sets)
def test_taxonomy_change(self): gm = GeneMatcher('4932') self.assertEqual(gm.tax_id, '4932') self.assertEqual(basename(normpath(gm.gene_db_path)), '4932.sqlite') gm.tax_id = '9606' self.assertEqual(gm.tax_id, '9606') self.assertEqual(basename(normpath(gm.gene_db_path)), '9606.sqlite')
def test_symbol_match_scenario(self): gm = GeneMatcher('9606') gm.genes = ['SCN5A'] gene = gm.genes[0] self.assertEqual(gene.input_identifier, 'SCN5A') self.assertEqual(gene.symbol, 'SCN5A') self.assertEqual(gene.gene_id, '6331')
def test_match_table_attributes(self): gm = GeneMatcher('4932') data = Table('brown-selected.tab') data = Table.transpose(data, feature_names_column='gene') gm.match_table_attributes(data) for column in data.domain.attributes: self.assertTrue(ENTREZ_ID in column.attributes)
def test_different_input_identifier_types(self): gm = GeneMatcher('9606') gm.genes = ['CD4', '614535', 'HB-1Y', 'ENSG00000205426'] for gene in gm.genes: self.assertIsNotNone(gene.description) self.assertIsNotNone(gene.tax_id) self.assertIsNotNone(gene.species) self.assertIsNotNone(gene.gene_id)
def find_homologs(self, genes: List[Union[str, Gene]]) -> List[Optional[Gene]]: gm = GeneMatcher(self.source_tax) gm.genes = genes homologs = [ g.homolog_gene(taxonomy_id=self.target_tax) for g in gm.genes ] homologs = load_gene_summary(self.target_tax, homologs) return homologs
def _update_gene_matcher(self): self.gene_names_from_table() if not self.input_genes: self._update_info_box() if not self.gene_matcher: self.gene_matcher = GeneMatcher(self.get_selected_organism(), case_insensitive=True) self.gene_matcher.genes = self.input_genes self.gene_matcher.organism = self.get_selected_organism()
def test_homologs(self): gm = GeneMatcher('9606') gm.genes = ['920'] g = gm.genes[0] self.assertIsNotNone(g.homologs) self.assertTrue(len(g.homologs)) self.assertIn('10090', g.homologs) self.assertEqual(g.homology_group_id, '513') self.assertEqual(g.homolog_gene('10090'), '12504') self.assertIsNone(g.homolog_gene('Unknown_taxonomy'))
def matchDDBids(genesDDB): matcher = GeneMatcher(44689) matcher.genes = genesDDB geneNames = matcher.genes geneInfo = dict() for gene in geneNames: ddb = gene.input_identifier symbol = parseNoneStr(gene.symbol) entrez = parseNoneStr(gene.gene_id) description = parseNoneStr(gene.description) geneInfo[ddb] = (symbol, entrez, description) return geneInfo
def run_gene_matcher(gene_matcher: GeneMatcher, state: TaskState): current_iter = 0 max_iter = len(gene_matcher.genes) def callback(): nonlocal current_iter current_iter += 1 state.set_progress_value(100 * (current_iter / max_iter)) state.set_status("Working ...") gene_matcher._progress_callback = callback gene_matcher.match_genes()
def reactome_gene_sets(tax_id: str) -> None: """ Prepare human pathways gene sets from reactome pathways """ if tax_id == '9606': download_link = 'http://www.reactome.org/download/current/ReactomePathways.gmt.zip' file_name = 'ReactomePathways.gmt' detail_link = 'https://reactome.org/content/detail/{}' gene_matcher = GeneMatcher('9606') with urlopen(download_link) as url: memfile = io.BytesIO(url.read()) with ZipFile(memfile, 'r') as myzip: f = myzip.open(file_name) content = f.read().decode().splitlines() genesets = [] for path in content: gene_symbols = path.split('\t')[2:] if path.split( '\t')[2:] else [] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(str(gene.gene_id)) pathway = path.split('\t')[0].replace(',', ' ') pathway_id = path.split('\t')[1].replace(',', ' ') gs = GeneSet(gs_id=pathway_id, name=pathway, genes=genes, hierarchy=('Reactome', 'pathways'), organism='9606', link=detail_link.format(pathway_id)) genesets.append(gs) for gs_group in GeneSets(genesets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def name_genes_entrez(gene_names: list, key_entrez: bool, organism: int = ORGANISM) -> dict: """ Add entrez id to each gene name :param gene_names: Gene names (eg. from dictyBase) :param organism: organism ID :param key_entrez: True: Entrez IDs as keys and names as values, False: vice versa :return: Dict of gene names and matching Entres IDs for genes that have Entrez ID """ entrez_names = dict() matcher = GeneMatcher(organism) matcher.genes = gene_names for gene in matcher.genes: name = gene.input_identifier entrez = gene.gene_id if entrez is not None: if key_entrez: entrez_names[entrez] = name else: entrez_names[name] = entrez return entrez_names
def send_to_output(self, result): self.progress_bar.finish() self.setStatusMessage('') etc_json, table_name = result # convert to table data = etc_to_table(etc_json, bool(self.gene_as_attr_name)) # set table name data.name = table_name # match genes gene_matcher = GeneMatcher(str(self.organism)) if not bool(self.gene_as_attr_name): if 'Gene' in data.domain: gene_column = data.domain['Gene'] gene_names = data.get_column_view(gene_column)[0] gene_matcher.genes = gene_names domain_ids = Domain([], metas=[StringVariable(ENTREZ_ID)]) data_ids = [[str(gene.gene_id) if gene.gene_id else '?'] for gene in gene_matcher.genes] table_ids = Table(domain_ids, data_ids) data = Table.concatenate([data, table_ids]) data.attributes[GENE_ID_COLUMN] = ENTREZ_ID else: gene_matcher.match_table_attributes(data) data.attributes[GENE_ID_ATTRIBUTE] = ENTREZ_ID # add table attributes data.attributes[TAX_ID] = str(self.organism) data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.gene_as_attr_name) # reset cache indicators self.set_cached_indicator() # send data to the output signal self.Outputs.etc_data.send(data)
def Update(self): """ Update (recompute enriched pathways) the widget state. """ if not self.data: return self.error(0) self.information(0) # XXX: Check data in setData, do not even allow this to be executed if # data has no genes try: genes = self.GeneNamesFromData(self.data) except ValueError: self.error(0, "Cannot extract gene names from input.") genes = [] if not self.useAttrNames and any("," in gene for gene in genes): genes = reduce(add, (split_and_strip(gene, ",") for gene in genes), []) self.information(0, "Separators detected in input gene names. " "Assuming multiple genes per instance.") self.queryGenes = genes self.information(1) reference = None if self.useReference and self.refData: reference = self.GeneNamesFromData(self.refData) if not self.useAttrNames \ and any("," in gene for gene in reference): reference = reduce(add, (split_and_strip(gene, ",") for gene in reference), []) self.information(1, "Separators detected in reference gene " "names. Assuming multiple genes per " "instance.") org_code = self.SelectedOrganismCode() from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher gm = GeneMatcher(kegg.to_taxid(org_code)) gm.genes = genes gm.run_matcher() mapped_genes = {gene: str(ncbi_id) for gene, ncbi_id in gm.map_input_to_ncbi().items()} def run_enrichment(org_code, genes, reference=None, progress=None): org = kegg.KEGGOrganism(org_code) if reference is None: reference = org.get_ncbi_ids() # This is here just to keep widget working without any major changes. # map not needed, geneMatcher will not work on widget level. unique_genes = genes unique_ref_genes = dict([(gene, gene) for gene in set(reference)]) taxid = kegg.to_taxid(org.org_code) # Map the taxid back to standard 'common' taxids # (as used by 'geneset') if applicable r_tax_map = dict((v, k) for k, v in kegg.KEGGGenome.TAXID_MAP.items()) if taxid in r_tax_map: taxid = r_tax_map[taxid] # We use the kegg pathway gene sets provided by 'geneset' for # the enrichment calculation. kegg_api = kegg.api.CachedKeggApi() linkmap = kegg_api.link(org.org_code, "pathway") converted_ids = kegg_api.conv(org.org_code, 'ncbi-geneid') kegg_sets = relation_list_to_multimap(linkmap, dict((gene.upper(), ncbi.split(':')[-1]) for ncbi, gene in converted_ids)) kegg_sets = geneset.GeneSets(input=kegg_sets) pathways = pathway_enrichment( kegg_sets, unique_genes.values(), unique_ref_genes.keys(), callback=progress ) # Ensure that pathway entries are pre-cached for later use in the # list/tree view kegg_pathways = kegg.KEGGPathways() kegg_pathways.pre_cache( pathways.keys(), progress_callback=progress ) return pathways, org, unique_genes, unique_ref_genes self.progressBarInit() self.setEnabled(False) self.infoLabel.setText("Retrieving...\n") progress = concurrent.methodinvoke(self, "setProgress", (float,)) self._enrichTask = concurrent.Task( function=lambda: run_enrichment(org_code, mapped_genes, reference, progress) ) self._enrichTask.finished.connect(self._onEnrichTaskFinished) self._executor.submit(self._enrichTask)
def _update_gene_matcher(self): self.gene_names_from_table() self.gene_matcher = GeneMatcher(self.get_selected_organism(), auto_start=False) self.gene_matcher.genes = self.input_genes
class OWGenes(OWWidget, ConcurrentWidgetMixin): name = "Genes" description = "Tool for working with genes" icon = "../widgets/icons/OWGeneInfo.svg" priority = 40 want_main_area = True selected_organism: int = Setting(11) search_pattern: str = Setting('') exclude_unmatched = Setting(True) replace_id_with_symbol = Setting(True) auto_commit = Setting(True) settingsHandler = DomainContextHandler() selected_gene_col = ContextSetting(None) use_attr_names = ContextSetting(True) replaces = [ 'orangecontrib.bioinformatics.widgets.OWGeneNameMatcher.OWGeneNameMatcher' ] class Inputs: data_table = Input("Data", Table) class Outputs: data_table = Output("Data", Table) gene_matcher_results = Output("Genes", Table) class Information(OWWidget.Information): pass def sizeHint(self): return QSize(1280, 960) def __init__(self): OWWidget.__init__(self) ConcurrentWidgetMixin.__init__(self) # ATTRIBUTES # self.target_database = ENTREZ_ID # input data self.input_data = None self.input_genes = None self.tax_id = None self.column_candidates = [] # input options self.organisms = [] # gene matcher self.gene_matcher = None # progress bar self.progress_bar = None self._timer = QTimer() self._timer.timeout.connect(self._apply_filter) self._timer.setSingleShot(True) # GUI SECTION # # Control area self.info_box = widgetLabel( widgetBox(self.controlArea, "Info", addSpace=True), 'No data on input.\n') organism_box = vBox(self.controlArea, 'Organism') self.organism_select_combobox = comboBox( organism_box, self, 'selected_organism', callback=self.on_input_option_change) self.get_available_organisms() self.organism_select_combobox.setCurrentIndex(self.selected_organism) box = widgetBox(self.controlArea, 'Gene IDs in the input data') self.gene_columns_model = itemmodels.DomainModel( valid_types=(StringVariable, DiscreteVariable)) self.gene_column_combobox = comboBox( box, self, 'selected_gene_col', label='Stored in data column', model=self.gene_columns_model, sendSelectedValue=True, callback=self.on_input_option_change, ) self.attr_names_checkbox = checkBox( box, self, 'use_attr_names', 'Stored as feature (column) names', disables=[(-1, self.gene_column_combobox)], callback=self.on_input_option_change, ) self.gene_column_combobox.setDisabled(bool(self.use_attr_names)) output_box = vBox(self.controlArea, 'Output') # separator(output_box) # output_box.layout().addWidget(horizontal_line()) # separator(output_box) self.exclude_radio = checkBox(output_box, self, 'exclude_unmatched', 'Exclude unmatched genes', callback=self.commit) self.replace_radio = checkBox(output_box, self, 'replace_id_with_symbol', 'Replace feature IDs with gene names', callback=self.commit) auto_commit(self.controlArea, self, "auto_commit", "&Commit", box=False) rubber(self.controlArea) # Main area self.filter = lineEdit(self.mainArea, self, 'search_pattern', 'Filter:', callbackOnType=True, callback=self.handle_filter_callback) # rubber(self.radio_group) self.mainArea.layout().addWidget(self.filter) # set splitter self.splitter = QSplitter() self.splitter.setOrientation(Qt.Vertical) self.table_model = GeneInfoModel() self.table_view = QTableView() self.table_view.setAlternatingRowColors(True) self.table_view.viewport().setMouseTracking(True) self.table_view.setSortingEnabled(True) self.table_view.setShowGrid(False) self.table_view.verticalHeader().hide() # self.table_view.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch) self.unknown_model = UnknownGeneInfoModel() self.unknown_view = QTableView() self.unknown_view.setModel(self.unknown_model) self.unknown_view.verticalHeader().hide() self.unknown_view.setShowGrid(False) self.unknown_view.setSelectionMode(QAbstractItemView.NoSelection) self.unknown_view.horizontalHeader().setSectionResizeMode( QHeaderView.Stretch) self.splitter.addWidget(self.table_view) self.splitter.addWidget(self.unknown_view) self.splitter.setStretchFactor(0, 90) self.splitter.setStretchFactor(1, 10) self.mainArea.layout().addWidget(self.splitter) def handle_filter_callback(self): self._timer.stop() self._timer.start(500) def _apply_filter(self): # filter only if input data is present and model is populated if self.table_model.table is not None: self.table_model.update_model( filter_pattern=str(self.search_pattern)) self.commit() def __reset_widget_state(self): self.table_view.clearSpans() self.table_view.setModel(None) self.table_model.clear() self.unknown_model.clear() self._update_info_box() def _update_info_box(self): if self.input_genes and self.gene_matcher: num_genes = len(self.gene_matcher.genes) known_genes = len(self.gene_matcher.get_known_genes()) info_text = ('{} genes in input data\n' '{} genes match Entrez database\n' '{} genes with match conflicts\n'.format( num_genes, known_genes, num_genes - known_genes)) else: info_text = 'No data on input.' self.info_box.setText(info_text) def on_done(self, _): # update info box self._update_info_box() # set output options self.toggle_radio_options() # set known genes self.table_model.initialize(self.gene_matcher.genes) self.table_view.setModel(self.table_model) self.table_view.selectionModel().selectionChanged.connect(self.commit) self.table_view.setSelectionBehavior(QAbstractItemView.SelectRows) self.table_view.setItemDelegateForColumn( self.table_model.entrez_column_index, LinkStyledItemDelegate(self.table_view)) v_header = self.table_view.verticalHeader() option = self.table_view.viewOptions() size = self.table_view.style().sizeFromContents( QStyle.CT_ItemViewItem, option, QSize(20, 20), self.table_view) v_header.setDefaultSectionSize(size.height() + 2) v_header.setMinimumSectionSize(5) self.table_view.horizontalHeader().setStretchLastSection(True) # set unknown genes self.unknown_model.initialize(self.gene_matcher.genes) self.unknown_view.verticalHeader().setStretchLastSection(True) self._apply_filter() def get_available_organisms(self): available_organism = sorted(((tax_id, taxonomy.name(tax_id)) for tax_id in taxonomy.common_taxids()), key=lambda x: x[1]) self.organisms = [tax_id[0] for tax_id in available_organism] self.organism_select_combobox.addItems( [tax_id[1] for tax_id in available_organism]) def gene_names_from_table(self): """ Extract and return gene names from `Orange.data.Table`. """ self.input_genes = [] if self.input_data: if self.use_attr_names: self.input_genes = [ str(attr.name).strip() for attr in self.input_data.domain.attributes ] else: if self.selected_gene_col is None: self.selected_gene_col = self.gene_column_identifier() self.input_genes = [ str(e[self.selected_gene_col]) for e in self.input_data if not np.isnan(e[self.selected_gene_col]) ] def _update_gene_matcher(self): self.gene_names_from_table() self.gene_matcher = GeneMatcher(self.get_selected_organism(), auto_start=False) self.gene_matcher.genes = self.input_genes # self.gene_matcher.organism = self.get_selected_organism() def get_selected_organism(self): return self.organisms[self.selected_organism] def _run(self): if self.gene_matcher is not None: self.start(run_gene_matcher, self.gene_matcher) def on_input_option_change(self): self.__reset_widget_state() self._update_gene_matcher() self._run() def gene_column_identifier(self): """ Get most suitable column that stores genes. If there are several suitable columns, select the one with most unique values. Take the best one. """ # candidates -> (variable, num of unique values) candidates = ((col, np.unique(self.input_data.get_column_view(col)[0]).size) for col in self.gene_columns_model if isinstance(col, DiscreteVariable) or isinstance(col, StringVariable)) best_candidate, _ = sorted(candidates, key=lambda x: x[1])[-1] return best_candidate def find_genes_location(self): """ Try locate the genes in the input data when we first load the data. Proposed rules: - when no suitable feature names are present, check the columns. - find the most suitable column, that is, the one with most unique values. """ domain = self.input_data.domain if not domain.attributes: if self.selected_gene_col is None: self.selected_gene_col = self.gene_column_identifier() self.use_attr_names = False @Inputs.data_table def handle_input(self, data): self.closeContext() self.input_data = None self.input_genes = None self.__reset_widget_state() self.gene_columns_model.set_domain(None) self.selected_gene_col = None if data: self.input_data = data self.gene_columns_model.set_domain(self.input_data.domain) # check if input table has tax_id, human is used if tax_id is not found self.tax_id = str(self.input_data.attributes.get(TAX_ID, '9606')) # check for gene location. Default is that genes are attributes in the input table. self.use_attr_names = self.input_data.attributes.get( GENE_AS_ATTRIBUTE_NAME, self.use_attr_names) if self.tax_id in self.organisms and not self.selected_organism: self.selected_organism = self.organisms.index(self.tax_id) self.openContext(self.input_data.domain) self.find_genes_location() self.on_input_option_change() def commit(self): selection = self.table_view.selectionModel().selectedRows( self.table_model.entrez_column_index) selected_genes = [row.data() for row in selection] if not len(selected_genes): selected_genes = self.table_model.get_filtered_genes() gene_ids = self.get_target_ids() known_genes = [gid for gid in gene_ids if gid != '?'] table = None gm_table = None if known_genes: # Genes are in rows (we have a column with genes). if not self.use_attr_names: if self.target_database in self.input_data.domain: gene_var = self.input_data.domain[self.target_database] metas = self.input_data.domain.metas else: gene_var = StringVariable(self.target_database) metas = self.input_data.domain.metas + (gene_var, ) domain = Domain(self.input_data.domain.attributes, self.input_data.domain.class_vars, metas) table = self.input_data.transform(domain) col, _ = table.get_column_view(gene_var) col[:] = gene_ids # filter selected rows selected_genes_set = set(selected_genes) selected_rows = [ row_index for row_index, row in enumerate(table) if str(row[gene_var]) in selected_genes_set ] # handle table attributes table.attributes[TAX_ID] = self.get_selected_organism() table.attributes[GENE_AS_ATTRIBUTE_NAME] = False table.attributes[GENE_ID_COLUMN] = self.target_database table = table[selected_rows] if selected_rows else table if self.exclude_unmatched: # create filter from selected column for genes only_known = table_filter.FilterStringList( gene_var, known_genes) # apply filter to the data table = table_filter.Values([only_known])(table) self.Outputs.data_table.send(table) # genes are are in columns (genes are features). else: domain = self.input_data.domain.copy() table = self.input_data.transform(domain) for gene in self.gene_matcher.genes: if gene.input_identifier in table.domain: table.domain[gene.input_identifier].attributes[ self.target_database] = (str(gene.gene_id) if gene.gene_id else '?') if self.replace_id_with_symbol: try: table.domain[gene.input_identifier].name = str( gene.symbol) except AttributeError: # TODO: missing gene symbol, need to handle this? pass # filter selected columns selected_genes_set = set(selected_genes) selected = [ column for column in table.domain.attributes if self.target_database in column.attributes and str(column.attributes[ self.target_database]) in selected_genes_set ] output_attrs = table.domain.attributes if selected: output_attrs = selected if self.exclude_unmatched: known_genes_set = set(known_genes) output_attrs = [ col for col in output_attrs if col.attributes[ self.target_database] in known_genes_set ] domain = Domain(output_attrs, table.domain.class_vars, table.domain.metas) table = table.from_table(domain, table) # handle table attributes table.attributes[TAX_ID] = self.get_selected_organism() table.attributes[GENE_AS_ATTRIBUTE_NAME] = True table.attributes[GENE_ID_ATTRIBUTE] = self.target_database gm_table = self.gene_matcher.to_data_table( selected_genes=selected_genes if selected_genes else None) self.Outputs.data_table.send(table) self.Outputs.gene_matcher_results.send(gm_table) def toggle_radio_options(self): self.replace_radio.setEnabled(bool(self.use_attr_names)) if self.gene_matcher.genes: # enable checkbox if unknown genes are detected self.exclude_radio.setEnabled( len(self.gene_matcher.genes) != len( self.gene_matcher.get_known_genes())) self.exclude_unmatched = len(self.gene_matcher.genes) != len( self.gene_matcher.get_known_genes()) def get_target_ids(self): return [ str(gene.gene_id) if gene.gene_id else '?' for gene in self.gene_matcher.genes ]
def runner(self, state: TaskState) -> Table: exp_type = self.data_output_options.expression_type[self.exp_type].type exp_source = self.data_output_options.expression_sources[ self.exp_source] proc_slug = self.data_output_options.process[self.proc_slug].slug collection_id = self.selected_collection_id table = self.data_table progress_steps_download = iter(np.linspace(0, 50, 2)) def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception if not table: collection = self.res.get_collection_by_id(collection_id) coll_table = resdk.tables.RNATables( collection, expression_source=exp_source, expression_process_slug=proc_slug, progress_callable=wrap_callback(callback, end=0.5), ) species = coll_table._data[0].output['species'] sample = coll_table._samples[0] state.set_status('Downloading ...') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) df_exp = coll_table.exp if exp_type != 'rc' else coll_table.rc df_exp = df_exp.rename(index=coll_table.readable_index) df_metas = coll_table.meta df_metas = df_metas.rename(index=coll_table.readable_index) df_qc = None if self.append_qc_data: # TODO: check if there is a way to detect if collection # table contains QC data try: df_qc = coll_table.qc df_qc = df_qc.rename(index=coll_table.readable_index) except ValueError: pass loop.close() state.set_status('To data table ...') duplicates = { item for item, count in Counter([ label.split('.')[1] for label in df_metas.columns.to_list() if '.' in label ]).items() if count > 1 } # what happens if there is more nested sections? section_name_to_label = { section['name']: section['label'] for section in sample.descriptor_schema.schema } column_labels = {} for field_schema, fields, path in iterate_schema( sample.descriptor, sample.descriptor_schema.schema, path=''): path = path[1:] # this is ugly, but cant go around it if path not in df_metas.columns: continue label = field_schema['label'] section_name, field_name = path.split('.') column_labels[path] = ( label if field_name not in duplicates else f'{section_name_to_label[section_name]} - {label}') df_exp = df_exp.reset_index(drop=True) df_metas = df_metas.astype('object') df_metas = df_metas.fillna(np.nan) df_metas = df_metas.replace('nan', np.nan) df_metas = df_metas.rename(columns=column_labels) if df_qc is not None: df_metas = pd.merge(df_metas, df_qc, left_index=True, right_index=True) xym, domain_metas = vars_from_df(df_metas) x, _, m = xym x_metas = np.hstack((x, m)) attrs = [ContinuousVariable(col) for col in df_exp.columns] metas = domain_metas.attributes + domain_metas.metas domain = Domain(attrs, metas=metas) table = Table(domain, df_exp.to_numpy(), metas=x_metas) state.set_progress_value(next(progress_steps_download)) state.set_status('Matching genes ...') progress_steps_gm = iter( np.linspace(50, 99, len(coll_table.gene_ids))) def gm_callback(): state.set_progress_value(next(progress_steps_gm)) tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id, progress_callback=gm_callback) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' self.data_table = table state.set_status('Normalizing ...') table = self.normalize(table) state.set_progress_value(100) return table
def runner( res: ResolweAPI, data_objects: List[Data], options: DataOutputOptions, exp_type: int, proc_type: int, input_annotation: int, state: TaskState, ) -> Table: data_frames = [] metadata = defaultdict(list) def parse_sample_descriptor(sample: Sample) -> None: general = sample.descriptor.get('general', {}) for label in SAMPLE_DESCRIPTOR_LABELS: metadata[label].append([general.get(label, '')]) metadata['sample_name'].append([sample.name]) exp_type = file_output_field = options.expression[exp_type].type proc_type = options.process[proc_type].type source = options.input_annotation[input_annotation].source species = options.input_annotation[input_annotation].species build = options.input_annotation[input_annotation].build # apply filters data_objects = [obj for obj in data_objects if obj.process.type == proc_type] data_objects = [ obj for obj in data_objects if obj.output['source'] == source and obj.output['species'] == species and obj.output['build'] == build ] if exp_type != 'rc': file_output_field = 'exp' data_objects = [obj for obj in data_objects if obj.output['exp_type'] == exp_type] if not data_objects: raise ResolweDataObjectsNotFound step, steps = 0, len(data_objects) + 3 def set_progress(): nonlocal step step += 1 state.set_progress_value(100 * (step / steps)) state.set_status('Downloading ...') for data_object in data_objects: set_progress() parse_sample_descriptor(data_object.sample) metadata['expression_type'].append([exp_type.upper()]) response = res.get_expressions(data_object.id, data_object.output[file_output_field]['file']) with io.BytesIO() as f: f.write(response.content) f.seek(0) # expressions to data frame df = pd.read_csv(f, sep='\t', compression='gzip') df = df.set_index('Gene').T.reset_index(drop=True) data_frames.append(df) state.set_status('Concatenating samples ...') df = pd.concat(data_frames, axis=0) state.set_status('To data table ...') table = table_from_frame(df) set_progress() state.set_status('Adding metadata ...') metas = [StringVariable(label) for label in metadata.keys()] domain = Domain(table.domain.attributes, table.domain.class_vars, metas) table = table.transform(domain) for key, value in metadata.items(): table[:, key] = value set_progress() state.set_status('Matching genes ...') tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' set_progress() return table
def panglao_db(file_path: str): file_name = 'panglao_gene_markers.tab' reference, reference_url = 'PanglaoDB', 'https://panglaodb.se/' with gzip.open(file_path, 'rb') as f: content = f.read().decode('utf-8').strip() species = 0 gene_symbol = 1 cell_type = 2 genes_by_organism = defaultdict(list) organism_mapper = {'Mm': 'Mouse', 'Hs': 'Human'} def _gene_function_table(desc_col: StringVariable, gm_results: GeneMatcher): _domain = Domain([], metas=[desc_col]) _data = [[str(gene.description) if gene.description else ''] for gene in gm_results.genes] return Table(_domain, _data) for line in content.split('\n'): columns = line.split('\t') for org in columns[species].split(' '): if org in organism_mapper.keys(): gene_entry = [ organism_mapper[org], columns[gene_symbol], columns[cell_type], reference, reference_url ] genes_by_organism[organism_mapper[org]].append(gene_entry) domain = Domain( [], metas=[ StringVariable('Organism'), StringVariable('Name'), StringVariable('Cell Type'), StringVariable('Reference'), StringVariable('URL'), ], ) entrez_id_column = StringVariable('Entrez ID') description_column = StringVariable('Function') # construct data table for mouse gm_mouse = GeneMatcher('10090') mouse_table = Table(domain, genes_by_organism['Mouse']) mouse_table = gm_mouse.match_table_column(mouse_table, 'Name', entrez_id_column) mouse_table = Table.concatenate( [mouse_table, _gene_function_table(description_column, gm_mouse)]) # construct data table for human gm_human = GeneMatcher('9606') human_table = Table(domain, genes_by_organism['Human']) human_table = gm_human.match_table_column(human_table, 'Name', entrez_id_column) human_table = Table.concatenate( [human_table, _gene_function_table(description_column, gm_human)]) # return combined tables Table.concatenate([mouse_table, human_table], axis=0).save(f'data/marker_genes/{file_name}')
def test_match_table_column(self): gm = GeneMatcher('4932') data = gm.match_table_column(Table('brown-selected.tab'), 'gene') self.assertTrue(ENTREZ_ID in data.domain)
from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher, GENE_INFO_TAGS # specify input organism = 9606 genes_symbols_to_match = ['HB1', 'BCKDHB', 'TWIST1'] # initialize gene matcher object gene_matcher = GeneMatcher(organism) gene_matcher.genes = genes_symbols_to_match # run matching process gene_matcher.run_matcher() # inspect results for gene in gene_matcher.genes: print("\ninput name: " + gene.input_name, "\nid from ncbi: ", gene.ncbi_id, "\nmatch type: ", gene.type_of_match ) if gene.ncbi_id is None and gene.possible_hits: print('possible_hits: ', [hit.ncbi_id for hit in gene.possible_hits])
Gene()).homology_group_id homologs = [ gene.gene_id for gene in self._homologs_by_group.get(homology_group, []) if gene.tax_id == organism ] if len(homologs) == 1: return homologs[0] else: # Is possible that find more then one gene? return None if __name__ == "__main__": from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher, load_gene_summary import Orange homology = HomoloGene() gm = GeneMatcher('4932') genes = Orange.data.Table("brown-selected") gm.genes = genes _homologs = [ homology.find_homolog(str(gene.gene_id), '9606') for gene in gm.genes ] _homologs = load_gene_summary('9606', _homologs) for gene, homolog in zip(gm.genes, _homologs): print(f'{gene} ----> {homolog}')