class OWGenes(OWWidget, ConcurrentWidgetMixin): name = "Genes" description = "Tool for working with genes" icon = "../widgets/icons/OWGeneInfo.svg" priority = 40 want_main_area = True selected_organism: int = Setting(11) search_pattern: str = Setting('') exclude_unmatched = Setting(True) replace_id_with_symbol = Setting(True) auto_commit = Setting(True) settingsHandler = DomainContextHandler() selected_gene_col = ContextSetting(None) use_attr_names = ContextSetting(True) replaces = [ 'orangecontrib.bioinformatics.widgets.OWGeneNameMatcher.OWGeneNameMatcher' ] class Inputs: data_table = Input("Data", Table) class Outputs: data_table = Output("Data", Table) gene_matcher_results = Output("Genes", Table) class Information(OWWidget.Information): pass def sizeHint(self): return QSize(1280, 960) def __init__(self): OWWidget.__init__(self) ConcurrentWidgetMixin.__init__(self) # ATTRIBUTES # self.target_database = ENTREZ_ID # input data self.input_data = None self.input_genes = None self.tax_id = None self.column_candidates = [] # input options self.organisms = [] # gene matcher self.gene_matcher = None # progress bar self.progress_bar = None self._timer = QTimer() self._timer.timeout.connect(self._apply_filter) self._timer.setSingleShot(True) # GUI SECTION # # Control area self.info_box = widgetLabel( widgetBox(self.controlArea, "Info", addSpace=True), 'No data on input.\n') organism_box = vBox(self.controlArea, 'Organism') self.organism_select_combobox = comboBox( organism_box, self, 'selected_organism', callback=self.on_input_option_change) self.get_available_organisms() self.organism_select_combobox.setCurrentIndex(self.selected_organism) box = widgetBox(self.controlArea, 'Gene IDs in the input data') self.gene_columns_model = itemmodels.DomainModel( valid_types=(StringVariable, DiscreteVariable)) self.gene_column_combobox = comboBox( box, self, 'selected_gene_col', label='Stored in data column', model=self.gene_columns_model, sendSelectedValue=True, callback=self.on_input_option_change, ) self.attr_names_checkbox = checkBox( box, self, 'use_attr_names', 'Stored as feature (column) names', disables=[(-1, self.gene_column_combobox)], callback=self.on_input_option_change, ) self.gene_column_combobox.setDisabled(bool(self.use_attr_names)) output_box = vBox(self.controlArea, 'Output') # separator(output_box) # output_box.layout().addWidget(horizontal_line()) # separator(output_box) self.exclude_radio = checkBox(output_box, self, 'exclude_unmatched', 'Exclude unmatched genes', callback=self.commit) self.replace_radio = checkBox(output_box, self, 'replace_id_with_symbol', 'Replace feature IDs with gene names', callback=self.commit) auto_commit(self.controlArea, self, "auto_commit", "&Commit", box=False) rubber(self.controlArea) # Main area self.filter = lineEdit(self.mainArea, self, 'search_pattern', 'Filter:', callbackOnType=True, callback=self.handle_filter_callback) # rubber(self.radio_group) self.mainArea.layout().addWidget(self.filter) # set splitter self.splitter = QSplitter() self.splitter.setOrientation(Qt.Vertical) self.table_model = GeneInfoModel() self.table_view = QTableView() self.table_view.setAlternatingRowColors(True) self.table_view.viewport().setMouseTracking(True) self.table_view.setSortingEnabled(True) self.table_view.setShowGrid(False) self.table_view.verticalHeader().hide() # self.table_view.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch) self.unknown_model = UnknownGeneInfoModel() self.unknown_view = QTableView() self.unknown_view.setModel(self.unknown_model) self.unknown_view.verticalHeader().hide() self.unknown_view.setShowGrid(False) self.unknown_view.setSelectionMode(QAbstractItemView.NoSelection) self.unknown_view.horizontalHeader().setSectionResizeMode( QHeaderView.Stretch) self.splitter.addWidget(self.table_view) self.splitter.addWidget(self.unknown_view) self.splitter.setStretchFactor(0, 90) self.splitter.setStretchFactor(1, 10) self.mainArea.layout().addWidget(self.splitter) def handle_filter_callback(self): self._timer.stop() self._timer.start(500) def _apply_filter(self): # filter only if input data is present and model is populated if self.table_model.table is not None: self.table_model.update_model( filter_pattern=str(self.search_pattern)) self.commit() def __reset_widget_state(self): self.table_view.clearSpans() self.table_view.setModel(None) self.table_model.clear() self.unknown_model.clear() self._update_info_box() def _update_info_box(self): if self.input_genes and self.gene_matcher: num_genes = len(self.gene_matcher.genes) known_genes = len(self.gene_matcher.get_known_genes()) info_text = ('{} genes in input data\n' '{} genes match Entrez database\n' '{} genes with match conflicts\n'.format( num_genes, known_genes, num_genes - known_genes)) else: info_text = 'No data on input.' self.info_box.setText(info_text) def on_done(self, _): # update info box self._update_info_box() # set output options self.toggle_radio_options() # set known genes self.table_model.initialize(self.gene_matcher.genes) self.table_view.setModel(self.table_model) self.table_view.selectionModel().selectionChanged.connect(self.commit) self.table_view.setSelectionBehavior(QAbstractItemView.SelectRows) self.table_view.setItemDelegateForColumn( self.table_model.entrez_column_index, LinkStyledItemDelegate(self.table_view)) v_header = self.table_view.verticalHeader() option = self.table_view.viewOptions() size = self.table_view.style().sizeFromContents( QStyle.CT_ItemViewItem, option, QSize(20, 20), self.table_view) v_header.setDefaultSectionSize(size.height() + 2) v_header.setMinimumSectionSize(5) self.table_view.horizontalHeader().setStretchLastSection(True) # set unknown genes self.unknown_model.initialize(self.gene_matcher.genes) self.unknown_view.verticalHeader().setStretchLastSection(True) self._apply_filter() def get_available_organisms(self): available_organism = sorted(((tax_id, taxonomy.name(tax_id)) for tax_id in taxonomy.common_taxids()), key=lambda x: x[1]) self.organisms = [tax_id[0] for tax_id in available_organism] self.organism_select_combobox.addItems( [tax_id[1] for tax_id in available_organism]) def gene_names_from_table(self): """ Extract and return gene names from `Orange.data.Table`. """ self.input_genes = [] if self.input_data: if self.use_attr_names: self.input_genes = [ str(attr.name).strip() for attr in self.input_data.domain.attributes ] else: if self.selected_gene_col is None: self.selected_gene_col = self.gene_column_identifier() self.input_genes = [ str(e[self.selected_gene_col]) for e in self.input_data if not np.isnan(e[self.selected_gene_col]) ] def _update_gene_matcher(self): self.gene_names_from_table() self.gene_matcher = GeneMatcher(self.get_selected_organism(), auto_start=False) self.gene_matcher.genes = self.input_genes # self.gene_matcher.organism = self.get_selected_organism() def get_selected_organism(self): return self.organisms[self.selected_organism] def _run(self): if self.gene_matcher is not None: self.start(run_gene_matcher, self.gene_matcher) def on_input_option_change(self): self.__reset_widget_state() self._update_gene_matcher() self._run() def gene_column_identifier(self): """ Get most suitable column that stores genes. If there are several suitable columns, select the one with most unique values. Take the best one. """ # candidates -> (variable, num of unique values) candidates = ((col, np.unique(self.input_data.get_column_view(col)[0]).size) for col in self.gene_columns_model if isinstance(col, DiscreteVariable) or isinstance(col, StringVariable)) best_candidate, _ = sorted(candidates, key=lambda x: x[1])[-1] return best_candidate def find_genes_location(self): """ Try locate the genes in the input data when we first load the data. Proposed rules: - when no suitable feature names are present, check the columns. - find the most suitable column, that is, the one with most unique values. """ domain = self.input_data.domain if not domain.attributes: if self.selected_gene_col is None: self.selected_gene_col = self.gene_column_identifier() self.use_attr_names = False @Inputs.data_table def handle_input(self, data): self.closeContext() self.input_data = None self.input_genes = None self.__reset_widget_state() self.gene_columns_model.set_domain(None) self.selected_gene_col = None if data: self.input_data = data self.gene_columns_model.set_domain(self.input_data.domain) # check if input table has tax_id, human is used if tax_id is not found self.tax_id = str(self.input_data.attributes.get(TAX_ID, '9606')) # check for gene location. Default is that genes are attributes in the input table. self.use_attr_names = self.input_data.attributes.get( GENE_AS_ATTRIBUTE_NAME, self.use_attr_names) if self.tax_id in self.organisms and not self.selected_organism: self.selected_organism = self.organisms.index(self.tax_id) self.openContext(self.input_data.domain) self.find_genes_location() self.on_input_option_change() def commit(self): selection = self.table_view.selectionModel().selectedRows( self.table_model.entrez_column_index) selected_genes = [row.data() for row in selection] if not len(selected_genes): selected_genes = self.table_model.get_filtered_genes() gene_ids = self.get_target_ids() known_genes = [gid for gid in gene_ids if gid != '?'] table = None gm_table = None if known_genes: # Genes are in rows (we have a column with genes). if not self.use_attr_names: if self.target_database in self.input_data.domain: gene_var = self.input_data.domain[self.target_database] metas = self.input_data.domain.metas else: gene_var = StringVariable(self.target_database) metas = self.input_data.domain.metas + (gene_var, ) domain = Domain(self.input_data.domain.attributes, self.input_data.domain.class_vars, metas) table = self.input_data.transform(domain) col, _ = table.get_column_view(gene_var) col[:] = gene_ids # filter selected rows selected_genes_set = set(selected_genes) selected_rows = [ row_index for row_index, row in enumerate(table) if str(row[gene_var]) in selected_genes_set ] # handle table attributes table.attributes[TAX_ID] = self.get_selected_organism() table.attributes[GENE_AS_ATTRIBUTE_NAME] = False table.attributes[GENE_ID_COLUMN] = self.target_database table = table[selected_rows] if selected_rows else table if self.exclude_unmatched: # create filter from selected column for genes only_known = table_filter.FilterStringList( gene_var, known_genes) # apply filter to the data table = table_filter.Values([only_known])(table) self.Outputs.data_table.send(table) # genes are are in columns (genes are features). else: domain = self.input_data.domain.copy() table = self.input_data.transform(domain) for gene in self.gene_matcher.genes: if gene.input_identifier in table.domain: table.domain[gene.input_identifier].attributes[ self.target_database] = (str(gene.gene_id) if gene.gene_id else '?') if self.replace_id_with_symbol: try: table.domain[gene.input_identifier].name = str( gene.symbol) except AttributeError: # TODO: missing gene symbol, need to handle this? pass # filter selected columns selected_genes_set = set(selected_genes) selected = [ column for column in table.domain.attributes if self.target_database in column.attributes and str(column.attributes[ self.target_database]) in selected_genes_set ] output_attrs = table.domain.attributes if selected: output_attrs = selected if self.exclude_unmatched: known_genes_set = set(known_genes) output_attrs = [ col for col in output_attrs if col.attributes[ self.target_database] in known_genes_set ] domain = Domain(output_attrs, table.domain.class_vars, table.domain.metas) table = table.from_table(domain, table) # handle table attributes table.attributes[TAX_ID] = self.get_selected_organism() table.attributes[GENE_AS_ATTRIBUTE_NAME] = True table.attributes[GENE_ID_ATTRIBUTE] = self.target_database gm_table = self.gene_matcher.to_data_table( selected_genes=selected_genes if selected_genes else None) self.Outputs.data_table.send(table) self.Outputs.gene_matcher_results.send(gm_table) def toggle_radio_options(self): self.replace_radio.setEnabled(bool(self.use_attr_names)) if self.gene_matcher.genes: # enable checkbox if unknown genes are detected self.exclude_radio.setEnabled( len(self.gene_matcher.genes) != len( self.gene_matcher.get_known_genes())) self.exclude_unmatched = len(self.gene_matcher.genes) != len( self.gene_matcher.get_known_genes()) def get_target_ids(self): return [ str(gene.gene_id) if gene.gene_id else '?' for gene in self.gene_matcher.genes ]
class OWGeneNameMatcher(OWWidget): name = "Gene Name Matcher" description = "Tool for working with genes" icon = "../widgets/icons/OWGeneInfo.svg" priority = 5 want_main_area = True use_attr_names = Setting(True) selected_organism = Setting(11) selected_filter = Setting(0) gene_as_attr_name = Setting(0) filter_unknown = Setting(True) include_entrez_id = Setting(True) # include_ensembl_id = Setting(True) auto_commit = Setting(True) class Inputs: data_table = Input("Data", Table) class Outputs: custom_data_table = Output("Data", Table) class Information(OWWidget.Information): pass def sizeHint(self): return QSize(1280, 960) def __init__(self): super().__init__() # ATTRIBUTES # # input data self.input_data = None self.input_genes = None self.tax_id = None self.column_candidates = [] self.selected_gene_col = None # input options self.organisms = [] # gene matcher self.gene_matcher = None # threads self.threadpool = QThreadPool(self) self.workers = None # progress bar self.progress_bar = None # filter self.filter_labels = ['Unique', 'Partial', 'Unknown'] # GUI SECTION # # Control area self.info_box = widgetLabel( widgetBox(self.controlArea, "Info", addSpace=True), "Initializing\n") organism_box = vBox(self.controlArea, 'Organism') self.organism_select_combobox = comboBox( organism_box, self, 'selected_organism', callback=self.on_input_option_change) self.get_available_organisms() self.organism_select_combobox.setCurrentIndex(self.selected_organism) box = widgetBox(self.controlArea, 'Gene names') self.gene_columns_model = itemmodels.DomainModel( valid_types=(StringVariable, )) self.gene_column_combobox = comboBox( box, self, 'selected_gene_col', model=self.gene_columns_model, sendSelectedValue=True, callback=self.on_input_option_change) self.attr_names_checkbox = checkBox( box, self, 'use_attr_names', 'Use attribute names', disables=[(-1, self.gene_column_combobox)], callback=self.on_input_option_change) self.gene_column_combobox.setDisabled(bool(self.use_attr_names)) output_box = vBox(self.controlArea, 'Output settings') # TODO: will widget support transposing tables? # radioButtonsInBox(output_box, self, "gene_as_attr_name", ["Genes in rows", "Genes in columns"], # callback=self.on_output_option_change) # separator(output_box) checkBox(output_box, self, 'filter_unknown', 'Filter unknown genes', callback=self.on_output_option_change) separator(output_box) output_box.layout().addWidget(horizontal_line()) checkBox(output_box, self, 'include_entrez_id', 'Include Entrez ID', callback=self.on_output_option_change) # TODO: provide support for ensembl ids as output option # checkBox(output_box, self, 'include_ensembl_id', 'Include Ensembl ID', callback=self.on_output_option_change) auto_commit(self.controlArea, self, "auto_commit", label="Commit") rubber(self.controlArea) # Main area filter_box = hBox(self.mainArea, 'Filter results') self.radio_group = radioButtons(filter_box, self, value='selected_filter', btnLabels=self.filter_labels, orientation=Qt.Horizontal, callback=self.on_filter_changed) rubber(self.radio_group) self.mainArea.layout().addWidget(filter_box) self.proxy_model = FilterProxyModel(self) self.extended_view = ExtendedTableView(parent=self) self.extended_view.genes_view.setModel(self.proxy_model) self.extended_view.genes_selection_model().selectionChanged.connect( self.__selection_changed) self.mainArea.layout().addWidget(self.extended_view, 1) def __reset_widget_state(self): self.Outputs.custom_data_table.send(None) self.proxy_model.setSourceModel(None) self.extended_view.reset_genes_model() self.extended_view.reset_info_model() def __selection_changed(self): genes = [ model_index.data() for model_index in self.extended_view.get_selected_gens() ] self.extended_view.set_info_model(genes) def _update_info_box(self): if self.input_genes and self.gene_matcher: num_genes = len(self.gene_matcher.genes) known_genes = len(self.gene_matcher.get_known_genes()) info_text = 'Genes on input: {}\n' \ 'Known genes : {} ({:.2f} %)\n'.format(num_genes, known_genes, known_genes * 100 / num_genes) else: info_text = 'No genes on input' self.info_box.setText(info_text) def _progress_advance(self): # GUI should be updated in main thread. That's why we are calling advance method here if self.progress_bar: self.progress_bar.advance() def _handle_matcher_results(self): assert threading.current_thread() == threading.main_thread() if self.progress_bar: self.progress_bar.finish() self.setStatusMessage('') # if no known genes, clean up and return if not len(self.gene_matcher.get_known_genes()): self._update_info_box() self.__reset_widget_state() return self._update_info_box() self.extended_view.set_genes_model(self.gene_matcher.genes) self.proxy_model.setSourceModel(self.extended_view.genes_model) self.extended_view.genes_view.resizeRowsToContents() self.commit() def get_available_organisms(self): available_organism = sorted([(tax_id, taxonomy.name(tax_id)) for tax_id in taxonomy.common_taxids()], key=lambda x: x[1]) self.organisms = [tax_id[0] for tax_id in available_organism] self.organism_select_combobox.addItems( [tax_id[1] for tax_id in available_organism]) def gene_names_from_table(self): """ Extract and return gene names from `Orange.data.Table`. """ self.input_genes = [] if self.input_data: if self.use_attr_names: self.input_genes = [ str(attr.name).strip() for attr in self.input_data.domain.attributes ] elif self.selected_gene_col: if self.selected_gene_col in self.input_data.domain: self.input_genes = [ str(e[self.selected_gene_col]) for e in self.input_data if not np.isnan(e[self.selected_gene_col]) ] def _update_gene_matcher(self): self.gene_names_from_table() if not self.input_genes: self._update_info_box() if not self.gene_matcher: self.gene_matcher = GeneMatcher(self.get_selected_organism(), case_insensitive=True) self.gene_matcher.genes = self.input_genes self.gene_matcher.organism = self.get_selected_organism() def get_selected_organism(self): return self.organisms[self.selected_organism] def match_genes(self): if self.gene_matcher: # init progress bar self.progress_bar = ProgressBar(self, iterations=len( self.gene_matcher.genes)) # status message self.setStatusMessage('Gene matcher running') worker = Worker(self.gene_matcher.run_matcher, progress_callback=True) worker.signals.progress.connect(self._progress_advance) worker.signals.finished.connect(self._handle_matcher_results) # move download process to worker thread self.threadpool.start(worker) def on_input_option_change(self): self.__reset_widget_state() self._update_gene_matcher() self.match_genes() @Inputs.data_table def handle_input(self, data): self.__reset_widget_state() self.gene_columns_model.set_domain(None) if data: self.input_data = data self.gene_columns_model.set_domain(self.input_data.domain) if self.gene_columns_model: self.selected_gene_col = self.gene_columns_model[0] self.tax_id = str(self.input_data.attributes.get(TAX_ID, '')) self.use_attr_names = self.input_data.attributes.get( GENE_AS_ATTRIBUTE_NAME, self.use_attr_names) if self.tax_id in self.organisms: self.selected_organism = self.organisms.index(self.tax_id) self.on_input_option_change() @staticmethod def get_gene_id_identifier(gene_id_strings): # type: (Set[str]) -> str if not len(gene_id_strings): return NCBI_ID regex = re.compile(r'Entrez ID \(.*?\)') filtered = filter(regex.search, gene_id_strings) return NCBI_ID + ' ({})'.format(len(set(filtered)) + 1) def __handle_ids(self, data_table): """ If 'use_attr_names' is True, genes from the input data are in columns. """ if self.use_attr_names: # set_of_attributes = set([key for attr in data_table.domain[:] for key in attr.attributes.keys() # if key.startswith(NCBI_ID)]) # gene_id = self.get_gene_id_identifier(set_of_attributes) gene_id = NCBI_ID for gene in self.gene_matcher.genes: if gene.ncbi_id: data_table.domain[ gene.input_name].attributes[gene_id] = str( gene.ncbi_id) else: set_of_variables = set([ var.name for var in data_table.domain.variables + data_table.domain.metas if var.name.startswith(NCBI_ID) ]) gene_id = self.get_gene_id_identifier(set_of_variables) temp_domain = Domain([], metas=[StringVariable(gene_id)]) temp_data = [[str(gene.ncbi_id) if gene.ncbi_id else '?'] for gene in self.gene_matcher.genes] temp_table = Table(temp_domain, temp_data) # if columns differ, then concatenate. if NCBI_ID in data_table.domain: if gene_id != NCBI_ID and not np.array_equal( np.array(temp_data).ravel(), data_table.get_column_view(NCBI_ID)[0]): data_table = Table.concatenate([data_table, temp_table]) else: gene_id = NCBI_ID else: data_table = Table.concatenate([data_table, temp_table]) return data_table, gene_id def __apply_filters(self, data_table): set_of_attributes = set([ key for attr in data_table.domain[:] for key in attr.attributes.keys() if key == NCBI_ID ]) gene_id = NCBI_ID if NCBI_ID in data_table.domain or set_of_attributes else None if self.include_entrez_id: data_table, gene_id = self.__handle_ids(data_table) if self.filter_unknown: known_input_genes = [ gene.input_name for gene in self.gene_matcher.get_known_genes() ] if self.use_attr_names: temp_domain = Domain([ attr for attr in data_table.domain.attributes if attr.name in known_input_genes ], metas=data_table.domain.metas, class_vars=data_table.domain.class_vars) data_table = data_table.transform(temp_domain) else: # create filter from selected column for genes only_known = table_filter.FilterStringList( self.selected_gene_col, known_input_genes) # apply filter to the data data_table = table_filter.Values([only_known])(data_table) return data_table, gene_id def commit(self): self.Outputs.custom_data_table.send(None) if not self.input_data: return if not self.use_attr_names and not self.gene_columns_model: return output_data_table = self.input_data.transform( self.input_data.domain.copy()) output_data_table, gene_id = self.__apply_filters( output_data_table.copy()) # handle table attributes output_data_table.attributes[TAX_ID] = self.get_selected_organism() output_data_table.attributes[GENE_AS_ATTRIBUTE_NAME] = bool( self.use_attr_names) if not bool(self.use_attr_names): output_data_table.attributes[GENE_ID_COLUMN] = gene_id else: output_data_table.attributes[GENE_ID_ATTRIBUTE] = gene_id self.Outputs.custom_data_table.send(output_data_table) # gene_objs = [self.proxy_model.index(row, 0).data() for row in range(self.proxy_model.rowCount())] def on_output_option_change(self): self.commit() def on_filter_changed(self): self.proxy_model.invalidateFilter() self.extended_view.genes_view.resizeRowsToContents()