def dataset_download(gds_id, samples=None, transpose=False, callback=None): file_name = '{}.tab'.format(gds_id) local_files.update(file_name, extract=True, callback=callback) table = Table(local_files.localpath_download(file_name)) title = table.name gds_info = local_files.info(file_name) table_annotations = {TableAnnotation.tax_id: gds_info['taxid']} if callback: callback() if samples is not None: filters = [ table_filter.FilterStringList(sample, sample_types) for sample, sample_types in samples.items() ] table = table_filter.Values(filters)(table) column_values = [] for meta_var in samples.keys(): column_values.append( table.get_column_view(table.domain[meta_var])[0]) class_values = list(map('|'.join, zip(*column_values))) _class_values = list(set(class_values)) map_class_values = { value: key for (key, value) in enumerate(_class_values) } class_var = DiscreteVariable(name='class', values=_class_values) _domain = Domain(table.domain.attributes, table.domain.class_vars + (class_var, ), table.domain.metas) table = table.transform(_domain) col, _ = table.get_column_view(class_var) col[:] = [map_class_values[class_val] for class_val in class_values] if transpose: table = Table.transpose(table, feature_names_column='sample_id', meta_attr_name='genes') table.name = title # table name is lost after transpose table_annotations[TableAnnotation.gene_as_attr_name] = not gds_info[ TableAnnotation.gene_as_attr_name] table_annotations[TableAnnotation.gene_id_column] = gds_info[ TableAnnotation.gene_id_attribute] else: table_annotations[TableAnnotation.gene_as_attr_name] = gds_info[ TableAnnotation.gene_as_attr_name] table_annotations[TableAnnotation.gene_id_attribute] = gds_info[ TableAnnotation.gene_id_attribute] if callback: callback() table.attributes = table_annotations return table
def dataset_download(gds_id, samples=None, transpose=False, callback=None): file_name = '{}.tab'.format(gds_id) file_path = local_files.localpath_download(file_name, extract=True, callback=callback) table = Table(file_path) title = table.name gds_info = local_files.info(file_name) table_annotations = {TableAnnotation.tax_id: gds_info['taxid']} if callback: callback() if samples is not None: filters = [table_filter.FilterStringList(sample, sample_types) for sample, sample_types in samples.items()] table = table_filter.Values(filters)(table) column_values = [] for meta_var in samples.keys(): column_values.append(table.get_column_view(table.domain[meta_var])[0]) class_values = list(map('|'.join, zip(*column_values))) _class_values = list(set(class_values)) map_class_values = {value: key for (key, value) in enumerate(_class_values)} class_var = DiscreteVariable(name='class', values=_class_values) _domain = Domain(table.domain.attributes, table.domain.class_vars + (class_var,), table.domain.metas) table = table.transform(_domain) col, _ = table.get_column_view(class_var) col[:] = [map_class_values[class_val] for class_val in class_values] if transpose: table = Table.transpose(table, feature_names_column='sample_id', meta_attr_name='genes') # When transposing a table, variable.attributes get picked up as numerical values instead of strings. # We need to convert from Continuous to StringVariable _genes = [ [str(int(gene)) if not np.isnan(gene) else '?'] for gene in table.get_column_view('Entrez ID')[0].astype(np.float64) ] new_var = StringVariable('Entrez ID') metas = [var for var in table.domain.metas if var.name != 'Entrez ID'] + [new_var] new_domain = Domain(table.domain.attributes, table.domain.class_vars, metas) table = table.transform(new_domain) table[:, new_var] = _genes # table name is lost after transpose table.name = title table_annotations[TableAnnotation.gene_as_attr_name] = not gds_info[TableAnnotation.gene_as_attr_name] table_annotations[TableAnnotation.gene_id_column] = gds_info[TableAnnotation.gene_id_attribute] else: table_annotations[TableAnnotation.gene_as_attr_name] = gds_info[TableAnnotation.gene_as_attr_name] table_annotations[TableAnnotation.gene_id_attribute] = gds_info[TableAnnotation.gene_id_attribute] if callback: callback() table.attributes = table_annotations return table
def commit(self): selection_model = self.data_view.selectionModel() if selection_model: selection = selection_model.selectedRows(self.COUNT) self.selected_rows = [self.filter_proxy_model.mapToSource(sel).row() for sel in selection] if selection and self.input_genes: genes = [model_index.data(Qt.UserRole) for model_index in selection] output_genes = [gene_name for gene_name in list(set.union(*genes))] self.num_of_sel_genes = len(output_genes) self.update_info_box() if self.use_attr_names: selected = [ column for column in self.input_data.domain.attributes if self.gene_id_attribute in column.attributes and str(column.attributes[self.gene_id_attribute]) in output_genes ] domain = Domain(selected, self.input_data.domain.class_vars, self.input_data.domain.metas) new_data = self.input_data.from_table(domain, self.input_data) self.Outputs.matched_genes.send(new_data) else: # create filter from selected column for genes only_known = table_filter.FilterStringList(self.gene_id_column, output_genes) # apply filter to the data data_table = table_filter.Values([only_known])(self.input_data) self.Outputs.matched_genes.send(data_table)
def __apply_filters(self, data_table): set_of_attributes = set([ key for attr in data_table.domain[:] for key in attr.attributes.keys() if key == NCBI_ID ]) gene_id = NCBI_ID if NCBI_ID in data_table.domain or set_of_attributes else None if self.include_entrez_id: data_table, gene_id = self.__handle_ids(data_table) if self.filter_unknown: known_input_genes = [ gene.input_name for gene in self.gene_matcher.get_known_genes() ] if self.use_attr_names: temp_domain = Domain([ attr for attr in data_table.domain.attributes if attr.name in known_input_genes ], metas=data_table.domain.metas, class_vars=data_table.domain.class_vars) data_table = data_table.transform(temp_domain) else: # create filter from selected column for genes only_known = table_filter.FilterStringList( self.selected_gene_col, known_input_genes) # apply filter to the data data_table = table_filter.Values([only_known])(data_table) return data_table, gene_id
def test_filter_string_list_case_insensitive_data(self): filtered_data = filter.Values(conditions=[ filter.FilterStringList(-1, ['donec'], case_sensitive=False) ])(self.table) correct_data = [SqlRowInstance(filtered_data.domain, row) for row in self.data if row[0] in ['Donec']] self.assertEqual(len(filtered_data), len(correct_data)) self.assertSequenceEqual(filtered_data, correct_data)
def test_filter_string_list(self): filtered_data = filter.Values(conditions=[ filter.FilterStringList(-1, ['et', 'in']) ])(self.table) correct_data = [SqlRowInstance(filtered_data.domain, row) for row in self.data if row[0] in ['et', 'in']] self.assertEqual(len(filtered_data), len(correct_data)) self.assertSequenceEqual(filtered_data, correct_data)
def commit(self): selection = self.table_view.selectionModel().selectedRows( self.table_model.entrez_column_index) selected_genes = [row.data() for row in selection] if not len(selected_genes): selected_genes = self.table_model.get_filtered_genes() gene_ids = self.get_target_ids() known_genes = [gid for gid in gene_ids if gid != '?'] table = None gm_table = None if known_genes: # Genes are in rows (we have a column with genes). if not self.use_attr_names: if self.target_database in self.input_data.domain: gene_var = self.input_data.domain[self.target_database] metas = self.input_data.domain.metas else: gene_var = StringVariable(self.target_database) metas = self.input_data.domain.metas + (gene_var, ) domain = Domain(self.input_data.domain.attributes, self.input_data.domain.class_vars, metas) table = self.input_data.transform(domain) col, _ = table.get_column_view(gene_var) col[:] = gene_ids # filter selected rows selected_genes_set = set(selected_genes) selected_rows = [ row_index for row_index, row in enumerate(table) if str(row[gene_var]) in selected_genes_set ] # handle table attributes table.attributes[TAX_ID] = self.get_selected_organism() table.attributes[GENE_AS_ATTRIBUTE_NAME] = False table.attributes[GENE_ID_COLUMN] = self.target_database table = table[selected_rows] if selected_rows else table if self.exclude_unmatched: # create filter from selected column for genes only_known = table_filter.FilterStringList( gene_var, known_genes) # apply filter to the data table = table_filter.Values([only_known])(table) self.Outputs.data_table.send(table) # genes are are in columns (genes are features). else: domain = self.input_data.domain.copy() table = self.input_data.transform(domain) for gene in self.gene_matcher.genes: if gene.input_identifier in table.domain: table.domain[gene.input_identifier].attributes[ self.target_database] = (str(gene.gene_id) if gene.gene_id else '?') if self.replace_id_with_symbol: try: table.domain[gene.input_identifier].name = str( gene.symbol) except AttributeError: # TODO: missing gene symbol, need to handle this? pass # filter selected columns selected_genes_set = set(selected_genes) selected = [ column for column in table.domain.attributes if self.target_database in column.attributes and str(column.attributes[ self.target_database]) in selected_genes_set ] output_attrs = table.domain.attributes if selected: output_attrs = selected if self.exclude_unmatched: known_genes_set = set(known_genes) output_attrs = [ col for col in output_attrs if col.attributes[ self.target_database] in known_genes_set ] domain = Domain(output_attrs, table.domain.class_vars, table.domain.metas) table = table.from_table(domain, table) # handle table attributes table.attributes[TAX_ID] = self.get_selected_organism() table.attributes[GENE_AS_ATTRIBUTE_NAME] = True table.attributes[GENE_ID_ATTRIBUTE] = self.target_database gm_table = self.gene_matcher.to_data_table( selected_genes=selected_genes if selected_genes else None) self.Outputs.data_table.send(table) self.Outputs.gene_matcher_results.send(gm_table)