def test_common_taxonomy(self): self.assertGreater(len(taxonomy.common_taxids()), 0) self.assertEqual(taxonomy.name(self.human), 'H**o sapiens') self.assertEqual(taxonomy.name(self.dicty), 'Dictyostelium discoideum') self.assertEqual(taxonomy.species_name_to_taxid('H**o sapiens'), self.human) self.assertEqual( taxonomy.species_name_to_taxid('Dictyostelium discoideum'), self.dicty) self.assertGreater(len(taxonomy.shortname(self.human)), 0) self.assertGreater(len(taxonomy.shortname(self.dicty)), 0)
def test_uncommon_taxonomy(self): self.assertTrue(self.dog not in taxonomy.common_taxids()) self.assertEqual(taxonomy.name(self.dog), 'Canis lupus familiaris') # not supported yet. self.assertIsNone( taxonomy.species_name_to_taxid('Canis lupus familiaris')) self.assertFalse(len(taxonomy.shortname(self.dog)))
def runner(self, state: TaskState) -> Table: exp_type = self.data_output_options.expression_type[self.exp_type].type exp_source = self.data_output_options.expression_sources[ self.exp_source] proc_slug = self.data_output_options.process[self.proc_slug].slug collection_id = self.selected_collection_id table = self.data_table progress_steps_download = iter(np.linspace(0, 50, 2)) def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception if not table: collection = self.res.get_collection_by_id(collection_id) coll_table = resdk.tables.RNATables( collection, expression_source=exp_source, expression_process_slug=proc_slug, progress_callable=wrap_callback(callback, end=0.5), ) species = coll_table._data[0].output['species'] sample = coll_table._samples[0] state.set_status('Downloading ...') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) df_exp = coll_table.exp if exp_type != 'rc' else coll_table.rc df_exp = df_exp.rename(index=coll_table.readable_index) df_metas = coll_table.meta df_metas = df_metas.rename(index=coll_table.readable_index) df_qc = None if self.append_qc_data: # TODO: check if there is a way to detect if collection # table contains QC data try: df_qc = coll_table.qc df_qc = df_qc.rename(index=coll_table.readable_index) except ValueError: pass loop.close() state.set_status('To data table ...') duplicates = { item for item, count in Counter([ label.split('.')[1] for label in df_metas.columns.to_list() if '.' in label ]).items() if count > 1 } # what happens if there is more nested sections? section_name_to_label = { section['name']: section['label'] for section in sample.descriptor_schema.schema } column_labels = {} for field_schema, fields, path in iterate_schema( sample.descriptor, sample.descriptor_schema.schema, path=''): path = path[1:] # this is ugly, but cant go around it if path not in df_metas.columns: continue label = field_schema['label'] section_name, field_name = path.split('.') column_labels[path] = ( label if field_name not in duplicates else f'{section_name_to_label[section_name]} - {label}') df_exp = df_exp.reset_index(drop=True) df_metas = df_metas.astype('object') df_metas = df_metas.fillna(np.nan) df_metas = df_metas.replace('nan', np.nan) df_metas = df_metas.rename(columns=column_labels) if df_qc is not None: df_metas = pd.merge(df_metas, df_qc, left_index=True, right_index=True) xym, domain_metas = vars_from_df(df_metas) x, _, m = xym x_metas = np.hstack((x, m)) attrs = [ContinuousVariable(col) for col in df_exp.columns] metas = domain_metas.attributes + domain_metas.metas domain = Domain(attrs, metas=metas) table = Table(domain, df_exp.to_numpy(), metas=x_metas) state.set_progress_value(next(progress_steps_download)) state.set_status('Matching genes ...') progress_steps_gm = iter( np.linspace(50, 99, len(coll_table.gene_ids))) def gm_callback(): state.set_progress_value(next(progress_steps_gm)) tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id, progress_callback=gm_callback) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' self.data_table = table state.set_status('Normalizing ...') table = self.normalize(table) state.set_progress_value(100) return table
def runner( res: ResolweAPI, data_objects: List[Data], options: DataOutputOptions, exp_type: int, proc_type: int, input_annotation: int, state: TaskState, ) -> Table: data_frames = [] metadata = defaultdict(list) def parse_sample_descriptor(sample: Sample) -> None: general = sample.descriptor.get('general', {}) for label in SAMPLE_DESCRIPTOR_LABELS: metadata[label].append([general.get(label, '')]) metadata['sample_name'].append([sample.name]) exp_type = file_output_field = options.expression[exp_type].type proc_type = options.process[proc_type].type source = options.input_annotation[input_annotation].source species = options.input_annotation[input_annotation].species build = options.input_annotation[input_annotation].build # apply filters data_objects = [obj for obj in data_objects if obj.process.type == proc_type] data_objects = [ obj for obj in data_objects if obj.output['source'] == source and obj.output['species'] == species and obj.output['build'] == build ] if exp_type != 'rc': file_output_field = 'exp' data_objects = [obj for obj in data_objects if obj.output['exp_type'] == exp_type] if not data_objects: raise ResolweDataObjectsNotFound step, steps = 0, len(data_objects) + 3 def set_progress(): nonlocal step step += 1 state.set_progress_value(100 * (step / steps)) state.set_status('Downloading ...') for data_object in data_objects: set_progress() parse_sample_descriptor(data_object.sample) metadata['expression_type'].append([exp_type.upper()]) response = res.get_expressions(data_object.id, data_object.output[file_output_field]['file']) with io.BytesIO() as f: f.write(response.content) f.seek(0) # expressions to data frame df = pd.read_csv(f, sep='\t', compression='gzip') df = df.set_index('Gene').T.reset_index(drop=True) data_frames.append(df) state.set_status('Concatenating samples ...') df = pd.concat(data_frames, axis=0) state.set_status('To data table ...') table = table_from_frame(df) set_progress() state.set_status('Adding metadata ...') metas = [StringVariable(label) for label in metadata.keys()] domain = Domain(table.domain.attributes, table.domain.class_vars, metas) table = table.transform(domain) for key, value in metadata.items(): table[:, key] = value set_progress() state.set_status('Matching genes ...') tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' set_progress() return table
def to_data_table(self, selected_genes: Optional[List[str]] = None) -> Table: """Transform GeneMatcher results to Orange data table. Optionally we can provide a list of genes (Entrez Ids). The table on the output will be populated only with provided genes. Parameters ---------- selected_genes: list List of Entrez Ids Returns ------- Orange.data.Table Summary of Gene info in tabular format """ data_x = [] metas = [ StringVariable('Input gene ID'), StringVariable(ENTREZ_ID), StringVariable('Symbol'), StringVariable('Synonyms'), StringVariable('Description'), StringVariable('Other IDs'), StringVariable('Type of gene'), StringVariable('Chromosome'), StringVariable('Map location'), StringVariable('Locus tag'), StringVariable('Symbol from nomenclature authority'), StringVariable('Full name from nomenclature authority'), StringVariable('Nomenclature status'), StringVariable('Other designations'), StringVariable('Species'), StringVariable('Taxonomy ID'), ] domain = Domain([], metas=metas) genes: List[Gene] = self.genes if selected_genes is not None: selected_genes_set = set(selected_genes) genes = [ gene for gene in self.genes if str(gene.gene_id) in selected_genes_set ] for gene in genes: db_refs = (', '.join( '{}: {}'.format(key, val) for (key, val) in gene.db_refs.items()) if gene.db_refs else '') synonyms = ', '.join(gene.synonyms) if gene.synonyms else '' line = [ gene.input_identifier, gene.gene_id, gene.symbol, synonyms, gene.description, db_refs, gene.type_of_gene, gene.chromosome, gene.map_location, gene.locus_tag, gene.symbol_from_nomenclature_authority, gene.full_name_from_nomenclature_authority, gene.nomenclature_status, gene.other_designations, species_name_to_taxid(gene.species), gene.tax_id, ] data_x.append(line) table = Table(domain, data_x) table.name = 'Gene Matcher Results' table.attributes[TableAnnotation.tax_id] = self.tax_id table.attributes[TableAnnotation.gene_as_attr_name] = False table.attributes[TableAnnotation.gene_id_column] = ENTREZ_ID return table
def set_data(self, data: Table) -> None: self.Warning.clear() self.data = data if self.data: if TableAnnotation.gene_as_attr_name not in self.data.attributes: self.Warning.mising_gene_as_attribute_name() self.data = None return if self.data.attributes[TableAnnotation.gene_as_attr_name]: if TableAnnotation.gene_id_attribute not in self.data.attributes: self.Warning.mising_gene_id_attribute() self.data = None return else: if TableAnnotation.tax_id not in self.data.attributes: self.Warning.missing_tax_id() self.data = None return if TableAnnotation.gene_id_column not in self.data.attributes: self.Warning.mising_gene_as_attribute_name() self.data = None return if self.data.attributes[ TableAnnotation. gene_id_column] not in self.data.domain: self.Warning.missing_gene_id() self.data = None return else: self.info.set_input_summary("0") self.info.set_output_summary("0") self.info_gene.clear() self.info_gene_type.setText("No data on input.") self.Outputs.genes.send(None) return self.source_tax = data.attributes[TableAnnotation.tax_id] taxonomy = common_taxid_to_name(self.source_tax) self.target_organism.clear() self.target_organism.addItems([ tax_name for tax_name in self.taxonomy_names if tax_name != taxonomy ]) if taxonomy == self.selected_organism: self.combo_box_id = -1 self.selected_organism = self.taxonomy_names[0] self.target_tax = species_name_to_taxid(self.selected_organism) else: try: self.combo_box_id = self.taxonomy_names.index( self.selected_organism) except ValueError: self.combo_box_id = -1 if self.combo_box_id != -1: self.target_organism.setCurrentIndex(self.combo_box_id) self.selected_organism = self.taxonomy_names[self.combo_box_id] self.target_tax = species_name_to_taxid(self.selected_organism) else: self.target_organism.setCurrentIndex(0) self.selected_organism = self.taxonomy_names[0] self.target_tax = species_name_to_taxid(self.selected_organism) self.info_gene_type.setText(f"Organism: {taxonomy}") data_len = (len(data.domain.attributes) if self.data.attributes[TableAnnotation.gene_as_attr_name] else len(data)) self.info_gene.setText(f"Number of genes: {data_len}") self.info.set_input_summary(f"{data_len}") self.commit()
def target_organism_change(self, combo_box_id: int) -> None: self.combo_box_id = combo_box_id self.selected_organism = self.target_organism.itemText(combo_box_id) self.target_tax = species_name_to_taxid(self.selected_organism) self.commit()