def test_common_taxonomy(self):
        self.assertGreater(len(taxonomy.common_taxids()), 0)

        self.assertEqual(taxonomy.name(self.human), 'H**o sapiens')
        self.assertEqual(taxonomy.name(self.dicty), 'Dictyostelium discoideum')

        self.assertEqual(taxonomy.species_name_to_taxid('H**o sapiens'),
                         self.human)
        self.assertEqual(
            taxonomy.species_name_to_taxid('Dictyostelium discoideum'),
            self.dicty)

        self.assertGreater(len(taxonomy.shortname(self.human)), 0)
        self.assertGreater(len(taxonomy.shortname(self.dicty)), 0)
    def test_uncommon_taxonomy(self):
        self.assertTrue(self.dog not in taxonomy.common_taxids())
        self.assertEqual(taxonomy.name(self.dog), 'Canis lupus familiaris')

        # not supported yet.
        self.assertIsNone(
            taxonomy.species_name_to_taxid('Canis lupus familiaris'))
        self.assertFalse(len(taxonomy.shortname(self.dog)))
    def runner(self, state: TaskState) -> Table:
        exp_type = self.data_output_options.expression_type[self.exp_type].type
        exp_source = self.data_output_options.expression_sources[
            self.exp_source]
        proc_slug = self.data_output_options.process[self.proc_slug].slug
        collection_id = self.selected_collection_id

        table = self.data_table
        progress_steps_download = iter(np.linspace(0, 50, 2))

        def callback(i: float, status=""):
            state.set_progress_value(i * 100)
            if status:
                state.set_status(status)
            if state.is_interruption_requested():
                raise Exception

        if not table:
            collection = self.res.get_collection_by_id(collection_id)
            coll_table = resdk.tables.RNATables(
                collection,
                expression_source=exp_source,
                expression_process_slug=proc_slug,
                progress_callable=wrap_callback(callback, end=0.5),
            )
            species = coll_table._data[0].output['species']
            sample = coll_table._samples[0]

            state.set_status('Downloading ...')
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            df_exp = coll_table.exp if exp_type != 'rc' else coll_table.rc
            df_exp = df_exp.rename(index=coll_table.readable_index)
            df_metas = coll_table.meta
            df_metas = df_metas.rename(index=coll_table.readable_index)
            df_qc = None
            if self.append_qc_data:
                # TODO: check if there is a way to detect if collection
                #       table contains QC data
                try:
                    df_qc = coll_table.qc
                    df_qc = df_qc.rename(index=coll_table.readable_index)
                except ValueError:
                    pass
            loop.close()

            state.set_status('To data table ...')

            duplicates = {
                item
                for item, count in Counter([
                    label.split('.')[1]
                    for label in df_metas.columns.to_list() if '.' in label
                ]).items() if count > 1
            }

            # what happens if there is more nested sections?
            section_name_to_label = {
                section['name']: section['label']
                for section in sample.descriptor_schema.schema
            }

            column_labels = {}
            for field_schema, fields, path in iterate_schema(
                    sample.descriptor, sample.descriptor_schema.schema,
                    path=''):
                path = path[1:]  # this is ugly, but cant go around it
                if path not in df_metas.columns:
                    continue
                label = field_schema['label']
                section_name, field_name = path.split('.')
                column_labels[path] = (
                    label if field_name not in duplicates else
                    f'{section_name_to_label[section_name]} - {label}')

            df_exp = df_exp.reset_index(drop=True)
            df_metas = df_metas.astype('object')
            df_metas = df_metas.fillna(np.nan)
            df_metas = df_metas.replace('nan', np.nan)
            df_metas = df_metas.rename(columns=column_labels)
            if df_qc is not None:
                df_metas = pd.merge(df_metas,
                                    df_qc,
                                    left_index=True,
                                    right_index=True)

            xym, domain_metas = vars_from_df(df_metas)
            x, _, m = xym
            x_metas = np.hstack((x, m))
            attrs = [ContinuousVariable(col) for col in df_exp.columns]
            metas = domain_metas.attributes + domain_metas.metas
            domain = Domain(attrs, metas=metas)
            table = Table(domain, df_exp.to_numpy(), metas=x_metas)
            state.set_progress_value(next(progress_steps_download))

            state.set_status('Matching genes ...')
            progress_steps_gm = iter(
                np.linspace(50, 99, len(coll_table.gene_ids)))

            def gm_callback():
                state.set_progress_value(next(progress_steps_gm))

            tax_id = species_name_to_taxid(species)
            gm = GeneMatcher(tax_id, progress_callback=gm_callback)
            table = gm.match_table_attributes(table, rename=True)
            table.attributes[TableAnnotation.tax_id] = tax_id
            table.attributes[TableAnnotation.gene_as_attr_name] = True
            table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID'
            self.data_table = table

        state.set_status('Normalizing ...')
        table = self.normalize(table)
        state.set_progress_value(100)

        return table
示例#4
0
def runner(
    res: ResolweAPI,
    data_objects: List[Data],
    options: DataOutputOptions,
    exp_type: int,
    proc_type: int,
    input_annotation: int,
    state: TaskState,
) -> Table:
    data_frames = []
    metadata = defaultdict(list)

    def parse_sample_descriptor(sample: Sample) -> None:
        general = sample.descriptor.get('general', {})

        for label in SAMPLE_DESCRIPTOR_LABELS:
            metadata[label].append([general.get(label, '')])

        metadata['sample_name'].append([sample.name])

    exp_type = file_output_field = options.expression[exp_type].type
    proc_type = options.process[proc_type].type
    source = options.input_annotation[input_annotation].source
    species = options.input_annotation[input_annotation].species
    build = options.input_annotation[input_annotation].build

    # apply filters
    data_objects = [obj for obj in data_objects if obj.process.type == proc_type]
    data_objects = [
        obj
        for obj in data_objects
        if obj.output['source'] == source and obj.output['species'] == species and obj.output['build'] == build
    ]
    if exp_type != 'rc':
        file_output_field = 'exp'
        data_objects = [obj for obj in data_objects if obj.output['exp_type'] == exp_type]

    if not data_objects:
        raise ResolweDataObjectsNotFound

    step, steps = 0, len(data_objects) + 3

    def set_progress():
        nonlocal step
        step += 1
        state.set_progress_value(100 * (step / steps))

    state.set_status('Downloading ...')
    for data_object in data_objects:
        set_progress()
        parse_sample_descriptor(data_object.sample)
        metadata['expression_type'].append([exp_type.upper()])

        response = res.get_expressions(data_object.id, data_object.output[file_output_field]['file'])
        with io.BytesIO() as f:
            f.write(response.content)
            f.seek(0)
            # expressions to data frame
            df = pd.read_csv(f, sep='\t', compression='gzip')
            df = df.set_index('Gene').T.reset_index(drop=True)
            data_frames.append(df)

    state.set_status('Concatenating samples ...')
    df = pd.concat(data_frames, axis=0)

    state.set_status('To data table ...')
    table = table_from_frame(df)
    set_progress()

    state.set_status('Adding metadata ...')
    metas = [StringVariable(label) for label in metadata.keys()]
    domain = Domain(table.domain.attributes, table.domain.class_vars, metas)
    table = table.transform(domain)

    for key, value in metadata.items():
        table[:, key] = value
    set_progress()

    state.set_status('Matching genes ...')
    tax_id = species_name_to_taxid(species)
    gm = GeneMatcher(tax_id)
    table = gm.match_table_attributes(table, rename=True)
    table.attributes[TableAnnotation.tax_id] = tax_id
    table.attributes[TableAnnotation.gene_as_attr_name] = True
    table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID'
    set_progress()

    return table
示例#5
0
    def to_data_table(self,
                      selected_genes: Optional[List[str]] = None) -> Table:
        """Transform GeneMatcher results to Orange data table.

        Optionally we can provide a list of genes (Entrez Ids).
        The table on the output will be populated only with provided genes.

        Parameters
        ----------
        selected_genes: list
            List of Entrez Ids

        Returns
        -------
        Orange.data.Table
            Summary of Gene info in tabular format
        """
        data_x = []
        metas = [
            StringVariable('Input gene ID'),
            StringVariable(ENTREZ_ID),
            StringVariable('Symbol'),
            StringVariable('Synonyms'),
            StringVariable('Description'),
            StringVariable('Other IDs'),
            StringVariable('Type of gene'),
            StringVariable('Chromosome'),
            StringVariable('Map location'),
            StringVariable('Locus tag'),
            StringVariable('Symbol from nomenclature authority'),
            StringVariable('Full name from nomenclature authority'),
            StringVariable('Nomenclature status'),
            StringVariable('Other designations'),
            StringVariable('Species'),
            StringVariable('Taxonomy ID'),
        ]
        domain = Domain([], metas=metas)

        genes: List[Gene] = self.genes
        if selected_genes is not None:
            selected_genes_set = set(selected_genes)
            genes = [
                gene for gene in self.genes
                if str(gene.gene_id) in selected_genes_set
            ]

        for gene in genes:
            db_refs = (', '.join(
                '{}: {}'.format(key, val)
                for (key,
                     val) in gene.db_refs.items()) if gene.db_refs else '')
            synonyms = ', '.join(gene.synonyms) if gene.synonyms else ''

            line = [
                gene.input_identifier,
                gene.gene_id,
                gene.symbol,
                synonyms,
                gene.description,
                db_refs,
                gene.type_of_gene,
                gene.chromosome,
                gene.map_location,
                gene.locus_tag,
                gene.symbol_from_nomenclature_authority,
                gene.full_name_from_nomenclature_authority,
                gene.nomenclature_status,
                gene.other_designations,
                species_name_to_taxid(gene.species),
                gene.tax_id,
            ]

            data_x.append(line)

        table = Table(domain, data_x)
        table.name = 'Gene Matcher Results'
        table.attributes[TableAnnotation.tax_id] = self.tax_id
        table.attributes[TableAnnotation.gene_as_attr_name] = False
        table.attributes[TableAnnotation.gene_id_column] = ENTREZ_ID
        return table
示例#6
0
    def set_data(self, data: Table) -> None:
        self.Warning.clear()
        self.data = data

        if self.data:
            if TableAnnotation.gene_as_attr_name not in self.data.attributes:
                self.Warning.mising_gene_as_attribute_name()
                self.data = None
                return
            if self.data.attributes[TableAnnotation.gene_as_attr_name]:
                if TableAnnotation.gene_id_attribute not in self.data.attributes:
                    self.Warning.mising_gene_id_attribute()
                    self.data = None
                    return

            else:
                if TableAnnotation.tax_id not in self.data.attributes:
                    self.Warning.missing_tax_id()
                    self.data = None
                    return
                if TableAnnotation.gene_id_column not in self.data.attributes:
                    self.Warning.mising_gene_as_attribute_name()
                    self.data = None
                    return
                if self.data.attributes[
                        TableAnnotation.
                        gene_id_column] not in self.data.domain:
                    self.Warning.missing_gene_id()
                    self.data = None
                    return
        else:
            self.info.set_input_summary("0")
            self.info.set_output_summary("0")
            self.info_gene.clear()
            self.info_gene_type.setText("No data on input.")
            self.Outputs.genes.send(None)

            return

        self.source_tax = data.attributes[TableAnnotation.tax_id]
        taxonomy = common_taxid_to_name(self.source_tax)
        self.target_organism.clear()
        self.target_organism.addItems([
            tax_name for tax_name in self.taxonomy_names
            if tax_name != taxonomy
        ])

        if taxonomy == self.selected_organism:
            self.combo_box_id = -1
            self.selected_organism = self.taxonomy_names[0]
            self.target_tax = species_name_to_taxid(self.selected_organism)
        else:
            try:
                self.combo_box_id = self.taxonomy_names.index(
                    self.selected_organism)
            except ValueError:
                self.combo_box_id = -1

            if self.combo_box_id != -1:
                self.target_organism.setCurrentIndex(self.combo_box_id)
                self.selected_organism = self.taxonomy_names[self.combo_box_id]
                self.target_tax = species_name_to_taxid(self.selected_organism)
            else:
                self.target_organism.setCurrentIndex(0)
                self.selected_organism = self.taxonomy_names[0]
                self.target_tax = species_name_to_taxid(self.selected_organism)

        self.info_gene_type.setText(f"Organism: {taxonomy}")
        data_len = (len(data.domain.attributes)
                    if self.data.attributes[TableAnnotation.gene_as_attr_name]
                    else len(data))
        self.info_gene.setText(f"Number of genes: {data_len}")
        self.info.set_input_summary(f"{data_len}")

        self.commit()
示例#7
0
    def target_organism_change(self, combo_box_id: int) -> None:
        self.combo_box_id = combo_box_id
        self.selected_organism = self.target_organism.itemText(combo_box_id)
        self.target_tax = species_name_to_taxid(self.selected_organism)

        self.commit()