예제 #1
0
    def identify_taxonomy(self):
        """Identify taxonomy of genomes.
        """
        print('Identifying taxonomy of genomes...')
        n = self.df.shape[0]

        def report_diff(msg):
            nonlocal n
            n_ = self.df.shape[0]
            if n_ < n:
                print('  ' + msg.format(n - n_))
            n = n_

        # remove non-capitalized organism names
        if self.capital:
            self.df = self.df[self.df['organism_name'].apply(is_capital)]
            report_diff('Dropped {} genomes without captalized organism name.')

        # block certain words in organism names
        if self.block:
            self.block = list_from_param(self.block)
            self.df = self.df[~self.df['organism_name'].
                              apply(contain_words, args=(self.block, ))]
            report_diff('Dropped {} genomes with one or more blocked words in '
                        'organism name.')

        # remove original species information
        self.df.drop(columns=['species_taxid'], inplace=True)

        # drop genomes whose taxIds are not in taxdump
        self.df.dropna(subset=['taxid'], inplace=True)
        self.df['taxid'] = self.df['taxid'].astype(str)
        self.df = self.df[self.df['taxid'].isin(self.taxdump)]
        report_diff('Dropped {} genomes without valid taxId.')

        # assign genomes to species (represented by taxId not name)
        self.df['species'] = self.df['taxid'].apply(taxid_at_rank,
                                                    rank='species',
                                                    taxdump=self.taxdump)

        # drop genomes without species taxId
        self.df.dropna(subset=['species'], inplace=True)
        report_diff('Dropped {} genomes without valid species taxId.')

        # drop genomes without Latinate species name
        if self.latin:
            self.df = self.df[self.df['species'].apply(
                lambda x: is_latin(self.taxdump[x]['name']))]
            report_diff('Dropped {} genomes without Latinate species name.')
        print('Done.')

        # include/exclude taxIds
        if self.taxids:
            self.taxids = set(list_from_param(self.taxids))
            print(f'{"Ex" if self.exclude else "In"}cluding '
                  f'{len(self.taxids)} custom TaxIDs...')

            self.df = self.df[self.df['taxid'].apply(lambda x: is_ancestral(
                x, self.taxids, self.taxdump) != self.exclude)]
            report_diff('Dropped {} genomes.')
예제 #2
0
 def test_is_ancestral(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     self.assertTrue(is_ancestral('1538547', {'2157'}, taxdump))
     self.assertFalse(is_ancestral('1538547', {'2'}, taxdump))