Exemplo n.º 1
0
    def identify_taxonomy(self):
        """Identify taxonomy of genomes.
        """
        print('Identifying taxonomy of genomes...')
        n = self.df.shape[0]

        def report_diff(msg):
            nonlocal n
            n_ = self.df.shape[0]
            if n_ < n:
                print('  ' + msg.format(n - n_))
            n = n_

        # remove non-capitalized organism names
        if self.capital:
            self.df = self.df[self.df['organism_name'].apply(is_capital)]
            report_diff('Dropped {} genomes without captalized organism name.')

        # block certain words in organism names
        if self.block:
            self.block = list_from_param(self.block)
            self.df = self.df[~self.df['organism_name'].
                              apply(contain_words, args=(self.block, ))]
            report_diff('Dropped {} genomes with one or more blocked words in '
                        'organism name.')

        # remove original species information
        self.df.drop(columns=['species_taxid'], inplace=True)

        # drop genomes whose taxIds are not in taxdump
        self.df.dropna(subset=['taxid'], inplace=True)
        self.df['taxid'] = self.df['taxid'].astype(str)
        self.df = self.df[self.df['taxid'].isin(self.taxdump)]
        report_diff('Dropped {} genomes without valid taxId.')

        # assign genomes to species (represented by taxId not name)
        self.df['species'] = self.df['taxid'].apply(taxid_at_rank,
                                                    rank='species',
                                                    taxdump=self.taxdump)

        # drop genomes without species taxId
        self.df.dropna(subset=['species'], inplace=True)
        report_diff('Dropped {} genomes without valid species taxId.')

        # drop genomes without Latinate species name
        if self.latin:
            self.df = self.df[self.df['species'].apply(
                lambda x: is_latin(self.taxdump[x]['name']))]
            report_diff('Dropped {} genomes without Latinate species name.')
        print('Done.')

        # include/exclude taxIds
        if self.taxids:
            self.taxids = set(list_from_param(self.taxids))
            print(f'{"Ex" if self.exclude else "In"}cluding '
                  f'{len(self.taxids)} custom TaxIDs...')

            self.df = self.df[self.df['taxid'].apply(lambda x: is_ancestral(
                x, self.taxids, self.taxdump) != self.exclude)]
            report_diff('Dropped {} genomes.')
Exemplo n.º 2
0
    def define_groups(self):
        """Define the three (actually two) groups: "self" and "close".
        """
        self.groups = {}
        for key in ('self', 'close'):
            tids = getattr(self, '{}_tax'.format(key))

            # user-defined group
            if tids:
                setattr(self, '{}_tax'.format(key), list_from_param(tids))
                print('User-defined {} group:'.format(key))

            # auto-infer group
            else:
                getattr(self, 'infer_{}_group'.format(key))()
                print('Auto-inferred {} group:'.format(key))

            # collect taxIds that belong to group
            tids = getattr(self, '{}_tax'.format(key))
            if key not in self.groups:
                self.groups[key] = set().union(
                    *[[x] + get_descendants(x, self.taxdump) for x in tids])

                # subtract self group from close group
                if key == 'close':
                    self.groups['close'] = self.groups['close'].difference(
                        self.groups['self'])

            # report group content
            for tid in tids:
                print('  {} ({})'.format(tid,
                                         describe_taxon(tid, self.taxdump)))
            print('{} group has {} taxa.'.format(key.capitalize(),
                                                 len(self.groups[key])))
Exemplo n.º 3
0
    def define_groups(self):
        """Define the three (actually two) groups: "self" and "close".

        Notes
        -----
        Assign these attributes:
        1. `self_tax`: top-level taxId(s) of the self group.
        2. `close_tax`: top-level taxId(s) of the close group.
        3. `groups` (keys: self, close, distal): all taxIds under each group.
        """
        self.groups = {}
        for key in ('self', 'close'):
            tids = getattr(self, f'{key}_tax')

            # user-defined group
            if tids:
                setattr(self, f'{key}_tax', list_from_param(tids))
                print(f'User-defined {key} group:')

            # auto-infer group
            else:
                getattr(self, f'infer_{key}_group')()
                print(f'Auto-inferred {key} group:')

            # collect taxIds that belong to group
            tids = getattr(self, f'{key}_tax')
            if key not in self.groups:
                self.groups[key] = set().union(
                    *[[x] + get_descendants(x, self.taxdump) for x in tids])

                # subtract self group from close group
                if key == 'close':
                    self.groups['close'] = self.groups['close'].difference(
                        self.groups['self'])

            # report group content
            for tid in tids:
                print(f'  {tid} ({describe_taxon(tid, self.taxdump)})')
            print(f'{key.capitalize()} group has {len(self.groups[key])} '
                  'taxa.')
Exemplo n.º 4
0
    def filter_genomes(self):
        """Filter genomes based on genome metadata.
        """
        print('Filtering genomes...')
        n = self.df.shape[0]

        def report_diff(msg):
            nonlocal n
            n_ = self.df.shape[0]
            if n_ < n:
                print('  ' + msg.format(n - n_))
            n = n_

        # complete genomes only
        if self.complete:
            self.df = self.df[self.df['assembly_level'].isin(
                {'Complete Genome', 'Chromosome'})]
            report_diff('Dropped {} non-complete genomes.')

        # non-redundant genome IDs
        # typically not necessary, just in case
        self.df.rename(columns={'# assembly_accession': 'accession'},
                       inplace=True)
        self.df['accnov'] = self.df['accession'].str.split('.', 1).str[0]
        self.df['genome'] = 'G' + self.df['accnov'].str.split('_', 1).str[-1]
        self.df.drop_duplicates(subset=['genome'], inplace=True)

        # include/exclude genome Ids
        if self.genoids:
            self.genoids = set(list_from_param(self.genoids))
            print(f'{"Ex" if self.exclude else "In"}cluding '
                  f'{len(self.genoids)} custom genome IDs...')
            self.df = self.df[(
                self.df['accession'].isin(self.genoids)
                | self.df['accnov'].isin(self.genoids)
                | self.df['genome'].isin(self.genoids)) != self.exclude]
            report_diff('Dropped {} genomes.')
        print('Done.')
Exemplo n.º 5
0
    def test_list_from_param(self):
        # nothing
        self.assertEqual(list_from_param(None), [])
        self.assertEqual(list_from_param(''), [])

        # already list
        self.assertListEqual(list_from_param([1, 2, 3]), [1, 2, 3])

        # list string
        self.assertListEqual(list_from_param('test'), ['test'])
        self.assertListEqual(list_from_param('a,b,c'), ['a', 'b', 'c'])

        # list file
        exp = ['this', 'is', 'a', 'list']
        fp = join(self.tmpdir, 'test.txt')
        with open(fp, 'w') as f:
            for e in exp:
                print(e, file=f)
        obs = list_from_param(fp)
        self.assertListEqual(obs, exp)
        remove(fp)