示例#1
0
    def read_input(self):
        """Workflow for reading input data.

        Notes
        -----
        1. Read taxonomy database into `taxdump`.
        2. Read homology search results into `data`.
        """
        # read taxonomy database
        if self.taxdump is not None:
            print('Reading local taxonomy database...')
            self.taxdump = read_taxdump(self.taxdump)
        elif (isfile(join(self.input, 'names.dmp'))
              and isfile(join(self.input, 'nodes.dmp'))):
            print('Reading custom taxonomy database...')
            self.taxdump = read_taxdump(self.input)
        elif (isfile(join(dirname(self.input), 'names.dmp'))
              and isfile(join(dirname(self.input), 'nodes.dmp'))):
            print('Reading custom taxonomy database...')
            self.taxdump = read_taxdump(dirname(self.input))
        else:
            raise ValueError('Missing taxonomy database.')
        print(f'Done. Read {len(self.taxdump)} taxa.')

        # read search results
        print('Reading homology search results...')
        self.data = {}
        for sid, fname in self.input_map.items():
            self.data[sid] = self.read_search_results(fname, self.maxhits,
                                                      self.evalue,
                                                      self.identity,
                                                      self.coverage)
            print(f'  {sid}: {len(self.data[sid])} proteins.')
        print(f'Done. Read search results of {len(self.data)} samples.')
示例#2
0
    def read_input(self):
        """Workflow for reading input data.
        """
        # read taxonomy database
        if self.taxdump is not None:
            print('Reading local taxonomy database...')
            self.taxdump = read_taxdump(self.taxdump)
        elif (isfile(join(self.input, 'names.dmp'))
              and isfile(join(self.input, 'nodes.dmp'))):
            print('Reading custom taxonomy database...')
            self.taxdump = read_taxdump(self.input)
        elif (isfile(join(dirname(self.input), 'names.dmp'))
              and isfile(join(dirname(self.input), 'nodes.dmp'))):
            print('Reading custom taxonomy database...')
            self.taxdump = read_taxdump(dirname(self.input))
        else:
            raise ValueError('Missing taxonomy database.')
        print('Done. Read {} taxa.'.format(len(self.taxdump)))

        # read search results
        print('Reading homology search results...')
        self.data = {}
        for sid, fname in self.input_map.items():
            self.data[sid] = self.read_search_results(fname)
            print('  {}: {} proteins.'.format(sid, len(self.data[sid])))
        print('Done. Read search results of {} samples.'.format(len(
            self.data)))
示例#3
0
    def test_read_taxdump(self):
        tmpdir = mkdtemp()
        exp = {
            '1': {
                'name': 'root',
                'parent': '1',
                'rank': 'no rank'
            },
            '2': {
                'name': 'Bacteria',
                'parent': '131567',
                'rank': 'superkingdom'
            }
        }

        # custom short format
        with open(join(tmpdir, 'nodes.dmp'), 'w') as f:
            f.write('1	|	1	|	no rank	|\n' '2	|	131567	|	superkingdom	|\n')
        with open(join(tmpdir, 'names.dmp'), 'w') as f:
            f.write('1	|	root	|\n' '2	|	Bacteria	|\n' '2157	|	Archaea	|\n')
        obs = read_taxdump(tmpdir)
        self.assertDictEqual(obs, exp)

        # original NCBI format
        with open(join(tmpdir, 'nodes.dmp'), 'w') as f:
            f.write(
                '1\t|\t1\t|\tno rank\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0'
                '\t|\t0\t|\t0\t|\t\t|\n'
                '2\t|\t131567\t|\tsuperkingdom\t|\t\t|\t0\t|\t0\t|\t11\t|\t0'
                '\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|\n')
        with open(join(tmpdir, 'names.dmp'), 'w') as f:
            f.write(
                '1\t|\tall\t|\t\t|\tsynonym\t|\n'
                '1\t|\troot\t|\t\t|\tscientific name\t|\n'
                '2\t|\tBacteria\t|\tBacteria <prokaryotes>\t|\tscientific name'
                '\t|\n'
                '2\t|\tMonera\t|\tMonera <Bacteria>\t|\tin-part\t|\n')
        obs = read_taxdump(tmpdir)
        self.assertDictEqual(obs, exp)
        rmtree(tmpdir)
示例#4
0
    def retrieve_taxdump(self):
        """Retrieve NCBI taxdump."""
        fname = 'taxdump.tar.gz'
        remote_file = f'/pub/taxonomy/{fname}'
        local_file = join(self.output, 'download', fname)

        # download taxdump
        if not self.check_local_file(local_file, self.overwrite):
            print('Downloading NCBI taxonomy database...', end='', flush=True)
            with open(local_file, 'wb') as f:
                self.ftp.retrbinary('RETR ' + remote_file, f.write)
            print(' done.')

        # read taxdump
        print('Reading NCBI taxonomy database...', end='', flush=True)
        with tarfile.open(local_file, 'r:gz') as f:
            f.extract('names.dmp', self.tmpdir)
            f.extract('nodes.dmp', self.tmpdir)
        self.taxdump = read_taxdump(self.tmpdir)
        print(' done.')
        print(f'Total number of TaxIDs: {len(self.taxdump)}.')