def read_input(self): """Workflow for reading input data. Notes ----- 1. Read taxonomy database into `taxdump`. 2. Read homology search results into `data`. """ # read taxonomy database if self.taxdump is not None: print('Reading local taxonomy database...') self.taxdump = read_taxdump(self.taxdump) elif (isfile(join(self.input, 'names.dmp')) and isfile(join(self.input, 'nodes.dmp'))): print('Reading custom taxonomy database...') self.taxdump = read_taxdump(self.input) elif (isfile(join(dirname(self.input), 'names.dmp')) and isfile(join(dirname(self.input), 'nodes.dmp'))): print('Reading custom taxonomy database...') self.taxdump = read_taxdump(dirname(self.input)) else: raise ValueError('Missing taxonomy database.') print(f'Done. Read {len(self.taxdump)} taxa.') # read search results print('Reading homology search results...') self.data = {} for sid, fname in self.input_map.items(): self.data[sid] = self.read_search_results(fname, self.maxhits, self.evalue, self.identity, self.coverage) print(f' {sid}: {len(self.data[sid])} proteins.') print(f'Done. Read search results of {len(self.data)} samples.')
def read_input(self): """Workflow for reading input data. """ # read taxonomy database if self.taxdump is not None: print('Reading local taxonomy database...') self.taxdump = read_taxdump(self.taxdump) elif (isfile(join(self.input, 'names.dmp')) and isfile(join(self.input, 'nodes.dmp'))): print('Reading custom taxonomy database...') self.taxdump = read_taxdump(self.input) elif (isfile(join(dirname(self.input), 'names.dmp')) and isfile(join(dirname(self.input), 'nodes.dmp'))): print('Reading custom taxonomy database...') self.taxdump = read_taxdump(dirname(self.input)) else: raise ValueError('Missing taxonomy database.') print('Done. Read {} taxa.'.format(len(self.taxdump))) # read search results print('Reading homology search results...') self.data = {} for sid, fname in self.input_map.items(): self.data[sid] = self.read_search_results(fname) print(' {}: {} proteins.'.format(sid, len(self.data[sid]))) print('Done. Read search results of {} samples.'.format(len( self.data)))
def test_read_taxdump(self): tmpdir = mkdtemp() exp = { '1': { 'name': 'root', 'parent': '1', 'rank': 'no rank' }, '2': { 'name': 'Bacteria', 'parent': '131567', 'rank': 'superkingdom' } } # custom short format with open(join(tmpdir, 'nodes.dmp'), 'w') as f: f.write('1 | 1 | no rank |\n' '2 | 131567 | superkingdom |\n') with open(join(tmpdir, 'names.dmp'), 'w') as f: f.write('1 | root |\n' '2 | Bacteria |\n' '2157 | Archaea |\n') obs = read_taxdump(tmpdir) self.assertDictEqual(obs, exp) # original NCBI format with open(join(tmpdir, 'nodes.dmp'), 'w') as f: f.write( '1\t|\t1\t|\tno rank\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0' '\t|\t0\t|\t0\t|\t\t|\n' '2\t|\t131567\t|\tsuperkingdom\t|\t\t|\t0\t|\t0\t|\t11\t|\t0' '\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|\n') with open(join(tmpdir, 'names.dmp'), 'w') as f: f.write( '1\t|\tall\t|\t\t|\tsynonym\t|\n' '1\t|\troot\t|\t\t|\tscientific name\t|\n' '2\t|\tBacteria\t|\tBacteria <prokaryotes>\t|\tscientific name' '\t|\n' '2\t|\tMonera\t|\tMonera <Bacteria>\t|\tin-part\t|\n') obs = read_taxdump(tmpdir) self.assertDictEqual(obs, exp) rmtree(tmpdir)
def retrieve_taxdump(self): """Retrieve NCBI taxdump.""" fname = 'taxdump.tar.gz' remote_file = f'/pub/taxonomy/{fname}' local_file = join(self.output, 'download', fname) # download taxdump if not self.check_local_file(local_file, self.overwrite): print('Downloading NCBI taxonomy database...', end='', flush=True) with open(local_file, 'wb') as f: self.ftp.retrbinary('RETR ' + remote_file, f.write) print(' done.') # read taxdump print('Reading NCBI taxonomy database...', end='', flush=True) with tarfile.open(local_file, 'r:gz') as f: f.extract('names.dmp', self.tmpdir) f.extract('nodes.dmp', self.tmpdir) self.taxdump = read_taxdump(self.tmpdir) print(' done.') print(f'Total number of TaxIDs: {len(self.taxdump)}.')