def __call__(self, args): print(f'Database building started at {timestamp()}.') # load configurations self.cfg = load_configs() # read and validate arguments self.set_parameters(args) if not self.skipdownload: # connect to NCBI FTP server self.connect_server() # create target directory makedirs(self.output, exist_ok=True) # retrieve taxonomy database self.retrieve_taxdump() # retrieve genome assembly summary self.retrieve_summary() # retrieve genome categories self.retrieve_categories() # filter genomes self.filter_genomes() # identify taxonomy of genomes self.identify_taxonomy() # sample genomes by taxonomy self.sample_by_taxonomy() # download genomes self.download_genomes() # extract genomes self.extract_genomes() # identify genome lineages self.genome_lineages() # write genome metadata self.genome_metadata() # build taxonomy database self.build_taxdump() # build protein-to-taxonomy map self.build_taxonmap() # build protein sequence database self.compile_database() # clean up if hasattr(self, 'mkdtemp'): rmtree(self.tmpdir) print(f'Database building finished at {timestamp()}.')
def test_set_parameters(self): me = Analyze() me.cfg = load_configs() def args(): return None # input is file infile = join(self.datadir, 'DnaK', 'search', 'sample.tsv') outdir = join(self.tmpdir, 'output') args.input = infile args.output = outdir args.noise = 0.75 me.set_parameters(args) self.assertEqual(me.input, infile) self.assertEqual(me.output, outdir) self.assertTrue(isdir(outdir)) self.assertDictEqual(me.input_map, {'sample': infile}) self.assertEqual(me.noise, 75) # coverage threshold too small args.input_cov = 25 with self.assertRaises(ValueError) as ctx: me.set_parameters(args) msg = 'Taxonomy coverage for auto-interence must be at least 50%.' self.assertEqual(str(ctx.exception), msg) args.input_cov = 75 # input is directory indir = join(self.datadir, 'DnaK', 'search') args.input = indir me.set_parameters(args) self.assertEqual(me.input, indir) self.assertDictEqual(me.input_map, {'sample': infile}) rmtree(outdir) # input is invalid not_path = 'I am not a path' args.input = not_path with self.assertRaises(ValueError) as ctx: me.set_parameters(args) msg = f'Invalid input data file or directory: {not_path}.' self.assertEqual(str(ctx.exception), msg) # input has no search result args.input = self.tmpdir with self.assertRaises(ValueError) as ctx: me.set_parameters(args) msg = f'No input data are found under: {self.tmpdir}.' self.assertEqual(str(ctx.exception), msg) # no input (which is okay) delattr(me, 'input_map') args.input = None me.set_parameters(args) self.assertFalse(hasattr(me, 'input_map'))
def __call__(self, args): print('Analysis started at {}.'.format(timestamp())) # load configurations self.cfg = load_configs() # read and validate arguments self.set_parameters(args) # use existing score table score_file = join(self.output, 'scores.tsv') if self.from_scores and isfile(score_file): self.df = pd.read_csv(score_file, sep='\t') else: # read input taxonomy and search results self.read_input() # assign taxonomy to input genomes self.assign_taxonomy() # define the three groups for search hits self.define_groups() # calculate scores for the three groups per protein self.calc_scores() # generate a table of calculated scores self.make_score_table() # no distal hits if not self.df['distal'].any(): print('WARNING: No hit is assigned to distal group. Cannot ' 'predict HGTs.') return # remove orphans if not self.orphans: self.remove_orphans() # remove outliers if self.outliers != 'none': self.remove_outliers() # predict HGTs self.predict_hgt() print('Analysis finished at {}.'.format(timestamp()))
def test_load_configs(self): obs = load_configs() self.assertIn('database', obs)