Exemplo n.º 1
0
    def __call__(self, args):
        print(f'Database building started at {timestamp()}.')

        # load configurations
        self.cfg = load_configs()

        # read and validate arguments
        self.set_parameters(args)

        if not self.skipdownload:
            # connect to NCBI FTP server
            self.connect_server()

        # create target directory
        makedirs(self.output, exist_ok=True)

        # retrieve taxonomy database
        self.retrieve_taxdump()

        # retrieve genome assembly summary
        self.retrieve_summary()

        # retrieve genome categories
        self.retrieve_categories()

        # filter genomes
        self.filter_genomes()

        # identify taxonomy of genomes
        self.identify_taxonomy()

        # sample genomes by taxonomy
        self.sample_by_taxonomy()

        # download genomes
        self.download_genomes()

        # extract genomes
        self.extract_genomes()

        # identify genome lineages
        self.genome_lineages()

        # write genome metadata
        self.genome_metadata()

        # build taxonomy database
        self.build_taxdump()

        # build protein-to-taxonomy map
        self.build_taxonmap()

        # build protein sequence database
        self.compile_database()

        # clean up
        if hasattr(self, 'mkdtemp'):
            rmtree(self.tmpdir)

        print(f'Database building finished at {timestamp()}.')
Exemplo n.º 2
0
    def test_set_parameters(self):
        me = Analyze()
        me.cfg = load_configs()

        def args():
            return None

        # input is file
        infile = join(self.datadir, 'DnaK', 'search', 'sample.tsv')
        outdir = join(self.tmpdir, 'output')
        args.input = infile
        args.output = outdir
        args.noise = 0.75
        me.set_parameters(args)
        self.assertEqual(me.input, infile)
        self.assertEqual(me.output, outdir)
        self.assertTrue(isdir(outdir))
        self.assertDictEqual(me.input_map, {'sample': infile})
        self.assertEqual(me.noise, 75)

        # coverage threshold too small
        args.input_cov = 25
        with self.assertRaises(ValueError) as ctx:
            me.set_parameters(args)
        msg = 'Taxonomy coverage for auto-interence must be at least 50%.'
        self.assertEqual(str(ctx.exception), msg)
        args.input_cov = 75

        # input is directory
        indir = join(self.datadir, 'DnaK', 'search')
        args.input = indir
        me.set_parameters(args)
        self.assertEqual(me.input, indir)
        self.assertDictEqual(me.input_map, {'sample': infile})
        rmtree(outdir)

        # input is invalid
        not_path = 'I am not a path'
        args.input = not_path
        with self.assertRaises(ValueError) as ctx:
            me.set_parameters(args)
        msg = f'Invalid input data file or directory: {not_path}.'
        self.assertEqual(str(ctx.exception), msg)

        # input has no search result
        args.input = self.tmpdir
        with self.assertRaises(ValueError) as ctx:
            me.set_parameters(args)
        msg = f'No input data are found under: {self.tmpdir}.'
        self.assertEqual(str(ctx.exception), msg)

        # no input (which is okay)
        delattr(me, 'input_map')
        args.input = None
        me.set_parameters(args)
        self.assertFalse(hasattr(me, 'input_map'))
Exemplo n.º 3
0
    def __call__(self, args):
        print('Analysis started at {}.'.format(timestamp()))

        # load configurations
        self.cfg = load_configs()

        # read and validate arguments
        self.set_parameters(args)

        # use existing score table
        score_file = join(self.output, 'scores.tsv')
        if self.from_scores and isfile(score_file):
            self.df = pd.read_csv(score_file, sep='\t')
        else:

            # read input taxonomy and search results
            self.read_input()

            # assign taxonomy to input genomes
            self.assign_taxonomy()

            # define the three groups for search hits
            self.define_groups()

            # calculate scores for the three groups per protein
            self.calc_scores()

            # generate a table of calculated scores
            self.make_score_table()

        # no distal hits
        if not self.df['distal'].any():
            print('WARNING: No hit is assigned to distal group. Cannot '
                  'predict HGTs.')
            return

        # remove orphans
        if not self.orphans:
            self.remove_orphans()

        # remove outliers
        if self.outliers != 'none':
            self.remove_outliers()

        # predict HGTs
        self.predict_hgt()

        print('Analysis finished at {}.'.format(timestamp()))
Exemplo n.º 4
0
 def test_load_configs(self):
     obs = load_configs()
     self.assertIn('database', obs)