示例#1
0
    def test_sum_taxids(self):
        me = Analyze()
        me.input_tax = {'S1': '1', 'S2': '3'}

        def _hits_df(d):
            return pd.Series(d, name='taxid').to_frame()

        me.data = {
            'S1': [{
                'hits': _hits_df({
                    'a': '4',
                    'b': '6'
                })
            }, {
                'hits': _hits_df({
                    'a': '4',
                    'c': '8'
                })
            }],
            'S2': [{
                'hits': _hits_df({
                    'b': '6',
                    'd': '1'
                })
            }]
        }
        obs = me.sum_taxids()
        exp = {'1', '3', '4', '6', '8'}
        self.assertSetEqual(obs, exp)
示例#2
0
    def test_find_match(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        df = pd.DataFrame(
            [
                [100, '585056'],  # E. coli UMN026
                [99, '1038927'],  # E. coli O104:H4
                [97, '562'],  # Escherichia coli
                [95, '622'],  # Shigella dysenteriae
                [92, '543'],  # Enterobacteriaceae
                [88, '548'],  # Klebsiella aerogenes
                [80, '766']
            ],  # Rickettsiales
            columns=['score', 'taxid'])

        # keep top 1% hits
        me.match_th = 0.99
        self.assertEqual(me.find_match(df), '562')

        # keep top 10% hits
        me.match_th = 0.9
        self.assertEqual(me.find_match(df), '543')

        # keep top 20% hits
        me.match_th = 0.8
        self.assertEqual(me.find_match(df), '1224')

        # input DataFrame is empty
        self.assertEqual(me.find_match(pd.DataFrame()), '0')
示例#3
0
    def test_infer_genome_tax(self):
        taxdump = taxdump_from_text(taxdump_proteo)

        # five proteins, in which four have hits
        taxids = [
            ['562', '620', '570'],  # E. coli
            ['562', '585056', '1038927', '2'],  # E. coli
            ['561', '543', '776'],  # Escherichia
            ['548', '570', '1236'],  # K. aerogenes
            []
        ]
        prots = [{'hits': pd.DataFrame(x, columns=['taxid'])} for x in taxids]
        obs = Analyze.infer_genome_tax(prots, taxdump, 75)
        exp = ('561', 75.0)  # 3 / 4 best hits assigned to Escherichia
        self.assertTupleEqual(obs, exp)

        # reduce coverage threshold
        obs = Analyze.infer_genome_tax(prots, taxdump, 50)
        exp = ('562', 50.0)  # 2 / 4 best hits assigned to Escherichia
        self.assertTupleEqual(obs, exp)

        # remove one protein that best matches E. coli
        prots.pop(0)
        obs = Analyze.infer_genome_tax(prots, taxdump, 75)
        exp = ('543', 100.0)  # 3 / 3 best hits assigned to Enterobacteriaceae
        self.assertTupleEqual(obs, exp)

        # no input protein
        with self.assertRaises(ValueError) as ctx:
            Analyze.infer_genome_tax({}, taxdump, 75)
        msg = 'Cannot auto-infer taxonomy.'
        self.assertEqual(str(ctx.exception), msg)
示例#4
0
 def test_remove_orphans(self):
     me = Analyze()
     me.df = pd.DataFrame(
         [[1.0, 0.2], [0.5, 0.4], [0.0, 0.0], [0.8, 0.0], [0.0, 0.7]],
         columns=['close', 'distal'])
     me.remove_orphans()
     self.assertListEqual(me.df.values.tolist(),
                          [[1.0, 0.2], [0.5, 0.4], [0.8, 0.0], [0.0, 0.7]])
示例#5
0
 def test_plot_density(self):
     data = np.concatenate([self.dist_norm1, self.dist_norm2])[:,
                                                               np.newaxis]
     estimator = KernelDensity(kernel='gaussian', bandwidth=0.5)
     kde = estimator.fit(data)
     x, y = Analyze.density_func(data, kde, 100)
     peak, valley = Analyze.first_hill(x, y)
     th = valley - (valley - peak) * 0.5 / 100
     fp = join(self.tmpdir, 'tmp.png')
     Analyze.plot_density(x, y, peak, valley, th, fp)
     self.assertTrue(isfile(fp))
     remove(fp)
示例#6
0
    def test_first_hill(self):
        # typical bimodal distribution
        data = np.concatenate([self.dist_norm1, self.dist_norm2])[:,
                                                                  np.newaxis]
        estimator = KernelDensity(kernel='gaussian', bandwidth=0.5)
        kde = estimator.fit(data)
        x, y = Analyze.density_func(data, kde, 100)
        obs_x, obs_y = Analyze.first_hill(x, y)
        exp_x, exp_y = 1.0971012583068704, 2.5302323352207674
        self.assertAlmostEqual(obs_x, exp_x)
        self.assertAlmostEqual(obs_y, exp_y)

        # peak larger than valley
        data = np.negative(data)
        kde = estimator.fit(data)
        x, y = Analyze.density_func(data, kde, 100)
        with self.assertRaises(ValueError) as ctx:
            Analyze.first_hill(x, y)
        msg = 'Peak is larger than valley.'
        self.assertEqual(str(ctx.exception), msg)

        # unimodal distribution
        data = self.dist_norm1[:, np.newaxis]
        kde = estimator.fit(data)
        x, y = Analyze.density_func(data, kde, 100)
        with self.assertRaises(ValueError) as ctx:
            Analyze.first_hill(x, y)
        msg = 'Cannot identify at least two peaks.'
        self.assertEqual(str(ctx.exception), msg)
示例#7
0
    def test_read_search_results(self):
        file = join(self.datadir, 'DnaK', 'search', 'sample.tsv')
        obs = Analyze.read_search_results(file)
        self.assertEqual(len(obs), 1)
        self.assertEqual(obs[0]['id'], 'WP_000516135.1')
        self.assertAlmostEqual(obs[0]['score'], 1092.8)
        self.assertTupleEqual(obs[0]['hits'].shape, (12, 5))
        self.assertEqual(obs[0]['hits'].iloc[2].name, 'NP_454622.1')
        self.assertAlmostEqual(obs[0]['hits']['evalue']['NP_230502.1'],
                               5.9e-282)
        self.assertEqual(obs[0]['hits']['taxid']['NP_384288.1'], '266834')

        # maximum number of hits
        obs = Analyze.read_search_results(file, 5)
        self.assertEqual(len(obs[0]['hits']), 5)
示例#8
0
    def test___call__(self):
        # run Ecoli sample using the Silverman method
        me = Analyze()

        def args():
            return None

        args.input = join(self.datadir, 'Ecoli', 'search')
        args.output = join(self.tmpdir, 'output')
        args.taxdump = join(self.datadir, 'Ecoli', 'taxdump')
        args.input_tax = None
        args.self_tax = None
        args.close_tax = None
        args.self_rank = None
        args.close_size = None
        args.distal_top = None
        args.bandwidth = 'silverman'
        args.from_scores = False
        me(args)
        self.assertEqual(me.df[me.df['hgt']].shape[0], 16)

        # use existing score table, run grid search
        args.input = None
        args.from_scores = True
        args.bandwidth = 'grid'
        me(args)
        self.assertEqual(me.df[me.df['hgt']].shape[0], 18)
        rmtree(args.output)
示例#9
0
    def test_calc_scores(self):
        columns = ('id', 'taxid', 'score')

        # helper for making hit table
        def _hits_df(data):
            return pd.DataFrame(data, columns=columns).set_index('id')

        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {
            'self': {'561', '562', '585056'},
            'close': {'543', '91347', '1236'}
        }
        me.data = {
            'S1': [{
                'score': 100,
                'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95)))
            }, {
                'score': 90,
                'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72)))
            }],
            'S2': [{
                'score':
                96,
                'hits':
                _hits_df(
                    (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66)))
            }]
        }
        me.weighted = True
        me.match_th = 0.9
        me.calc_scores()

        # helper for get scores
        def _prot_scores(prot):
            return [prot[x] for x in ('self', 'close', 'distal')]

        s1_1 = me.data['S1'][0]
        self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self'])
        self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0])
        self.assertEqual(s1_1['match'], '0')
        s1_2 = me.data['S1'][1]
        self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close'])
        self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0])
        self.assertEqual(s1_2['match'], '0')
        s2_1 = me.data['S2'][0]
        self.assertListEqual(s2_1['hits']['group'].tolist(),
                             ['self', 'close', 'distal'])
        self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875])
        self.assertEqual(s2_1['match'], '620')
示例#10
0
    def test_grid_kde(self):
        estimator = KernelDensity(kernel='gaussian')

        # unimodal
        data = self.dist_gamma[:, np.newaxis]
        obs = Analyze.grid_kde(data, estimator, 10).bandwidth
        self.assertAlmostEqual(obs, 0.774263682681127)

        # bimodal
        data = np.concatenate([self.dist_norm1, self.dist_norm2])[:,
                                                                  np.newaxis]
        obs = Analyze.grid_kde(data, estimator, 10).bandwidth
        self.assertAlmostEqual(obs, 0.46415888336127786)

        data = np.array([1, 2, 3, 4, 5])[:, np.newaxis]
        obs = Analyze.grid_kde(data, estimator, 5).bandwidth
        self.assertAlmostEqual(obs, 1.0)

        # very few data points (bw = high end)
        data = np.array([1, 2, 3, 4, 5])[:, np.newaxis]
        obs = Analyze.grid_kde(data, estimator, 5).bandwidth
        self.assertAlmostEqual(obs, 1.0)

        # constant values (bw = low end)
        data = np.array([1, 1, 1, 1, 1])[:, np.newaxis]
        obs = Analyze.grid_kde(data, estimator, 5).bandwidth
        self.assertAlmostEqual(obs, 0.1)

        # too few data points (less than splits)
        data = np.array([1, 2, 3])[:, np.newaxis]
        with self.assertRaises(ValueError) as ctx:
            Analyze.grid_kde(data, estimator, 5)
        msg = 'Cannot perform grid search on 3 data point(s).'
        self.assertEqual(str(ctx.exception), msg)
示例#11
0
    def test_silverman_bw(self):
        # unimodal
        obs = Analyze.silverman_bw(self.dist_gamma)
        self.assertAlmostEqual(obs, 0.6148288686346546)
        obs = Analyze.silverman_bw(self.dist_lognorm)
        self.assertAlmostEqual(obs, 0.2384666552244172)

        # bimodal
        obs = Analyze.silverman_bw(
            np.concatenate([self.dist_norm1, self.dist_norm2]))
        self.assertAlmostEqual(obs, 0.48713295460585126)

        # constant values
        obs = Analyze.silverman_bw([1, 1, 1, 1, 1])
        self.assertAlmostEqual(obs, 0.652301697309926)

        # IQR = 0
        obs = Analyze.silverman_bw([1, 3, 3, 3, 5])
        self.assertAlmostEqual(obs, 0.9224939070946869)

        # one element
        with self.assertRaises(ValueError) as ctx:
            Analyze.silverman_bw([5])
        msg = 'Cannot calculate bandwidth on 1 data point.'
        self.assertEqual(str(ctx.exception), msg)
示例#12
0
 def test_plot_hgts(self):
     me = Analyze()
     me.output = self.tmpdir
     me.df = pd.DataFrame(np.array(
         [self.dist_gamma, self.dist_lognorm[:800]]).T,
                          columns=['close', 'distal'])
     me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
     me.plot_hgts()
     fp = join(self.tmpdir, 'scatter.png')
     self.assertTrue(isfile(fp))
     remove(fp)
示例#13
0
 def test_density_func(self):
     data = self.dist_norm1[:, np.newaxis]
     estimator = KernelDensity(kernel='gaussian', bandwidth=0.5)
     kde = estimator.fit(data)
     obs = Analyze.density_func(data, kde, 10)
     exp = (np.array([
         -1.48253468, 0.0939095, 1.67035369, 3.24679787, 4.82324206,
         6.39968624, 7.97613043, 9.55257461, 11.1290188, 12.70546298
     ]),
            np.array([
                0.00104342, 0.00788705, 0.0496806, 0.13173376, 0.19176352,
                0.15754466, 0.06992292, 0.02140856, 0.00150463, 0.00053637
            ]))
     np.testing.assert_array_almost_equal(obs, exp)
示例#14
0
 def test_calc_cluster_props(self):
     me = Analyze()
     me.self_low = False
     me.df = pd.DataFrame(np.array(
         [self.dist_gamma, self.dist_lognorm[:800]]).T,
                          columns=['close', 'distal'])
     me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
     obs = me.calc_cluster_props()
     self.assertAlmostEqual(obs[0], 1.094658052928843)
     self.assertAlmostEqual(obs[1], 4.30076698399293)
     obs = me.df['silh'].describe()
     self.assertAlmostEqual(obs['mean'], 0.312495082044277)
     self.assertAlmostEqual(obs['std'], 0.21945541659155993)
     self.assertEqual(me.df.query('hgt & silh < 0.5').shape[0], 35)
示例#15
0
 def test_make_score_table(self):
     me = Analyze()
     me.output = self.tmpdir
     me.data = {
         'S1': [{
             'id': 'P1',
             'length': 100,
             'match': '0',
             'self': 1.5,
             'close': 0.75,
             'distal': 0.0,
             'hits': pd.DataFrame([0] * 3)
         }, {
             'id': 'P2',
             'length': 120,
             'match': '1224',
             'self': 1.625,
             'close': 0.225,
             'distal': 0.375,
             'hits': pd.DataFrame([0] * 5)
         }],
         'S2': [{
             'id': 'P1',
             'length': 225,
             'match': '620',
             'self': 2.35,
             'close': 1.05,
             'distal': 0.75,
             'hits': pd.DataFrame([0] * 6)
         }]
     }
     me.make_score_table()
     obs = me.df.values.tolist()
     exp = [['S1', 'P1', 100, 3, 1.5, 0.75, 0, '0'],
            ['S1', 'P2', 120, 5, 1.625, 0.225, 0.375, '1224'],
            ['S2', 'P1', 225, 6, 2.35, 1.05, 0.75, '620']]
     self.assertListEqual(obs, exp)
     fp = join(self.tmpdir, 'scores.tsv')
     with open(fp, 'r') as f:
         obs = [x.split('\t') for x in f.read().splitlines()[1:]]
     exp = [[str(y) for y in x] for x in exp]
     self.assertListEqual(obs, exp)
     remove(fp)
示例#16
0
 def test_relevant_groups(self):
     me = Analyze()
     me.self_low = False
     self.assertListEqual(me.relevant_groups(), ['close', 'distal'])
     me.self_low = True
     self.assertListEqual(me.relevant_groups(), ['self', 'close', 'distal'])
示例#17
0
    def test_smart_kde(self):
        me = Analyze()

        # typical case (bimodal distribution)
        me.df = pd.Series(np.concatenate([self.dist_norm1, self.dist_norm2]),
                          name='group').to_frame()
        me.bw_steps = 10
        me.noise = 50
        me.low_part = 75
        me.output = self.tmpdir
        obs = me.smart_kde('group')
        self.assertAlmostEqual(obs, 2.1903958075763343)
        file = join(self.tmpdir, 'group.kde.png')
        self.assertTrue(isfile(file))
        remove(file)

        # unable to determine threshold
        me.low_part = 0.001
        me.df = pd.Series(self.dist_norm1, name='group').to_frame()
        self.assertEqual(me.smart_kde('group'), 0)
示例#18
0
    def test_assign_taxonomy(self):
        # input are two genomes with defined taxonomy
        me = Analyze()
        me.input_tax = 'S1:561,S2:620'  # Escherichia and Shigella
        me.data = {}
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        # test input taxonomy extraction
        self.assertDictEqual(me.input_tax, {'S1': '561', 'S2': '620'})
        # test taxonomy refinement
        exp = {
            '1', '131567', '2', '1224', '1236', '91347', '543', '561', '620'
        }
        self.assertSetEqual(set(me.taxdump.keys()), exp)
        # test LCA discovery
        self.assertEqual(me.lca, '543')

        # helper for making hit table
        def _hits_df(d):
            return pd.Series(d, name='taxid', dtype=object).to_frame()

        # input is one genome with defined taxonomy
        me = Analyze()
        me.data = {'S1': [{'hits': pd.DataFrame(columns=['taxid'])}]}
        me.input_tax = '561'  # Escherichia
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        self.assertDictEqual(me.input_tax, {'S1': '561'})

        # input taxonomy not found in database
        me.input_tax = '1234'
        me.taxdump = taxdump_from_text(taxdump_proteo)
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'TaxID 1234 is not present in taxonomy database.'
        self.assertEqual(str(ctx.exception), msg)

        # input are two genome whose taxonomies are to be inferred based on
        # search results
        me = Analyze()
        me.input_tax = None
        me.data = {
            'S1': [{
                'hits': _hits_df({
                    'P1': '561',
                    'P2': '562'
                })
            }, {
                'hits': _hits_df({
                    'P3': '543',
                    'P4': '561'
                })
            }],
            'S2': [{
                'hits': _hits_df({
                    'P5': '562',
                    'P6': '585056'
                })
            }, {
                'hits': _hits_df({
                    'P7': '561',
                    'P8': '1038927'
                })
            }, {
                'hits': _hits_df({'P9': '2580236'})
            }]
        }
        me.input_cov = 75
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        self.assertDictEqual(me.input_tax, {'S1': '543', 'S2': '561'})
        self.assertEqual(me.lca, '543')

        # cannot auto-infer taxonomy
        me.data['S3'] = [{'hits': _hits_df({})}]
        me.taxdump = taxdump_from_text(taxdump_proteo)
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'Cannot auto-infer taxonomy for S3. Please specify manually.'
        self.assertEqual(str(ctx.exception), msg)

        # invalid input taxonomy string
        me.input_tax = '561'
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'Invalid input taxonomy format.'
        self.assertEqual(str(ctx.exception), msg)
示例#19
0
 def test_plot_hist(self):
     fp = join(self.tmpdir, 'tmp.png')
     Analyze.plot_hist(self.dist_gamma, fp)
     self.assertTrue(isfile(fp))
     remove(fp)
示例#20
0
    def test_define_groups(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {}

        # user defined groups:
        # self: genera Escherichia and Shigella
        # close: family Enterobacteriaceae
        me.groups = {}
        me.self_tax = '561,620'
        me.close_tax = '543'
        me.define_groups()
        self.assertListEqual(me.self_tax, ['561', '620'])
        exp = {'561', '562', '585056', '1038927', '2580236', '620', '622'}
        self.assertSetEqual(me.groups['self'], exp)
        self.assertListEqual(me.close_tax, ['543'])
        exp = {'543', '548', '570'}
        self.assertSetEqual(me.groups['close'], exp)

        # auto-infer groups
        me.self_tax = {}
        me.close_tax = {}
        me.lca = '562'  # all inputs are E. coli
        me.self_rank = 'genus'  # but we want to raise self to genus
        me.close_size = 2  # close group must be this big or bigger
        me.define_groups()
        self.assertListEqual(me.self_tax, ['561'])
        exp = {'561', '562', '585056', '1038927', '2580236'}
        self.assertSetEqual(me.groups['self'], exp)
        self.assertListEqual(me.close_tax, ['543'])
        exp = {'543', '548', '570', '620', '622'}
        self.assertSetEqual(me.groups['close'], exp)
示例#21
0
    def test_infer_self_group(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)

        # assign to LCA of all genomes (E. coli)
        me.self_tax = None
        me.lca = '562'
        me.self_rank = None
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['562'])

        # raise LCA to genus level (Escherichia)
        me.self_tax = None
        me.lca = '562'
        me.self_rank = 'genus'
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['561'])

        # LCA (Enterobacteriaceae) is already above designated rank (genus)
        me.self_tax = None
        me.lca = '543'
        me.self_rank = 'genus'
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['543'])
示例#22
0
    def test_infer_close_group(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {}

        # close group is parent of LCA of self group
        me.self_tax = ['562']  # E. coli
        me.groups['self'] = set(['562'] + get_descendants('562', me.taxdump))
        me.close_tax = None
        me.close_size = None
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['561'])  # Escherichia
        self.assertSetEqual(me.groups['close'], {'561', '2580236'})

        # close group must have at least 5 taxa
        me.close_tax = None
        me.groups['close'] = None
        me.close_size = 5
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['543'])  # Enterobacteriaceae
        exp = {'543', '620', '622', '570', '548', '561', '2580236'}
        self.assertSetEqual(me.groups['close'], exp)

        # close group is LCA of multiple self groups
        me.self_tax = ['561', '620']  # Escherichia and Shigella
        me.groups['self'] = set().union(*[[x] + get_descendants(x, me.taxdump)
                                          for x in me.self_tax])
        me.close_tax = None
        me.groups['close'] = None
        me.close_size = None
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['543'])  # Enterobacteriaceae
        exp = {'543', '570', '548'}
        self.assertSetEqual(me.groups['close'], exp)
示例#23
0
    def test_perform_kde(self):
        me = Analyze()
        me.bw_steps = 10
        data = np.concatenate([self.dist_norm1, self.dist_norm2])

        # grid search
        me.bandwidth = 'grid'
        obs = me.perform_kde(data)[2]
        self.assertAlmostEqual(obs, 0.21544346900318834)

        # Silverman's rule-of-thumb
        me.bandwidth = 'silverman'
        obs = me.perform_kde(data)[2]
        self.assertAlmostEqual(obs, 0.48713295460585126)

        # fixed value
        me.bandwidth = 0.5
        obs = me.perform_kde(data)[2]
        self.assertAlmostEqual(obs, 0.5)

        # invalid bandwidth
        me.bandwidth = 100
        with self.assertRaises(ValueError) as ctx:
            me.perform_kde(data)
        msg = 'Invalid bandwidth: 100.'
        self.assertEqual(str(ctx.exception), msg)
示例#24
0
    def test_set_parameters(self):
        me = Analyze()
        me.cfg = load_configs()

        def args():
            return None

        # input is file
        infile = join(self.datadir, 'DnaK', 'search', 'sample.tsv')
        outdir = join(self.tmpdir, 'output')
        args.input = infile
        args.output = outdir
        args.noise = 0.75
        me.set_parameters(args)
        self.assertEqual(me.input, infile)
        self.assertEqual(me.output, outdir)
        self.assertTrue(isdir(outdir))
        self.assertDictEqual(me.input_map, {'sample': infile})
        self.assertEqual(me.noise, 75)

        # coverage threshold too small
        args.input_cov = 25
        with self.assertRaises(ValueError) as ctx:
            me.set_parameters(args)
        msg = 'Taxonomy coverage for auto-interence must be at least 50%.'
        self.assertEqual(str(ctx.exception), msg)
        args.input_cov = 75

        # input is directory
        indir = join(self.datadir, 'DnaK', 'search')
        args.input = indir
        me.set_parameters(args)
        self.assertEqual(me.input, indir)
        self.assertDictEqual(me.input_map, {'sample': infile})
        rmtree(outdir)

        # input is invalid
        not_path = 'I am not a path'
        args.input = not_path
        with self.assertRaises(ValueError) as ctx:
            me.set_parameters(args)
        msg = f'Invalid input data file or directory: {not_path}.'
        self.assertEqual(str(ctx.exception), msg)

        # input has no search result
        args.input = self.tmpdir
        with self.assertRaises(ValueError) as ctx:
            me.set_parameters(args)
        msg = f'No input data are found under: {self.tmpdir}.'
        self.assertEqual(str(ctx.exception), msg)

        # no input (which is okay)
        delattr(me, 'input_map')
        args.input = None
        me.set_parameters(args)
        self.assertFalse(hasattr(me, 'input_map'))
示例#25
0
    def test_cluster_kde(self):
        me = Analyze()
        data = np.concatenate([self.dist_norm1, self.dist_norm2])
        me.df = pd.Series(data, name='group').to_frame()
        me.bw_steps = 10
        me.noise = 50
        me.low_part = 75
        me.output = self.tmpdir

        # grid search
        me.bandwidth = 'grid'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 1.855525575742988)

        # Silverman's rule-of-thumb
        me.bandwidth = 'silverman'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.2279977615745703)

        # fixed value
        me.bandwidth = 0.5
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.2507008281395433)

        # smart KDE
        me.bandwidth = 'auto'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.1903958075763343)

        # clean up
        remove(join(self.tmpdir, 'group.kde.png'))

        # cannot find threshold (unimodal distribution)
        me.df = pd.Series(self.dist_norm1, name='group').to_frame()
        me.bandwidth = 'silverman'
        obs = me.cluster_kde('group')
        self.assertEqual(obs, 0)
示例#26
0
    def test_predict_hgt(self):
        me = Analyze()

        # populate score table
        n = 1000
        data = {
            'sample': ['S1'] * n,
            'protein': [f'P{x}' for x in range(n)],
            'self':
            np.random.choice(self.dist_gamma, n),
            'close':
            np.concatenate((np.random.choice(self.dist_norm1, int(n / 2)) / 3,
                            np.random.choice(self.dist_norm2, int(n / 2)))),
            'distal':
            np.concatenate((np.random.choice(self.dist_lognorm,
                                             int(n * 3 / 4)),
                            np.random.choice(self.dist_gamma, int(n / 4)) / 2))
        }
        me.df = pd.DataFrame(data)

        # default setting
        me.output = self.tmpdir
        me.self_low = False
        me.bandwidth = 'auto'
        me.bw_steps = 20
        me.low_part = 75
        me.fixed = 25
        me.noise = 50
        me.silhouette = 0.5

        # run prediction
        self.assertEqual(me.predict_hgt(), 96)
        groups = ['self', 'close', 'distal']
        for group in groups[1:]:
            fp = join(self.tmpdir, f'{group}.hist.png')
            self.assertTrue(isfile(fp))
            remove(fp)
        fp = join(self.tmpdir, 'scatter.png')
        self.assertTrue(isfile(fp))
        remove(fp)
        fp = join(self.tmpdir, 'hgts')
        self.assertTrue(isfile(join(fp, 'S1.txt')))
        rmtree(fp)

        # constant values
        me.df['close'] = 1
        me.df.drop('hgt', axis=1, inplace=True)
        self.assertEqual(me.predict_hgt(), 0)
        self.assertNotIn('hgt', me.df.columns)
        remove(join(self.tmpdir, 'close.hist.png'))
示例#27
0
    def test_remove_outliers(self):
        me = Analyze()
        me.self_low = False
        df = pd.DataFrame(np.array([self.dist_gamma,
                                    self.dist_lognorm[:800]]).T,
                          columns=['close', 'distal'])

        # Z-score
        me.df = df.copy()
        me.outliers = 'zscore'
        me.remove_outliers()
        self.assertEqual(me.df.shape[0], 781)

        # boxplot
        me.df = df.copy()
        me.outliers = 'boxplot'
        me.remove_outliers()
        self.assertEqual(me.df.shape[0], 710)
示例#28
0
    def test_refine_cluster(self):
        me = Analyze()

        # only close and distal
        me.self_low = False
        me.silhouette = 0.5
        me.df = pd.DataFrame(np.array(
            [self.dist_gamma, self.dist_lognorm[:800]]).T,
                             columns=['close', 'distal'])
        me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
        me.refine_cluster(me.calc_cluster_props())
        self.assertEqual(me.df[me.df['hgt']].shape[0], 11)

        # all three groups
        me.self_low = True
        me.df = pd.DataFrame(np.array(
            [self.dist_norm1[:800], self.dist_gamma,
             self.dist_lognorm[:800]]).T,
                             columns=['self', 'close', 'distal'])
        me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
        me.refine_cluster(me.calc_cluster_props())
        self.assertEqual(me.df[me.df['hgt']].shape[0], 4)
示例#29
0
 def test_outliers_boxplot(self):
     df = pd.DataFrame(np.array([self.dist_gamma,
                                 self.dist_lognorm[:800]]).T,
                       columns=['close', 'distal'])
     obs = Analyze.outliers_boxplot(df, ['close', 'distal'])
     self.assertEqual(obs.shape[0], 710)
示例#30
0
    def test_read_input(self):
        me = Analyze()

        def batch_assert():
            self.assertEqual(len(me.taxdump), 76)
            self.assertEqual(me.data['sample'][0]['id'], 'WP_000516135.1')
            self.assertEqual(me.data['sample'][0]['hits'].shape, (12, 5))

        # DnaK - default mode
        me.taxdump = join(self.datadir, 'DnaK', 'taxdump')
        me.input_map = {
            'sample': join(self.datadir, 'DnaK', 'search', 'sample.tsv')
        }
        me.read_input()
        batch_assert()

        # missing taxonomy
        copy(join(self.datadir, 'DnaK', 'search', 'sample.tsv'),
             join(self.tmpdir, 'sample.tsv'))
        me.input = self.tmpdir
        me.taxdump = None
        with self.assertRaises(ValueError) as ctx:
            me.read_input()
        msg = 'Missing taxonomy database.'
        self.assertEqual(str(ctx.exception), msg)

        # taxonomy in same directory as search result
        copy(join(self.datadir, 'DnaK', 'taxdump', 'nodes.dmp'),
             join(self.tmpdir, 'nodes.dmp'))
        copy(join(self.datadir, 'DnaK', 'taxdump', 'names.dmp'),
             join(self.tmpdir, 'names.dmp'))
        me.input_map = {'sample': join(self.tmpdir, 'sample.tsv')}
        me.read_input()
        batch_assert()

        # taxonomy in parent directory as search result
        indir = join(self.tmpdir, 'search')
        makedirs(indir)
        move(join(self.tmpdir, 'sample.tsv'), join(indir, 'sample.tsv'))
        me.input = indir
        me.input_map = {'sample': join(indir, 'sample.tsv')}
        me.taxdump = None
        me.read_input()
        batch_assert()
        rmtree(indir)
        remove(join(self.tmpdir, 'nodes.dmp'))
        remove(join(self.tmpdir, 'names.dmp'))