예제 #1
0
 def test_plot_hgts(self):
     me = Analyze()
     me.output = self.tmpdir
     me.df = pd.DataFrame(np.array(
         [self.dist_gamma, self.dist_lognorm[:800]]).T,
                          columns=['close', 'distal'])
     me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
     me.plot_hgts()
     fp = join(self.tmpdir, 'scatter.png')
     self.assertTrue(isfile(fp))
     remove(fp)
예제 #2
0
    def test_predict_hgt(self):
        me = Analyze()

        # populate score table
        n = 1000
        data = {
            'sample': ['S1'] * n,
            'protein': [f'P{x}' for x in range(n)],
            'self':
            np.random.choice(self.dist_gamma, n),
            'close':
            np.concatenate((np.random.choice(self.dist_norm1, int(n / 2)) / 3,
                            np.random.choice(self.dist_norm2, int(n / 2)))),
            'distal':
            np.concatenate(
                (np.random.choice(self.dist_lognorm, int(n * 3 / 4)),
                 np.random.choice(self.dist_gamma, int(n / 4)) / 2)),
            'match': ['0'] * n
        }
        me.df = pd.DataFrame(data)

        # default setting
        me.output = self.tmpdir
        me.self_low = False
        me.bandwidth = 'auto'
        me.bw_steps = 20
        me.low_part = 75
        me.fixed = 25
        me.noise = 50
        me.silhouette = 0.5
        me.taxdump = {}
        me.donor_name = False
        me.donor_rank = None

        # run prediction
        self.assertEqual(me.predict_hgt(), 96)
        groups = ['self', 'close', 'distal']
        for group in groups[1:]:
            fp = join(self.tmpdir, f'{group}.hist.png')
            self.assertTrue(isfile(fp))
            remove(fp)
        fp = join(self.tmpdir, 'scatter.png')
        self.assertTrue(isfile(fp))
        remove(fp)
        fp = join(self.tmpdir, 'hgts')
        self.assertTrue(isfile(join(fp, 'S1.txt')))
        rmtree(fp)

        # constant values
        me.df['close'] = 1
        me.df.drop('hgt', axis=1, inplace=True)
        self.assertEqual(me.predict_hgt(), 0)
        self.assertNotIn('hgt', me.df.columns)
        remove(join(self.tmpdir, 'close.hist.png'))
예제 #3
0
    def test_write_hgt_list(self):
        me = Analyze()
        me.output = self.tmpdir
        makedirs(join(me.output, 'hgts'), exist_ok=True)
        me.donor_name = False
        me.donor_rank = None
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.df = pd.DataFrame(
            [['S1', 'P1', 0.85, '562', True], ['S1', 'P2', 0.95, '622', True],
             ['S1', 'P3', 1.05, '0', True], ['S2', 'P4', 0.80, '766', True],
             ['S2', 'P5', 0.20, '0', False]],
            columns=['sample', 'protein', 'silh', 'match', 'hgt'])

        # default
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\t562\n' 'P2\t0.95\t622\n' 'P3\t1.05\t0\n')
        self.assertEqual(obs, exp)

        # number format and negative result
        me.write_hgt_list('S2')
        with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f:
            self.assertEqual(f.read(), 'P4\t0.8\t766\n')

        # raise to family
        me.donor_rank = 'family'
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\t543\n' 'P2\t0.95\t543\n' 'P3\t1.05\t0\n')
        self.assertEqual(obs, exp)

        # report taxon name
        me.donor_rank = None
        me.donor_name = True
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\tEscherichia coli\n'
               'P2\t0.95\tShigella dysenteriae\n'
               'P3\t1.05\tN/A\n')
        self.assertEqual(obs, exp)
        rmtree(join(me.output, 'hgts'))
예제 #4
0
 def test_make_score_table(self):
     me = Analyze()
     me.output = self.tmpdir
     me.data = {
         'S1': [{
             'id': 'P1',
             'length': 100,
             'match': '0',
             'self': 1.5,
             'close': 0.75,
             'distal': 0.0,
             'hits': pd.DataFrame([0] * 3)
         }, {
             'id': 'P2',
             'length': 120,
             'match': '1224',
             'self': 1.625,
             'close': 0.225,
             'distal': 0.375,
             'hits': pd.DataFrame([0] * 5)
         }],
         'S2': [{
             'id': 'P1',
             'length': 225,
             'match': '620',
             'self': 2.35,
             'close': 1.05,
             'distal': 0.75,
             'hits': pd.DataFrame([0] * 6)
         }]
     }
     me.make_score_table()
     obs = me.df.values.tolist()
     exp = [['S1', 'P1', 100, 3, 1.5, 0.75, 0, '0'],
            ['S1', 'P2', 120, 5, 1.625, 0.225, 0.375, '1224'],
            ['S2', 'P1', 225, 6, 2.35, 1.05, 0.75, '620']]
     self.assertListEqual(obs, exp)
     fp = join(self.tmpdir, 'scores.tsv')
     with open(fp, 'r') as f:
         obs = [x.split('\t') for x in f.read().splitlines()[1:]]
     exp = [[str(y) for y in x] for x in exp]
     self.assertListEqual(obs, exp)
     remove(fp)
예제 #5
0
    def test_smart_kde(self):
        me = Analyze()

        # typical case (bimodal distribution)
        me.df = pd.Series(np.concatenate([self.dist_norm1, self.dist_norm2]),
                          name='group').to_frame()
        me.bw_steps = 10
        me.noise = 50
        me.low_part = 75
        me.output = self.tmpdir
        obs = me.smart_kde('group')
        self.assertAlmostEqual(obs, 2.1903958075763343)
        file = join(self.tmpdir, 'group.kde.png')
        self.assertTrue(isfile(file))
        remove(file)

        # unable to determine threshold
        me.low_part = 0.001
        me.df = pd.Series(self.dist_norm1, name='group').to_frame()
        self.assertEqual(me.smart_kde('group'), 0)
예제 #6
0
    def test_cluster_kde(self):
        me = Analyze()
        data = np.concatenate([self.dist_norm1, self.dist_norm2])
        me.df = pd.Series(data, name='group').to_frame()
        me.bw_steps = 10
        me.noise = 50
        me.low_part = 75
        me.output = self.tmpdir

        # grid search
        me.bandwidth = 'grid'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 1.855525575742988)

        # Silverman's rule-of-thumb
        me.bandwidth = 'silverman'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.2279977615745703)

        # fixed value
        me.bandwidth = 0.5
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.2507008281395433)

        # smart KDE
        me.bandwidth = 'auto'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.1903958075763343)

        # clean up
        remove(join(self.tmpdir, 'group.kde.png'))

        # cannot find threshold (unimodal distribution)
        me.df = pd.Series(self.dist_norm1, name='group').to_frame()
        me.bandwidth = 'silverman'
        obs = me.cluster_kde('group')
        self.assertEqual(obs, 0)