예제 #1
0
    def test_per_sample_sequences_complex(self):
        maximum = 2

        def bin_f(x):
            return x['SequenceID'].rsplit('_', 1)[0]

        exp = sorted([('a', {
            'SequenceID': 'a_2',
            'Sequence': 'AATTGGCC-a2'
        }), ('a', {
            'SequenceID': 'a_3',
            'Sequence': 'AATTGGCC-a3'
        }), ('b', {
            'SequenceID': 'b_2',
            'Sequence': 'AATTGGCC-b2'
        }), ('b', {
            'SequenceID': 'b_1',
            'Sequence': 'AATTGGCC-b1'
        }), ('c', {
            'SequenceID': 'c_1',
            'Sequence': 'AATTGGCC-c1'
        }), ('c', {
            'SequenceID': 'c_2',
            'Sequence': 'AATTGGCC-c2'
        })],
                     key=lambda x: x[0])
        obs = isubsample(self.mock_sequence_iter(self.sequences),
                         maximum,
                         bin_f=bin_f,
                         buf_size=1)
        self.assertEqual(sorted(obs, key=lambda x: x[0]), exp)
예제 #2
0
def subsample_dm(distmat, mapping_file, max, category, output):
    """Subsample the distmat to max samples per category value"""
    mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str),
                     index_col='#SampleID')
    id_to_cat = dict(mf[category])

    def bin_f(x):
        return id_to_cat[x]

    dm = read(distmat, into=DistanceMatrix)
    dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)])
    dm.to_file(output)
예제 #3
0
 def test_per_sample_sequences_complex(self):
     maximum = 2
     bin_f = lambda x: x['SequenceID'].rsplit('_', 1)[0]
     exp = sorted([('a', {'SequenceID': 'a_2', 'Sequence': 'AATTGGCC-a2'}),
                   ('a', {'SequenceID': 'a_3', 'Sequence': 'AATTGGCC-a3'}),
                   ('b', {'SequenceID': 'b_2', 'Sequence': 'AATTGGCC-b2'}),
                   ('b', {'SequenceID': 'b_1', 'Sequence': 'AATTGGCC-b1'}),
                   ('c', {'SequenceID': 'c_1', 'Sequence': 'AATTGGCC-c1'}),
                   ('c', {'SequenceID': 'c_2', 'Sequence': 'AATTGGCC-c2'})],
                  key=lambda x: x[0])
     obs = isubsample(self.mock_sequence_iter(self.sequences), maximum,
                      bin_f=bin_f, buf_size=1)
     self.assertEqual(sorted(obs, key=lambda x: x[0]), exp)
예제 #4
0
def subsample_dm(distmat, mapping_file, max, category, output):
    """Subsample the distmat to max samples per category value"""
    mf = pd.read_csv(mapping_file,
                     '\t',
                     converters=defaultdict(str),
                     dtype=str)
    mf.set_index('#SampleID', inplace=True)

    id_to_cat = dict(mf[category])

    def bin_f(x):
        return id_to_cat.get(x)

    dm = read(distmat, into=DistanceMatrix)
    dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)])
    dm.to_file(output)
예제 #5
0
    def test_isubsample_simple(self):
        maximum = 10

        def bin_f(x):
            return x['SequenceID'].rsplit('_', 1)[0]

        # note, the result here is sorted by sequence_id but is in heap order
        # by the random values associated to each sequence
        exp = sorted([('a', {
            'SequenceID': 'a_5',
            'Sequence': 'AATTGGCC-a5'
        }), ('a', {
            'SequenceID': 'a_1',
            'Sequence': 'AATTGGCC-a1'
        }), ('a', {
            'SequenceID': 'a_4',
            'Sequence': 'AATTGGCC-a4'
        }), ('a', {
            'SequenceID': 'a_3',
            'Sequence': 'AATTGGCC-a3'
        }), ('a', {
            'SequenceID': 'a_2',
            'Sequence': 'AATTGGCC-a2'
        }), ('b', {
            'SequenceID': 'b_2',
            'Sequence': 'AATTGGCC-b2'
        }), ('b', {
            'SequenceID': 'b_1',
            'Sequence': 'AATTGGCC-b1'
        }), ('c', {
            'SequenceID': 'c_3',
            'Sequence': 'AATTGGCC-c3'
        }), ('c', {
            'SequenceID': 'c_2',
            'Sequence': 'AATTGGCC-c2'
        }), ('c', {
            'SequenceID': 'c_1',
            'Sequence': 'AATTGGCC-c1'
        })],
                     key=lambda x: x[0])
        obs = isubsample(self.mock_sequence_iter(self.sequences),
                         maximum,
                         bin_f=bin_f)
        self.assertEqual(sorted(obs, key=lambda x: x[0]), exp)
예제 #6
0
    def test_per_sample_sequences_min_seqs(self):
        maximum = 10
        minimum = 3
        bin_f = lambda x: x['SequenceID'].rsplit('_', 1)[0]

        # note, the result here is sorted by sequence_id but is in heap order
        # by the random values associated to each sequence
        exp = sorted([('a', {'SequenceID': 'a_5', 'Sequence': 'AATTGGCC-a5'}),
                      ('a', {'SequenceID': 'a_1', 'Sequence': 'AATTGGCC-a1'}),
                      ('a', {'SequenceID': 'a_4', 'Sequence': 'AATTGGCC-a4'}),
                      ('a', {'SequenceID': 'a_3', 'Sequence': 'AATTGGCC-a3'}),
                      ('a', {'SequenceID': 'a_2', 'Sequence': 'AATTGGCC-a2'}),
                      ('c', {'SequenceID': 'c_3', 'Sequence': 'AATTGGCC-c3'}),
                      ('c', {'SequenceID': 'c_2', 'Sequence': 'AATTGGCC-c2'}),
                      ('c', {'SequenceID': 'c_1', 'Sequence': 'AATTGGCC-c1'})],
                     key=lambda x: x[0])
        obs = isubsample(self.mock_sequence_iter(self.sequences), maximum,
                         minimum, bin_f=bin_f)
        self.assertEqual(sorted(obs, key=lambda x: x[0]), exp)
예제 #7
0
 def test_binf_is_none(self):
     maximum = 2
     items = [1, 2]
     exp = [(True, 1), (True, 2)]
     obs = isubsample(items, maximum)
     self.assertEqual(list(obs), exp)
예제 #8
0
 def test_max_lt_zero(self):
     gen = isubsample([1, 2, 3], maximum=-10)
     with self.assertRaises(ValueError):
         next(gen)
예제 #9
0
 def test_min_gt_max(self):
     gen = isubsample([1, 2, 3], maximum=2, minimum=10)
     with self.assertRaises(ValueError):
         next(gen)
예제 #10
0
 def test_binf_is_none(self):
     maximum = 2
     items = [1, 2]
     exp = [(True, 1), (True, 2)]
     obs = isubsample(items, maximum)
     self.assertEqual(list(obs), exp)
예제 #11
0
 def test_max_lt_zero(self):
     gen = isubsample([1, 2, 3], maximum=-10)
     with self.assertRaises(ValueError):
         next(gen)
예제 #12
0
 def test_min_gt_max(self):
     gen = isubsample([1, 2, 3], maximum=2, minimum=10)
     with self.assertRaises(ValueError):
         next(gen)