示例#1
0
    def test_ne_true(self):
        s1 = Sequence(self.label, self.sequence)
        s2 = Sequence(self.label, "---aggatgcgagatgcgtgtt-----")
        self.assertTrue(s1 != s2)

        self.assertTrue(s1 != 2)
        self.assertTrue(s1 != "---aggatgcgagatgcgtgtt-----")
示例#2
0
    def test_eq_true(self):
        s1 = Sequence(self.label, self.sequence)
        s2 = Sequence(self.label, self.sequence)
        self.assertTrue(s1 == s2)

        s2 = Sequence("152_4447;size=1812;", self.sequence)
        self.assertTrue(s1 == s2)
示例#3
0
    def test_eq_false(self):
        s1 = Sequence(self.label, self.sequence)
        s2 = Sequence(self.label, "---aggatgcgagatgcgtgtt-----")
        self.assertFalse(s1 == s2)

        self.assertFalse(s1 == 2)
        self.assertFalse(s1 == "---aggatgcgagatgcgtgtt-----")
示例#4
0
    def test_ne_false(self):
        s1 = Sequence(self.label, self.sequence)
        s2 = Sequence(self.label, self.sequence)
        self.assertFalse(s1 != s2)

        s2 = Sequence("152_4447;size=1812;", self.sequence)
        self.assertFalse(s1 != s2)
示例#5
0
    def test_deblur_with_non_default_error_profile(self):
        error_dist = [
            1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025,
            0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005,
            0.0000005, 0.0000005
        ]
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(sequence_generator(seqs_f), error_dist=error_dist)
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        self.assertEqual(obs, exp)

        error_dist = np.array([
            1, 0.06, 0.02, 0.02, 0.01, 0.005, 0.005, 0.005, 0.001, 0.001,
            0.001, 0.0005
        ])
        seqs_f = StringIO(TEST_SEQS_2)
        obs = deblur(sequence_generator(seqs_f), error_dist=error_dist)
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        self.assertEqual(obs, exp)
示例#6
0
 def test_get_sequences(self):
     exp_seqs = [
         Sequence("151_4447;size=1812;", "---aggatgcgagatgcgtggt-----"),
         Sequence("151_3288;size=1337;", "---ggatgcgagatgcgtggtg-----"),
         Sequence("151_6640;size=1068;", "---cggaggcgagatgcgtggt-----"),
         Sequence("151_5155;size=998;", "---gaggatgcgagatgcgtgg-----"),
         Sequence("151_527;size=964;", "---acggaggatgatgcgcggt-----"),
         Sequence("151_14716;size=390;", "---gagtgcgagatgcgtggtg-----"),
         Sequence("151_5777;size=305;", "---ggagtgcaagattccaggt-----"),
         Sequence("151_64278;size=200;", "---tactagcaagattcctggt-----"),
         Sequence("151_9240;size=170;", "---tagggcaagactccatggt-----"),
         Sequence("151_41690;size=157;", "---agg-gcgagattcctagtgg----")]
     obs_seqs = get_sequences(self.seqs)
     self.assertEqual(obs_seqs, exp_seqs)
示例#7
0
    def test_deblur_indel(self):
        """Test if also removes indel sequences
        """
        seqs_f = StringIO(TEST_SEQS_2)

        # add the MSA for the indel
        seqs = sequence_generator(seqs_f)
        newseqs = []
        for chead, cseq in seqs:
            tseq = cseq[:10] + '-' + cseq[10:]
            newseqs.append((chead, tseq))
        # now add a sequence with an A insertion
        tseq = cseq[:10] + 'A' + cseq[10:-1] + '-'
        newseqs.append((chead, tseq))

        obs = deblur(newseqs)
        # remove the '-' (same as in launch_workflow)
        for s in obs:
            s.sequence = s.sequence.replace('-', '')

        # the expected output
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]
        # make sure we get 1 sequence as output
        self.assertEqual(len(obs), 1)
        # and that it is the correct sequence
        self.assertEqual(obs[0].sequence, exp[0].sequence)
示例#8
0
    def test_init(self):
        obs = Sequence(self.label, self.sequence)

        self.assertEqual(obs.label, self.label)
        self.assertEqual(obs.sequence, self.exp_seq)
        self.assertEqual(obs.length, 27)
        self.assertEqual(obs.unaligned_length, 19)
        self.assertEqual(obs.frequency, 1812)
        npt.assert_equal(obs.np_sequence, self.exp_np_seq)
示例#9
0
    def test_init_mixed_case(self):
        sequence = "---AggATgcGAgatGCgtgGT-----"

        obs = Sequence(self.label, sequence)

        self.assertEqual(obs.label, self.label)
        self.assertEqual(obs.sequence, self.exp_seq)
        self.assertEqual(obs.length, 27)
        self.assertEqual(obs.unaligned_length, 19)
        self.assertEqual(obs.frequency, 1812)
        npt.assert_equal(obs.np_sequence, self.exp_np_seq)
示例#10
0
    def test_init_uppercase(self):
        sequence = "---AGGATGCGAGATGCGTGGT-----"

        obs = Sequence(self.label, sequence)

        self.assertEqual(obs.label, self.label)
        self.assertEqual(obs.sequence, self.exp_seq)
        self.assertEqual(obs.length, 27)
        self.assertEqual(obs.unaligned_length, 19)
        self.assertEqual(obs.frequency, 1812)
        npt.assert_equal(obs.np_sequence, self.exp_np_seq)
示例#11
0
    def test_deblur(self):
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(parse_fasta(seqs_f))
        exp = [
            Sequence("E.Coli-999;size=720;",
                     "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                     "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                     "gcaagcttgagtctcgtagaggggggcagaattccag")]

        self.assertEqual(obs, exp)
示例#12
0
    def test_deblur_toy_example(self):
        seqs_f = StringIO(TEST_SEQS_1)
        obs = deblur(sequence_generator(seqs_f))
        exp = [
            Sequence(
                "E.Coli;size=1000;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        self.assertEqual(obs, exp)
示例#13
0
    def test_to_fasta(self):
        seq = Sequence(self.label, self.sequence)
        obs = seq.to_fasta()
        exp = ">151_4447;size=1812;\n---AGGATGCGAGATGCGTGGT-----\n"
        self.assertEqual(obs, exp)

        seq.frequency = 1811.1
        obs = seq.to_fasta()
        exp = ">151_4447;size=1811;\n---AGGATGCGAGATGCGTGGT-----\n"
        self.assertEqual(obs, exp)

        seq.frequency = 1811.5
        obs = seq.to_fasta()
        exp = ">151_4447;size=1812;\n---AGGATGCGAGATGCGTGGT-----\n"
        self.assertEqual(obs, exp)
示例#14
0
    def test_init_non_actg_chars(self):
        sequence = "---FoOatgcgagatgcgtfOo-----"
        exp_seq = "---FOOATGCGAGATGCGTFOO-----"
        exp_np_seq = np.array([
            4, 4, 4, 5, 5, 5, 0, 3, 2, 1, 2, 0, 2, 0, 3, 2, 1, 2, 3, 5, 5, 5,
            4, 4, 4, 4, 4
        ])

        obs = Sequence(self.label, sequence)

        self.assertEqual(obs.label, self.label)
        self.assertEqual(obs.sequence, exp_seq)
        self.assertEqual(obs.length, 27)
        self.assertEqual(obs.unaligned_length, 19)
        self.assertEqual(obs.frequency, 1812)
        npt.assert_equal(obs.np_sequence, exp_np_seq)
示例#15
0
    def test_deblur_indel(self):
        """Test if also removes indel sequences
        """
        seqs_f = StringIO(TEST_SEQS_2)

        # add the MSA for the indel
        seqs = sequence_generator(seqs_f)
        newseqs = []
        for chead, cseq in seqs:
            tseq = cseq[:10] + '-' + cseq[10:]
            newseqs.append((chead, tseq))

        # now add a sequence with an A insertion at the expected freq. (30 < 0.02 * (720 / 0.47) where 0.47 is the mod_factor) so should be removed
        cseq = newseqs[0][1]
        tseq = cseq[:10] + 'A' + cseq[11:-1] + '-'
        chead = '>indel1-read;size=30;'
        newseqs.append((chead, tseq))

        # and add a sequence with an A insertion but at higher freq. (not expected by indel upper bound - (31 > 0.02 * (720 / 0.47) so should not be removed)
        cseq = newseqs[0][1]
        tseq = cseq[:10] + 'A' + cseq[11:-1] + '-'
        chead = '>indel2-read;size=31;'
        newseqs.append((chead, tseq))

        obs = deblur(newseqs)

        # remove the '-' (same as in launch_workflow)
        for s in obs:
            s.sequence = s.sequence.replace('-', '')

        # the expected output
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]
        # make sure we get 2 sequences as output - the original and the indel2 (too many reads for the expected indel probabilty)
        self.assertEqual(len(obs), 2)
        # and that it is the correct sequence
        self.assertEqual(obs[0].sequence, exp[0].sequence)
        self.assertEqual(obs[1].label, '>indel2-read;size=31;')
示例#16
0
def get_sequences(input_seqs):
    """Returns a list of Sequences

    Parameters
    ----------
    input_seqs : iterable of (str, str)
        The list of input sequences in (label, sequence) format

    Returns
    -------
    list of Sequence

    Raises
    ------
    ValueError
        If no sequences where found in `input_seqs`
        If all the sequences do not have the same length either aligned or
        unaligned.
    """
    try:
        seqs = [Sequence(id, seq) for id, seq in input_seqs]
    except Exception:
        seqs = []

    if len(seqs) == 0:
        logger = logging.getLogger(__name__)
        logger.warn('No sequences found in fasta file!')
        return None

    # Check that all the sequence lengths (aligned and unaligned are the same)
    aligned_lengths = set(s.length for s in seqs)
    unaligned_lengths = set(s.unaligned_length for s in seqs)

    if len(aligned_lengths) != 1 or len(unaligned_lengths) != 1:
        raise ValueError(
            "Not all sequence have the same length. Aligned lengths: %s, "
            "sequence lengths: %s"
            % (", ".join(map(str, aligned_lengths)),
               ", ".join(map(str, unaligned_lengths))))

    seqs = sorted(seqs, key=attrgetter('frequency'), reverse=True)
    return seqs