Пример #1
0
def test_poly_a_clipper():
    # Define Object
    poly_a_clipper = PolyAClipper()

    # Test: If there is no poly a stretch the sequence is not changed
    seq_no_stretch = result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAC"
    "AATGGTTAGGTACAGATAG"
    assert poly_a_clipper.clip_poly_a_stretch(seq_no_stretch) == result_seq

    # Test: If the sequence is empty the sequence is not changed
    empty_seq = result_seq = ""
    assert poly_a_clipper.clip_poly_a_stretch(empty_seq) == result_seq

    # Test: Clip a terminal 10 fold A stretch
    test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAATTTAGACGACG"
    result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC"
    assert poly_a_clipper.clip_poly_a_stretch(test_seq) == result_seq

    # Test: If there is less than a 10 fold terminal stretch don't clip
    test_seq = result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAA"
    "TTTAGACGACG"
    assert poly_a_clipper.clip_poly_a_stretch(test_seq) == result_seq

    # Test: All A sequence
    test_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
    result_seq = ""
    assert poly_a_clipper.clip_poly_a_stretch(test_seq) == result_seq

    # Test: Sequence with an starting 'AAAA' substring
    test_seq = "TTTAAAATTTTTTTTAAAACCCCCCCCCCAAAAC"
    assert list(poly_a_clipper._aaaa_starting_substrings(
        test_seq, 11)) == [['AAAATTTTTTT', 3], ['AAAACCCCCCC', 15]]

    # Test: If there is no terminal A stretch, there is no clipping
    test_seq = result_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
    assert poly_a_clipper.remove_3_prime_a(test_seq) == result_seq

    # Test: Removal of one terminal A
    test_seq = "AAAAATTTTCCGCCCGGGAAATTTTA"
    result_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
    assert poly_a_clipper.remove_3_prime_a(test_seq) == result_seq

    # Test: Removal of multiple terminal As
    test_seq = "AAAAATTTTCCGCCCGGGAAATTTTAAAAAA"
    result_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
    assert poly_a_clipper.remove_3_prime_a(test_seq) == result_seq
Пример #2
0
class TestPolyAClipper(unittest.TestCase):
    def setUp(self):
        self.poly_a_clipper = PolyAClipper()

    def test_clip_poly_a_strech_no_change(self):
        """If there is no poly a strech the sequence is not changed."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACACAATGGTTAGGTACAGATAG"
        result_seq = test_seq
        self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq),
                         result_seq)

    def test_clip_poly_a_strech_empt(self):
        """If there the sequence is empty the sequence is not changed."""
        test_seq = ""
        result_seq = test_seq
        self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq),
                         result_seq)

    def test_clip_poly_a_strech_terminal_10_a(self):
        """Clipp terminal A strech if it is 10 A."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAAA"
        result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC"
        self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq),
                         result_seq)

    def test_clip_poly_a_strech_internal_10_a(self):
        """Clip before a 10 A long internal strech."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAATTTAGACGACG"
        result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC"
        self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq),
                         result_seq)

    def test_clip_poly_a_strech_terminal_09_a(self):
        """If there less than 10 A don't clip."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAA"
        result_seq = test_seq
        self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq),
                         result_seq)

    def test_clip_poly_a_strecht_internal_(self):
        """If there less than 10 A don't clip."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAATTTAGACGACG"
        result_seq = test_seq
        self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq),
                         result_seq)

    def test_clip_poly_a_strecht_internal_09_a(self):
        """Test all A string"""
        test_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
        result_seq = ""
        self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq),
                         result_seq)

    def test_aaaa_starting_substrings(self):
        test_seq = "TTTAAAATTTTTTTTAAAACCCCCCCCCCAAAAC"
        self.assertEqual(
            list(self.poly_a_clipper._aaaa_starting_substrings(test_seq, 11)),
            [["AAAATTTTTTT", 3], ["AAAACCCCCCC", 15]],
        )

    def test_remove_3_prime_a_no_change(self):
        """If there are no terminal As, there is no clipping."""
        test_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
        result_seq = test_seq
        self.assertEqual(self.poly_a_clipper.remove_3_prime_a(test_seq),
                         result_seq)

    def test_remove_3_prime_a_one_a(self):
        """Remove terminal A"""
        test_seq = "AAAAATTTTCCGCCCGGGAAATTTTA"
        result_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
        self.assertEqual(self.poly_a_clipper.remove_3_prime_a(test_seq),
                         result_seq)

    def test_remove_3_prime_a_multiple_as(self):
        """Remove terminal stretch of multiple As"""
        test_seq = "AAAAATTTTCCGCCCGGGAAATTTTAAAAAA"
        result_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
        self.assertEqual(self.poly_a_clipper.remove_3_prime_a(test_seq),
                         result_seq)
Пример #3
0
class ReadProcessor(object):
    
    def __init__(self, poly_a_clipping=False,  min_read_length=20,
                 paired_end=False, min_phred_score=None,
                 adapter=None, reverse_complement=False):
        self._poly_a_clipping = poly_a_clipping
        self._min_read_length = min_read_length
        self._paired_end = paired_end
        self._min_phred_score = min_phred_score
        self._adapter = adapter
        self._poly_a_clipper = PolyAClipper()
        self._reverse_complement = reverse_complement

    def process_single_end(self, input_path, output_path):
        self._init_stat_dict()
        with gzip.open(output_path, "wb") as output_fh:
            input_fh = self._input_fh(input_path)
            self._process_single_end(input_fh, output_fh)
        return self._stats

    def process_paired_end(self, input_path_pair, output_path_pair):
        self._init_stat_dict()
        with gzip.open(output_path_pair[0], "wb") as output_p1_fh, \
                gzip.open(output_path_pair[1], "wb") as output_p2_fh:
            input_p1_fh = self._input_fh(input_path_pair[0])
            input_p2_fh = self._input_fh(input_path_pair[1])
            self._process_paired_end(
                input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh)
        return self._stats

    def _init_stat_dict(self):
        self._stats = defaultdict(int)
        self._stats["total_no_of_reads"]
        self._stats["polya_removed"]
        self._stats["single_a_removed"]
        self._stats["unmodified"]
        self._stats["too_short"]
        self._stats["long_enough"]
        self._stats[
            "read_length_before_processing_and_freq"] = defaultdict(int)
        self._stats["read_length_after_processing_and_freq"] = defaultdict(int)
    
    def _input_fh(self, input_path):
        """Return a file hande

        Can deal with plain fasta files, gzipped bzipped2 fasta.
        """
        if input_path.endswith(".gz"):
            return gzip.open(input_path, "rt")
        elif input_path.endswith(".bz2"):
            return bz2.open(input_path, "rt")
        elif input_path.endswith(".xz"):
            return lzma.open(input_path, "rt")
        with open(input_path, "r") as check_file:
            check_file.seek(0)
            first_line = check_file.readline()
            if first_line[0] == '@':
                self._fastq = True
            else:
                self._fastq = False
        return open(input_path)

    def _trim_by_quality(self, seq, qualities):
        good_nucl = []
        for nucl, qual in zip(seq, qualities):
            if qual < self._min_phred_score:
                break
            good_nucl.append(nucl)
        return "".join(good_nucl)

    def _clip_adapter(self, seq):
        adapter_start_pos = seq.find(self._adapter)
        if adapter_start_pos == -1:
            return seq
        else:
            return seq[:adapter_start_pos]
        
    def _process_single_end(self, input_fh, output_fh):
        for header, seq, qualities in self._parse_sequences(input_fh):
            raw_seq_len = len(seq)
            self._stats["total_no_of_reads"] += 1
            if self._fastq and self._min_phred_score is not None:
                seq = self._trim_by_quality(seq, qualities)
            if self._reverse_complement:
                seq = Seq(seq)
                seq = str(seq.reverse_complement())
            if self._adapter is not None:
                seq = self._clip_adapter(seq)
            if self._poly_a_clipping:
                seq = self._poly_a_clipper.clip_poly_a_stretch(seq)
                seq = self._poly_a_clipper.remove_3_prime_a(seq)
            clipped_seq_len = len(seq)
            if clipped_seq_len == raw_seq_len - 1:
                self._stats["single_a_removed"] += 1
            elif clipped_seq_len < raw_seq_len - 1:
                self._stats["polya_removed"] += 1
            else:
                self._stats["unmodified"] += 1
            if clipped_seq_len < self._min_read_length:
                self._stats["too_short"] += 1
                continue
            self._stats["long_enough"] += 1
            self._stats["read_length_before_processing_and_freq"][
                raw_seq_len] += 1
            self._stats["read_length_after_processing_and_freq"][
                clipped_seq_len] += 1
            # Encoding to bytes is necessary due to saving via gzip
            output_fh.write(str.encode(">%s\n%s\n" % (header, seq)))

    def _parse_sequences(self, input_fh):
        if self._fastq:
            for seq_record in SeqIO.parse(input_fh, "fastq"):
                yield(seq_record.description, str(seq_record.seq),
                      seq_record.letter_annotations["phred_quality"])
        else:
            for seq_record in SeqIO.parse(input_fh, "fasta"):
                yield(seq_record.description, str(seq_record.seq), None)
        
    def _process_paired_end(
            self, input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh):
        for fasta_entry_p1, fasta_entry_p2 in zip(
                self._parse_sequences(input_p1_fh),
                self._parse_sequences(input_p2_fh,)):
            header_p1 = fasta_entry_p1[0]
            header_p2 = fasta_entry_p2[0]
            seq_p1 = fasta_entry_p1[1]
            seq_p2 = fasta_entry_p2[1]
            qualities_p1 = fasta_entry_p1[2]
            qualities_p2 = fasta_entry_p1[2]
            raw_seq_p1_len = len(seq_p1)
            raw_seq_p2_len = len(seq_p2)
            self._stats["total_no_of_reads"] += 1
            self._stats["unmodified"] += 1
            if self._fastq and self._min_phred_score is not None:
                seq_p1 = self._trim_by_quality(seq_p1, qualities_p1)
                seq_p2 = self._trim_by_quality(seq_p2, qualities_p2)
            if self._reverse_complement:
                seq_p1 = Seq(seq_p1)
                seq_p1 = str(seq_p1.reverse_complement())
                seq_p2 = Seq(seq_p2)
                seq_p2 = str(seq_p2.reverse_complement())
            if self._adapter is not None:
                seq_p1 = self._clip_adapter(seq_p1)
                seq_p2 = self._clip_adapter(seq_p2)
            if (raw_seq_p1_len < self._min_read_length or
                    raw_seq_p2_len < self._min_read_length):
                self._stats["too_short"] += 1
                continue
            self._stats["long_enough"] += 1
            self._stats["read_length_before_processing_and_freq"][
                raw_seq_p1_len] += 1
            self._stats["read_length_after_processing_and_freq"][
                raw_seq_p1_len] += 1
            self._stats["read_length_before_processing_and_freq"][
                raw_seq_p2_len] += 1
            self._stats["read_length_after_processing_and_freq"][
                raw_seq_p2_len] += 1
            # Encoding to bytes is necessary due to saving via gzip
            output_p1_fh.write(str.encode(">%s\n%s\n" % (header_p1, seq_p1)))
            output_p2_fh.write(str.encode(">%s\n%s\n" % (header_p2, seq_p2)))
Пример #4
0
class ReadProcessor(object):
    
    def __init__(self, poly_a_clipping=False,  min_read_length=12,
                 paired_end=False, fastq=False, min_phred_score=None,
                 adapter=None, reverse_complement=False):
        self._poly_a_clipping = poly_a_clipping
        self._min_read_length = min_read_length
        self._paired_end = paired_end
        self._fastq = fastq
        self._min_phred_score = min_phred_score
        self._adapter = adapter
        self._poly_a_clipper = PolyAClipper()
        self._reverse_complement = reverse_complement

    def process_single_end(self, input_path, output_path):
        self._init_stat_dict()
        with gzip.open(output_path, "wb") as output_fh:
            input_fh = self._input_fh(input_path)
            self._process_single_end(input_fh, output_fh)
        return self._stats

    def process_paired_end(self, input_path_pair, output_path_pair):
        self._init_stat_dict()
        with gzip.open(output_path_pair[0], "wb") as output_p1_fh, \
                gzip.open(output_path_pair[1], "wb") as output_p2_fh:
            input_p1_fh = self._input_fh(input_path_pair[0])
            input_p2_fh = self._input_fh(input_path_pair[1])
            self._process_paired_end(
                input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh)
        return self._stats

    def _init_stat_dict(self):
        self._stats = defaultdict(int)
        self._stats["total_no_of_reads"]
        self._stats["polya_removed"]
        self._stats["single_a_removed"]
        self._stats["unmodified"]
        self._stats["too_short"]
        self._stats["long_enough"]
        self._stats[
            "read_length_before_processing_and_freq"] = defaultdict(int)
        self._stats["read_length_after_processing_and_freq"] = defaultdict(int)
    
    def _input_fh(self, input_path):
        """Return a file hande

        Can deal with plain fasta files, gzipped fasta or bzipped2 fasta.
        """
        if input_path.endswith(".gz"):
            return gzip.open(input_path, "rt")
        elif input_path.endswith(".bz2"):
            return bz2.open(input_path, "rt")
        return open(input_path)

    def _trim_by_quality(self, seq, qualities):
        good_nucl = []
        for nucl, qual in zip(seq, qualities):
            if qual < self._min_phred_score:
                break
            good_nucl.append(nucl)
        return "".join(good_nucl)

    def _clip_adapter(self, seq):
        adapter_start_pos = seq.find(self._adapter)
        if adapter_start_pos == -1:
            return seq
        else:
            return seq[:adapter_start_pos]
        
    def _process_single_end(self, input_fh, output_fh):
        for header, seq, qualities in self._parse_sequences(input_fh):
            raw_seq_len = len(seq)
            self._stats["total_no_of_reads"] += 1
            if self._fastq and not self._min_phred_score is None:
                seq = self._trim_by_quality(seq, qualities)
            if self._reverse_complement:
                seq = Seq(seq)
                seq = str(seq.reverse_complement())
            if not self._adapter is None:
                seq = self._clip_adapter(seq)
            if self._poly_a_clipping:
                seq = self._poly_a_clipper.clip_poly_a_strech(seq)
                seq = self._poly_a_clipper.remove_3_prime_a(seq)
            clipped_seq_len = len(seq)
            if clipped_seq_len == raw_seq_len - 1:
                self._stats["single_a_removed"] += 1
            elif clipped_seq_len < raw_seq_len - 1:
                self._stats["polya_removed"] += 1
            else:
                self._stats["unmodified"] += 1
            if clipped_seq_len < self._min_read_length:
                self._stats["too_short"] += 1
                continue
            self._stats["long_enough"] += 1
            self._stats["read_length_before_processing_and_freq"][
                raw_seq_len] += 1
            self._stats["read_length_after_processing_and_freq"][
                clipped_seq_len] += 1
            # Encoding to bytes is necessary due to saving via gzip
            output_fh.write(str.encode(">%s\n%s\n" % (header, seq)))

    def _parse_sequences(self, input_fh):
        if self._fastq:
            for seq_record in SeqIO.parse(input_fh, "fastq"):
                yield(seq_record.description, str(seq_record.seq),
                      seq_record.letter_annotations["phred_quality"])
        else:
            for seq_record in SeqIO.parse(input_fh, "fasta"):
                yield(seq_record.description, str(seq_record.seq), None)
        
    def _process_paired_end(
            self, input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh):
        for fasta_entry_p1, fasta_entry_p2 in zip(
                self._parse_sequences(input_p1_fh),
                self._parse_sequences(input_p2_fh,)):
            header_p1 = fasta_entry_p1[0]
            header_p2 = fasta_entry_p2[0]
            seq_p1 = fasta_entry_p1[1]
            seq_p2 = fasta_entry_p2[1]
            qualities_p1 = fasta_entry_p1[2]
            qualities_p2 = fasta_entry_p1[2]
            raw_seq_p1_len = len(seq_p1)
            raw_seq_p2_len = len(seq_p2)
            self._stats["total_no_of_reads"] += 1
            self._stats["unmodified"] += 1
            if self._fastq and self._min_phred_score is not None:
                seq_p1 = self._trim_by_quality(seq_p1, qualities_p1)
                seq_p2 = self._trim_by_quality(seq_p2, qualities_p2)
            if self._reverse_complement:
                seq_p1 = Seq(seq_p1)
                seq_p1 = str(seq_p1.reverse_complement())
                seq_p2 = Seq(seq_p2)
                seq_p2 = str(seq_p2.reverse_complement())
            if self._adapter is not None:
                seq_p1 = self._clip_adapter(seq_p1)
                seq_p2 = self._clip_adapter(seq_p2)
            if (raw_seq_p1_len < self._min_read_length or
                    raw_seq_p2_len < self._min_read_length):
                self._stats["too_short"] += 1
                continue
            self._stats["long_enough"] += 1
            self._stats["read_length_before_processing_and_freq"][
                raw_seq_p1_len] += 1
            self._stats["read_length_after_processing_and_freq"][
                raw_seq_p1_len] += 1
            self._stats["read_length_before_processing_and_freq"][
                raw_seq_p2_len] += 1
            self._stats["read_length_after_processing_and_freq"][
                raw_seq_p2_len] += 1
            # Encoding to bytes is necessary due to saving via gzip
            output_p1_fh.write(str.encode(">%s\n%s\n" % (header_p1, seq_p1)))
            output_p2_fh.write(str.encode(">%s\n%s\n" % (header_p2, seq_p2)))
Пример #5
0
class TestPolyAClipper(unittest.TestCase):

    def setUp(self):
        self.poly_a_clipper = PolyAClipper()

    def test_clip_poly_a_strech_no_change(self):
        """If there is no poly a strech the sequence is not changed."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACACAATGGTTAGGTACAGATAG"
        result_seq = test_seq
        self.assertEqual(
            self.poly_a_clipper.clip_poly_a_strech(test_seq),
            result_seq)

    def test_clip_poly_a_strech_empt(self):
        """If there the sequence is empty the sequence is not changed."""
        test_seq = ""
        result_seq = test_seq
        self.assertEqual(
            self.poly_a_clipper.clip_poly_a_strech(test_seq),
            result_seq)

    def test_clip_poly_a_strech_terminal_10_a(self):
        """Clipp terminal A strech if it is 10 A."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAAA"
        result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC"
        self.assertEqual(
            self.poly_a_clipper.clip_poly_a_strech(test_seq),
            result_seq)

    def test_clip_poly_a_strech_internal_10_a(self):
        """Clip before a 10 A long internal strech."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAATTTAGACGACG"
        result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC"
        self.assertEqual(
            self.poly_a_clipper.clip_poly_a_strech(test_seq),
            result_seq)

    def test_clip_poly_a_strech_terminal_09_a(self):
        """If there less than 10 A don't clip."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAA"
        result_seq = test_seq
        self.assertEqual(
            self.poly_a_clipper.clip_poly_a_strech(test_seq),
            result_seq)

    def test_clip_poly_a_strecht_internal_(self):
        """If there less than 10 A don't clip."""
        test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAATTTAGACGACG"
        result_seq = test_seq
        self.assertEqual(
            self.poly_a_clipper.clip_poly_a_strech(test_seq),
            result_seq)

    def test_clip_poly_a_strecht_internal_09_a(self):
        """Test all A string"""
        test_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
        result_seq = ""
        self.assertEqual(
            self.poly_a_clipper.clip_poly_a_strech(test_seq),
            result_seq)

    def test_aaaa_starting_substrings(self):
        test_seq = "TTTAAAATTTTTTTTAAAACCCCCCCCCCAAAAC"
        self.assertEqual(
            list(self.poly_a_clipper._aaaa_starting_substrings(test_seq, 11)),
            [['AAAATTTTTTT', 3], ['AAAACCCCCCC', 15]])

    def test_remove_3_prime_a_no_change(self):
        """If there are no terminal As, there is no clipping."""
        test_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
        result_seq = test_seq
        self.assertEqual(
            self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq)
        
    def test_remove_3_prime_a_one_a(self):
        """Remove terminal A"""
        test_seq = "AAAAATTTTCCGCCCGGGAAATTTTA"
        result_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
        self.assertEqual(
            self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq)

    def test_remove_3_prime_a_multiple_as(self):
        """Remove terminal stretch of multiple As"""
        test_seq = "AAAAATTTTCCGCCCGGGAAATTTTAAAAAA"
        result_seq = "AAAAATTTTCCGCCCGGGAAATTTT"
        self.assertEqual(
            self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq)