def test_sequence_stripper(): 'It can cut using trimming recommendations' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1), annotations={TRIMMING_RECOMMENDATIONS:{}}) trim_rec = seq.annotations[TRIMMING_RECOMMENDATIONS] seq_trimmer = create_seq_trim_and_masker() trim_rec['vector'] = [(0,3), (8, 12)] seq2 = seq_trimmer(seq) assert str(seq2.seq) == 'CTCA' trim_rec['vector'] = [(0, 0), (8, 12)] seq2 = seq_trimmer(seq) assert str(seq2.seq) == 'GGTCTCA' trim_rec['vector'] = [(0, 0), (8, 12)] trim_rec['quality'] = [] trim_rec['mask'] = [(0, 3), (5, 6)] seq2 = seq_trimmer(seq) assert seq2.seq == 'ggtCtcA' assert 'vector' not in trim_rec assert 'quality' not in trim_rec assert trim_rec['mask'] == [(0, 2), (4, 5)] trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] seq2 = seq_trimmer(seq) assert seq2 is None seq_trimmer = create_seq_trim_and_masker(mask=False) trim_rec['vector'] = [(0, 0), (8, 12)] trim_rec['quality'] = [] trim_rec['mask'] = [(0, 3), (5, 6)] seq2 = seq_trimmer(seq) assert seq2.seq == 'GGTCTCA' assert seq2.annotations == {'trimming_recommendations': {'mask': [(0, 2), (4, 5)]}} seq_trimmer = create_seq_trim_and_masker() seq3 = seq_trimmer(seq2) assert seq3.seq == 'ggtCtcA' #masking the regions to trim seq_trimmer = create_seq_trim_and_masker(mask=True, trim_as_mask=True) trim_rec['vector'] = [(0, 2), (8, 12)] trim_rec['quality'] = [] trim_rec['mask'] = [(0, 1), (5, 6)] seq2 = seq_trimmer(seq) assert seq2.seq == 'gggTCtcAtcatcaggg' assert seq2.annotations == {'trimming_recommendations': {'mask': [(0, 1), (5, 6)]}}
def test_mask_low_complexity(): "It test mask_low_complexity function" seq = "TCGCATCGATCATCGCAGATCGACTGATCGATCGATCGGGGGGGGGGGGGGGGGGGGGGGG" seq = Seq(seq) qual = [30] * 61 desc = "hola" seq1 = SeqWithQuality(seq=seq, qual=qual, description=desc) mask_low_complexity = create_masker_for_low_complexity() masked_seq = mask_low_complexity(seq1) sequence_trimmer = create_seq_trim_and_masker() masked_seq = sequence_trimmer(masked_seq) assert masked_seq.seq[-10:] == "gggggggggg" assert len(masked_seq.qual) == 61 assert masked_seq.description == desc seq = "GGGGGTTTCTTAAATTCGCCTGGAGATTTCATTCGGGGGGGGGGTTCTCCCCAGGGGGGGGTG" seq += "GGGAAACCCCCCGTTTCCCCCCCCGCGCGCCTTTTCGGGGAAAATTTTTTTTTGTTCCCCCCG" seq += "GAAAAAAAAATATTTCTCCTGCGGGGCCCCCGCGAAGAAAAAAGAAAAAAAAAAAGAGGAGGA" seq += "GGGGGGGGGGGGCGAAAATATAGTTTGG" seq1 = SeqWithQuality(seq=Seq(seq)) masked_seq = mask_low_complexity(seq1) masked_seq = sequence_trimmer(masked_seq) expected = "GGGGGTTTCTTAAATTCGCCTGGAGATTTCATtcggggggggggttctccccaggggg" expected += "gggtggggAAaccccccgtttccccccccgcgcgccttttcggggaaaattttttttt" expected += "gttccccccGGAAAAAAAAATATTTCTCCTGCGGGGCCCCCGCGaagaaaaaagaaaa" expected += "aaaaaaaGAGGAGGAGGGgggggggggCGAAAATATAGTTTGG" assert str(masked_seq.seq) == expected
def test_word_masker(): 'It test if we remove words from the beginning of the seq' word = 'ATAT' seq1 = word + 'tctcatcatca'.upper() seq1 = Seq(seq1) seq = SeqWithQuality(seq1, qual=[30] * len(seq1)) remover = create_word_masker([word]) seq = remover(seq) sequence_trimmer = create_seq_trim_and_masker() seq = sequence_trimmer(seq) assert seq.seq[0] == 'a' seq1 = 'atactctcatcatca'.upper() seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1)) seq = remover(seq) seq = sequence_trimmer(seq) assert seq.seq == seq1 word = 'CA' seq1 = 'ATCATCATCATCA' seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1)) remover = create_word_masker([word], False) seq = remover(seq) seq = sequence_trimmer(seq) assert seq.seq == 'ATcaTcaTcaTca'
def test_word_masker(): "It test if we remove words from the beginning of the seq" word = "ATAT" seq1 = word + "tctcatcatca".upper() seq1 = Seq(seq1) seq = SeqWithQuality(seq1, qual=[30] * len(seq1)) remover = create_word_masker([word]) seq = remover(seq) sequence_trimmer = create_seq_trim_and_masker() seq = sequence_trimmer(seq) assert seq.seq[0] == "a" seq1 = "atactctcatcatca".upper() seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1)) seq = remover(seq) seq = sequence_trimmer(seq) assert seq.seq == seq1 word = "CA" seq1 = "ATCATCATCATCA" seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1)) remover = create_word_masker([word], False) seq = remover(seq) seq = sequence_trimmer(seq) assert seq.seq == "ATcaTcaTcaTca"
def test_mask_low_complexity(): 'It test mask_low_complexity function' seq = 'TCGCATCGATCATCGCAGATCGACTGATCGATCGATCGGGGGGGGGGGGGGGGGGGGGGGG' seq = Seq(seq) qual = [30] * 61 desc = 'hola' seq1 = SeqWithQuality(seq=seq, qual=qual, description=desc) mask_low_complexity = create_masker_for_low_complexity() masked_seq = mask_low_complexity(seq1) sequence_trimmer = create_seq_trim_and_masker() masked_seq = sequence_trimmer(masked_seq) assert masked_seq.seq[-10:] == 'gggggggggg' assert len(masked_seq.qual) == 61 assert masked_seq.description == desc seq = 'GGGGGTTTCTTAAATTCGCCTGGAGATTTCATTCGGGGGGGGGGTTCTCCCCAGGGGGGGGTG' seq += 'GGGAAACCCCCCGTTTCCCCCCCCGCGCGCCTTTTCGGGGAAAATTTTTTTTTGTTCCCCCCG' seq += 'GAAAAAAAAATATTTCTCCTGCGGGGCCCCCGCGAAGAAAAAAGAAAAAAAAAAAGAGGAGGA' seq += 'GGGGGGGGGGGGCGAAAATATAGTTTGG' seq1 = SeqWithQuality(seq=Seq(seq)) masked_seq = mask_low_complexity(seq1) masked_seq = sequence_trimmer(masked_seq) expected = 'GGGGGTTTCTTAAATTCGCCTGGAGATTTCATtcggggggggggttctccccaggggg' expected += 'gggtggggAAaccccccgtttccccccccgcgcgccttttcggggaaaattttttttt' expected += 'gttccccccGGAAAAAAAAATATTTCTCCTGCGGGGCCCCCGCGaagaaaaaagaaaa' expected += 'aaaaaaaGAGGAGGAGGGgggggggggCGAAAATATAGTTTGG' assert str(masked_seq.seq) == expected
def test_strip_seq_by_quality_lucy(): "It tests strip_seq_by_quality_lucy2" seq = "ATCGATCAGTCAGACTGACAGACTCAGATCAGATCAGCATCAGCATACGATACGCATCAGACT" seq += "ACGATCGATCGATCGACAGATCATCGATCATCGACGACTAGACGATCATCGATACGCAGACTC" seq += "AGCAGACTACGAGATCAGCAGCATCAGCAGCA" qual = "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 " qual += "00 00 00 00 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 " qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 " qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 " qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 " qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 " qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 " qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 00 00 00 00" qual = qual.split() seq = Seq(seq) seqrec1 = SeqWithQuality(name="seq1", seq=seq, qual=qual, description="desc1") qual = "40 40 40 37 40 40 37 37 37 37 37 37 37 37 40 42 42 42 44 44 " qual += "56 56 42 40 40 40 40 36 36 28 35 32 35 35 40 42 37 37 35 37 " qual += "32 35 35 35 35 35 35 38 33 33 24 33 33 42 33 35 35 35 35 33 " qual += "36 30 30 24 29 29 35 35 35 35 29 29 29 35 38 38 38 37 35 33 " qual += "29 35 35 34 30 30 30 33 29 31 31 29 29 29 28 28 24 21 16 16 " qual += "21 24 29 29 32 40 27 27 25 25 21 30 27 28 28 32 23 23 21 24 " qual += "24 17 18 19 21 15 19 11 9 9 11 23 17 15 10 10 10 20 27 25 23 " qual += "18 22 23 24 18 10 10 13 13 18 19 10 12 12 18 16 14 10 10 11 " qual += "16 13 21 19 31 19 27 27 28 26 29 25 25 20 19 23 28 28 19 20 " qual += "13 9 9 9 9 9 17 15 21 17 14 12 21 17 19 24 28 24 23 " quality = qual.split() seq = "ATCGATCAGTCAGACTGACAGACTCAGATCAGATCAGCATCAGCATACGATACGCATCAGACT" seq += "ACGATCGATCGATCGACAGATCATCGATCATCGACGACTAGACGATCATCGATACGCAGACTC" seq += "AGCAGACTACGAGATCAGCAGCATCAGCAGCAAGCAGACTACGAGATCAGCAGCATCAGCAGC" seq += "ATTACGATGAT" seq = Seq(seq) seqrec2 = SeqWithQuality(seq=seq, qual=quality, name="seq2", description="desc2") seq_iter = iter([seqrec1, seqrec2]) seq_trimmer = create_seq_trim_and_masker() lucy_striper = create_striper_by_quality_lucy() # pylint:disable-msg=W0612 seq_iter = lucy_striper(seq_iter) new_seqs = [] for seq in seq_iter: new_seqs.append(seq_trimmer(seq)) seq = new_seqs[0].seq assert seqrec1.description == new_seqs[0].description assert seq.startswith("CAGATCAGATCAGCATCAGCAT") assert seq.endswith("CGAGATCAGCAGCATCAGC") assert len(new_seqs) == 2 assert new_seqs[1].description == "desc2" # now we test the sequence with adaptors vector_fpath = os.path.join(TEST_DATA_DIR, "lucy", "icugi_vector.fasta") splice_fpath = os.path.join(TEST_DATA_DIR, "lucy", "icugi_splice.fasta") parameters = {"vector": [vector_fpath, splice_fpath], "bracket": [10, 0.02]} lucy_striper = create_striper_by_quality_lucy(parameters) seq_fhand = open(os.path.join(TEST_DATA_DIR, "lucy", "seq_with_adaptor1.fastq")) seq_iter = lucy_striper(seqs_in_file(seq_fhand, format="fastq")) new_seqs = [] for seq in seq_iter: new_seqs.append(seq_trimmer(seq))
def test_trim_seq_by_qual_trimpoly(self): "It test trimpoly but with trim low quality parameters" seq = "ATCGATCTGATCTAGTCGATGTCTAGCTGAGCTACATAGCTAACGATCTAGTCTAGTCTATG" seq += "TCATGTCATGTCGATGTCTAGTCTAGTCTAGTGAGTCACTGACTAGATCATGACATCGANNN" seq += "NNNNNNNNNNNNNNNNNNTACTAGTC" seq = Seq(seq) qual = [10] * 150 desc = "hola" seq1 = SeqWithQuality(seq=seq, qual=qual, description=desc) strip_seq_by_quality_trimpoly = create_striper_by_quality_trimpoly() trimmed_seq = strip_seq_by_quality_trimpoly(seq1) sequence_trimmer = create_seq_trim_and_masker() trimmed_seq = sequence_trimmer(trimmed_seq) # It does not mask anything with this example, but it check that # if works assert trimmed_seq.seq.endswith("ATGACATCGA") # another seq with the problem at the begining seq = "TNNNNNNAGGGCTTTCCTGACAGCTANNNNNTTTGCGGGCAACATCCAGAACAAGCACCG" seq += "GCAGATTGGCAATGCCGTGCCCCCGCCTCTTGCCTATGCACTTGGGAGGAAGCTGAAGGA" seq += "AGCCGTTGACAAGCGTCAGGAAGCCAGCGCAGGCGTGCCTGCACCATGAGAAGTTTTCCT" seq = Seq(seq) seq1 = SeqWithQuality(seq=seq) strip_seq_by_quality_trimpoly = create_striper_by_quality_trimpoly() trimmed_seq = strip_seq_by_quality_trimpoly(seq1) trimmed_seq = sequence_trimmer(trimmed_seq) str_seq = str(trimmed_seq.seq) assert str_seq.startswith("TTTGCGGGCAACA") assert str_seq.endswith("GAGAAGTTTTCCT") seq = "TGACATCGAACCTCGGCGCCGAGCACCTCCTCGCTGGGATGGTGGGCAAGAACTCCATGA" seq += "AGGTCGCTCGCGATCTGGTCATGCAGGAGGTGAGGAGGCACTTCCGCCCTGAGCTGCTGA" seq += "ACCGTCTCGACGAGATCGTGATCTTCGATCCTCTGTCCCACGAGCAGCTGAGGAAGGTCG" seq += "CTCGCCTTCAGATGAAGGATGTGGCCGTCCGTCTTGCCGAANNNNNCATCGCTCTGGCTG" seq += "TGACCGANNNNNCATTGGACATCATCTTGTCTCTCTCTNNNNNNTCNNNNT" seq1 = SeqWithQuality(seq=Seq(seq)) strip_seq_by_quality_trimpoly = create_striper_by_quality_trimpoly() trimmed_seq = strip_seq_by_quality_trimpoly(seq1) trimmed_seq = sequence_trimmer(trimmed_seq) str_seq = str(trimmed_seq.seq) assert str_seq.startswith("TGACATCGAA") assert str_seq.endswith("TCTTGCCGAA") seq = "TACGGCCGGGGTNNCNNANNNNGCATTCTCGCAGGGTCTTTCTACACTATTAGATAAGAT" seq += "GGATCCTTCTCAGAGAGTGAAGTTTGTTCAGGAAGTCAAGAAGGTTCTTGGATGATGATA" seq += "TGATACCAACACATCCAACACAATATGCGCATGCTACATGTTATTTTTCAAGTACATACA" seq += "TAGAAGGATATTGCTTGGCCTTGATTGATCATGTCTGATCTAAGTCGATCATTATTTTCT" seq += "TGAAACTTCCTTTCGGACGTGGTGCTATGGTTGATGAATTTGGATGTGTGCGTTCTGCCA" seq += "GGTGTAAGCCCAAAGGTTTATACAGACCGAGTTAAGGTTAGGAAGAGCACGAGTGAACTT" seq1 = SeqWithQuality(seq=Seq(seq)) strip_seq_by_quality_trimpoly = create_striper_by_quality_trimpoly() trimmed_seq = strip_seq_by_quality_trimpoly(seq1) trimmed_seq = sequence_trimmer(trimmed_seq) str_seq = str(trimmed_seq.seq) assert str_seq.startswith("GCATTCTCGCAG") assert str_seq.endswith("GTGAACTT")
def test_trim_seq_by_qual_trimpoly(self): 'It test trimpoly but with trim low quality parameters' seq = 'ATCGATCTGATCTAGTCGATGTCTAGCTGAGCTACATAGCTAACGATCTAGTCTAGTCTATG' seq += 'TCATGTCATGTCGATGTCTAGTCTAGTCTAGTGAGTCACTGACTAGATCATGACATCGANNN' seq += 'NNNNNNNNNNNNNNNNNNTACTAGTC' seq = Seq(seq) qual = [10] * 150 desc = 'hola' seq1 = SeqWithQuality(seq=seq, qual=qual, description=desc) strip_seq_by_quality_trimpoly = create_striper_by_quality_trimpoly() trimmed_seq = strip_seq_by_quality_trimpoly(seq1) sequence_trimmer = create_seq_trim_and_masker() trimmed_seq = sequence_trimmer(trimmed_seq) # It does not mask anything with this example, but it check that # if works assert trimmed_seq.seq.endswith('ATGACATCGA') #another seq with the problem at the begining seq = 'TNNNNNNAGGGCTTTCCTGACAGCTANNNNNTTTGCGGGCAACATCCAGAACAAGCACCG' seq += 'GCAGATTGGCAATGCCGTGCCCCCGCCTCTTGCCTATGCACTTGGGAGGAAGCTGAAGGA' seq += 'AGCCGTTGACAAGCGTCAGGAAGCCAGCGCAGGCGTGCCTGCACCATGAGAAGTTTTCCT' seq = Seq(seq) seq1 = SeqWithQuality(seq=seq) strip_seq_by_quality_trimpoly = create_striper_by_quality_trimpoly() trimmed_seq = strip_seq_by_quality_trimpoly(seq1) trimmed_seq = sequence_trimmer(trimmed_seq) str_seq = str(trimmed_seq.seq) assert str_seq.startswith('TTTGCGGGCAACA') assert str_seq.endswith('GAGAAGTTTTCCT') seq = 'TGACATCGAACCTCGGCGCCGAGCACCTCCTCGCTGGGATGGTGGGCAAGAACTCCATGA' seq += 'AGGTCGCTCGCGATCTGGTCATGCAGGAGGTGAGGAGGCACTTCCGCCCTGAGCTGCTGA' seq += 'ACCGTCTCGACGAGATCGTGATCTTCGATCCTCTGTCCCACGAGCAGCTGAGGAAGGTCG' seq += 'CTCGCCTTCAGATGAAGGATGTGGCCGTCCGTCTTGCCGAANNNNNCATCGCTCTGGCTG' seq += 'TGACCGANNNNNCATTGGACATCATCTTGTCTCTCTCTNNNNNNTCNNNNT' seq1 = SeqWithQuality(seq=Seq(seq)) strip_seq_by_quality_trimpoly = create_striper_by_quality_trimpoly() trimmed_seq = strip_seq_by_quality_trimpoly(seq1) trimmed_seq = sequence_trimmer(trimmed_seq) str_seq = str(trimmed_seq.seq) assert str_seq.startswith('TGACATCGAA') assert str_seq.endswith('TCTTGCCGAA') seq = 'TACGGCCGGGGTNNCNNANNNNGCATTCTCGCAGGGTCTTTCTACACTATTAGATAAGAT' seq += 'GGATCCTTCTCAGAGAGTGAAGTTTGTTCAGGAAGTCAAGAAGGTTCTTGGATGATGATA' seq += 'TGATACCAACACATCCAACACAATATGCGCATGCTACATGTTATTTTTCAAGTACATACA' seq += 'TAGAAGGATATTGCTTGGCCTTGATTGATCATGTCTGATCTAAGTCGATCATTATTTTCT' seq += 'TGAAACTTCCTTTCGGACGTGGTGCTATGGTTGATGAATTTGGATGTGTGCGTTCTGCCA' seq += 'GGTGTAAGCCCAAAGGTTTATACAGACCGAGTTAAGGTTAGGAAGAGCACGAGTGAACTT' seq1 = SeqWithQuality(seq=Seq(seq)) strip_seq_by_quality_trimpoly = create_striper_by_quality_trimpoly() trimmed_seq = strip_seq_by_quality_trimpoly(seq1) trimmed_seq = sequence_trimmer(trimmed_seq) str_seq = str(trimmed_seq.seq) assert str_seq.startswith('GCATTCTCGCAG') assert str_seq.endswith('GTGAACTT')
def test_edge_stripper(): 'It test if we remove edges of the seq' seq1 = 'gggtctcatcatcaggg' seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1)) edge_stripperr = create_edge_stripper(left_length=3, right_length=3) sequence_trimmer = create_seq_trim_and_masker() seq = edge_stripperr(seq) seq = sequence_trimmer(seq) assert seq.seq == 'tctcatcatca'
def test_mask_polya(): "It test mask_polyA function" seq = "TCGCATCGATCATCGCAGATCGACTGATCGATCGATCAAAAAAAAAAAAAAAAAAAAAAA" seq = Seq(seq) seq1 = SeqWithQuality(seq=seq, description="hola") mask_polya = create_masker_for_polia() masked_seq = mask_polya(seq1) sequence_trimmer = create_seq_trim_and_masker() masked_seq = sequence_trimmer(masked_seq) exp_seq = "TCGCATCGATCATCGCAGATCGACTGATCGATCGATCaaaaaaaaaaaaaaaaaaaaaaa" assert masked_seq.seq == exp_seq assert masked_seq.description == "hola"
def test_strip_short_adaptors(): "It tests the short adaptor removal with J. Forment sequences" seq = "CgCGTGTCTCTAgATAGGGACAGTAGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAG" seq += "ATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTA" seq = Seq(seq) seq = SeqWithQuality(name="seq", seq=seq) seq_trimmer = create_seq_trim_and_masker() strip_adap = create_re_word_striper(words=["CGTGTCTCTA", "TATATATA"]) clean_seq = strip_adap(seq) clean_seq = seq_trimmer(clean_seq) assert str(clean_seq.seq).startswith("gATAGGGACAGTAGGAATCTCGTTAATC") # two words # 000000000011111111112222222222333333333344444444 # 012345678901234567890123456789012345678901234567 seq = "CgCGTGTCTCTAgATAGGGACAGTAGGAATTTTTTTcCGTGTCTCTAc" seq = SeqWithQuality(name="seq", seq=Seq(seq)) clean_seq = strip_adap(seq) clean_seq = seq_trimmer(clean_seq) assert str(clean_seq.seq) == "gATAGGGACAGTAGGAATTTTTTTc" # We don't want the first region even if it's the longest one seq = "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCgCGTGTCTCTAgC" seq = SeqWithQuality(name="seq", seq=Seq(seq)) clean_seq = strip_adap(seq) clean_seq = seq_trimmer(clean_seq) assert str(clean_seq.seq) == "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCg" # Everything works if the adaptors are at the beginning and ends seq = "CGTGTCTCTAcccccccccccccccTATATATAggggggggggCGTGTCTCTA" seq = SeqWithQuality(name="seq", seq=Seq(seq)) clean_seq = strip_adap(seq) clean_seq = seq_trimmer(clean_seq) assert str(clean_seq.seq) == "ccccccccccccccc" # It test if we remove words from the beginning of the seq word = "ATAT" seq2 = "tctcatcatca" seq1 = word + seq2 + word word = "^" + word seq1 = Seq(seq1) seq = SeqWithQuality(seq1, qual=[30] * len(seq1)) remover = create_re_word_striper([word]) seq = remover(seq) seq = seq_trimmer(seq) assert seq.seq == seq2
def test_strip_short_adaptors(): 'It tests the short adaptor removal with J. Forment sequences' seq = 'CgCGTGTCTCTAgATAGGGACAGTAGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAG' seq += 'ATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTA' seq = Seq(seq) seq = SeqWithQuality(name='seq', seq=seq) seq_trimmer = create_seq_trim_and_masker() strip_adap = create_re_word_striper(words=['CGTGTCTCTA', 'TATATATA']) clean_seq = strip_adap(seq) clean_seq = seq_trimmer(clean_seq) assert str(clean_seq.seq).startswith('gATAGGGACAGTAGGAATCTCGTTAATC') #two words # 000000000011111111112222222222333333333344444444 # 012345678901234567890123456789012345678901234567 seq = 'CgCGTGTCTCTAgATAGGGACAGTAGGAATTTTTTTcCGTGTCTCTAc' seq = SeqWithQuality(name='seq', seq=Seq(seq)) clean_seq = strip_adap(seq) clean_seq = seq_trimmer(clean_seq) assert str(clean_seq.seq) == 'gATAGGGACAGTAGGAATTTTTTTc' #We don't want the first region even if it's the longest one seq = 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCgCGTGTCTCTAgC' seq = SeqWithQuality(name='seq', seq=Seq(seq)) clean_seq = strip_adap(seq) clean_seq = seq_trimmer(clean_seq) assert str(clean_seq.seq) == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCg' #Everything works if the adaptors are at the beginning and ends seq = 'CGTGTCTCTAcccccccccccccccTATATATAggggggggggCGTGTCTCTA' seq = SeqWithQuality(name='seq', seq=Seq(seq)) clean_seq = strip_adap(seq) clean_seq = seq_trimmer(clean_seq) assert str(clean_seq.seq) == 'ccccccccccccccc' #It test if we remove words from the beginning of the seq word = 'ATAT' seq2 = 'tctcatcatca' seq1 = word + seq2 + word word = '^' + word seq1 = Seq(seq1) seq = SeqWithQuality(seq1, qual=[30] * len(seq1)) remover = create_re_word_striper([word]) seq = remover(seq) seq = seq_trimmer(seq) assert seq.seq == seq2
def test_strip_vector_align_blast(): "It tests strip_vector_by_alignment using blast and UniVec" vector = os.path.join(TEST_DATA_DIR, "blast", "univec+") vec1 = "CTCGGGCCGTCTCTTGGGCTTGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCC" vec1 += "TGTAGGGCAGGCTCATACCCCTGCCGAACCGCTTTTGTCAGCCGGTCGGCCACGGCTTCCGG" vec1 += "CGTCTCAACGCGCTTT" seq1 = "ATGCATCAGATGCATGCATGACTACGACTACGATCAGCATCAGCGATCAGCATCGATACGATC" seq = SeqWithQuality(name="seq", seq=Seq(seq1 + vec1)) seq_trimmer = create_seq_trim_and_masker() strip_vector_by_alignment = create_vector_striper(vector, "blastn") striped_seq = strip_vector_by_alignment(seq) striped_seq = seq_trimmer(striped_seq) striped_seq = str(striped_seq.seq) assert seq1[4:14] in striped_seq assert seq1[-14:-4] in striped_seq assert vec1[4:14] not in striped_seq assert vec1[-14:-4] not in striped_seq
def test_strip_vector_align_blast(): 'It tests strip_vector_by_alignment using blast and UniVec' vector = os.path.join(TEST_DATA_DIR, 'blast', 'univec+') vec1 = 'CTCGGGCCGTCTCTTGGGCTTGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCC' vec1 += 'TGTAGGGCAGGCTCATACCCCTGCCGAACCGCTTTTGTCAGCCGGTCGGCCACGGCTTCCGG' vec1 += 'CGTCTCAACGCGCTTT' seq1 = 'ATGCATCAGATGCATGCATGACTACGACTACGATCAGCATCAGCGATCAGCATCGATACGATC' seq = SeqWithQuality(name='seq', seq=Seq(seq1+vec1)) seq_trimmer = create_seq_trim_and_masker() strip_vector_by_alignment = create_vector_striper(vector, 'blastn') striped_seq = strip_vector_by_alignment(seq) striped_seq = seq_trimmer(striped_seq) striped_seq = str(striped_seq.seq) assert seq1[4:14] in striped_seq assert seq1[-14:-4] in striped_seq assert vec1[4:14] not in striped_seq assert vec1[-14:-4] not in striped_seq
def test_edge_stripper(): "It test if we remove edges of the seq" seq1 = "gggtctcatcatcaggg" seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1)) edge_stripperr = create_edge_stripper(left_length=3, right_length=3) sequence_trimmer = create_seq_trim_and_masker() seq = edge_stripperr(seq) seq = sequence_trimmer(seq) assert seq.seq == "tctcatcatca" seq1 = "gggtctcatcatcaggg" seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1)) edge_stripper = create_edge_stripper(left_length=1, right_length=2) seq = edge_stripper(seq) seq = sequence_trimmer(seq) assert seq.seq == "ggtctcatcatcag" seq1 = "gggtctcatcatcaggg" seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1)) edge_stripper = create_edge_stripper(left_length=10, right_length=0) seq = edge_stripper(seq) seq = sequence_trimmer(seq) assert seq.seq == "atcaggg"
def test_strip_seq_by_quality(): "test trim_seq_by_quality " seq = "ataataataata" qual = [0] * len(seq) desc = "hola" seq1 = SeqWithQuality(qual=qual, seq=Seq(seq), description=desc) strip_seq_by_quality = create_striper_by_quality(quality_treshold=40, min_seq_length=2, min_quality_bases=3) seq_trimmer = create_seq_trim_and_masker() new_seq = strip_seq_by_quality(seq1) new_seq = seq_trimmer(new_seq) assert new_seq is None qual = [20, 20, 20, 60, 60, 60, 60, 60, 20, 20, 20, 20] seq = "ataataataata" desc = "hola" seq1 = SeqWithQuality(qual=qual, seq=Seq(seq), description=desc) strip_seq_by_quality = create_striper_by_quality(quality_treshold=40, min_seq_length=2, min_quality_bases=3) seq_trimmer = create_seq_trim_and_masker() new_seq = strip_seq_by_quality(seq1) new_seq = seq_trimmer(new_seq) assert new_seq.seq == "ataat" assert new_seq.description == desc qual = [60, 60, 60, 60, 60, 60, 60] seq = "ataataa" new_seq = strip_seq_by_quality(SeqWithQuality(qual=qual, seq=Seq(seq))) new_seq = seq_trimmer(new_seq) assert new_seq.seq == "ataataa" qual = [60, 60, 60, 60, 60, 60, 0] seq = "ataataa" new_seq = strip_seq_by_quality(SeqWithQuality(qual=qual, seq=Seq(seq))) new_seq = seq_trimmer(new_seq) assert new_seq.seq == "ataata" assert new_seq.qual == [60, 60, 60, 60, 60, 60] qual = [40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11, 6, 5, 3, 20, 10, 12, 8, 5, 4, 7, 1] seq = "atatatatagatagatagatagatg" strip_seq_by_quality = create_striper_by_quality( quality_treshold=20, min_seq_length=2, min_quality_bases=3, quality_window_width=2 ) new_seq = strip_seq_by_quality(SeqWithQuality(qual=qual, seq=Seq(seq))) new_seq = seq_trimmer(new_seq) assert new_seq.qual == [40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11] qual = [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9] seq = "atatatatatatatatatatatata" strip_seq_by_quality = create_striper_by_quality( quality_treshold=20, min_seq_length=2, min_quality_bases=3, quality_window_width=1 ) new_seq = strip_seq_by_quality(SeqWithQuality(qual=qual, seq=Seq(seq))) new_seq = seq_trimmer(new_seq) assert new_seq.qual == [40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40] # remove only from 3' qual = [ 1, 1, 1, 1, 1, 1, 1, 4, 27, 38, 40, 4, 11, 40, 40, 40, 40, 40, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9, ] seq = "atatatatatatatataaaaaatatatata" strip_seq_by_quality = create_striper_by_quality( quality_treshold=20, min_seq_length=2, min_quality_bases=3, quality_window_width=1, only_3_end=True ) new_seq = strip_seq_by_quality(SeqWithQuality(qual=qual, seq=Seq(seq))) new_seq = seq_trimmer(new_seq) qual = new_seq.qual assert qual[0] == 1 assert len(qual) == 20 # some solid reads reads = """@8_19_812_F3 NACGATACGCTATGGGGAATGGCGAAAAAAGGGAAGGGAACTCACAGGA + !>=:BB<:>:A<A+5@?=6BA%<>AB9?<@B=1@BB7='A5@A@;98-2 """ reads_fhand = tempfile.NamedTemporaryFile() reads_fhand.write(reads) seq = list(seqs_in_file(reads_fhand, format="fastq"))[0] new_seq = strip_seq_by_quality(seq) new_seq = seq_trimmer(new_seq) assert len(new_seq) == 46
def test_sequence_stripper(): "It can cut using trimming recommendations" seq1 = "gggtctcatcatcaggg".upper() seq = SeqWithQuality(Seq(seq1), qual=[30] * len(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) trim_rec = seq.annotations[TRIMMING_RECOMMENDATIONS] seq_trimmer = create_seq_trim_and_masker() trim_rec["vector"] = [(0, 3), (8, 12)] seq2 = seq_trimmer(seq) assert str(seq2.seq) == "CTCA" trim_rec["vector"] = [(0, 0), (8, 12)] seq2 = seq_trimmer(seq) assert str(seq2.seq) == "GGTCTCA" trim_rec["vector"] = [(0, 0), (8, 12)] trim_rec["quality"] = [] trim_rec["mask"] = [(0, 3), (5, 6)] seq2 = seq_trimmer(seq) assert seq2.seq == "ggtCtcA" assert "vector" not in trim_rec assert "quality" not in trim_rec assert trim_rec["mask"] == [(0, 2), (4, 5)] trim_rec["vector"] = [(0, 1), (8, 12)] trim_rec["quality"] = [(1, 8), (13, 17)] seq2 = seq_trimmer(seq) assert seq2 is None seq_trimmer = create_seq_trim_and_masker(mask=False) trim_rec["vector"] = [(0, 0), (8, 12)] trim_rec["quality"] = [] trim_rec["mask"] = [(0, 3), (5, 6)] seq2 = seq_trimmer(seq) assert seq2.seq == "GGTCTCA" assert seq2.annotations == {"trimming_recommendations": {"mask": [(0, 2), (4, 5)]}} seq_trimmer = create_seq_trim_and_masker() seq3 = seq_trimmer(seq2) assert seq3.seq == "ggtCtcA" # masking the regions to trim seq_trimmer = create_seq_trim_and_masker(mask=True, trim_as_mask=True) trim_rec["vector"] = [(0, 2), (8, 12)] trim_rec["quality"] = [] trim_rec["mask"] = [(0, 1), (5, 6)] seq2 = seq_trimmer(seq) assert seq2.seq == "gggTCtcAtcatcaggg" assert seq2.annotations == {"trimming_recommendations": {"mask": [(0, 1), (5, 6)]}} # keep 5segment = False seq_trimmer = create_seq_trim_and_masker() trim_rec["vector"] = [(3, 5)] trim_rec["quality"] = [] trim_rec["mask"] = [] seq2 = seq_trimmer(seq) assert seq2.seq == "CATCATCAGGG" # keep 5segment = True seq_trimmer = create_seq_trim_and_masker(keep_5segment=True) trim_rec["vector"] = [(3, 5)] trim_rec["quality"] = [] trim_rec["mask"] = [] seq2 = seq_trimmer(seq) assert seq2.seq == "GGG" # masking the regions to trim seq_trimmer = create_seq_trim_and_masker(mask=True, trim_as_mask=True, keep_5segment=True) trim_rec["vector"] = [(0, 2), (8, 12)] trim_rec["quality"] = [] trim_rec["mask"] = [(0, 1), (5, 6)] seq2 = seq_trimmer(seq) assert seq2.seq == seq.seq.lower() assert seq2.annotations == {"trimming_recommendations": {"mask": [(0, 1), (5, 6)]}} seq_trimmer = create_seq_trim_and_masker(keep_5segment=True) trim_rec["vector"] = [(0, 2), (8, 12)] trim_rec["quality"] = [] trim_rec["mask"] = [(0, 1), (5, 6)] assert None == seq_trimmer(seq)
def test_strip_seq_by_quality_lucy(): 'It tests strip_seq_by_quality_lucy2' seq = 'ATCGATCAGTCAGACTGACAGACTCAGATCAGATCAGCATCAGCATACGATACGCATCAGACT' seq += 'ACGATCGATCGATCGACAGATCATCGATCATCGACGACTAGACGATCATCGATACGCAGACTC' seq += 'AGCAGACTACGAGATCAGCAGCATCAGCAGCA' qual = '00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ' qual += '00 00 00 00 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 ' qual += '60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 ' qual += '60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 ' qual += '60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 ' qual += '60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 ' qual += '60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 ' qual += '60 60 60 60 60 60 60 60 60 60 60 60 60 60 00 00 00 00' qual = qual.split() seq = Seq(seq) seqrec1 = SeqWithQuality(name='seq1', seq=seq, qual=qual, description ='desc1') qual = '40 40 40 37 40 40 37 37 37 37 37 37 37 37 40 42 42 42 44 44 ' qual += '56 56 42 40 40 40 40 36 36 28 35 32 35 35 40 42 37 37 35 37 ' qual += '32 35 35 35 35 35 35 38 33 33 24 33 33 42 33 35 35 35 35 33 ' qual += '36 30 30 24 29 29 35 35 35 35 29 29 29 35 38 38 38 37 35 33 ' qual += '29 35 35 34 30 30 30 33 29 31 31 29 29 29 28 28 24 21 16 16 ' qual += '21 24 29 29 32 40 27 27 25 25 21 30 27 28 28 32 23 23 21 24 ' qual += '24 17 18 19 21 15 19 11 9 9 11 23 17 15 10 10 10 20 27 25 23 ' qual += '18 22 23 24 18 10 10 13 13 18 19 10 12 12 18 16 14 10 10 11 ' qual += '16 13 21 19 31 19 27 27 28 26 29 25 25 20 19 23 28 28 19 20 ' qual += '13 9 9 9 9 9 17 15 21 17 14 12 21 17 19 24 28 24 23 ' quality = qual.split() seq = 'ATCGATCAGTCAGACTGACAGACTCAGATCAGATCAGCATCAGCATACGATACGCATCAGACT' seq += 'ACGATCGATCGATCGACAGATCATCGATCATCGACGACTAGACGATCATCGATACGCAGACTC' seq += 'AGCAGACTACGAGATCAGCAGCATCAGCAGCAAGCAGACTACGAGATCAGCAGCATCAGCAGC' seq += 'ATTACGATGAT' seq = Seq(seq) seqrec2 = SeqWithQuality(seq=seq, qual=quality, name='seq2', description ='desc2') seq_iter = iter([seqrec1, seqrec2]) seq_trimmer = create_seq_trim_and_masker() lucy_striper = create_striper_by_quality_lucy() #pylint:disable-msg=W0612 seq_iter = lucy_striper(seq_iter) new_seqs = [] for seq in seq_iter: new_seqs.append(seq_trimmer(seq)) seq = new_seqs[0].seq assert seqrec1.description == new_seqs[0].description assert seq.startswith('CAGATCAGATCAGCATCAGCAT') assert seq.endswith('CGAGATCAGCAGCATCAGC') assert len(new_seqs) == 2 assert new_seqs[1].description == 'desc2' # now we test the sequence with adaptors vector_fpath = os.path.join(TEST_DATA_DIR, 'lucy', 'icugi_vector.fasta') splice_fpath = os.path.join(TEST_DATA_DIR, 'lucy', 'icugi_splice.fasta') parameters = {'vector':[vector_fpath, splice_fpath], 'bracket':[10, 0.02]} lucy_striper = create_striper_by_quality_lucy(parameters) seq_fhand = open(os.path.join(TEST_DATA_DIR, 'lucy', 'seq_with_adaptor1.fastq')) seq_iter = lucy_striper(seqs_in_file(seq_fhand, format='fastq')) new_seqs = [] for seq in seq_iter: new_seqs.append(seq_trimmer(seq))
def test_strip_adaptor_blast(self): "It tests strip_vector_by_alignment with blastn-short" vec1 = SeqWithQuality(name="vec1", seq=Seq("atcgatcgatagcatacgat")) vec2 = SeqWithQuality(name="vec2", seq=Seq("atgcatcagatcgataaaga")) fhand_vectors = temp_fasta_file([vec1, vec2]) seq_trimmer = create_seq_trim_and_masker() strip_vector_by_alignment = create_adaptor_striper(fhand_vectors) seq = "ATGCATCAGATGCATGCATGACTACGACTACGATCAGCATCAGCGATCAGCATCGATACGATC" seq = Seq(seq) seq2 = SeqWithQuality(name="seq1", seq=seq) seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq + seq2.seq + vec2.seq, description="hola") seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) assert seq3.description == "hola" fhand_vectors.seek(0) seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq + vec2.seq + seq2.seq) seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) # overlaping vectors fhand_vectors.seek(0) new_seq = vec1.seq[:-2] + vec2.seq + seq2.seq + vec2.seq seq1 = SeqWithQuality(name=seq2.name, seq=new_seq) seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) # Now only vectors fhand_vectors.seek(0) new_seq = vec1.seq + vec2.seq + vec2.seq seq1 = SeqWithQuality(name=seq2.name, seq=new_seq) seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert seq3 is None # Now without vectors fhand_vectors.seek(0) seq1 = seq2 seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) fhand_vectors.seek(0) seq1 = SeqWithQuality(name=seq2.name, seq=vec1.seq[::-1] + vec2.seq + seq2.seq) seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) seq = "ATGCATCAGATGCATGCATGACTACGACTACGATCAGCATCAGCGATCAGCATCGATACGATC" seq = Seq(seq) seq2 = SeqWithQuality(name="seq1", seq=seq) # 'atcgatcgatagcatacgat atgcatcagatcgataaaga seq = "atcgatcgatagcataGgat" + seq2.seq + "atgGatcagatcgataaaga" seq1 = SeqWithQuality(name=seq2.name, seq=seq, description="hola") seq3 = strip_vector_by_alignment(seq1) seq3 = seq_trimmer(seq3) assert str(seq2.seq) == str(seq3.seq) long_adap = "atcgatcgatagcatacgatatcgatcgatagcatacgatatcgatcgatagcatacc" vec1 = SeqWithQuality(name="vec1", seq=Seq(long_adap)) fhand_vectors = temp_fasta_file([vec1]) try: create_adaptor_striper(fhand_vectors) self.fail("ValueError expected") except ValueError: pass