def record5(): return screed.Record( name='read5', sequence='CTCTTCCGGCAGTCACTGTCAAGAGAGGGTGAACT', ikmers=[ KmerOfInterest('CTGTCAA', 15, [12, 0, 0]), KmerOfInterest('TGTCAAG', 16, [13, 0, 0]), ], )
def record6(): return screed.Record( name='read6', sequence='TCACTGTCAAGAGAGGCCTACGGATTCGGTTACTG', ikmers=[ KmerOfInterest('CTGTCAA', 3, [12, 0, 0]), KmerOfInterest('TGTCAAG', 4, [13, 0, 0]), ], )
def record2a(): return screed.Record( name='read2', sequence='ACGCAAAGCTATTTACGCAA', ikmers=[ KmerOfInterest('CGCAA', 1, [15, 0, 0]), KmerOfInterest('CGCAA', 15, [15, 0, 0]), ], )
def record4(): # similar to record2 but with a single nucleotide mismatch return screed.Record( name='read4', sequence='ACGCAATGCTATTTAAAACC', ikmers=[ KmerOfInterest('CGCAA', 1, [15, 0, 0]), KmerOfInterest('AAAAC', 14, [19, 1, 0]), ], )
def record3(): # reverse complement of record2 return screed.Record( name='read3', sequence='GGTTTTAAATAGCTTTGCGT', ikmers=[ KmerOfInterest('GTTTT', 1, [19, 1, 0]), KmerOfInterest('TTGCG', 14, [15, 0, 0]), ], )
def record10(): return screed.Record( name='read10', sequence=('CAGGTCCCCACCCGGATACTTGAAGCAGGCAGCCT'), ikmers=[ KmerOfInterest('TCCCCACCCGGATACTT', 4, [28, 0, 0]), KmerOfInterest('CCCCACCCGGATACTTG', 5, [26, 0, 0]), KmerOfInterest('CCCGGATACTTGAAGCA', 10, [21, 0, 0]), ], )
def record9(): return screed.Record( name='read9', sequence=('AGCAAGGCGCTCGCGTCAACGAAGTGAGCTCCCGTGGTCTTGAGTTATCG' 'CCTCACATAC'), ikmers=[ KmerOfInterest('AGCAAGGCGCTCGCGTC', 0, [25, 0, 0]), KmerOfInterest('GCAAGGCGCTCGCGTCA', 1, [39, 0, 0]), KmerOfInterest('GTTATCGCCTCACATAC', 42, [15, 1, 1]), KmerOfInterest('AGTTATCGCCTCACATA', 43, [15, 1, 0]), ], )
def record8(): return screed.Record( name='read8', sequence=('GTATGTGAGGCGATAACTCAAGACCACGGGAGCTCACTTCGTTGACGCGA' 'GCGCCTTGCT'), ikmers=[ KmerOfInterest('GTATGTGAGGCGATAAC', 0, [15, 1, 0]), KmerOfInterest('TATGTGAGGCGATAACT', 1, [15, 1, 1]), KmerOfInterest('TGACGCGAGCGCCTTGC', 42, [39, 0, 0]), KmerOfInterest('GACGCGAGCGCCTTGCT', 43, [25, 0, 0]), ], )
def picorecord3(): return screed.Record( name='seq1_901428_901847_3:0:0_0:0:0_87d/1', sequence=('TATTGTTCACTCAGCCTTACTTTGGGAAACAAAAAAAAAACTAAGCTTTTGGATTACAG' 'TTGGAAGTGAGGTCTCAGCCTGCACAAACGAATAAATGTAA'), ikmers=[ KmerOfInterest('CAGCCTTACTTTGGGAAACAAAAAA', 11, [17, 0, 0]), KmerOfInterest('TCAGCCTTACTTTGGGAAACAAAAA', 10, [18, 0, 0]), KmerOfInterest('CTCAGCCTTACTTTGGGAAACAAAA', 9, [18, 1, 0]), KmerOfInterest('ACTCAGCCTTACTTTGGGAAACAAA', 8, [18, 1, 0]), KmerOfInterest('CACTCAGCCTTACTTTGGGAAACAA', 7, [19, 0, 0]), ], )
def picorecord2(): return screed.Record( name='seq1_901428_901847_3:0:0_0:0:0_87d/1', sequence=('TTACATTTATTCGTTTGTGCAGGCTGAGACCTCACTTCCAACTGTAATCCAAAAGCTTA' 'GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATA'), ikmers=[ KmerOfInterest('TTTTTTGTTTCCCAAAGTAAGGCTG', 64, [19, 0, 0]), KmerOfInterest('TTTTTGTTTCCCAAAGTAAGGCTGA', 65, [18, 1, 0]), KmerOfInterest('TTTTGTTTCCCAAAGTAAGGCTGAG', 66, [18, 1, 0]), KmerOfInterest('TTTGTTTCCCAAAGTAAGGCTGAGT', 67, [18, 0, 0]), KmerOfInterest('TTGTTTCCCAAAGTAAGGCTGAGTG', 68, [17, 0, 0]), ], )
def picorecord1(): return screed.Record( name='seq1_901350_901788_1:0:0_0:0:0_21ca1/2', sequence=('GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATATTTTCTCATAGTTTTGAC' 'AAAAACAAAGGAATCCTTAGTTATTAAACTCGGGAGTTTGA'), ikmers=[ KmerOfInterest('TTTTTTGTTTCCCAAAGTAAGGCTG', 5, [19, 0, 0]), KmerOfInterest('TTTTTGTTTCCCAAAGTAAGGCTGA', 6, [18, 1, 0]), KmerOfInterest('TTTTGTTTCCCAAAGTAAGGCTGAG', 7, [18, 1, 0]), KmerOfInterest('TTTGTTTCCCAAAGTAAGGCTGAGT', 8, [18, 0, 0]), KmerOfInterest('TTGTTTCCCAAAGTAAGGCTGAGTG', 9, [17, 0, 0]), ], )
def test_ikmer_abund_after_recalc(): """ Ensure interesting k-mer abundances are correct after recalculation. The interesting k-mer has an advertised abundance of 28, but a true abundance (in `counts`) of 10. The readset "validate" function should check and correct this. """ read = screed.Record( name='read1', sequence='AAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGT', ikmers=[KmerOfInterest('CATTGTCCTCGGGACTC', 13, [28, 0, 0])], ) counts = khmer.Counttable(17, 1e5, 4) seq = 'TTCGTTCCCGAAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGTTCCGTCCTTCA' for _ in range(10): counts.consume(seq) rs = ReadSet() rs.add(read) assert read.ikmers[0].abund[0] == 28 rs.validate(counts, minabund=8) assert rs.valid == (1, 1) assert read.ikmers[0].abund[0] == 10
def record7(): return screed.Record( name='read7', sequence=('CAGGTCCCCACCCGGATACTTGAAGCAGGCAGCCTCAAGGTATGTGAGGC' 'GATAACTCAA'), ikmers=[ KmerOfInterest('TCCCCACCCGGATACTT', 4, [28, 0, 0]), KmerOfInterest('CCCCACCCGGATACTTG', 5, [26, 0, 0]), KmerOfInterest('CCCGGATACTTGAAGCA', 10, [21, 0, 0]), KmerOfInterest('GGTATGTGAGGCGATAA', 38, [14, 0, 0]), KmerOfInterest('GTATGTGAGGCGATAAC', 39, [15, 1, 0]), KmerOfInterest('TATGTGAGGCGATAACT', 40, [15, 1, 1]), ], )
def record1(): return screed.Record( name='read1', sequence='GCTGCACCGATGTACGCAAA', ikmers=[KmerOfInterest('CGCAA', 14, [15, 0, 0])], )
def record12(): return screed.Record( name='read12', sequence='CCCGGATACTTGAAGCAGGCAcC', ikmers=[KmerOfInterest('CCCGGATACTTGAAGCA', 0, [21, 0, 0])], )