def test_augfastx_writer(): output = StringIO() record = Record( name='BasiliscusVulgarisRead84467/1', sequence='TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT', quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB', annotations=[ KmerOfInterest(ksize=19, offset=13, abund=(12, 1, 1)), KmerOfInterest(ksize=19, offset=15, abund=(20, 0, 1)), ], ) kevlar.print_augmented_fastx(record, output) record = Record( name='BasiliscusVulgarisRead90577/2', sequence='CTGTAATCCCAGCACTTTGGGAGGCCGAGGCAAGCAGATGATGCGGTCAG', quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB', annotations=[ KmerOfInterest(ksize=19, offset=1, abund=(5, 7, 9)), KmerOfInterest(ksize=19, offset=2, abund=(7, 10, 9)), ], mates=['CAGATGTGTCTTGTGGGCAGTGCAGCGGAGAGGTGCAAATATGGGTTTGG'] ) kevlar.print_augmented_fastx(record, output) record = Record( name='BasiliscusVulgarisRead99037/1', sequence='AGCACTTTGGGAGGCCGAGGCAAGCAGATGATGCGGTCAGGATTACAGAT', quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB' ) kevlar.print_augmented_fastx(record, output) assert output.getvalue() == """@BasiliscusVulgarisRead84467/1
def dump(bamstream, refrseqs=None, pairmode='split', upint=50000, logstream=sys.stderr): """ Parse read alignments in BAM/SAM format. - bamstream: open file handle to the BAM/SAM file input - refrseqs: dictionary of reference sequences, indexed by sequence ID; if provided, perfect matches to the reference sequence will be discarded - strict: only keep paired end if it also lacks a perfect match to the reference genome - upint: update interval for progress indicator - logstream: file handle do which progress indicator will write output """ bam = pysam.AlignmentFile(bamstream, 'rb') reader = bam_paired_reader(bam) for i, (record1, record2) in enumerate(reader, 1): if i % upint == 0: # pragma: no cover print('...processed', i, 'pairs of records', file=logstream) for record in keepers(record1, record2, bam, refrseqs, pairmode): yield Record(name=readname(record), sequence=record.seq, quality=record.qual)
def revcom(self): seq = kevlar.revcom(self.read.sequence) kmerseqrc = kevlar.revcom(self.kmerseq) newoffset = len(seq) - self.kmer.offset - self.kmer.ksize kmer = KmerOfInterest(self.kmer.ksize, newoffset, self.kmer.abund) kdict = {self.kmerseq: kmer, kmerseqrc: kmer} newread = Record(self.read.name, seq, annotations=[kmer], ikmers=kdict) return ReadWithKmer(newread, self.kmerseq)
def record5(): return Record( name='read5', sequence='CTCTTCCGGCAGTCACTGTCAAGAGAGGGTGAACT', annotations=[ KmerOfInterest(7, 15, [12, 0, 0]), KmerOfInterest(7, 16, [13, 0, 0]), ], )
def record2a(): return Record( name='read2', sequence='ACGCAAAGCTATTTACGCAA', annotations=[ KmerOfInterest(5, 1, [15, 0, 0]), KmerOfInterest(5, 15, [15, 0, 0]), ], )
def record6(): return Record( name='read6', sequence='TCACTGTCAAGAGAGGCCTACGGATTCGGTTACTG', annotations=[ KmerOfInterest(7, 3, [12, 0, 0]), KmerOfInterest(7, 4, [13, 0, 0]), ], )
def record4(): # similar to record2 but with a single nucleotide mismatch return Record( name='read4', sequence='ACGCAATGCTATTTAAAACC', annotations=[ KmerOfInterest(5, 1, [15, 0, 0]), KmerOfInterest(5, 14, [19, 1, 0]), ], )
def record3fix(): # reverse complement of record2 return Record( name='read3', sequence='GGTTTTAAATAGCTTTGCGT', annotations=[ KmerOfInterest(5, 1, [19, 1, 0]), KmerOfInterest(5, 14, [15, 0, 0]), ], )
def mutate_genome(infile, mutations): parser = parse_augmented_fastx(kevlar.open(infile, 'r')) for record in parser: sequence = record.sequence if record.name in mutations: mutlist = sorted(mutations[record.name], key=lambda m: m.pos, reverse=True) sequence = mutate_sequence(sequence, mutlist) yield Record(name=record.name, sequence=sequence)
def main(args): contigstream = kevlar.parse_augmented_fastx(kevlar.open(args.contigs, 'r')) outstream = kevlar.open(args.out, 'w') localizer = localize(contigstream, args.refr, seedsize=args.seed_size, delta=args.delta, maxdiff=args.max_diff, logstream=args.logfile) for cutout in localizer: record = Record(name=cutout.defline, sequence=cutout.sequence) kevlar.sequence.write_record(record, outstream)
def picorecord4(): return Record( name='seqname', sequence=('TGTTCACTCAGCCTTACTTTGGGAAACAAAAAAAAAACTAAGCTTTTGGATTACAGTTG' 'GAAGTGAGGTCTCAGCCTGCACAAACGAATAAATG'), annotations=[ KmerOfInterest(25, 8, [17, 0, 0]), KmerOfInterest(25, 7, [18, 0, 0]), KmerOfInterest(25, 6, [18, 1, 0]), KmerOfInterest(25, 5, [18, 1, 0]), KmerOfInterest(25, 4, [19, 0, 0]), ], )
def picorecord3(): return Record( name='seq1_901428_901847_3:0:0_0:0:0_87d/1', sequence=('TATTGTTCACTCAGCCTTACTTTGGGAAACAAAAAAAAAACTAAGCTTTTGGATTACAG' 'TTGGAAGTGAGGTCTCAGCCTGCACAAACGAATAAATGTAA'), annotations=[ KmerOfInterest(25, 11, [17, 0, 0]), KmerOfInterest(25, 10, [18, 0, 0]), KmerOfInterest(25, 9, [18, 1, 0]), KmerOfInterest(25, 8, [18, 1, 0]), KmerOfInterest(25, 7, [19, 0, 0]), ], )
def picorecord2(): return Record( name='seq1_901428_901847_3:0:0_0:0:0_87d/1', sequence=('TTACATTTATTCGTTTGTGCAGGCTGAGACCTCACTTCCAACTGTAATCCAAAAGCTTA' 'GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATA'), annotations=[ KmerOfInterest(25, 64, [19, 0, 0]), KmerOfInterest(25, 65, [18, 1, 0]), KmerOfInterest(25, 66, [18, 1, 0]), KmerOfInterest(25, 67, [18, 0, 0]), KmerOfInterest(25, 68, [17, 0, 0]), ], )
def picorecord1(): return Record( name='seq1_901350_901788_1:0:0_0:0:0_21ca1/2', sequence=('GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATATTTTCTCATAGTTTTGAC' 'AAAAACAAAGGAATCCTTAGTTATTAAACTCGGGAGTTTGA'), annotations=[ KmerOfInterest(25, 5, [19, 0, 0]), KmerOfInterest(25, 6, [18, 1, 0]), KmerOfInterest(25, 7, [18, 1, 0]), KmerOfInterest(25, 8, [18, 0, 0]), KmerOfInterest(25, 9, [17, 0, 0]), ], )
def test_align_mates(): mate_seqs = kevlar.open(data_file('minitrio/novel-mates.fastq.gz'), 'r') record = Record( name='bogusread', sequence='NNNNN', mates=[r.sequence for r in kevlar.parse_augmented_fastx(mate_seqs)] ) refrfile = data_file('minitrio/refr.fa') kevlar.reference.autoindex(refrfile) positions = list(kevlar.call.align_mates(record, refrfile)) seqids = set([seqid for seqid, start, end in positions]) coords = sorted([(start, end) for seqid, start, end in positions]) print('DEBUG', coords, file=sys.stderr) assert seqids == set(['seq1']) assert coords == [ (45332, 45432), (45377, 45477), (45393, 45493), (45428, 45528), (45440, 45540), (45447, 45547), (46092, 46192), (46093, 46193), (46099, 46199), (46127, 46227), (46131, 46231), (46146, 46246), (46148, 46248), (48025, 48125), (48035, 48135), ]
def record1fix(): return Record( name='read1', sequence='GCTGCACCGATGTACGCAAA', annotations=[KmerOfInterest(5, 14, [15, 0, 0])], )