def mutate_genome(infile, mutations): parser = khmer.ReadParser(infile) for record in parser: sequence = record.sequence if record.name in mutations: mutlist = sorted(mutations[record.name], key=lambda m: m.pos, reverse=True) sequence = mutate_sequence(sequence, mutlist) yield khmer.Read(name=record.name, sequence=sequence)
def test_BrokenPairedReader_lowercase_khmer_Read(): # use khmer.Read objects which should automatically have a `cleaned_seq` # attribute stream = [khmer.Read(name='seq1/1', sequence='acgtn'), khmer.Read(name='seq1/2', sequence='AcGtN'), khmer.Read(name='seq1/2', sequence='aCgTn')] results = [] for num, is_pair, read1, read2 in broken_paired_reader(stream): results.append((read1, read2)) a, b = results[0] assert a.sequence == 'acgtn' assert a.cleaned_seq == 'ACGTA' assert b.sequence == 'AcGtN' assert b.cleaned_seq == 'ACGTA' c, d = results[1] assert c.sequence == 'aCgTn' assert c.cleaned_seq == 'ACGTA' assert d is None
def localize(contigstream, refrfile, ksize=31, delta=25): """ Wrap the `kevlar localize` task as a generator. Input is an iterable containing contigs (assembled by `kevlar assemble`) stored as khmer or screed sequence records, the filename of the reference genome sequence, and the desired k-size. """ seedmatches = KmerMatchSet(ksize) for seqid, pos in get_exact_matches(contigstream, refrfile, ksize): seedmatches.add(seqid, pos) if len(seedmatches) == 0: raise KevlarNoReferenceMatchesError() refrstream = kevlar.open(refrfile, 'r') for subseqid, subseq in extract_regions(refrstream, seedmatches, delta=delta): yield khmer.Read(name=subseqid, sequence=subseq)
def test_clean_input_reads(): # all Read attributes are read only stream = [khmer.Read(name='seq1/1', sequence='ACGT')] with pytest.raises(AttributeError): next(clean_input_reads(stream))