def check_kmer_overlap(f): chr2 = f['chr2'] kmers = Fasta.as_kmers(chr2, 10, overlap=2) for i, k in enumerate(list(kmers)[:-1]): assert (len(k[1]) == 10) assert (k[0] == (i * (10 - 2))) kmers = Fasta.as_kmers(chr2, 10, overlap=4) seqs = [k[1] for k in kmers] paired_seqs = zip(seqs[0:-1], seqs[1:]) for a, b in paired_seqs: if len(a) < 4 or len(b) < 4: continue assert (a[-4:] == b[:4])
def segments(self): ''' Generator for Segments ''' startchr = self.start_chromosome start = self.start_location chrs = [ x[0] for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0]) ] for chr in chrs: segcount = 0 if self.verbose: print "Reading chr %s" % chr # Skip forward if a starting chr was defined if startchr is not None and startchr != chr: continue else: startchr = None for kmer in Fasta.as_kmers(self.fasta[chr], self.segment_size): end = start + self.segment_size seg = Segment(start, end, kmer[1], chr) segcount += 1 if self.verbose and segcount % 1000 == 0: print "Read %d segments" % segcount yield seg start = end
def segments(self): ''' Generator for Segments ''' startchr = self.start_chromosome start = self.start_location chrs = [x[0] for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0])] for chr in chrs: segcount = 0 if self.verbose: print "Reading chr %s" % chr # Skip forward if a starting chr was defined if startchr is not None and startchr != chr: continue else: startchr = None for kmer in Fasta.as_kmers(self.fasta[chr],self.segment_size): end = start + self.segment_size seg = Segment(start, end, kmer[1] ,chr) segcount += 1 if self.verbose and segcount % 1000 == 0: print "Read %d segments" % segcount yield seg start = end
def check_kmers(f): seq = str(f['chr2']) kmers = list(Fasta.as_kmers(f['chr2'], 10)) assert (len(kmers) == len(seq) / 10) assert (kmers[0] == (0, seq[:10])) seqs = [k[1] for k in kmers] assert ("".join(seqs) == seq) last_pair = kmers[-1] assert (seqs[-1][-1] == 'T') seq = str(f['chr3']) kmers = list(Fasta.as_kmers(f['chr3'], 1)) assert (kmers[2][0] == 2) seqs = [k[1] for k in kmers] assert ("".join(seqs) == seq)
def with_kmers(f, names, k, overlap): """ split the sequences in Fasta object `f` into pieces of length `k` with the given `overlap` the results are written to the array of files `fhs` """ fhs = [open(name, 'wb') for name in names] i = 0 for seqid in f.keys(): seq = f[seqid] for (start0, subseq) in Fasta.as_kmers(seq, k, overlap=overlap): fh = fhs[i % len(fhs)] print >>fh, ">%s" % format_kmer(seqid, start0) print >>fh, subseq i += 1
def with_kmers(f, names, k, overlap): """ split the sequences in Fasta object `f` into pieces of length `k` with the given `overlap` the results are written to the array of files `fhs` """ fhs = [open(name, 'wb') for name in names] i = 0 for seqid in f.keys(): seq = f[seqid] for (start0, subseq) in Fasta.as_kmers(seq, k, overlap=overlap): fh = fhs[i % len(fhs)] print >> fh, ">%s" % format_kmer(seqid, start0) print >> fh, subseq i += 1