def action(args): barcodes = get_barcodes(args.barcodes) bc_seqs, bc_labels = zip(*barcodes) bc_dict = dict(barcodes) bc_rexp = re.compile('|'.join(bc_seqs)) opener = bz2.BZ2File if args.fastq.endswith('.bz2') else open count = Counter() with opener(args.fastq) as f, bz2.BZ2File(args.mapfile, 'w') as mapout: writer = csv.writer(mapout) writer.writerow([ 'name', 'label', 'barcode', 'ratio', 'bc_start', 'bc_stop', 'rle' ]) for seq in SeqIO.parse(f, 'fastq'): if not args.min_length <= len(seq) <= args.max_length: count['fail_len'] += 1 continue barcode, ratio, bc_start, bc_stop = find_barcode( seq, bc_rexp, bc_seqs, exact=False, bc_len=10, search_window=args.search_window) sample = bc_dict.get(barcode) name = seq.id.replace(':', '_') if barcode: if count['matched'] >= args.limit: break count['matched'] += 1 count[barcode] += 1 seq, counts = homoencode(seq.seq) args.outfile.write('>%s %s\n%s\n' % (name, sample, seq)) writer.writerow([ name, sample, barcode, ratio, bc_start, bc_stop, to_ascii(counts) ]) elif args.unmatched: count['no_match'] += 1 args.unmatched.write('>%s\n%s\n' % (name, seq.seq)) writer.writerow( [name, sample, None, barcode, ratio, bc_start, bc_stop]) stats = csv.writer(args.stats) rows = sorted((bc_dict[k], v) for k, v in count.items() if k in bc_dict) stats.writerows(rows) stats.writerow(['total_matched', count['matched']]) stats.writerow(['no_match', count['no_match']]) stats.writerow(['fail_len', count['fail_len']])
def action(args): barcodes = get_barcodes(args.barcodes) bc_seqs, bc_labels = zip(*barcodes) bc_dict = dict(barcodes) bc_rexp = re.compile('|'.join(bc_seqs)) opener = bz2.BZ2File if args.fastq.endswith('.bz2') else open count = Counter() with opener(args.fastq) as f, bz2.BZ2File(args.mapfile, 'w') as mapout: writer = csv.writer(mapout) writer.writerow(['name','label','barcode','ratio','bc_start','bc_stop','rle']) for seq in SeqIO.parse(f, 'fastq'): if not args.min_length <= len(seq) <= args.max_length: count['fail_len'] += 1 continue barcode, ratio, bc_start, bc_stop = find_barcode( seq, bc_rexp, bc_seqs, exact = False, bc_len = 10, search_window = args.search_window) sample = bc_dict.get(barcode) name = seq.id.replace(':','_') if barcode: if count['matched'] >= args.limit: break count['matched'] += 1 count[barcode] += 1 seq, counts = homoencode(seq.seq) args.outfile.write('>%s %s\n%s\n' % (name, sample, seq)) writer.writerow([name, sample, barcode, ratio, bc_start, bc_stop, to_ascii(counts)]) elif args.unmatched: count['no_match'] += 1 args.unmatched.write('>%s\n%s\n' % (name, seq.seq)) writer.writerow([name, sample, None, barcode, ratio, bc_start, bc_stop]) stats = csv.writer(args.stats) rows = sorted((bc_dict[k], v) for k, v in count.items() if k in bc_dict) stats.writerows(rows) stats.writerow(['total_matched', count['matched']]) stats.writerow(['no_match', count['no_match']]) stats.writerow(['fail_len', count['fail_len']])
def seq_and_homoencode(seq): return seq, homoencode(seq.seq)
def test02(self): rle, counts = sequtils.homoencode('GCTTCAAACATA') self.assertEquals(rle, 'GCTCACATA') self.assertEquals(counts, [1, 1, 2, 1, 3, 1, 1, 1, 1])
def test01(self): rle, counts = sequtils.homoencode('AAA') self.assertEquals(rle, 'A') self.assertEquals(counts, [3])
def test01(self): seq = 'TCTGGACCGTGTCTTTCAGTTCCAAAGTGTGACTGATCCATCCTCTCAGACC' e, c = sequtils.homoencode(seq) self.assertEquals(seq, sequtils.homodecode(e, c))