def extract_unaligned_lomem(b6_fname, in_fname, out_fname): # extracts unaligned reads using a small memory footprint--probably the # smallest possible that doesnt require reads to be sorted b6_cache = set([]) in_cache = {} with open(b6_fname, "r") as b6_fp, \ open(in_fname, "r") as in_fp, \ open(out_fname, "w") as out_fp: for b6_l, in_r in izip_longest(b6_fp, fast_fastq(in_fp), fillvalue=False): try: b6_l = b6_l.split("\t", 1) del in_cache[b6_l[0]] except KeyError: b6_cache.add(b6_l[0]) except AttributeError: pass try: b6_cache.remove(in_r.id) except KeyError: in_cache[in_r.id] = in_r.raw() except AttributeError: pass # dump all the reads that remain out_fp.write("".join(in_cache.values()))
def split_file(input_fp, fps, dests, formats, read_index, stats): for read in fast_fastq(input_fp): sample = read.id.split("_")[0] try: read_dest = dests[sample] except KeyError: #sys.stderr.write("read: %s notfound\n" % read.id) stats["notfound"]["matched"] += 1 fps["notfound"][read_index].write(read.raw()) continue # fastq_or_fasta min_read_length max_read_length regexp #sys.stderr.write("read: %s -> %s\n" % (read.id, read_dest)) read_format = formats[read_dest] # read too short if read_format[1] != -1 and len(read.sequence) < read_format[1]: stats[read_dest]["short"] += 1 #sys.stderr.write(" len(read) = %s < %s\n" % (len(read.sequence), read_format[1])) continue # read too long if read_format[2] != -1 and len(read.sequence) > read_format[2]: stats[read_dest]["long"] += 1 #sys.stderr.write(" len(read) = %s > %s\n" % (len(read.sequence), read_format[2])) continue # read didn't match regular expression if read_format[3]: if not read_format[3].match(read.sequence): stats[read_dest]["regexp"] += 1 #sys.stderr.write(" regexp match failed\n") continue # convert to fasta or don't if read_format[0] == "fasta": #sys.stderr.write(" convert to fasta\n") raw_read = ">%s\n%s\n" % (read.id, read.sequence) else: raw_read = read.raw() # write it out stats[read_dest]["matched"] += 1 #sys.stderr.write(" write to %s\n\n" % fps[read_dest][read_index]) fps[read_dest][read_index].write(raw_read)
def pool_unzip(fname_list, sample_list, status_msg): # accepts a list of gzip'd filenames and combines them all into # a single unzipped FASTQ, renaming reads to samples pooled_file = tempfile.NamedTemporaryFile(delete=False) for fname, sample in zip(fname_list, sample_list): sys.stderr.write("\rPooling %s -> %s: Sample %s" % (status_msg, pooled_file.name, sample)) read_num = 1 for read in fast_fastq(gzip.GzipFile(fname).readlines()): read.id = "%s_%s" % (sample, read_num) pooled_file.write(read.raw()) read_num += 1 sys.stderr.write("\n") return pooled_file
def main(): parse_options(sys.argv[1:]) # load barcodes fwd_bcs, rev_bcs = load_barcodes(args[0]) if options.use_plate: plate = load_plate(args[1]) barcode_to_sample = map_bc_to_sample(plate, fwd_bcs, rev_bcs) if options.zip_fname: # open zipfile miseq_zip = zipfile.ZipFile(options.zip_fname) # find pairs pairs = dict([("_".join(x.split("/")[-1].split(".")[0].split("_")[:-2]), (x, y)) for x, y in find_pairs(miseq_zip)]) if options.sample_name not in pairs: raise ValueError("Could not find %s in %s!" % (options.sample_name, options.zip_fname)) # extract read files sys.stderr.write("Extracting reads from zipfile...\n") fwd = fetch_unzip(miseq_zip, pairs[options.sample_name][0]) rev = fetch_unzip(miseq_zip, pairs[options.sample_name][1]) elif options.fwd_fname and options.rev_fname: fwd = open(options.fwd_fname, "r") rev = open(options.rev_fname, "r") barcode_to_count = {} if options.merge or options.merged_fname: with open(os.path.join(options.output_dir, "merged_reads.assigned.fastq"), "w") as assigned, \ open(os.path.join(options.output_dir, "merged_reads.unassigned.fastq"), "w") as unassigned: read_length_bins = {} total_reads = 0.0 quality_reads = 0.0 if options.merge: # run pear to merge reads sys.stderr.write("Merging reads...\n") merged, stats, raw_pear_log = pear(fwd.name, rev.name, options.mem_size, options.num_threads) open(os.path.join(options.output_dir, "pear.log"), "w").write(raw_pear_log) if stats["assembled_reads"] < options.min_merged_perc: sys.stderr.write(" Warning: only %.02f%% of reads assembled\n" % stats["assembled_reads"]) merged_fastq = fast_fastq(open("%s.assembled.fastq" % merged.name, "r")) elif options.merged_fname: merged_fastq = fast_fastq(open(options.merged_fname, "r")) # read through fastq sys.stderr.write("Filtering and demultiplexing reads...\n") for merged_read in merged_fastq: total_reads += 1 # check that read passes quality filter if not qual_filter(merged_read, options.min_qual, options.phred_offset, options.max_errors): continue # keep track of stats quality_reads += 1 try: read_length_bins[len(merged_read.sequence)] += 1 except KeyError: read_length_bins[len(merged_read.sequence)] = 1 # demultiplex dm_out = demultiplex(merged_read, fwd_bcs.values(), None, rev_bcs.values(), options.max_mismatch) if dm_out == False: # strip pair info from read and write to unassigned file merged_read.id = merged_read.id.split(" ")[0] unassigned.write(merged_read.raw()) else: # rename read to barcodes used and write to assigned file trimmed_read, _, f_bc, r_bc = dm_out bc_name = "%s_%s" % (f_bc, r_bc) try: barcode_to_count[bc_name] += 1 except KeyError: barcode_to_count[bc_name] = 1 # we want to rename to sample names, and a barcode was found to match # something in barcodes.txt, but that particular barcode is not in use # in our plate layout. in this case, ignore this read if options.use_plate and bc_name not in barcode_to_sample: # strip pair info from read and write to unassigned file merged_read.id = merged_read.id.split(" ")[0] unassigned.write(merged_read.raw()) continue if options.use_plate: trimmed_read.id = "%s_%s" % (barcode_to_sample[bc_name], barcode_to_count[bc_name]) else: trimmed_read.id = "%s_%s" % (bc_name, barcode_to_count[bc_name]) assigned.write(trimmed_read.raw()) if total_reads > 0: if quality_reads / total_reads < options.min_qual_perc: sys.stderr.write(" Warning: only %.02f%% of reads passed quality filter\n" % (quality_reads * 100 / total_reads)) sys.stderr.write("\nSummary") sys.stderr.write("\n Total reads: %d" % total_reads) sys.stderr.write("\n Quality reads: %d" % quality_reads) sys.stderr.write("\n Min read length: %d" % min(read_length_bins.keys())) sys.stderr.write("\n Mean read length: %d" % mean(read_length_bins.keys())) sys.stderr.write("\n Max read length: %d\n" % max(read_length_bins.keys())) sys.stderr.write("\n Assigned reads: %d" % sum(barcode_to_count.values())) sys.stderr.write("\n Unassigned reads: %d" % (quality_reads - sum(barcode_to_count.values()))) sys.stderr.write("\n Avg reads/barcode: %d\n" % mean(barcode_to_count.values())) with open(os.path.join(options.output_dir, "read_lengths.log"), "w") as fp: for read_length, count in sorted(read_length_bins.items(), key=lambda x: x[0], reverse=True): fp.write("%s\t%s\n" % (read_length, count)) if options.merge: # remove temporary files os.unlink(merged.name) os.unlink("%s.assembled.fastq" % merged.name) os.unlink("%s.discarded.fastq" % merged.name) os.unlink("%s.unassembled.forward.fastq" % merged.name) os.unlink("%s.unassembled.reverse.fastq" % merged.name) else: with open(os.path.join(options.output_dir, "fwd_reads.assigned.fastq"), "w") as f_assigned, \ open(os.path.join(options.output_dir, "rev_reads.assigned.fastq"), "w") as r_assigned, \ open(os.path.join(options.output_dir, "fwd_reads.unassigned.fastq"), "w") as f_unassigned, \ open(os.path.join(options.output_dir, "rev_reads.unassigned.fastq"), "w") as r_unassigned: total_reads = 0.0 quality_reads = 0.0 # read through foward and reverse fastq simultaneously sys.stderr.write("Filtering and demultiplexing reads...\n") for f_read, r_read in izip(fast_fastq(open(fwd.name, "r")), fast_fastq(open(rev.name, "r"))): total_reads += 1 # check that read passes quality filter if qual_filter(f_read, options.min_qual, options.phred_offset, options.max_errors) == False or \ qual_filter(r_read, options.min_qual, options.phred_offset, options.max_errors) == False: continue quality_reads += 1 # demultiplex dm_out = demultiplex(f_read, fwd_bcs.values(), r_read, rev_bcs.values(), options.max_mismatch) if dm_out == False: # strip pair info from read and write to unassigned file f_read.id = f_read.id.split(" ")[0] f_unassigned.write(f_read.raw()) r_read.id = r_read.id.split(" ")[0] r_unassigned.write(r_read.raw()) else: # rename reads to barcodes used and write to assigned file trimmed_f_read, trimmed_r_read, f_bc, r_bc = dm_out bc_name = "%s_%s" % (f_bc, r_bc) try: barcode_to_count[bc_name] += 1 except KeyError: barcode_to_count[bc_name] = 1 # we want to rename to sample names, and a barcode was found to match # something in barcodes.txt, but that particular barcode is not in use # in our plate layout. in this case, ignore this read if options.use_plate and bc_name not in barcode_to_sample: # strip pair info from read and write to unassigned file f_read.id = f_read.id.split(" ")[0] f_unassigned.write(f_read.raw()) r_read.id = r_read.id.split(" ")[0] r_unassigned.write(r_read.raw()) continue if options.use_plate: trimmed_f_read.id = "%s_%s" % (barcode_to_sample[bc_name], barcode_to_count[bc_name]) else: trimmed_f_read.id = "%s_%s" % (bc_name, barcode_to_count[bc_name]) f_assigned.write(trimmed_f_read.raw()) if options.use_plate: trimmed_r_read.id = "%s_%s" % (barcode_to_sample[bc_name], barcode_to_count[bc_name]) else: trimmed_r_read.id = "%s_%s" % (bc_name, barcode_to_count[bc_name]) r_assigned.write(trimmed_r_read.raw()) if total_reads > 0: if quality_reads / total_reads < options.min_qual_perc: sys.stderr.write(" Warning: only %.02f%% of reads passed quality filter\n" % (quality_reads * 100 / total_reads)) sys.stderr.write("\nSummary") sys.stderr.write("\n Total pairs: %d" % total_reads) sys.stderr.write("\n Quality pairs: %d" % quality_reads) sys.stderr.write("\n Assigned pairs: %d" % sum(barcode_to_count.values())) sys.stderr.write("\n Unassigned pairs: %d" % (quality_reads - sum(barcode_to_count.values()))) sys.stderr.write("\n Avg pairs/barcode: %d\n" % int(mean(barcode_to_count.values()))) sorted_barcode_to_count = sorted(barcode_to_count.items(), key=lambda x: x[1], reverse=True) with open(os.path.join(options.output_dir, "barcode_to_count.log"), "w") as fp: if options.use_plate: for barcode, count in sorted_barcode_to_count: try: sample_name = barcode_to_sample[barcode] except KeyError: sample_name = "BARCODE_NOT_IN_PLATE_LAYOUT" fp.write("%s\t%s\t%s\n" % (barcode, sample_name, count)) else: for barcode, count in sorted_barcode_to_count: fp.write("%s\t%s\n" % (barcode, count)) # remove temporary files if options.zip_fname: os.unlink(fwd.name) os.unlink(rev.name)
def main(): parse_options(sys.argv[1:]) # load barcodes if options.dir_name == False: fwd_bcs, rev_bcs = load_barcodes(args[0]) if options.dir_name == False and options.use_plate: plate = load_plate(args[1]) barcode_to_sample = map_bc_to_sample(plate, fwd_bcs, rev_bcs) if options.zip_fname and options.sample_name: # open zipfile miseq_zip = zipfile.ZipFile(options.zip_fname) # find pairs pairs = dict([ ("_".join(x.split("/")[-1].split(".")[0].split("_")[:-2]), (x, y)) for x, y in find_pairs(miseq_zip) ]) if options.sample_name not in pairs: raise ValueError("Could not find %s in %s!" % (options.sample_name, options.zip_fname)) # extract read files sys.stderr.write("Extracting reads from zipfile...\n") fwd = fetch_unzip(miseq_zip, pairs[options.sample_name][0]) rev = fetch_unzip(miseq_zip, pairs[options.sample_name][1]) elif options.fwd_fname and options.rev_fname: fwd = open(options.fwd_fname, "r") rev = open(options.rev_fname, "r") elif options.dir_name and options.multi_plates: # find pairs in directory fwd_files, rev_files, sample_list = find_subdirs( options.dir_name, options.multi_plates) # combine all desired files into new fwd and rev files fwd = pool_unzip(fwd_files, sample_list, "Read 1") rev = pool_unzip(rev_files, sample_list, "Read 2") barcode_to_count = {} if options.merge or options.merged_fname: with open(os.path.join(options.output_dir, "merged_reads.assigned.fastq"), "w") as assigned, \ open(os.path.join(options.output_dir, "merged_reads.unassigned.fastq"), "w") as unassigned: read_length_bins = {} total_reads = 0.0 quality_reads = 0.0 nbj_trim_fail = 0.0 if options.merge: # run pear to merge reads sys.stderr.write("Merging reads...\n") merged, stats, raw_pear_log = pear(fwd.name, rev.name, options.mem_size, options.num_threads) open(os.path.join(options.output_dir, "pear.log"), "w").write(raw_pear_log) if stats["assembled_reads"] < options.min_merged_perc: sys.stderr.write( " Warning: only %.02f%% of reads assembled\n" % stats["assembled_reads"]) merged_fastq = fast_fastq( open("%s.assembled.fastq" % merged.name, "r")) elif options.merged_fname: merged_fastq = fast_fastq(open(options.merged_fname, "r")) # read through fastq sys.stderr.write("Filtering and demultiplexing reads...\n") for merged_read in merged_fastq: total_reads += 1 # check that read passes quality filter if not qual_filter(merged_read, options.min_qual, options.phred_offset, options.max_errors): continue # keep track of stats quality_reads += 1 try: read_length_bins[len(merged_read.sequence)] += 1 except KeyError: read_length_bins[len(merged_read.sequence)] = 1 if options.dir_name: sample = merged_read.id.split("_")[0] # renumber reads so that they refer to new merged reads try: barcode_to_count[sample] += 1 except KeyError: barcode_to_count[sample] = 1 merged_read.id = "%s_%s" % (sample, barcode_to_count[sample]) if options.trim_nbj: nbj_read = trim_primers(merged_read, NBJ_V3V4_FWD_PRIMER, NBJ_V3V4_REV_PRIMER, options.max_mismatch) if nbj_read: assigned.write(nbj_read.raw()) else: nbj_trim_fail += 1 merged_read.id = "%s-NBJTRIMFAIL" % merged_read.id unassigned.write(merged_read.raw()) else: assigned.write(merged_read.raw()) else: # demultiplex dm_out = demultiplex(merged_read, fwd_bcs.values(), None, rev_bcs.values(), options.max_mismatch) if dm_out == False: # strip pair info from read and write to unassigned file merged_read.id = merged_read.id.split(" ")[0] unassigned.write(merged_read.raw()) else: # rename read to barcodes used and write to assigned file trimmed_read, _, f_bc, r_bc = dm_out bc_name = "%s_%s" % (f_bc, r_bc) try: barcode_to_count[bc_name] += 1 except KeyError: barcode_to_count[bc_name] = 1 # we want to rename to sample names, and a barcode was found to match # something in barcodes.txt, but that particular barcode is not in use # in our plate layout. in this case, ignore this read if options.use_plate and bc_name not in barcode_to_sample: # strip pair info from read and write to unassigned file merged_read.id = merged_read.id.split(" ")[0] unassigned.write(merged_read.raw()) continue if options.use_plate: trimmed_read.id = "%s_%s" % ( barcode_to_sample[bc_name], barcode_to_count[bc_name]) else: trimmed_read.id = "%s_%s" % ( bc_name, barcode_to_count[bc_name]) if options.trim_nbj: nbj_read = trim_primers(trimmed_read, NBJ_V3V4_FWD_PRIMER, NBJ_V3V4_REV_PRIMER, options.max_mismatch) if nbj_read: assigned.write(nbj_read.raw()) else: nbj_trim_fail += 1 trimmed_read.id = "%s-NBJTRIMFAIL" % trimmed_read.id unassigned.write(trimmed_read.raw()) else: assigned.write(trimmed_read.raw()) if total_reads > 0: if quality_reads / total_reads < options.min_qual_perc: sys.stderr.write( " Warning: only %.02f%% of reads passed quality filter\n" % (quality_reads * 100 / total_reads)) sys.stderr.write("\nSummary") sys.stderr.write("\n Total reads: %d" % total_reads) sys.stderr.write("\n Quality reads: %d" % quality_reads) if options.trim_nbj: sys.stderr.write("\n Failed NBJ Trim: %d" % nbj_trim_fail) sys.stderr.write("\n Min read length: %d" % min(read_length_bins.keys())) sys.stderr.write("\n Mean read length: %d" % mean(read_length_bins.keys())) sys.stderr.write("\n Max read length: %d\n" % max(read_length_bins.keys())) sys.stderr.write("\n Assigned reads: %d" % sum(barcode_to_count.values())) sys.stderr.write("\n Unassigned reads: %d" % (quality_reads - sum(barcode_to_count.values()))) sys.stderr.write("\n Avg reads/barcode: %d\n" % mean(barcode_to_count.values())) with open(os.path.join(options.output_dir, "read_lengths.log"), "w") as fp: for read_length, count in sorted(read_length_bins.items(), key=lambda x: x[0], reverse=True): fp.write("%s\t%s\n" % (read_length, count)) if options.merge: # remove temporary files os.unlink(merged.name) os.unlink("%s.assembled.fastq" % merged.name) os.unlink("%s.discarded.fastq" % merged.name) os.unlink("%s.unassembled.forward.fastq" % merged.name) os.unlink("%s.unassembled.reverse.fastq" % merged.name) else: with open(os.path.join(options.output_dir, "fwd_reads.assigned.fastq"), "w") as f_assigned, \ open(os.path.join(options.output_dir, "rev_reads.assigned.fastq"), "w") as r_assigned, \ open(os.path.join(options.output_dir, "fwd_reads.unassigned.fastq"), "w") as f_unassigned, \ open(os.path.join(options.output_dir, "rev_reads.unassigned.fastq"), "w") as r_unassigned: total_reads = 0.0 quality_reads = 0.0 # read through foward and reverse fastq simultaneously sys.stderr.write("Filtering and demultiplexing reads...\n") for f_read, r_read in izip(fast_fastq(open(fwd.name, "r")), fast_fastq(open(rev.name, "r"))): total_reads += 1 # check that read passes quality filter if qual_filter(f_read, options.min_qual, options.phred_offset, options.max_errors) == False or \ qual_filter(r_read, options.min_qual, options.phred_offset, options.max_errors) == False: continue quality_reads += 1 if options.dir_name: # count reads sample1 = f_read.id.split("_")[0] sample2 = r_read.id.split("_")[0] assert sample1 == sample2 try: barcode_to_count[sample1] += 1 except KeyError: barcode_to_count[sample1] = 1 # just write the filtered reads to their final destination f_assigned.write(f_read.raw()) r_assigned.write(r_read.raw()) else: # demultiplex dm_out = demultiplex(f_read, fwd_bcs.values(), r_read, rev_bcs.values(), options.max_mismatch) if dm_out == False: # strip pair info from read and write to unassigned file f_read.id = f_read.id.split(" ")[0] f_unassigned.write(f_read.raw()) r_read.id = r_read.id.split(" ")[0] r_unassigned.write(r_read.raw()) else: # rename reads to barcodes used and write to assigned file trimmed_f_read, trimmed_r_read, f_bc, r_bc = dm_out bc_name = "%s_%s" % (f_bc, r_bc) try: barcode_to_count[bc_name] += 1 except KeyError: barcode_to_count[bc_name] = 1 # we want to rename to sample names, and a barcode was found to match # something in barcodes.txt, but that particular barcode is not in use # in our plate layout. in this case, ignore this read if options.use_plate and bc_name not in barcode_to_sample: # strip pair info from read and write to unassigned file f_read.id = f_read.id.split(" ")[0] f_unassigned.write(f_read.raw()) r_read.id = r_read.id.split(" ")[0] r_unassigned.write(r_read.raw()) continue if options.use_plate: trimmed_f_read.id = "%s_%s" % ( barcode_to_sample[bc_name], barcode_to_count[bc_name]) else: trimmed_f_read.id = "%s_%s" % ( bc_name, barcode_to_count[bc_name]) f_assigned.write(trimmed_f_read.raw()) if options.use_plate: trimmed_r_read.id = "%s_%s" % ( barcode_to_sample[bc_name], barcode_to_count[bc_name]) else: trimmed_r_read.id = "%s_%s" % ( bc_name, barcode_to_count[bc_name]) r_assigned.write(trimmed_r_read.raw()) if total_reads > 0: if quality_reads / total_reads < options.min_qual_perc: sys.stderr.write( " Warning: only %.02f%% of reads passed quality filter\n" % (quality_reads * 100 / total_reads)) sys.stderr.write("\nSummary") sys.stderr.write("\n Total pairs: %d" % total_reads) sys.stderr.write("\n Quality pairs: %d" % quality_reads) sys.stderr.write("\n Assigned pairs: %d" % sum(barcode_to_count.values())) sys.stderr.write("\n Unassigned pairs: %d" % (quality_reads - sum(barcode_to_count.values()))) sys.stderr.write("\n Avg pairs/barcode: %d\n" % int(mean(barcode_to_count.values()))) sorted_barcode_to_count = sorted(barcode_to_count.items(), key=lambda x: x[1], reverse=True) with open(os.path.join(options.output_dir, "barcode_to_count.log"), "w") as fp: if options.dir_name == False and options.use_plate: for barcode, count in sorted_barcode_to_count: try: sample_name = barcode_to_sample[barcode] except KeyError: sample_name = "BARCODE_NOT_IN_PLATE_LAYOUT" fp.write("%s\t%s\t%s\n" % (barcode, sample_name, count)) else: for barcode, count in sorted_barcode_to_count: fp.write("%s\t%s\n" % (barcode, count)) # remove temporary files if options.zip_fname or options.dir_name: os.unlink(fwd.name) os.unlink(rev.name)