def trim_reads(data_folder, adaID, VERBOSE=0, summary=True, quality=25, blocksize=10, minlen_read1=100, minlen_read2=50): '''Trim low quality at the end of reads''' fn_in = get_read_filenames(data_folder, adaID, gzip=True) fn_out = get_read_filenames(data_folder, adaID, gzip=True, trimmed=True) n_good = 0 n_discarded = 0 with gzip.open(fn_in[0], 'rb') as fin1, \ gzip.open(fn_in[1], 'rb') as fin2, \ gzip.open(fn_out[0], 'rb') as fout1, \ gzip.open(fn_out[1], 'wb') as fout2: it1 = SeqIO.read(fin1, 'fastq') it2 = SeqIO.read(fin2, 'fastq') for irp, reads in enumerate(izip(it1, it2)): if VERBOSE >= 2: if not ((irp + 1) % 10000): print irp + 1 # Trim both reads trims = [ trim_read(read, quality=quality, blocksize=blocksize) for read in reads ] lrs = map(len, trims) if (lrs[0] > minlen_read1) and (lrs[1] > minlen_read2): SeqIO.write(trims[0], fout1, 'fastq') SeqIO.write(trims[1], fout2, 'fastq') n_good += 1 else: n_discarded += 1 if VERBOSE: print 'Trim lowq ends of reads:' print 'Good:', n_good print 'Discarded:', n_discarded # Write summary to file if summary: with open(get_trim_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Trim low quality ends results: adaID ' + adaID + '\n') f.write('Total:\t\t' + str(irp) + '\n') f.write('Good:\t\t' + str(n_good) + '\n') f.write('Discarded:\t' + str(n_discarded) + '\n')
def trim_reads(data_folder, adaID, VERBOSE=0, summary=True, quality=25, blocksize=10, minlen_read1=100, minlen_read2=50): '''Trim low quality at the end of reads''' fn_in = get_read_filenames(data_folder, adaID, gzip=True) fn_out = get_read_filenames(data_folder, adaID, gzip=True, trimmed=True) n_good = 0 n_discarded = 0 with gzip.open(fn_in[0], 'rb') as fin1, \ gzip.open(fn_in[1], 'rb') as fin2, \ gzip.open(fn_out[0], 'rb') as fout1, \ gzip.open(fn_out[1], 'wb') as fout2: it1 = SeqIO.read(fin1, 'fastq') it2 = SeqIO.read(fin2, 'fastq') for irp, reads in enumerate(izip(it1, it2)): if VERBOSE >= 2: if not ((irp + 1) % 10000): print irp + 1 # Trim both reads trims = [trim_read(read, quality=quality, blocksize=blocksize) for read in reads] lrs = map(len, trims) if (lrs[0] > minlen_read1) and (lrs[1] > minlen_read2): SeqIO.write(trims[0], fout1, 'fastq') SeqIO.write(trims[1], fout2, 'fastq') n_good += 1 else: n_discarded += 1 if VERBOSE: print 'Trim lowq ends of reads:' print 'Good:', n_good print 'Discarded:', n_discarded # Write summary to file if summary: with open(get_trim_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Trim low quality ends results: adaID '+adaID+'\n') f.write('Total:\t\t'+str(irp)+'\n') f.write('Good:\t\t'+str(n_good)+'\n') f.write('Discarded:\t'+str(n_discarded)+'\n')
# Specify the dataset dataset = load_sequencing_run(seq_run) data_folder = dataset.folder # If the script is called with no adaID, iterate over all samples = dataset.samples if adaIDs is not None: samples = samples.loc[samples.adapter.isin(adaIDs)] if VERBOSE >= 2: print samples.index.tolist() # Iterate over all adaIDs for samplename, sample in samples.iterrows(): adaID = str(sample.adapter) # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, VERBOSE=VERBOSE, threads=threads, reference=refname, summary=summary) continue if summary: with open(get_trim_summary_filename(data_folder, adaID), 'w') as f: f.write('Call: python trim_reads_lowq.py --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --threads '+str(threads)+\ ' --reference '+refname+\ ' --verbose '+str(VERBOSE)+'\n') trim_reads(data_folder, adaID, VERBOSE=VERBOSE, summary=summary)
# If the script is called with no adaID, iterate over all samples = dataset.samples if adaIDs is not None: samples = samples.loc[samples.adapter.isin(adaIDs)] if VERBOSE >= 2: print samples.index.tolist() # Iterate over all adaIDs for samplename, sample in samples.iterrows(): adaID = str(sample.adapter) # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, VERBOSE=VERBOSE, threads=threads, reference=refname, summary=summary) continue if summary: with open(get_trim_summary_filename(data_folder, adaID), 'w') as f: f.write('Call: python trim_reads_lowq.py --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --threads '+str(threads)+\ ' --reference '+refname+\ ' --verbose '+str(VERBOSE)+'\n') trim_reads(data_folder, adaID, VERBOSE=VERBOSE, summary=summary)