def stats(fastq_file): """Generate basic stats from FASTQ file """ # Loop over all reads in the FASTQ n_reads = 0 read_lengths = {} index_sequences = {} for read in FASTQFile.FastqIterator(fastq_file): # Count of reads n_reads += 1 # Read length distribution read_len = len(read.sequence) if read_len in read_lengths: read_lengths[read_len] += 1 else: read_lengths[read_len] = 1 # Tag name distribution index_seq = read.seqid.index_sequence if index_seq is not None: if index_seq in index_sequences: index_sequences[index_seq] += 1 else: index_sequences[index_seq] = 1 # Finished print "Total reads: %d" % n_reads print "Read lengths" for len_ in read_lengths: print "\t%d: %d" % (len_, read_lengths[len_]) print "Index sequences" for seq in index_sequences: print "\t%s: %d" % (seq, index_sequences[seq])
def edit_instrument_name(fastq_file,new_instrument_name): """Edit the instrument name for all records in FASTQ file Loop over all records in a supplied FASTQ file, update the sequence identifier (i.e. first line in the each record) by changing the instrument name, and write the updated records to stdout. """ # Loop over all reads in the FASTQ # Update the instrument name in the sequence identifier and echo to stdout for read in FASTQFile.FastqIterator(fastq_file): if new_instrument_name: # Modify the instrument name read.seqid.instrument_name = new_instrument_name # Echo updated read to stdout print read
"N_SUBSET reads. (Quicker than using all reads but may not be accurate " "if subset is not representative of the file as a whole.)") # Process the command line options, arguments = p.parse_args() if len(arguments) != 1: p.error("input FASTQ file required") else: fastq_file = arguments[0] if not os.path.exists(fastq_file): p.error("Input file '%s' not found" % fastq_file) # Get broad format type print "Sniffing %s" % fastq_file print "\nData from first read:" for read in FASTQFile.FastqIterator(fastq_file): fastq_format = read.seqid.format if fastq_format is None and read.is_colorspace: fastq_format = 'colorspace' print "\tHeader format:\t%s" % str(fastq_format) print "\tSeq length:\t%d" % read.seqlen break # Determine the quality score range (and count reads) try: n_subset = int(options.n_subset) except TypeError: n_subset = None n_reads = 0 min_max_qual = (None, None) for read in FASTQFile.FastqIterator(fastq_file):
def demultiplex_fastq(fastq_file, barcodes, nmismatches): """Perform demultiplexing of a FASTQ file Demultiplex reads in a FASTQ file given information about a set of barcode/index sequences. Produces a file for each barcode, plus another for 'unbinned' reads. Arguments: fastq_file: FASTQ file to be demultiplexed (can be gzipped) barcodes: list of barcode sequences to use for demultiplexing nmismatches: maxiumum number of mismatched bases allowed when testing whether barcode sequences match Returns: No return value """ # Start print "Processing %s" % fastq_file info = IlluminaData.IlluminaFastq(fastq_file) # Set up output files output_files = {} # Weed out barcodes that aren't associated with this lane local_barcodes = [] for barcode in barcodes: if barcode['lane'] != info.lane_number: continue local_barcodes.append(barcode) output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % ( barcode['name'], barcode['index'], info.lane_number, info.read_number, info.set_number) print "\t%s\t%s" % (barcode['index'], output_file_name) if os.path.exists(output_file_name): print "\t%s: already exists,exiting" % output_file_name sys.exit(1) output_files[barcode['index']] = open(output_file_name, 'w') # Check if there's anything to do if len(local_barcodes) == 0: return # Also make a file for unbinned reads unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % ( info.lane_number, info.read_number, info.set_number) if os.path.exists(unbinned_file_name): print "\t%s: already exists,exiting" % unbinned_file_name sys.exit(1) output_files['unbinned'] = open(unbinned_file_name, 'w') # Process reads nreads = 0 for read in FASTQFile.FastqIterator(fastq_file): nreads += 1 matched_read = False this_barcode = read.seqid.index_sequence for barcode in local_barcodes: if barcode['matcher'].match(this_barcode, nmismatches): ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name']) output_files[barcode['index']].write(str(read) + '\n') matched_read = True break # Put in unbinned if no match if not matched_read: output_files['unbinned'].write(str(read) + '\n') ##if nreads > 100: break # Close files for barcode in local_barcodes: output_files[barcode['index']].close() print "\tMatched %d reads for %s" % (nreads, os.path.basename(fastq_file))