def simple(fastq=None, fp=None): """ Return number of reads in a FASTQ file Uses the FASTQFile.nreads function to do the counting. Arguments: fastq: fastq(.gz) file fp: open file descriptor for fastq file Returns: Number of reads """ return FASTQFile.nreads(fastq=fastq, fp=fp)
def batch_fastqs(fastqs, nbatches, basename="batched", out_dir=None): """ Splits reads from one or more Fastqs into batches Concatenates input Fastq files and then splits reads into smaller Fastqs using the external 'batch' utility. Arguments: fastqs (list): list of paths to one or more Fastq files to take reads from nbatches (int): number of batches to output reads into basename (str): optional basename to use for the output Fastq files (default: 'batched') out_dir (str): optional path to a directory where the batched Fastqs will be written """ # Count the total number of reads print "Fetching read counts:" nreads = 0 for fq in fastqs: n = FASTQFile.nreads(fq) print "%s:\t%d" % (os.path.basename(fq), n) nreads += n print "Total reads: %d" % nreads # Determine batch size batch_size = nreads / nbatches if nreads % batch_size: # Round up batch size batch_size += 1 assert (batch_size * nbatches >= nreads) print "Creating batches of %d reads" % batch_size # Check if fastqs are compressed gzipped = fastqs[0].endswith('.gz') if gzipped: batch_cmd = Command('zcat') else: batch_cmd = Command('cat') # Get the read number read_number = get_read_number(fastqs[0]) suffix = ".r%s.fastq" % read_number # Build and run the batching command batch_cmd.add_args(*fastqs) batch_cmd.add_args('|', 'split', '-l', batch_size * 4, '-d', '-a', 3, '--additional-suffix=%s' % suffix, '-', os.path.join(out_dir, "%s.B" % basename)) batch_script = os.path.join(out_dir, "batch.sh") batch_cmd.make_wrapper_script("/bin/bash", batch_script) # Check for successful exit code retcode = Command("/bin/bash", batch_script).run_subprocess(working_dir=out_dir) if retcode != 0: raise Exception("Batching failed: exit code %s" % retcode) # Collect and return the batched Fastq names batched_fastqs = [ os.path.join(out_dir, "%s.B%03d%s" % (basename, i, suffix)) for i in xrange(0, nbatches) ] return batched_fastqs
fastqs = sample.fastq_subset(read_number=1) + \ sample.fastq_subset(read_number=2) for fastq in fastqs: print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % IlluminaData.describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn, fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % ( fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: print "%s" % IlluminaData.summarise_projects(illumina_data) # Print number of undetermined reads if options.stats and illumina_data.undetermined is not None: print "Undetermined indices" for lane in illumina_data.undetermined.samples: for fastq in lane.fastq: fq = os.path.join(lane.dirn, fastq)
fastqs = sample.fastq_subset(read_number=1) + \ sample.fastq_subset(read_number=2) for fastq in fastqs: print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % IlluminaData.describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn,fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: print "%s" % IlluminaData.summarise_projects(illumina_data) # Print number of undetermined reads if options.stats and illumina_data.undetermined is not None: print "Undetermined indices" for lane in illumina_data.undetermined.samples: for fastq in lane.fastq: