def start(self, fastq_file1, fastq_file2, fastq_fileU, output_prefix, rdpPath, gene='16srrna', train=None, batchsize=10000, minQ=None, minL=0, procs=1, test=False, verbose=True, debug=False): """ Start classifying double barcoded Illumina sequencing run """ results = {} self.verbose = verbose try: if (train is None and gene != '16srrna' and gene != 'fungallsu' and gene != "fungalits_warcup" and gene != "fungalits_unite"): sys.stderr.write("ERROR:[classify] parameter -g (--gene) must be one of 16srrna or fungallsu or fungalits_warcup or fungalits_unite \n") raise Exception # establish and open the Illumina run if fastq_file1 is not None and fastq_file2 is not None: self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.runPairs.open() else: self.runPairs = None if fastq_fileU is not None: self.runSingle = OneReadIlluminaRun(fastq_fileU) self.runSingle.open() else: self.runSingle = None if self.runPairs is None and self.runSingle is None: sys.stderr.write("ERROR:[classify] input reads not specified, or incorrect pairs\n") raise Exception lasttime = time.time() batch = 0 pool = Pool(procs, maxtasksperchild=1) #For OneReadIllumina: if (self.runSingle is not None): while 1: # get next batch of reads reads = self.runSingle.next(batchsize) batch = batch + len(reads) if len(reads) == 0: break run_out = IlluminaFastaOutput(output_prefix + "." + str(batch)) # process individual reads for read in reads: if minQ != 0 or minL != 0: read.trimRead(minQ, minL) if read.goodRead == True: run_out.addRead(read.getFasta()) else: run_out.addRead(read.getFasta()) # Write out reads rcount = run_out.count() if rcount > batchsize: sys.stderr.write("WARNING:[classify] output count exceeds batch count") run_out.writeReads() rdp_out = output_prefix + "." + str(batch) + ".fixrank" results[rdp_out] = pool.apply_async(rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose)) if test: break #For TwoReadIllumina: if (self.runPairs is not None): while 1: # get next batch of reads reads = self.runPairs.next(batchsize) batch = batch + len(reads) if len(reads) == 0: break run_out = IlluminaFastaOutput(output_prefix + "." + str(batch)) # process individual reads for read in reads: if minQ != 0 or minL != 0: read.trimRead(minQ, minL) if read.goodRead == True: run_out.addRead(read.getJoinedFasta()) else: run_out.addRead(read.getJoinedFasta()) # Write out reads run_out.writeReads() rdp_out = output_prefix + "." + str(batch) + ".fixrank" results[rdp_out] = pool.apply_async(rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose)) if test: break allfinished = False while not allfinished: time.sleep(1) np = check_status(results) if np == 0: allfinished = True if self.verbose: sys.stderr.write("Combining temporary files\n") with open(output_prefix + ".fixrank", "wb") as outfile: for f in results.keys(): with open(f, "rb") as infile: outfile.write(infile.read()) os.remove(f) if self.verbose: sys.stdout.write("%s reads processed in %s minutes\n" % (batch, round((time.time() - lasttime) / (60), 2))) self.clean(results) return 0 except (KeyboardInterrupt, SystemExit): self.clean(results) sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean(results) sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join(traceback.format_exception(*sys.exc_info()))) return 1
def start(self, fastq_file1, fastq_file2, fastq_fileU, output_prefix, rdpPath, gene='16srrna', train=None, batchsize=10000, minQ=None, minL=0, procs=1, test=False, verbose=True, debug=False): """ Start classifying double barcoded Illumina sequencing run """ results = {} self.verbose = verbose try: if (train is None and gene != '16srrna' and gene != 'fungallsu' and gene != "fungalits_warcup" and gene != "fungalits_unite"): sys.stderr.write( "ERROR:[classify] parameter -g (--gene) must be one of 16srrna or fungallsu or fungalits_warcup or fungalits_unite \n" ) raise Exception # establish and open the Illumina run if fastq_file1 is not None and fastq_file2 is not None: self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.runPairs.open() else: self.runPairs = None if fastq_fileU is not None: self.runSingle = OneReadIlluminaRun(fastq_fileU) self.runSingle.open() else: self.runSingle = None if self.runPairs is None and self.runSingle is None: sys.stderr.write( "ERROR:[classify] input reads not specified, or incorrect pairs\n" ) raise Exception lasttime = time.time() batch = 0 pool = Pool(procs, maxtasksperchild=1) #For OneReadIllumina: if (self.runSingle is not None): while 1: # get next batch of reads reads = self.runSingle.next(batchsize) batch = batch + len(reads) if len(reads) == 0: break run_out = IlluminaFastaOutput(output_prefix + "." + str(batch)) # process individual reads for read in reads: if minQ != 0 or minL != 0: read.trimRead(minQ, minL) if read.goodRead == True: run_out.addRead(read.getFasta()) else: run_out.addRead(read.getFasta()) # Write out reads rcount = run_out.count() if rcount > batchsize: sys.stderr.write( "WARNING:[classify] output count exceeds batch count" ) run_out.writeReads() rdp_out = output_prefix + "." + str(batch) + ".fixrank" results[rdp_out] = pool.apply_async( rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose)) if test: break #For TwoReadIllumina: if (self.runPairs is not None): while 1: # get next batch of reads reads = self.runPairs.next(batchsize) batch = batch + len(reads) if len(reads) == 0: break run_out = IlluminaFastaOutput(output_prefix + "." + str(batch)) # process individual reads for read in reads: if minQ != 0 or minL != 0: read.trimRead(minQ, minL) if read.goodRead == True: run_out.addRead(read.getJoinedFasta()) else: run_out.addRead(read.getJoinedFasta()) # Write out reads run_out.writeReads() rdp_out = output_prefix + "." + str(batch) + ".fixrank" results[rdp_out] = pool.apply_async( rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose)) if test: break allfinished = False while not allfinished: time.sleep(1) np = check_status(results) if np == 0: allfinished = True if self.verbose: sys.stderr.write("Combining temporary files\n") with open(output_prefix + ".fixrank", "wb") as outfile: for f in results.keys(): with open(f, "rb") as infile: outfile.write(infile.read()) os.remove(f) if self.verbose: sys.stdout.write( "%s reads processed in %s minutes\n" % (batch, round((time.time() - lasttime) / (60), 2))) self.clean(results) return 0 except (KeyboardInterrupt, SystemExit): self.clean(results) sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean(results) sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1
def start(self, fastq_file1, fastq_file2, fastq_fileU, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False): """ Split double barcoded Illumina sequencing run from two to four reads by sample identifier """ self.verbose = verbose if fastq_fileU is not None and (fastq_file1 is not None and fastq_file2 is not None): sys.stderr.write( "ERROR:[SplitBySample] cannot have both paired and single reads\n" ) return 1 try: if fastq_file1 is not None and fastq_file2 is not None: self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.runPairs.open() else: self.runPairs = None if fastq_fileU is not None: self.runSingle = OneReadIlluminaRun(fastq_fileU) self.runSingle.open() else: self.runSingle = None if self.runPairs is None and self.runSingle is None: sys.stderr.write( "ERROR:[SplitBySample] input reads not specified, or incorrect pairs\n" ) raise Exception self.run_out = {} if (self.runPairs is not None): while 1: if self.verbose: sys.stderr.write("Processing sequence files.\n") # get next batch of reads reads = self.runPairs.next(batchsize) if len(reads) == 0: break # process individual reads, check to see if sample was already added to the library of self.run_out for read in reads: sample = read.sample if sample in self.run_out: self.run_out[sample].addRead(read.getFastqSRA()) else: self.run_out[sample] = IlluminaTwoReadOutput( os.path.join(output_prefix, sample), uncompressed) self.run_out[sample].addRead(read.getFastqSRA()) # Write out reads for each key in dictionary for key in self.run_out: self.run_out[key].writeReads() if self.verbose: sys.stderr.write("\nSplit out %s total samples in %s.\n" % (len(self.run_out), output_prefix)) return 0 if (self.runSingle is not None): while 1: if self.verbose: sys.stderr.write("Processing sequence files.\n") # get next batch of reads reads = self.runSingle.next(batchsize) if len(reads) == 0: break # process individual reads, check to see if sample was already added to the library of self.run_out for read in reads: sample = read.sample if sample in self.run_out: self.run_out[sample].addRead(read.getFastqSRA()) else: self.run_out[sample] = IlluminaOneReadOutput( os.path.join(output_prefix, sample), uncompressed) self.run_out[sample].addRead(read.getFastqSRA()) # Write out reads for each key in dictionary for key in self.run_out: self.run_out[key].writeReads() if self.verbose: sys.stderr.write("\nSplit out %s total samples in %s.\n" % (len(self.run_out), output_prefix)) return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated.\n" % (__name__)) return 1 except: self.clean() sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1