def build(self, infiles, outfiles, output_prefix): prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) outdir = os.path.join(output_prefix + ".dir") track = os.path.basename(output_prefix) processing_options = self.processing_options threads = self.threads infile1, infile2 = infiles outfile = outfiles[0] cmd = '''pandaseq -f %(infile1)s -r %(infile2)s %(processing_options)s -T %(threads)i -U >(gzip > %(outfile)s.unpaired.gz) -w >(gzip > %(outfile)s) -F -G %(output_prefix)s-pandaseq.log.bgz; >& %(output_prefix)s-pandaseq.log; checkpoint; gzip %(outdir)s/*; checkpoint; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options r = {33: 'sanger', 64: 'illumina', 59: 'solexa'} quality = r[offset] if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] cmd = '''sickle se -g %(processing_options)s --qual-type %(quality)s --output-file %(outfile)s --fastq-file %(infile)s 2>>%(output_prefix)s.log ;''' % locals() elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles cmd = '''sickle pe -g -s %(processing_options)s --qual-type %(quality)s -f %(infile1)s -r %(infile2)s -o %(outfile1)s -p %(outfile2)s 2>>%(output_prefix)s.log ;''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) outdir = os.path.join(output_prefix + ".dir") track = os.path.basename(output_prefix) processing_options = self.processing_options infile1, infile2 = infiles outfile = outfiles[0] cmd = '''flash %(infile1)s %(infile2)s -p %(offset)s %(processing_options)s -o %(track)s -d %(outdir)s >& %(output_prefix)s-flash.log; checkpoint; gzip %(outdir)s/*; checkpoint; mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options assert len(infiles) == len(outfiles) cmds = [] for infile, outfile in zip(infiles, outfiles): cmds.append('''zcat %(infile)s | fastx_trimmer -Q%(offset)s %(processing_options)s 2>> %(output_prefix)s.log | gzip > %(outfile)s ;''' % locals()) return " checkpoint; ".join(cmds)
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) offset = Fastq.getOffset("sanger", raises=False) threads = self.threads processing_options = self.processing_options if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] cmd = '''trimmomatic SE -threads %(threads)i -phred%(offset)s %(infile)s %(outfile)s %(processing_options)s 2>> %(output_prefix)s.log ;''' % locals() elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles cmd = '''trimmomatic PE -threads %(threads)i -phred%(offset)s %(infile1)s %(infile2)s %(outfile1)s %(output_prefix)s.1.unpaired %(outfile2)s %(output_prefix)s.2.unpaired %(processing_options)s 2>> %(output_prefix)s.log; checkpoint; gzip %(output_prefix)s.*.unpaired; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] outdir = os.path.dirname(outfile) trim_out = "%s/%s_trimmed.fq.gz" % ( outdir, infile.replace(".fastq.gz", "")) cmd = '''trim_galore %(processing_options)s --phred%(offset)s --output_dir %(outdir)s %(infile)s 2>>%(output_prefix)s.log; mv %(trim_out)s %(outfile)s; ''' % locals() outfiles = (outfile, ) elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles outdir = os.path.dirname(outfile1) cmd = '''trim_galore %(processing_options)s --paired --phred%(offset)s --output_dir %(outdir)s %(infile1)s %(infile2)s 2>>%(output_prefix)s.log; mv %(outdir)s/%(infile1)s_val_1.fq.gz %(outfile1)s; mv %(outdir)s/%(infile2)s_val_2.fq.gz %(outfile2)s; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] trim_out = "%s_trimmed.fq.gz" % (output_prefix) cmd = '''trim_galore %(processing_options)s --phred%(offset)s --output_dir %(outdir)s %(infile)s 2>>%(output_prefix)s.log; mv %(trim_out)s %(outfile)s; ''' % locals() outfiles = (outfile,) elif self.num_files == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles cmd = '''trim_galore %(processing_options)s --paired --phred%(offset)s --output_dir %(outdir)s %(infile1)s %(infile2)s 2>>%(output_prefix)s.log; mv %(infile1)s_val_1.fq.gz %(outfile1)s; mv %(infile2)s_val_2.fq.gz %(outfile2)s; ''' % locals() return cmd
def processReads(infiles, outfile): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs(infile) if infile2: track = P.snip(outfile, ".fastq.1.gz") outfile2 = P.snip(outfile, ".fastq.1.gz") + ".fastq.2.gz" else: track = P.snip(outfile, ".fastq.gz") if PARAMS["process_combine_reads"]: E.warn( "combining reads cannot be can not be combined with other processing for paired ended reads" ) if not infile2: raise IOError("must have paired data to combine reads") read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \ PARAMS["combine_reads_fragment_length"], \ PARAMS["combine_reads_fragment_length_stdev"] fragment_options = " ".join(map(str, [read_len, frag_len, frag_stdev])) if PARAMS["combine_reads_max_overlap"]: E.warn( "if specifying --max-overlap read and fragment length options will be ignored" ) max_overlap = "--max-overlap=%i" % PARAMS[ "combine_reads_max_overlap"] fragment_options = "" elif not PARAMS["combine_reads_max_overlap"] and len( fragment_options.strip().split(" ")) < 3: E.warn( "have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used" ) max_overlap = "" fragment_options = "" elif PARAMS["combine_reads_read_length"] and PARAMS[ "combine_reads_fragment_length"] and PARAMS[ "combine_reads_fragment_length_stdev"]: if PARAMS["combine_reads_max_overlap"]: E.warn( "--max-overlap will override the specified read and fragment length options" ) max_overlap = "" fragment_options = """--read-len=%(read_len)i --fragment-len=%(frag_len)i --fragment-len-stddev=%(frag_stdev)i""" % locals( ) else: max_overlap = "" fragment_options = "" if not PARAMS["combine_reads_min_overlap"]: min_overlap = "" else: min_overlap = "--min-overlap=%i" % PARAMS[ "combine_reads_min_overlap"] if not PARAMS["combine_reads_threads"]: threads = "" else: threads = "--threads=%i" % PARAMS["combine_reads_threads"] if not PARAMS["combine_reads_phred_offset"]: phred_offset = "" else: phred_offset = "--phred-offset=%i" % PARAMS[ "combine_reads_phred_offset"] if not PARAMS["combine_reads_max_mismatch_density"]: max_mismatch_density = "" else: max_mismatch_density = "--max-mismatch-density=%f" % PARAMS[ "combine_reads_max_mismatch_density"] statement = '''flash %(min_overlap)s %(max_overlap)s %(max_mismatch_density)s %(phred_offset)s %(fragment_options)s --output-prefix=%(track)s %(threads)s --compress %(infile)s %(infile2)s >> %(outfile)s.log ''' P.run() if PARAMS["combine_reads_concatenate"]: infiles = " ".join([ track + x for x in [ ".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz" ] ]) statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s''' else: statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s''' P.run() return if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads" ) statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat(IOTools.openFile(infile), raises=False) E.info("%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset(format, raises=False) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s'] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log' ) do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn("no filtering specified for %s - nothing done" % infile) return s.append("gzip") if not infile2: statement = " | ".join(s) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn("processing first of pair") # first read pair statement = " | ".join(s) + " > %(tmpfile1)s" P.run() # second read pair E.warn("processing second of pair") infile = infile2 statement = " | ".join(s) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation") statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%s.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink(tmpfile1) os.unlink(tmpfile2) os.unlink(tmpfile)
def setfastqAttr(self, infiles): self.offset = Fastq.getOffset(self.f_format, raises=False)
def processReads( infiles, outfile ): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs( infile ) if infile2: track = P.snip( outfile, ".fastq.1.gz" ) outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz" else: track = P.snip( outfile, ".fastq.gz" ) if PARAMS["process_combine_reads"]: E.warn("combining reads cannot be can not be combined with other processing for paired ended reads") if not infile2: raise IOError("must have paired data to combine reads") read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \ PARAMS["combine_reads_fragment_length"], \ PARAMS["combine_reads_fragment_length_stdev"] fragment_options = " ".join(map(str,[read_len, frag_len, frag_stdev])) if PARAMS["combine_reads_max_overlap"]: E.warn("if specifying --max-overlap read and fragment length options will be ignored") max_overlap="--max-overlap=%i" % PARAMS["combine_reads_max_overlap"] fragment_options = "" elif not PARAMS["combine_reads_max_overlap"] and len(fragment_options.strip().split(" ")) < 3: E.warn("have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used") max_overlap = "" fragment_options = "" elif PARAMS["combine_reads_read_length"] and PARAMS["combine_reads_fragment_length"] and PARAMS["combine_reads_fragment_length_stdev"]: if PARAMS["combine_reads_max_overlap"]: E.warn("--max-overlap will override the specified read and fragment length options") max_overlap = "" fragment_options = """--read-len=%(read_len)i --fragment-len=%(frag_len)i --fragment-len-stddev=%(frag_stdev)i""" % locals() else: max_overlap = "" fragment_options = "" if not PARAMS["combine_reads_min_overlap"]: min_overlap = "" else: min_overlap = "--min-overlap=%i" % PARAMS["combine_reads_min_overlap"] if not PARAMS["combine_reads_threads"]: threads = "" else: threads = "--threads=%i" % PARAMS["combine_reads_threads"] if not PARAMS["combine_reads_phred_offset"]: phred_offset = "" else: phred_offset = "--phred-offset=%i" % PARAMS["combine_reads_phred_offset"] if not PARAMS["combine_reads_max_mismatch_density"]: max_mismatch_density = "" else: max_mismatch_density = "--max-mismatch-density=%f" % PARAMS["combine_reads_max_mismatch_density"] statement = '''flash %(min_overlap)s %(max_overlap)s %(max_mismatch_density)s %(phred_offset)s %(fragment_options)s --output-prefix=%(track)s %(threads)s --compress %(infile)s %(infile2)s >> %(outfile)s.log ''' P.run() if PARAMS["combine_reads_concatenate"]: infiles = " ".join([track + x for x in [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]]) statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s''' else: statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s''' P.run() return if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads") statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False) E.info( "%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset( format, raises = False ) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s' ] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log') do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn( "no filtering specified for %s - nothing done" % infile ) return s.append( "gzip" ) if not infile2: statement = " | ".join( s ) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn( "processing first of pair") # first read pair statement = " | ".join( s ) + " > %(tmpfile1)s" P.run() # second read pair E.warn( "processing second of pair") infile = infile2 statement = " | ".join( s ) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation" ) statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%s.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink( tmpfile1 ) os.unlink( tmpfile2 ) os.unlink( tmpfile )
def processReads( infiles, outfile ): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs( infile ) if infile2: track = P.snip( outfile, ".fastq.1.gz" ) outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz" else: track = P.snip( outfile, ".fastq.gz" ) if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads") statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False) E.info( "%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset( format, raises = False ) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s' ] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log') do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn( "no filtering specified for %s - nothing done" % infile ) return s.append( "gzip" ) if not infile2: statement = " | ".join( s ) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn( "processing first of pair") # first read pair statement = " | ".join( s ) + " > %(tmpfile1)s" P.run() # second read pair E.warn( "processing second of pair") infile = infile2 statement = " | ".join( s ) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation" ) statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%i.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink( tmpfile1 ) os.unlink( tmpfile2 ) os.unlink( tmpfile )