def test_task3(infile, outfile): print("%s start to run " % infile) # subprocess.check_call("./five_second.py") run_job("./five_second.py", run_locally=True) print("%s wake up " % infile) with open(outfile, "w") as p: pass
def runFastqc(FqFileName, fastqcLog, config): """ To run FastQC Arguments: - `FqFileName`: fastq file - `config`: config """ cmds = ['runFastQC.sh'] #cmds.append("-o") cmds.append(fastqc_path) cores = int(config['cores']) if cores == 0: cores = 1 #cmds.append("-t") cmds.append(str(cores)) cmds.append(FqFileName) cmds.append(config["pair_end"]) logfile = fastqcLog run_job(" ".join(cmds), job_name=os.path.basename(FqFileName) + "_fastqc", job_other_options=cluster_options(config, "runFastqc", cores, logfile), job_script_directory=os.path.dirname(os.path.realpath(__file__)), job_environment={'BASH_ENV': '~/.bash_profile'}, retain_job_scripts=True, drmaa_session=my_drmaa_session) return 0
def runDiffrepeat(BamFileNames, ResultFile, config): """ To run diffrepeats Arguments: - `BamFileNames`: bam files - `config`: config """ cmds = ['runDiffrepeat.sh'] cmds.append(diffrepeat_path) cmds.append(alignment_path) cmds.append(config["repbase_db"]) cmds.append(ResultFile) cmds.append(config["diffrepeat_editdist"]) cmds.append(config["diffrepeat_mapq"]) logfile = expandOsPath( os.path.join(log_path, config["project_name"] + ".diffrepeat.log")) cores = int(config['cores']) if cores == 0: cores = 1 run_job(" ".join(cmds), job_name="runDiffRepeat", job_other_options=cluster_options(config, "runDiffrepeat", cores, logfile), job_script_directory=os.path.dirname(os.path.realpath(__file__)), job_environment={'BASH_ENV': '~/.bash_profile'}, retain_job_scripts=True, drmaa_session=my_drmaa_session) return 0
def runFastqc(BamFileName, fastqcLog, config): """ To run FastQC Arguments: - `BamFileName`: bam file - `config`: config """ cmds = ['fastqc'] cmds.append("-o") cmds.append( expandOsPath( os.path.join(config["project_dir"], config["data_dir"], "FastQC"))) cores = int(config['cores']) if cores == 0: cores = 1 cmds.append("-t") cmds.append(str(cores)) cmds.append(BamFileName) logfile = BamFileName + ".fastqc.log" run_job(" ".join(cmds), job_name="fastqc_" + os.path.basename(BamFileName), job_other_options=cluster_options(config, "runFastqc", cores, logfile), job_script_directory=os.path.dirname(os.path.realpath(__file__)), job_environment={'BASH_ENV': '~/.bash_profile'}, retain_job_scripts=True, drmaa_session=my_drmaa_session) return 0
def rmdupBam(BamFileName, rmdupFile, config): """ To remove duplicates Arguments: - `BamFileName`: bam file - `config`: config """ if config["pair_end"]=="no": cmds = ['rmdup.bam.sh'] else: cmds = ['rmdup_PE.bam.sh'] cmds.append(BamFileName) cmds.append(rmdup_path) #if "bam_sort_buff" in config: # cmds.append(config["bam_sort_buff"]) logfile = BamFileName + ".rmdup.log" cores = 1 run_job(" ".join(cmds), job_name = "rmdup_" + os.path.basename(BamFileName), job_other_options = cluster_options(config, "rmdupBam", cores, logfile), job_script_directory = os.path.dirname(os.path.realpath(__file__)), job_environment={ 'BASH_ENV' : '~/.bash_profile' }, retain_job_scripts = True, drmaa_session=my_drmaa_session) return 0
def genTDF(BamFileName, tdfLog, config): """ To generate TDF files for IGV Arguments: - `BamFileName`: bam file - `config`: config """ cmds = ['igvtools'] cmds.append("count") cmds.append(BamFileName) TDFPath = expandOsPath(os.path.join(rmdup_path, "tdf")) baseName = os.path.basename(BamFileName) cmds.append(os.path.join(TDFPath, baseName.replace(".bam", ".tdf"))) cmds.append(config["IGV_genome"]) logfile = BamFileName + ".tdf.log" cores = 1 run_job(" ".join(cmds), job_name="genTDF_" + os.path.basename(BamFileName), job_other_options=cluster_options(config, "genTDF", cores, logfile), job_script_directory=os.path.dirname(os.path.realpath(__file__)), job_environment={'BASH_ENV': '~/.bash_profile'}, retain_job_scripts=True, drmaa_session=my_drmaa_session) return 0
def rmdupBam(BamFileName, rmdupFile, config): """ To remove duplicates Arguments: - `BamFileName`: bam file - `config`: config """ if config["pair_end"] == "no": cmds = ['rmdup.bam.sh'] else: cmds = ['rmdup_PE.bam.sh'] cmds.append(BamFileName) cmds.append(rmdup_path) #if "bam_sort_buff" in config: # cmds.append(config["bam_sort_buff"]) logfile = BamFileName + ".rmdup.log" cores = 1 run_job(" ".join(cmds), job_name="rmdup_" + os.path.basename(BamFileName), job_other_options=cluster_options(config, "rmdupBam", cores, logfile), job_script_directory=os.path.dirname(os.path.realpath(__file__)), job_environment={'BASH_ENV': '~/.bash_profile'}, retain_job_scripts=True, drmaa_session=my_drmaa_session) return 0
def test_task3( infile, outfile): print ("%s start to run " % infile) #subprocess.check_call("./five_second.py") run_job("./five_second.py", run_locally = True) print ("%s wake up " % infile) with open(outfile, "w") as p: pass
def runPhantomPeak(BamFileName, Log, config): """ To check data with phantomPeak Arguments: - `BamFileName`: bam file - `config`: config """ cmds = ['runPhantomPeak.sh'] cmds.append(BamFileName) cmds.append(str(config["cores"])) logfile = BamFileName + ".phantomPeak.log" cores = int(config['cores']) if cores == 0: cores = 1 run_job(" ".join(cmds), job_name="runPhantomPeak_" + os.path.basename(BamFileName), job_other_options=cluster_options(config, "runPhantomPeak", cores, logfile), job_script_directory=os.path.dirname(os.path.realpath(__file__)), job_environment={'BASH_ENV': '~/.bash_profile'}, retain_job_scripts=True, drmaa_session=my_drmaa_session) return 0
def genTDF(BamFileName, tdfLog, config): """ To generate TDF files for IGV Arguments: - `BamFileName`: bam file - `config`: config """ cmds = ['igvtools'] cmds.append("count") cmds.append(BamFileName) TDFPath = expandOsPath(os.path.join(rmdup_path, "tdf")) baseName = os.path.basename(BamFileName) cmds.append(os.path.join(TDFPath, baseName.replace(".bam", ".tdf"))) cmds.append(config["IGV_genome"]) logfile = BamFileName + ".tdf.log" cores = 1 run_job(" ".join(cmds), job_name = "genTDF_" + os.path.basename(BamFileName), job_other_options = cluster_options(config, "genTDF", cores, logfile), job_script_directory = os.path.dirname(os.path.realpath(__file__)), job_environment={ 'BASH_ENV' : '~/.bash_profile' }, retain_job_scripts = True, drmaa_session=my_drmaa_session) return 0
def runFastqc(BamFileName, fastqcLog, config): """ To run FastQC Arguments: - `BamFileName`: bam file - `config`: config """ cmds = ['fastqc'] cmds.append("-o") cmds.append(expandOsPath(os.path.join(config["project_dir"], config["data_dir"], "FastQC"))) cores = int(config['cores']) if cores == 0: cores = 1 cmds.append("-t") cmds.append(str(cores)) cmds.append(BamFileName) logfile = BamFileName + ".fastqc.log" run_job(" ".join(cmds), job_name = "fastqc_" + os.path.basename(BamFileName), job_other_options = cluster_options(config, "runFastqc", cores, logfile), job_script_directory = os.path.dirname(os.path.realpath(__file__)), job_environment={ 'BASH_ENV' : '~/.bash_profile' }, retain_job_scripts = True, drmaa_session=my_drmaa_session) return 0
def runPhantomPeak(BamFileName, Log, config): """ To check data with phantomPeak Arguments: - `BamFileName`: bam file - `config`: config """ cmds = ['runPhantomPeak.sh'] cmds.append(BamFileName) cmds.append(str(config["cores"])) logfile = BamFileName + ".phantomPeak.log" cores = int(config['cores']) if cores == 0: cores = 1 stdout_res, stderr_res = run_job(" ".join(cmds), job_name = "runPhantomPeak_" + os.path.basename(BamFileName), job_other_options = cluster_options(config, "runPhantomPeak", cores, logfile), job_script_directory = os.path.dirname(os.path.realpath(__file__)), job_environment={ 'BASH_ENV' : '~/.bashrc' }, retain_job_scripts = True, drmaa_session=my_drmaa_session) writeLog(logfile, stdout_res, stderr_res) return 0
def BS_flagstat(INfile, OUTfile, sampath, outdir, my_session, logobject): read_root = re.sub('.bam', '', os.path.basename(INfile)) cmd = os.path.join(sampath, 'samtools') + ' flagstat ' + INfile + ' > ' + OUTfile logobject.info(cmd) with open(os.path.join(outdir, "logs", "%s.flagstat.out" % read_root), 'w') as stdoutF, open( os.path.join(outdir, "logs", "%s.flagstat.err" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str=cmd, job_name='fstat', logger=logobject, drmaa_session=my_session, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo ') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logobject.error("Flagstat error: %s" % err) raise else: logobject.info('Flagstat calculation complete') return
def CpG_filt(input_file, output_file): ii = input_file oo = output_file read_root = re.sub('.CG.call.gz', '', os.path.basename(ii)) gz_cmd = 'gzip -dc ' + ii + ' > ' + re.sub('.gz', '', ii) filt_cmd = os.path.join( Rpath, 'Rscript' ) + ' --no-save --no-restore /data/boehm/group/pipelines/BS_amplicon_seq/v0.1.0/BSampli.mCT.filt.R ' + mextout + ' ' + re.sub( '.gz', '', ii) + ' ' + pozFsub clean_cmd = 'rm -v ' + re.sub('.gz', '', ii) cmd_all = ';'.join([gz_cmd, filt_cmd, clean_cmd]) logger.info(cmd_all) with open(os.path.join(mextout, "logs", "%s.CpG_filt.out" % read_root), 'w') as stdoutF, open( os.path.join(mextout, "logs", "%s.CpG_filt.err" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str=cmd_all, job_name='CpG_filt', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logger.error("CpG filtering error: %s" % err) raise else: logger.info('CpG filtering complete')
def tn5_shift(input_file, output_file, out_dir, logger, logger_mutex): cmd = ("#==================================\n" "# TN5 shift for atac seq \n" "#==================================\n" "source ~/.bashrc \n" "cd $TMPDIR \n" "cp {out_dir}/*tagAlign.gz . \n" "for tag in *tagAlign.gz \n" "do zcat ""$tag"" | awk -F $'\t' 'BEGIN {{OFS = FS}}{{ if ($6 == \"+\") {{$2 = $2 + 4}} else if ($6 == \"-\") {{$3 = $3 - 5}} print $0}}' | \\\n" "gzip -nc > ""${{tag:0:${{#tag}}-12}}.tn5.tagAlign.gz"" \n" "done \n" "mv *tn5* {out_dir} \n") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "tn5_shift", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=04:00:00 -w n -l mem=4G -l tmpfs=10G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("tn5_shift")
def trim_BQ(infiles,outfiles): ii1=infiles[0] ii2=infiles[1] oo1=outfiles[0] oo2=outfiles[1] read_root=re.sub('_R1.fastq.gz','',os.path.basename(ii1)) uzcmd1='zcat -v '+ ii1 + ' > ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii1))) uzcmd2='zcat -v '+ ii2 + ' > ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii2))) bshcmd='perl '+ os.path.join(prinpath,'prinseq-lite.pl') + ' -fastq ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii1))) + ' -fastq2 ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii2))) + ' -out_good ' + os.path.join(cutout,re.sub('_1.fastq.gz','',os.path.basename(oo1))) +' -trim_qual_right 20 -trim_qual_type min -trim_qual_window 6 -trim_qual_step 3 -min_len 50 -ns_max_p 10 -min_qual_mean 26 -out_bad null' zcmd1='gzip -c '+ os.path.join(cutout,re.sub('.gz','',os.path.basename(oo1))) + ' > ' + oo1 zcmd2='gzip -c '+ os.path.join(cutout,re.sub('.gz','',os.path.basename(oo2))) + ' > ' + oo2 clcmd='rm -v '+ os.path.join(cutout,re.sub('.gz','',os.path.basename(ii1))) + ' ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii2))) + ' ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(oo1))) + ' ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(oo2))) cmd_all=';'.join([uzcmd1,uzcmd2,bshcmd,zcmd1,zcmd2,clcmd]) logger.info(cmd_all) with open(os.path.join(cutout,"logs","%s.BQtrim_reads.out" % read_root),'w+') as stdoutF, open(os.path.join(cutout,"logs","%s.BQtrim_reads.err" % read_root),'w+') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str = cmd_all, job_name = 'BQtrim_reads', logger = logger, drmaa_session = mySession, run_locally = False, working_directory = os.getcwd(), job_other_options = '-p bioinfo') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) except error_drmaa_job as err: logger.error("BQtrim_reads error: %s" % err) raise else: logger.info('Base quality trimming complete')
def map_reads(input_files, output_file): ii1 = input_files[0] ii2 = input_files[1] oo = output_file read_root = re.sub('_1.fastq.gz', '', os.path.basename(ii1)) mapcmd = os.path.join(bismpath, 'bismark') + ' -p ' + str( args.nthreads ) + ' --non_directional --dovetail --temp_dir /data/extended --path_to_bowtie /package/bowtie2-2.2.8 --output_dir ' + bamoutO + ' --basename ' + read_root + ' --genome_folder ' + crefGpath + ' -1 ' + ii1 + ' -2 ' + ii2 logger.info(mapcmd) with open(os.path.join(bamoutO, "logs", "%s.readmap.out.log" % read_root), 'w') as stdoutF, open( os.path.join(bamoutO, "logs", "%s.readmap.err.log" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=mapcmd, job_name='BSmap', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --mincpus=' + str(args.nthreads)) stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logger.error("Map_reads error: %s" % err) raise else: logger.info('Mapping complete') return
def index_bam(input_file, output_file): ii = input_file oo = output_file read_root = re.sub('.sorted.bam', '', os.path.basename(ii)) cmd = os.path.join(sampath, 'samtools') + ' index ' + ii logger.info(cmd) with open(os.path.join(bamoutO, "logs", "%s.bam_index.out" % read_root), 'w+') as stdoutF, open( os.path.join(bamoutO, "logs", "%s.bam_index.err" % read_root), 'w+') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str=cmd, job_name='bam_index', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo ') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except error_drmaa_job as err: logger.error("Bam indexing error: %s" % err) raise else: logger.info('Bam indexing complete')
def cut_reads_auto(INfile1, INfile2, OUTfile1, OUTfile2, cutThdR1, cutThdR2, cutpath, my_session, cutout, logobject, args): read_root = re.sub('_R1.fastq.gz', '', os.path.basename(INfile1)) bshcmd = cutpath + ' cutadapt -a AGATCGGAAGAGC -A AGATCGGAAGAGC --minimum-length 30 -n 5 -j' + str( args.nthreads ) + ' -u ' + cutThdR1 + ' -U ' + cutThdR2 + ' -o ' + OUTfile1 + ' -p ' + OUTfile2 + ' ' + INfile1 + ' ' + INfile2 + ';sleep 300' with open(os.path.join(cutout, "logs", "%s.trim_reads.out" % read_root), 'w+') as stdoutF, open( os.path.join(cutout, "logs", "%s.trim_reads.err" % read_root), 'w+') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=bshcmd, job_name='cut_reads', logger=logobject, drmaa_session=my_session, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --nodes=1=1 --mincpus={}'.format( args.nthreads)) stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) except error_drmaa_job as err: logobject.error("Cut_reads error: %s" % err) raise return
def BS_conv_rate(ii1sub, oo, metDir, my_session, logobject): read_root = os.path.basename(ii1sub) CR_cmd = '/data/manke/repository/scripts/DNA_methylation/DEEP_scripts/conversionRate_KS.sh ' + ii1sub + ' ' + oo logobject.info(CR_cmd) with open(os.path.join(metDir, "logs", "%s.conv_rate.out.log" % read_root), 'w') as stdoutF, open( os.path.join(metDir, "logs", "%s.conv_rate.err.log" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str=CR_cmd, job_name='conv_rate', logger=logobject, drmaa_session=my_session, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logobject.error("Conversion rate error: %s" % err) raise else: logobject.info('Conversion rate calculation complete') return
def get_flagstat(input_file, output_file): ii = input_file oo = output_file read_root = re.sub('.RGi.bam', '', os.path.basename(ii)) cmd = os.path.join(sampath, 'samtools') + ' flagstat ' + ii + ' > ' + oo logger.info(cmd) with open(os.path.join(metout, "logs", "%s.flagstat.out" % read_root), 'w') as stdoutF, open( os.path.join(metout, "logs", "%s.flagstat.err" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str=cmd, job_name='fstat', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo ') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logger.error("Flagstat error: %s" % err) raise else: logger.info('Flagstat calculation complete')
def cut_reads(infiles, outfiles): ii1=infiles[0] ii2=infiles[1] oo1=outfiles[0] oo2=outfiles[1] read_root=re.sub('_R1.fastq.gz','',os.path.basename(ii1)) bshcmd=os.path.join(cutpath,'cutadapt') + ' -a AGATCGGAAGAGC -A AGATCGGAAGAGC --minimum-length 30 -n 5 -o ' + oo1 + ' -p ' + oo2 + ' ' + ii1 + ' ' + ii2 logger.info(bshcmd) with open(os.path.join(cutout,"logs","%s.cut_reads.out" % read_root),'w+') as stdoutF, open(os.path.join(cutout,"logs","%s.cut_reads.err" % read_root),'w+') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str = bshcmd, job_name = 'cut_reads', logger = logger, drmaa_session = mySession, run_locally = False, working_directory = os.getcwd(), job_other_options = '-p bioinfo') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) except error_drmaa_job as err: logger.error("Cut_reads error: %s" % err) raise else: logger.info('Adapter trimming complete')
def BS_Mbias(INfile, OUTfile, POMpath, refG, metDir, nthreads, my_session, logobject): read_root = re.sub('.bam', '', os.path.basename(INfile)) Mb_cmd = os.path.join( POMpath, 'MethylDackel' ) + ' mbias --txt ' + refG + ' ' + INfile + ' ' + OUTfile + ' -@ ' + str( nthreads) + ' > ' + OUTfile + '.txt' logobject.info(Mb_cmd) with open(os.path.join(metDir, "logs", "%s.mbias.out" % read_root), 'w') as stdoutF, open( os.path.join(metDir, "logs", "%s.mbias.err" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=Mb_cmd, job_name='mbias', logger=logobject, drmaa_session=my_session, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --mem-per-cpu=10000 --mincpus=' + str(nthreads)) stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logobject.error("Methylation bias error: %s" % err) raise else: logobject.info('Methylation bias calculation complete') return
def makeDB(input_files,output_file): ii1 = input_files[0] ii2 = input_files[1] ii3 = input_files[2] oo = output_file read_root=re.sub('_prin_flash.extendedFrags.sed.fastq.gz','',os.path.basename(ii1)) oox=os.path.join(os.path.dirname(oo),(read_root + '.readDB'),read_root + '.flash.db') bshcmd='zcat -v ' + ii1 + ' ' + ii2 + ' ' + ii3 + ' | awk \'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}\' - | ' + os.path.join(blastpath,'makeblastdb ') + ' -in - -parse_seqids -dbtype nucl -out ' + oox + ' -title ' + read_root + '; ln -fs ' + oox + '.nal ' + oo logger.info(bshcmd) with open(os.path.join(DBout,"logs","%s.makeDB.out" % read_root),'w+') as stdoutF, open(os.path.join(DBout,"logs","%s.makeDB.err" % read_root),'w+') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str = bshcmd, job_name = 'makeDB', logger = logger, drmaa_session = mySession, run_locally = False, working_directory = os.getcwd(), job_other_options = '-p bioinfo ') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except error_drmaa_job as err: logger.error("MakeDB error: %s" % err) raise else: logger.info('Making database complete')
def mod_Rnames(input_files,output_files): ii1 = input_files[0] ii2 = input_files[1] ii3 = input_files[2] oo1 = output_files[0] oo2 = output_files[1] oo3 = output_files[2] read_root=re.sub('_prin_flash.extendedFrags.fastq.gz','',os.path.basename(ii1)) cmd1='zcat ' + ii1 + ' | sed \'s/\ /_/g\' - | gzip -c > ' + oo1 cmd2='zcat ' + ii2 + ' | sed \'s/\ /_/g\' - | gzip -c > ' + oo2 cmd3='zcat ' + ii3 + ' | sed \'s/\ /_/g\' - | gzip -c > ' + oo3 cmd_all=[cmd1,cmd2,cmd3] bshcmd=' ; '.join(cmd_all) logger.info(bshcmd) with open(os.path.join(cutout,"logs","%s.sed.out" % read_root),'w+') as stdoutF, open(os.path.join(cutout,"logs","%s.sed.err" % read_root),'w+') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str = bshcmd, job_name = 'sed', logger = logger, drmaa_session = mySession, run_locally = False, working_directory = os.getcwd(), job_other_options = '-p bioinfo ') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except error_drmaa_job as err: logger.error("Sed error: %s" % err) raise else: logger.info('Renaming reads complete')
def merge_mates(input_files,output_file): ii1 = input_files[0] ii2 = input_files[1] oo = re.sub('.extendedFrags.fastq.gz','',os.path.basename(output_file[0])) read_root=re.sub('_prin_1.fastq.gz','',os.path.basename(ii1)) bshcmd=os.path.join(flashpath,'flash')+ ' -z -M 300 -t 8 -o '+ oo + ' -d ' + cutout + ' ' + ii1 + ' ' + ii2 logger.info(bshcmd) with open(os.path.join(cutout,"logs","%s.flash.out" % read_root),'w+') as stdoutF, open(os.path.join(cutout,"logs","%s.flash.err" % read_root),'w+') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str = bshcmd, job_name = 'flash', logger = logger, drmaa_session = mySession, run_locally = False, working_directory = os.getcwd(), job_other_options = '-p bioinfo --mincpus=8') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except error_drmaa_job as err: logger.error("Flash error: %s" % err) raise else: logger.info('Merging mates complete')
def conv_rate(input_files, output_file): ii1 = input_files[0] ii1sub = re.sub('_1.fastq.gz', '', ii1) oo = output_file read_root = os.path.basename(ii1sub) CR_cmd = '/data/boehm/group/pipelines/BS_amplicon_seq/v0.1.0/conversionRate_prin_KS.sh ' + ii1sub + ' ' + oo logger.info(CR_cmd) with open(os.path.join(metout, "logs", "%s.conv_rate.out.log" % read_root), 'w') as stdoutF, open( os.path.join(metout, "logs", "%s.conv_rate.err.log" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str=CR_cmd, job_name='conv_rate', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logger.error("Conversion rate error: %s" % err) raise else: logger.info('Conversion rate calculation complete')
def BS_index_bam(INfile, sampath, bamoutDir, my_session, logobject): cmd_bamInd = os.path.join(sampath, 'samtools') + ' index ' + INfile + ';sleep 300' read_root = re.sub('.bam', '', os.path.basename(INfile)) logobject.info(cmd_bamInd) with open( os.path.join(bamoutDir, "logs", "%s.bamIndex.out.log" % read_root), 'w') as stdoutF, open( os.path.join(bamoutDir, "logs", "%s.bamIndex.err.log" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job(cmd_str=cmd_bamInd, job_name='bamIndex', logger=logobject, drmaa_session=my_session, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logobject.error("Bam_index error: %s" % err) raise else: logobject.info('Bam indexing complete') return
def depth_of_cov(input_file, output_file): ii = input_file oos = output_file oos2 = oos.replace('.sample_summary', '') read_root = re.sub('.RGi.bam', '', os.path.basename(ii)) #OUTlist2=oos2[2:] cmd_all = 'java -Xmx50g -Djava.io.tmpdir=/data/extended -jar ' + os.path.join( GATKpath, 'GenomeAnalysisTK.jar' ) + ' -R ' + refG + ' -T DepthOfCoverage -o ' + oos2 + ' -I ' + ii + ' -ct 0 -ct 1 -ct 2 -ct 5 -ct 10 -ct 15 -ct 20 -ct 30 -ct 50 -omitBaseOutput -mmq 10 --partitionType sample -L ' + args.intList logger.info(cmd_all) with open( os.path.join(metout, "logs", "%s.depth_cov.out.log" % read_root), 'w') as stdoutF, open( os.path.join(metout, "logs", "%s.depth_cov.err.log" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=cmd_all, job_name='depth_cov', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --mem=50000') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logger.error("Depth of coverage error: %s" % err) raise else: logger.info('Depth of coverage calculation complete')
def run_piped_command(cfg, *args): run_locally = True retain_job_scripts = True job_script_dir = os.path.join(cfg.runs_scratch_dir, "drmaa") cpus = 1 mem_per_cpu = 1024 walltime = "24:00:00" stdout, stderr = "", "" job_options = "--ntasks=1 \ --cpus-per-task={cpus} \ --mem-per-cpu={mem} \ --time={time} \ ".format(cpus=cpus, mem=int(1.2 * mem_per_cpu), time=walltime) full_cmd = expand_piped_command(*args) print full_cmd try: stdout, stderr = run_job(full_cmd.strip(), job_other_options=job_options, run_locally=run_locally, retain_job_scripts=retain_job_scripts, job_script_directory=job_script_dir, logger=cfg.logger, working_directory=os.getcwd(), drmaa_session=cfg.drmaa_session) except error_drmaa_job as err: raise Exception("\n".join( map(str, ["Failed to run:", full_cmd, err, stdout, stderr])))
def calc_Mbias(input_file, output_file): ii = input_file oo = output_file oos = re.sub('.txt', '', oo) read_root = re.sub('.bam', '', os.path.basename(ii)) Mb_cmd = os.path.join( POMpath, 'MethylDackel') + ' mbias --txt --keepDupes -@ ' + str( args.nthreads ) + ' ' + refG + ' ' + ii + ' ' + oos + ' > ' + oo #+ '.txt' logger.info(Mb_cmd) with open(os.path.join(metout, "logs", "%s.mbias.out" % read_root), 'w') as stdoutF, open( os.path.join(metout, "logs", "%s.mbias.err" % read_root), 'w') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=Mb_cmd, job_name='mbias', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --mem-per-cpu=10000 --mincpus=' + str(args.nthreads)) stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logger.error("Methylation bias error: %s" % err) raise else: logger.info('Methylation bias calculation complete')
def stringtie(input_file, output_file,abundance_file,qc_path,gtf,logger, logger_mutex): bam=os.path.basename(input_file) cmd = ( "source ~/.bashrc \n" "cd $TMPDIR \n" "mkdir reference \n" "cp {input_file} . \n" "cp {gtf} ./reference/gencode.gtf \n" "stringtie -p 8 -G ./reference/gencode.gtf -A {abundance_file} -o {output_file} -B -e -v {bam} \\\n" "2>{qc_path}/stringtie.log \n" ) cmd = cmd.format(**locals()) #print cmd try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "stringtie", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -l h_rt=04:00:00 -w n -l mem=4G -l tmpfs=60G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("stringtie worked")
def sort_bam(input_file, output_file): ii = input_file oo = output_file read_root = re.sub('_pe.bam', '', os.path.basename(ii)) cmd = os.path.join(sampath, 'samtools') + ' sort -T ' + os.path.join( '/data/extended', read_root) + ' -m 6G -@ ' + str( args.nthreads) + ' -o ' + oo + ' ' + ii logger.info(cmd) with open(os.path.join(bamoutO, "logs", "%s.bamsort.out" % read_root), 'w+') as stdoutF, open( os.path.join(bamoutO, "logs", "%s.bamsort.err" % read_root), 'w+') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=cmd, job_name='bamsort', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --mincpus=' + str(args.nthreads)) stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except error_drmaa_job as err: logger.error("Bam sorting error: %s" % err) raise else: logger.info('Bam sorting complete')
def bam_to_tagAlign(input_file, output_file, out_dir, logger, logger_mutex): print "\n\ninput_file: " + str(input_file) + "\n\n" print "\n\ninput_file: " + str(output_file) + "\n\n" FINAL_BAM_FILE=os.path.basename(input_file) FINAL_BAM_PREFIX=FINAL_BAM_FILE[:-4] BAM_LOC=os.path.dirname(input_file) OFPREFIX=FINAL_BAM_FILE[:-4] FINAL_NMSRT_BAM=OFPREFIX + ".final_filt_nmsrt.bam" FINAL_NMSRT_BAM_PREFIX = FINAL_NMSRT_BAM[:-4] FINAL_BEDPE_FILE=FINAL_NMSRT_BAM_PREFIX + ".bedpe.gz" FINAL_TA_FILE=FINAL_BAM_PREFIX +".PE2SE.tagAlign.gz" NREADS=25000000 SUBSAMPLED_TA_FILE=OFPREFIX + ".filt.nodup.sample" + str(25) + ".MATE1.tagAlign.gz" cmd = ("# =================== \n" "# Create tagAlign file \n" "# =================== \n" "source ~/.bashrc \n" "cd $TMPDIR \n" "cp {input_file} . \n" "cp {BAM_LOC}/{FINAL_NMSRT_BAM} . \n" "# Create virtual SE file containing both read pairs \n" "bedtools bamtobed -i {FINAL_BAM_FILE} \\\n" " | awk 'BEGIN{{OFS=\"\\t\"}}{{$4=\"N\";$5=\"1000\";print $0}}' | gzip -nc > {FINAL_TA_FILE} \n" "# ================ \n" "# Create BEDPE file \n" "# ================ \n" "bedtools bamtobed -bedpe -mate1 -i {FINAL_NMSRT_BAM} | gzip -nc > {FINAL_BEDPE_FILE} \n" "# ================================= \n" "# Subsample tagAlign file \n" "# Restrict to one read end per pair for CC analysis \n" "# ================================ \n" "zcat {FINAL_BEDPE_FILE} | grep -v \"chrM\" | shuf -n {NREADS} --random-source={FINAL_BEDPE_FILE} \\\n" " | awk 'BEGIN{{OFS=\"\\t\"}}{{print $1,$2,$3,\"N\",\"1000\",$9}}' | gzip -nc > {SUBSAMPLED_TA_FILE} \n" "mv {FINAL_TA_FILE} {out_dir} \n" "mv {SUBSAMPLED_TA_FILE} {out_dir} \n" "mv {FINAL_BEDPE_FILE} {out_dir} ") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "bam2tag", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=24G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("bam_to_tagAlign worked")
def cut_reads_user(INfile1, INfile2, OUTfile1, OUTfile2, cutpath, my_session, cutout, logobject, args): read_root = os.path.basename(INfile1)[:-12] adapterSeq = "AGATCGGAAGAGC" if args.nextera: adapterSeq = "CTGTCTCTTATA" bshcmd = "{} cutadapt -a {} -A {} -q {} -m 30 -j {} {} -o {} -p {} {} {} ; sleep 300".format( cutpath, adapterSeq, adapterSeq, args.trimThreshold, args.nthreads, args.trimOtherArgs, OUTfile1, OUTfile2, INfile1, INfile2) with open(os.path.join(cutout, "logs", "%s.trim_reads.out" % read_root), 'w+') as stdoutF, open( os.path.join(cutout, "logs", "%s.trim_reads.err" % read_root), 'w+') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=bshcmd, job_name='cut_reads', logger=logobject, drmaa_session=my_session, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --nodes=1=1 --mincpus={}'.format( args.nthreads)) stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) except error_drmaa_job as err: logobject.error("Cut_reads error: %s" % err) raise return
def intAgg_stats(input_files, output_files): ii = os.path.join(CpGstat_out, input_files[1]) oo = output_files Rcmd = os.path.join( Rpath, 'Rscript' ) + ' --no-save --no-restore /data/boehm/group/pipelines/BS_amplicon_seq/v0.1.0/BSampli.interval_stats.limma.R ' + intStat_out + ' ' + args.intList + ' ' + ii + ' ' + args.sampleInfo logger.info(Rcmd) with open(os.path.join(intStat_out, "logs", "interval_stats.out"), 'w') as stdoutF, open( os.path.join(intStat_out, "logs", "interval_stats.err"), 'w') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=Rcmd, job_name='agg_stats', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except Exception as err: logger.error("Interval stats error: %s" % err) raise else: logger.info('Interval stats calculation complete')
def postTrim_fqc(input_files, output_files): ii1 = input_files[0] ii2 = input_files[1] read_root = re.sub('_prin_1.fastq.gz', '', os.path.basename(ii1)) bshcmd = os.path.join( FQCpath, 'fastqc ') + ' --outdir ' + fqcout + ' -t 8 ' + ii1 + ' ' + ii2 logger.info(bshcmd) with open(os.path.join(fqcout, "logs", "%s.post_fqc.out" % read_root), 'w+') as stdoutF, open( os.path.join(fqcout, "logs", "%s.post_fqc.err" % read_root), 'w+') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=bshcmd, job_name='post_fqc', logger=logger, drmaa_session=mySession, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --mincpus=8') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except error_drmaa_job as err: logger.error("Post_trim_fastqc error: %s" % err) raise else: logger.info('Post trim fastqc complete')
def post_trim_fqc(INfile1, INfile2, fqcout, FQCpath, my_session, logobject): read_root = re.sub('_R1.fastq.gz', '', os.path.basename(INfile1)) bshcmd = os.path.join( FQCpath, 'fastqc ') + ' --outdir ' + fqcout + ' -t 8 ' + INfile1 + ' ' + INfile2 with open(os.path.join(fqcout, "logs", "%s.post_fqc.out" % read_root), 'w+') as stdoutF, open( os.path.join(fqcout, "logs", "%s.post_fqc.err" % read_root), 'w+') as stderrF: try: stdout_res, stderr_res = run_job( cmd_str=bshcmd, job_name='post_fqc', logger=logobject, drmaa_session=my_session, run_locally=False, working_directory=os.getcwd(), job_other_options='-p bioinfo --nodes=1=1 --mincpus=8') stdoutF.write("".join(stdout_res)) stderrF.write("".join(stderr_res)) # relay all the stdout, stderr, drmaa output to diagnose failures except error_drmaa_job as err: logobject.error("Post_trim_fastqc error: %s" % err) raise return
def run_exonerate(input_file, output_file, genome_filename, query_filename): twobit_filename = FASTA_RE_COMPILED.sub('.2bit', genome_filename) job_name = input_file.replace('.genblastA.gff3', '.sge') job = 'run_est_mapping.py --query_type {}'.format(args.query_type) job += ' --upstream {} --downstream {} --mapper exonerate --save_mapper_output --augustus_hints'.format( args.exonerate_upstream, args.exonerate_downstream) if args.extra_exonerate_args: job += ' --extra_mapper_args "{}"'.format(args.extra_exonerate_args) job += ' {} {} {} {}'.format(query_filename, input_file, twobit_filename, output_file) job_queue = 'all.q' job_env = dict(PATH=PATH_val, PYTHONPATH=PYTHONPATH_val) if not args.run_local: job_env['MODULESHOME'] = args.modules_home run_job(job, job_name=job_name, job_other_options='-q {}'.format(job_queue), job_environment=job_env, drmaa_session=drmaa_session, working_directory=args.working_directory, run_locally=args.run_local, logger=logger)
def bowtie2(input_files, out_file, path, outpath,qc_folder,logger, logger_mutex): flat_list = [item for sublist in input_files for item in sublist] first_reads = [] second_reads =[] for i in flat_list: if re.search('val_1', os.path.basename(i)): first_reads.append(os.path.basename(i)) elif re.search('val_2', os.path.basename(i)): second_reads.append(os.path.basename(i)) first_reads = ','.join(first_reads) second_reads = ','.join(second_reads) bowtie2_output = out_file.split('/') bowtie2_output = bowtie2_output[-1] cmd = ( " cd $TMPDIR \n" " mkdir reference \n" " mkdir temporary \n" " cp {path}" + "/*fq.gz" + " . \n " " ls -l > {qc_folder}/log \n" " date \n" " cp $HOME/Scratch/reference/grch38/bowtie2/*bt2 ./reference \n" " bowtie2 -k 4 -X2000 --mm --local --threads 8 \\\n" " -x ./reference/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bowtie_index \\\n" " -1 {first_reads} \\\n" " -2 {second_reads} \\\n" " 2> {qc_folder}/bowtie2.log \\\n" " | samtools view -bS - -o temp.bam 2>{qc_folder}/samtools.log \n" " ls -lh >> {qc_folder}/list.log \n" " ~/applications/sambamba/sambamba_v0.6.6 sort -p -m 4G -t 8 --tmpdir=./temporary temp.bam -o " + bowtie2_output + " \n" " ls -lh >> {qc_folder}/list.log \n" " cp " + bowtie2_output + " {outpath} \n" " rm -r * ") cmd = cmd.format(**locals()) #print cmd try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "bowtie2", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -V -l h_rt=12:00:00 -w n -l mem=4G -l tmpfs=80G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("bowtie2 worked")
def run_stage(state, stage, command): '''Run a pipeline stage, either locally or on the cluster''' # Grab the configuration options for this stage config = state.config modules = config.get_stage_option(stage, 'modules') mem = config.get_stage_option(stage, 'mem') * MEGABYTES_IN_GIGABYTE account = config.get_stage_option(stage, 'account') queue = config.get_stage_option(stage, 'queue') walltime = config.get_stage_option(stage, 'walltime') run_local = config.get_stage_option(stage, 'local') cores = config.get_stage_option(stage, 'cores') pipeline_id = config.get_option('pipeline_id') job_name = pipeline_id + '_' + stage # Generate a "module load" command for each required module if modules is not None: module_loads = '\n'.join(['module load ' + module for module in modules]) else: module_loads = '\n' cluster_command = '\n'.join([module_loads, command]) # Specify job-specific options for SLURM job_options = '--nodes=1 --ntasks-per-node={cores} --ntasks={cores} --time={time} --mem={mem} --partition={queue} --account={account}' \ .format(cores=cores, time=walltime, mem=mem, queue=queue, account=account) # Log a message about the job we are about to run log_messages = ['Running stage: {}'.format(stage), 'Command: {}'.format(command)] if not run_local: log_messages.append('Job options: {}'.format(job_options)) state.logger.info('\n'.join(log_messages)) # Run the job, capturing stdout and stderr stdout_res, stderr_res = None, None try: stdout_res, stderr_res = \ run_job(cmd_str=cluster_command, job_name = job_name, logger = state.logger.proxy, drmaa_session = state.drmaa_session, # Determines whether to run the command on the local # machine or run it on the cluster run_locally = run_local, # Keep a copy of the job script for diagnostic purposes retain_job_scripts = True, retain_stdout = True, retain_stderr = True, job_script_directory = state.options.jobscripts, job_other_options = job_options) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", command, err, stdout_res, stderr_res])))
def alignFastqByBowtie(FqFileName, OutputBamFileName, config): """ To align '.fastq' to genome. Arguments: - `FqFileName`: file to be processed """ if "aligner" in config: if config["aligner"] == "bowtie": cmds = ['fastq2bam_by_bowtie.sh'] cmds.append(FqFileName) cmds.append(expandOsPath(config['bowtie_index'])) elif config["aligner"] == "bowtie2": cmds = ['fastq2bam_by_bowtie2.sh'] cmds.append(FqFileName) cmds.append(config['bowtie_index']) else: raise KeyError else: cmds = ['fastq2bam_by_bowtie.sh'] cmds.append(FqFileName) cmds.append(expandOsPath(config['bowtie_index'])) target = expandOsPath(os.path.join(config["project_dir"], config["data_dir"])) cmds.append(target) cmds.append(config["pair_end"]) cores = int(config['cores']) if cores == 0: cores = 1 cmds.append(str(cores)) logfile = FqFileName + ".alignment.log" run_job(" ".join(cmds), job_name = "alignFastqByBowtie_" + os.path.basename(FqFileName), job_other_options = cluster_options(config, "alignFastqByBowtie", cores, logfile), job_script_directory = os.path.dirname(os.path.realpath(__file__)), job_environment={ 'BASH_ENV' : '~/.bash_profile' }, retain_job_scripts = True, drmaa_session=my_drmaa_session) return 0
def hisat2(input_files, out_file, path, outpath,qc_folder,hisat_genome_index,logger, logger_mutex): flat_list = [item for sublist in input_files for item in sublist] first_reads = [] second_reads =[] for i in flat_list: if re.search('val_1', os.path.basename(i)): first_reads.append(os.path.basename(i)) elif re.search('val_2', os.path.basename(i)): second_reads.append(os.path.basename(i)) first_reads = ','.join(first_reads) second_reads = ','.join(second_reads) hisat_output = out_file.split('/') hisat_output = hisat_output[-1] cmd = ( "source ~/.bashrc \n" "cd $TMPDIR \n" "mkdir reference \n" "cp {path}/*fq.gz . \n" "cp {hisat_genome_index}/genome* ./reference \n" "hisat2 -p 8 -x ./reference/genome_snp_tran --dta-cufflinks \\\n" "--novel-splicesite-outfile ./novel_splice.txt \\\n" "--novel-splicesite-infile ./novel_splice.txt \\\n" "-1 {first_reads} \\\n" "-2 {second_reads} \\\n" "2> {qc_folder}/hisat.log | samtools view -bS - -o temp.bam \n" "samtools sort -@ 8 temp.bam -m 4G " + hisat_output[:-4] + " 2>{qc_folder}/samtools.log \n" "mv {hisat_output} {outpath} \n" "mv novel_splice.txt {outpath} \n") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "hisat", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -l h_rt=08:00:00 -w n -l mem=4G -l tmpfs=60G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("hisat worked")
def trim_fastq(input_files, output_files, qc_folder, output_folder ,logger, logger_mutex): print "OUTPUT FILES! " + str(output_files) if len(input_files) !=2: raise Exception("One of the reads pairs %s missing" % (input_files,)) cmd = ( " source ~/.bashrc \n" " date \n" " echo $HOSTNAME \n" " cd $TMPDIR \n" " cp {input_files[0]} . \n" " cp {input_files[1]} . \n" " basename1=$(basename {input_files[0]}) \n" " basename2=$(basename {input_files[1]}) \n" " date \n" " ls -l \n" #" trim_galore --fastqc --paired {basenames[0]} {basenames[1]} &> {qc_folder}/trim_galore.log \n" " trim_galore --fastqc --paired $basename1 $basename2 &> {qc_folder}/trim_galore.log \n" " mv *.fq.gz {output_folder} \n" " mv *fastqc* {qc_folder} \n" " mv *report* {qc_folder}; rm * \n" ) job_name = "trim_fastqc" ## formats the cmd input to get the variables in the {} cmd = cmd.format(**locals()) #print(cmd) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name, job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -l h_rt=05:00:00 -l mem=4G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) # except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("trim_fastq worked")
def create_pseudoreplicates(input_file, output_file, out_dir, logger, logger_mutex): FINAL_BEDPE_FILE=os.path.basename(input_file) PR_PREFIX=FINAL_BEDPE_FILE[:-26] PR1_TA_FILE=PR_PREFIX + ".PE2SE.pr1.tagAlign.gz" PR2_TA_FILE=PR_PREFIX + ".PE2SE.pr2.tagAlign.gz" cmd = ("# ========================\n" "# Create pseudoReplicates\n" "# =======================\n" "source ~/.bashrc \n" "cd $TMPDIR \n" "cp {input_file} . \n" "# Get total number of read pairs \n" "nlines=$( zcat {FINAL_BEDPE_FILE} | wc -l ) \n" "nlines=$(( (nlines + 1) / 2 )) \n" "# Shuffle and split BEDPE file into 2 equal parts \n" "zcat {FINAL_BEDPE_FILE} | shuf --random-source={FINAL_BEDPE_FILE} | split -d -l $nlines - {PR_PREFIX} \n" "# Will produce {PR_PREFIX}00 and {PR_PREFIX}01 \n" "# Convert read pairs to reads into standard tagAlign file \n" "awk 'BEGIN{{OFS=\"\\t\"}}{{printf \"%s\\t%s\\t%s\\tN\\t1000\\t%s\\n%s\\t%s\\t%s\\tN\\t1000\\t%s\\n\",$1,$2,$3,$9,$4,$5,$6,$10}}' {PR_PREFIX}00 | gzip -nc > {PR1_TA_FILE} \n" "rm {PR_PREFIX}00 \n" "awk 'BEGIN{{OFS=\"\\t\"}}{{printf \"%s\\t%s\\t%s\\tN\\t1000\\t%s\\n%s\\t%s\\t%s\\tN\\t1000\\t%s\\n\",$1,$2,$3,$9,$4,$5,$6,$10}}' {PR_PREFIX}01 | gzip -nc > {PR2_TA_FILE} \n" "rm {PR_PREFIX}01 \n" "mv {PR1_TA_FILE} {out_dir} \n" "mv {PR2_TA_FILE} {out_dir} " ) cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "create_pseudo", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=01:00:00 -w n -l mem=8G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("create_pseudoreplicates")
def phantom_peak_quals(input_file, output_file, out_dir, outfile1,outfile2,logger, logger_mutex): SUBSAMPLED_TA_FILE=os.path.basename(input_file) SUBSAMPLED_TA_FILE=SUBSAMPLED_TA_FILE[:-25] + "filt.nodup.sample25.MATE1.tagAlign.gz" cmd = ("#########################\n" "# run phantompeakquals #\n" "#########################\n" "source ~/.bashrc \n" "cd $TMPDIR \n" "mkdir job_temp \n" "mv {out_dir}/{SUBSAMPLED_TA_FILE} . \n" "Rscript ~/applications/phantompeakqualtools/run_spp.R " " -c={SUBSAMPLED_TA_FILE} -filtchr=chrM " " -savp={outfile1} -out={outfile2} " " -tmpdir=./job_temp \n" "echo -e \"Filename\\tnumReads\\testFragLen\\tcorr_estFragLen\\tPhantomPeak\\tcorr_phantomPeak\\targmin_corr\\tmin_corr\\tphantomPeakCoef\\trelPhantomPeakCoef\\tQualityTag\" > header \n" "sed -r 's/,[^\\t]+//g' {outfile2} > temp \n" "cat header temp > temporary && mv temporary temp \n" "mv temp {outfile2} \n" "mv {outfile2} {out_dir}\n" "mv {outfile1} {out_dir}") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "phantom", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=24G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("create_pseudoreplicates")
def bowtie2(input_files, out_file, path, outpath,qc_folder,logger, logger_mutex): print out_file reads = [] for i in input_files: reads.append(os.path.basename(i)) reads = ','.join(reads) print reads bowtie2_output = out_file.split('/') bowtie2_output = bowtie2_output[-1] cmd = ( "cd $TMPDIR \n" "mkdir reference \n" "cp {path}" + "/*fq.gz" + " . \n " "ls -l \n" "date \n" "cp $HOME/Scratch/reference/grch38/bowtie2/*bt2 ./reference \n " " bowtie2 -k 4 -X2000 --mm --local --threads 8 " " -x ./reference/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bowtie_index " " -U {reads} 2> {qc_folder}/bowtie2.log | samtools view -bS - -o temp.bam \n" " samtools sort -@ 8 temp.bam -m 2G " + bowtie2_output[:-4] + " 2>{qc_folder}/samtools.log \n" " samtools flagstat {bowtie2_output} > {qc_folder}/{bowtie2_output}.mapstats \n" " cp {bowtie2_output} {outpath} \n" " rm -r * \n") cmd = cmd.format(**locals()) #print cmd try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "bowtie2", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -V -l h_rt=08:00:00 -w n -l mem=2G -l tmpfs=60G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ", job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("bowtie2 worked")
def bam_to_tagAlign(input_file, output_file, out_dir,prefix, logger, logger_mutex): FINAL_BAM_FILE=os.path.basename(input_file) FINAL_BAM_PREFIX=prefix cmd = ("# =================== \n" "# Create tagAlign file \n" "# =================== \n" "cd $TMPDIR \n" "cp {input_file} . \n" "FINAL_TA_FILE={FINAL_BAM_PREFIX}.tagAlign.gz \n" "bedtools bamtobed -i {FINAL_BAM_FILE} | awk 'BEGIN{{OFS=\"\\t\"}}{{$4=\"N\";$5=\"1000\";print $0}}' | gzip -nc > \"$FINAL_TA_FILE\" \n" "# ================================= \n" "# Subsample tagAlign file \n" "# ================================ \n" "NREADS=15000000 \n" "SUBSAMPLED_TA_FILE={FINAL_BAM_PREFIX}.sample.tagAlign.gz\n" "zcat \"$FINAL_TA_FILE\" | grep -v chrM | shuf -n \"$NREADS\" --random-source=\"$FINAL_TA_FILE\" | gzip -nc > \"$SUBSAMPLED_TA_FILE\" \n" "mv \"$SUBSAMPLED_TA_FILE\" {out_dir} \n" "mv \"$FINAL_TA_FILE\" {out_dir}") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "bam2tag", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=24G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ", job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("bam_to_tagAlign worked")
def blacklist(input_file, output_file,out_dir, logger, logger_mutex): cmd = ("#===================================\n" "# run mac2 2 on tn5 shifted files \n" "#===================================\n" "source ~/.bashrc \n" "cd $TMPDIR \n" "cp {out_dir}/*narrowPeak.gz . \n" "blacklist=\"/home/sejjctj/Scratch/reference/grch38/chipseq_blacklist/hg38.blacklist.bed.gz\" \n" "for peak in *narrowPeak.gz \n" "do \n" "prefix=\"${{tag:0:${{#tag}}-14}}\" #remove .narrowPeak.gz \n" "filtered_peak=\"${{prefix}}\".narrowPeak.filt.gz \n" "bedtools intersect -v a ${{peak}} -b ${{blacklist}} \\\n" "| awk 'BEGIN{{OFS=\"\\t\"}}{{if($5>1000) $5=1000; print $0}}' \\\n" "| grep -P 'chr[\dXY]+[\\t]' | gzip -nc > ${{filtered_peak}} \n" "mv ${{filtered_peak}} {out_dir} \n" "done \n") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "blacklist", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=08:00:00 -w n -l mem=16G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("blacklist")
def qorts(input_file, output_file, log_file, gtf, logger, logger_mutex): bam=os.path.basename(input_file[0]) cmd = (" source ~/.bashrc \n" " cd $TMPDIR; mkdir tmp \n" " cp {input_file[0]} ./ \n" " samtools sort -n -m 12G -T prefix -O bam {bam} > namesort.bam \n" " java -Xmx48G -Djava.io.tmpdir=./tmp \\\n" " -jar ~/applications/QoRTs/QoRTs.jar QC \\\n" " --nameSorted \\\n" " --minMAPQ 60 \\\n" " --maxReadLength 100 \\\n" " namesort.bam \\\n" " {gtf} \\\n" " {output_file} \\\n" " 2>{log_file} " ) cmd = cmd.format(**locals()) #print cmd try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "qorts", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -l h_rt=08:00:00 -w n -l mem=48G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("qorts worked")
def cufflinks(input_file, output_file, path,qc_path,gtf,genome,mask,genome_name,logger, logger_mutex): bam=os.path.basename(input_file) my_mask=os.path.basename(mask) cmd = ( "source ~/.bashrc \n" "cd $TMPDIR \n" "mkdir reference \n" "cp {input_file} . \n" "cp {genome}*fa* ./reference \n" "cp {gtf} ./reference/gencode.gtf \n" "cp {mask} ./reference \n" "cufflinks -q -u --no-update-check -p 8 -G ./reference/gencode.gtf \\\n" "-b ./reference/{genome_name} \\\n" "--mask-file ./reference/{my_mask} {bam} \\\n" "-o {path} \\\n" "2>{qc_path}/cufflinks.log \n" ) cmd = cmd.format(**locals()) #print cmd try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "cufflinks", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -l h_rt=04:00:00 -w n -l mem=4G -l tmpfs=60G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("cufflinks worked")
def star_fusion(input_files, out_file,sample, outpath,qc_folder,logger, logger_mutex): fusion_input = input_files[1] fusion_name = os.path.basename(fusion_input) cmd = ( "source ~/.bashrc \n" "module unload perl \n" "module load perl/5.16.0 \n" "export PERL5LIB=$PERL5LIB:/home/sejjctj/perl5/lib/perl5 \n" "cd $TMPDIR \n " "cp {fusion_input} . \n " "awk 'BEGIN{{OFS=\"\\t\"}}{{$1=\"chr\"$1;$4=\"chr\"$4;print $0}}' {fusion_name} > temp && mv temp {fusion_name} \n" "STAR-Fusion \\\n" "--genome_lib_dir /home/sejjctj/Scratch/reference/star_single_cell/fusion/GRCh38_v27_CTAT_lib_Feb092018/ctat_genome_lib_build_dir \\\n" "-J {fusion_name} \\\n" "--output_dir {outpath} \n" ) cmd = cmd.format(**locals()) #print cmd try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "star_fusion", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -l h_rt=02:00:00 -w n -l mem=24G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("star_fusion worked")
def kallisto(input_files, output_file, path,kallisto_folder,qc_folder): input_files = [item for sublist in input_files for item in sublist] list_of_reads = [] for filename in input_files: list_of_reads.append(os.path.basename(filename)) list_of_reads = ' '.join(list_of_reads) cmd = ("source ~/.bashrc \n" "cd $TMPDIR \n" "mkdir reference \n" "cp {path}/*fq.gz . \n" "cp $HOME/Scratch/reference/hg38_ver84_transcripts.idx ./reference \n" "kallisto quant -b 100 -t 4 -i \\\n" "./reference/hg38_ver84_transcripts.idx {list_of_reads} \\\n" "-o {kallisto_folder}") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "kallisto", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -l h_rt=04:00:00 -l mem=8G -w n -pe smp 4 -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("kallisto worked")
def trim_fastq(input_file, output_files, qc_folder, output_folder ,logger, logger_mutex): raw_fastq=os.path.basename(input_file) cmd = (" cd $TMPDIR ; " " cp {input_file} . ;" " trim_galore --fastqc {raw_fastq} 2> {qc_folder}/trim_galore.log ; " " mv *.fq.gz {output_folder} ; " " mv *fastqc* {qc_folder} ; " " mv *report* {qc_folder}; rm * ; " ) job_name = "trim_fastqc" ## formats the cmd input to get the variables in the {} cmd = cmd.format(**locals()) #print(cmd) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name, job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -V -l h_rt=05:00:00 -l mem=4G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes", job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) # except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("trim_fastq worked")
def star(input_files, out_file, path,outpath,sample,qc_folder,logger, logger_mutex): flat_list = [item for sublist in input_files for item in sublist] print(flat_list) first_reads = [] second_reads =[] for i in flat_list: if re.search('val_1', os.path.basename(i)): first_reads.append(os.path.basename(i)) elif re.search('val_2', os.path.basename(i)): second_reads.append(os.path.basename(i)) first_reads = ','.join(first_reads) second_reads = ','.join(second_reads) star_output = out_file.split('/') star_output = star_output[-1] #print star_output cmd = ( "source ~/.bashrc \n" "cd $TMPDIR \n " "cp {path}/*fq.gz . \n " "STAR --runThreadN 4 \\\n" "--genomeDir ~/Scratch/reference/star_single_cell/index/ \\\n" "--readFilesIn " + first_reads + " " + second_reads + " \\\n" "--readFilesCommand zcat \\\n" "--twopassMode Basic \\\n" "--outReadsUnmapped None \\\n" "--chimSegmentMin 12 \\\n" "--chimJunctionOverhangMin 12 \\\n" "--alignSJDBoverhangMin 10 \\\n" "--alignMatesGapMax 100000 \\\n" "--alignIntronMax 100000 \\\n" "--chimSegmentReadGapMax 3 \\\n" "--alignSJstitchMismatchNmax 5 -1 5 5 \\\n" "--outSAMstrandField intronMotif \\\n" "--outFilterIntronMotifs RemoveNoncanonical \\\n" ## added for compatibility with "--outFileNamePrefix {sample} \\\n" ## cufflinks "--outSAMtype BAM SortedByCoordinate\n" "cp *junction {outpath} \n" "cp *bam {outpath} \n" "cp *Log.* {qc_folder} ") cmd = cmd.format(**locals()) print cmd try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "star", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-w n -S /bin/bash -l h_rt=02:00:00 -w n -l mem=24G -l tmpfs=60G -pe smp 4 -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("star worked")
def test_task2(infile, outfile): print("%s start to run " % infile) run_job("./five_second.py", run_locally=True) print("%s wake up " % infile) with open(outfile, "w") as p: pass
def post_alignment_filter(input_file, output_file, out_dir,log_file, logger, logger_mutex): print input_file raw_bam=os.path.basename(input_file) prefix=raw_bam[:-4] FILT_BAM_PREFIX=prefix + ".filt.srt" FILT_BAM_FILE=FILT_BAM_PREFIX +".bam" MAPQ_THRESH=30 TMP_FILT_BAM_FILE=FILT_BAM_PREFIX + "dupmark.bam" DUP_FILE_QC=FILT_BAM_PREFIX + ".dup.qc" FINAL_BAM_PREFIX=prefix + ".filt.nodup.srt" FINAL_BAM_FILE=FINAL_BAM_PREFIX + ".bam" FINAL_BAM_INDEX_FILE=FINAL_BAM_PREFIX + ".bai" FINAL_BAM_FILE_MAPSTATS=FINAL_BAM_PREFIX + ".flagstat.qc" PBC_FILE_QC=FINAL_BAM_PREFIX + ".pbc.qc" picard_loc="/shared/ucl/apps/picard-tools/1.136/picard-tools-1.136/" cmd=("cd $TMPDIR \n" "cp {input_file} . \n" "date \n" "ls -l \n" "\n" "samtools sort -@ 4 -m 8G {raw_bam} temporary \n" "mv temporary.bam {raw_bam} \n" "samtools view -@ 4 -F 1804 -q {MAPQ_THRESH} -b {raw_bam} > {FILT_BAM_FILE} \n" "mv temporary_bam.bam {FILT_BAM_FILE} \n" "echo \"first filter done\" \n" "ls -lh \n" "#=========================\n" "# Mark Duplicates \n" "#==========================\n" "\n" "java -Xmx8G -jar {picard_loc}picard.jar MarkDuplicates INPUT={FILT_BAM_FILE} \\\n" "OUTPUT={TMP_FILT_BAM_FILE} METRICS_FILE={DUP_FILE_QC} VALIDATION_STRINGENCY=LENIENT \\\n" "ASSUME_SORTED=true REMOVE_DUPLICATES=false \n" "mv {TMP_FILT_BAM_FILE} {FILT_BAM_FILE} \n" "echo \"mark duplicates done\" \n" "ls -lh \n" "date \n" "\n" "# ============================ \n" "# Remove duplicates\n" "# Index final position sorted BAM \n" "# ============================ \n" "\n" "samtools view -@ 4 -F 1804 -b {FILT_BAM_FILE} > {FINAL_BAM_FILE} \n" "\n" "# Index Final BAM file \n" "samtools index {FINAL_BAM_FILE} {FINAL_BAM_INDEX_FILE} \n" "samtools flagstat {FINAL_BAM_FILE} > {FINAL_BAM_FILE_MAPSTATS} \n" "# Compute library complexity \n" "# ============================= \n" "# sort by position and strand \n" "# Obtain unique count statistics \n" "\n" "PBC_FILE_QC={FINAL_BAM_PREFIX}.pbc.qc \n" "# PBC File output \n" "echo -e \"TotalReadPairs\\tDistinctReadPairs\\tOneReadPair\\tTwoReadPairs\\tNRF=Distinct/Total\\tPBC1=OnePair/Distinct\\tPBC2=OnePair/TwoPair\" > header \n" "bedtools bamtobed -i {FILT_BAM_FILE} | awk 'BEGIN{{OFS=\"\\t\"}}{{print $1,$2,$3,$6}}' | \\\n" "grep -v chrM | sort | uniq -c | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \\\n" "{{m0=m0+1}} {{mt=mt+$1}} END{{printf \"%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \\\n" "> {PBC_FILE_QC} \n" "mv {FINAL_BAM_FILE} {out_dir} \n" "cat header {PBC_FILE_QC} > temp_file && mv temp_file {PBC_FILE_QC} \n" "mv {PBC_FILE_QC} {out_dir} \n" "mv {FINAL_BAM_FILE} {out_dir} \n") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "filter_bam", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=8G -l tmpfs=60G -pe smp 4 -wd /home/sejjctj/Scratch -j yes ", job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("post_alignment_filter worked")
def post_alignment_filter(input_file, output_file, out_dir,log_file, logger, logger_mutex): input_file = input_file[0] print "input:" print input_file print "output:" print output_file output_bam = os.path.basename(output_file) print "output_bam:" print output_bam RAW_BAM_FILE=os.path.basename(input_file) OFPREFIX = output_bam[:-4] print OFPREFIX FILT_BAM_PREFIX=OFPREFIX + ".filt" FILT_BAM_FILE=FILT_BAM_PREFIX + ".bam" TMP_FILT_BAM_PREFIX="tmp." + FILT_BAM_PREFIX + ".nmsrt" TMP_FILT_BAM_FILE=TMP_FILT_BAM_PREFIX + ".bam" TMP_FILT_FIXMATE_BAM_FILE=TMP_FILT_BAM_PREFIX + ".fixmate.bam" TMP_DUP_BAM_FILE=FILT_BAM_PREFIX + ".dupmark.bam" DUP_FILE_QC=FILT_BAM_PREFIX + ".dup.qc" FINAL_BAM_PREFIX=OFPREFIX + ".nodup" FINAL_BAM_FILE=FINAL_BAM_PREFIX + ".bam" FINAL_BAM_INDEX_FILE=FINAL_BAM_FILE + ".bai" FINAL_BAM_FILE_MAPSTATS=FINAL_BAM_PREFIX + ".flagstat.qc" PBC_FILE_QC=OFPREFIX + ".pbc.qc" picard_loc="/shared/ucl/apps/picard-tools/1.136/picard-tools-1.136/" cmd = ( " # ============================= \n" " # Remove unmapped, mate unmapped \n" " # not primary alignment, reads failing platform \n" " # Only keep properly paired reads \n" " # Obtain name sorted BAM file \n" " # ================== \n" " source ~/.bashrc \n" " cd $TMPDIR \n" " cp {input_file} ./ \n" " ls -lh \n" " date \n" " samtools view -F 524 -f 2 -u {RAW_BAM_FILE} \\\n" " | sambamba sort -n -m 16G -t 4 /dev/stdin -o {TMP_FILT_BAM_FILE} \n" " samtools view -h {TMP_FILT_BAM_FILE} | assign_multimappers.py -k 4 --paired-end \\\n" " | samtools fixmate -r /dev/stdin {TMP_FILT_FIXMATE_BAM_FILE} \n" " ls -lh \n" " date \n" " # Remove orphan reads (pair was removed) \n" " # and read pairs mapping to different chromosomes \n" " # obtain position sorted BAM \n" " samtools view -F 1804 -f 2 -u {TMP_FILT_FIXMATE_BAM_FILE} \\\n" " | sambamba sort -m 16G -t 4 /dev/stdin -o {FILT_BAM_FILE} \n" " rm {TMP_FILT_FIXMATE_BAM_FILE} \n" " rm {TMP_FILT_BAM_FILE} \n" " ls -lh \n" " date \n" " # ============= \n" " # Mark duplicates \n" " # ============= \n" " java -Xmx16G -jar {picard_loc}picard.jar MarkDuplicates INPUT={FILT_BAM_FILE} " " OUTPUT={TMP_DUP_BAM_FILE} METRICS_FILE={DUP_FILE_QC} " " VALIDATION_STRINGENCY=LENIENT ASSUME_SORTED=true REMOVE_DUPLICATES=false \n" " mv {TMP_DUP_BAM_FILE} {FILT_BAM_FILE} \n" " # ============================ \n" " # Remove duplicates \n" " # Index final position sorted BAM \n" " # Create final name sorted BAM \n" " # ============================ \n" " samtools view -F 1804 -f 2 -b {FILT_BAM_FILE} > {output_bam} \n" " samtools sort -n -m 16G -@ 4 {output_bam} {OFPREFIX}.final_filt_nmsrt \n" " # used later on \n" " samtools index {output_bam} \n" " samtools flagstat {output_bam} > {output_bam}.mapstats \n" " mv {output_bam}.mapstats {out_dir}\n" " # ============================= \n" " # Compute library complexity \n" " # ============================= \n" " # Sort by name \n" " # convert to bedPE and obtain fragment coordinates \n" " # sort by position and strand \n" " # Obtain unique count statistics \n" " sambamba sort -n -m 16G -t 4 {FILT_BAM_FILE} -o {OFPREFIX}.srt.tmp.bam \n" " echo -e '# PBC File output\n# TotalReadPairs\tDistinctReadPairs\tOneReadPair\tTwoReadPairs\tNRF=Distinct/Total\tPBC1=OnePair/Distinct\tPBC2=OnePair/TwoPair' > header \n" " bedtools bamtobed -bedpe -i {OFPREFIX}.srt.tmp.bam \\\n" " | awk 'BEGIN{{OFS=\"\\t\"}}{{print $1,$2,$4,$6,$9,$10}}' | grep -v 'chrM' | sort \\\n" " | uniq -c | \\\n" " awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}}($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}}{{t=mt+$1}}END{{printf\"%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' " " > {PBC_FILE_QC} \n" " rm {FILT_BAM_FILE} \n" " mv {output_bam} {out_dir} \n" " mv {output_bam}.bai {out_dir} \n" " cat header header {PBC_FILE_QC} > temporary && mv temporary > {PBC_FILE_QC} \n" " mv {PBC_FILE_QC} {out_dir} \n" " mv {OFPREFIX}.final_filt_nmsrt.bam {out_dir} ") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "filter_bam", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=16G -l tmpfs=60G -pe smp 4 -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("post_alignment_filter worked")
def macs2(input_file, output_file,out_dir, logger, logger_mutex): cmd = ("#===================================\n" "# run mac2 2 on tn5 shifted files \n" "#===================================\n" "source ~/.bashrc \n" "cd $TMPDIR \n" "cp {out_dir}/*tn5.tagAlign.gz . \n" "for tag in *tagAlign.gz \n" "do \n" "prefix=""${{tag:0:${{#tag}}-12}}"" #remove.tagAlign.gz \n" "peakfile=""${{prefix}}"".narrowPeak.gz \n" "pval_thresh=0.01 \n" "fc_bedgraph=""${{prefix}}"".fc.signal.bedgraph \n" "fc_bedgraph_srt=""${{prefix}}"".fc.signal.srt.bedgraph \n" "fc_bigwig=""${{prefix}}""_sig.fc.signal.bigwig \n" "pval_bedgraph=""${{prefix}}"".pval.signal.bedgraph \n" "pval_bedgraph_srt=""${{prefix}}"".pval.signal.srt.bedgraph \n" "pval_bigwig=""${{prefix}}_sig.pval.signal.bigwig \n" "chrsz=\"/home/sejjctj/Scratch/reference/grch38/hg38.chrom.sizes\" \n" "## see https://github.com/taoliu/MACS/issues/145 for choice of --shift and --extsize \n" "macs2 callpeak \\\n" "-t ""$tag"" -f BED -n ""$prefix"" -g 2700000000 -p $pval_thresh \\\n" "--nomodel --shift -100 --extsize 200 -B --SPMR --keep-dup all --call-summits \n" "# Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank> \n" "sort -k 8gr,8gr \"$prefix\"_peaks.narrowPeak | awk 'BEGIN{{OFS=\"\\t\"}}{{$4=""Peak_""NR ; print $0}}' \\\n" " | gzip -nc > ""$peakfile"" \n" "rm -f \"$prefix\"_peaks.narrowPeak \n" "rm -f \"$prefix\"_peaks.xls \n" "rm -f \"$prefix\"_summits.bed \n" ''' "macs2 bdgcmp -t \"$prefix\"_treat_pileup.bdg -c \"$prefix\"_control_lambda.bdg \\\n" "--o-prefix ""$prefix"" -m FE \n" "slopBed -i \"$prefix\"_FE.bdg -g ""$chrsz"" -b 0 | bedClip stdin ""$chrsz"" ""$fc_bedgraph"" \n" "rm -f ""$prefix""_FE.bdg \n" "sort -k1,1 -k2,2n ""$fc_bedgraph"" > ""$fc_bedgraph_srt"" \n" "bedGraphToBigWig ""$fc_bedgraph_srt"" ""$chrsz"" ""$fc_bigwig"" \n" "rm -f ""$fc_bedgraph"" ""$fc_bedgraph_srt"" \n" "# sval counts the number of tags per million in the compressed BED file \n" "#sval=$(wc -l <(zcat -f \"$tag\" ) | awk '{{printf \"%f\", $1/1000000}}') \n" "sval=$(zcat \"$tag\" | wc -l | awk '{{print $1/1000000}}') \n" "macs2 bdgcmp \\\n" "-t \"$prefix\"_treat_pileup.bdg -c \"$prefix\"_control_lambda.bdg \\\n" "--o-prefix ""$prefix"" -m ppois -S ""${{sval}}"" \n" "slopBed -i \"$prefix\"_ppois.bdg -g ""$chrsz"" -b 0 | \\\n" "bedClip stdin ""$chrsz"" ""$pval_bedgraph"" \n" "rm -f \"$prefix\"_ppois.bdg \n" "sort -k1,1 -k2,2n ""$pval_bedgraph"" > ""$pval_bedgraph_srt"" \n" "bedGraphToBigWig ""$pval_bedgraph_srt"" ""$chrsz"" ""$pval_bigwig"" \n" "rm -f ""$pval_bedgraph"" ""$pval_bedgraph_srt"" \n" "rm -f \"$prefix\"_treat_pileup.bdg \"$prefix\"_control_lambda.bdg \n" ''' "mv ./\"$prefix\"* {out_dir} \n" "done \n") cmd = cmd.format(**locals()) try: stdout_res, stderr_res = "","" stdout_res, stderr_res = run_job(cmd, job_name = "macs2", job_script_directory = "/home/sejjctj/Scratch/test_dir", job_other_options = "-S /bin/bash -V -l h_rt=08:00:00 -w n -l mem=16G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes ", #job_environment = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } , retain_job_scripts = True, # retain job scripts for debuging, they go in Scratch/test_dir working_directory = "/home/sejjctj/Scratch/test_dir", drmaa_session = drmaa_session, logger = logger ) except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("mac2_callpeaks")