def test_task3(infile, outfile):
    print("%s start to run " % infile)
    # subprocess.check_call("./five_second.py")
    run_job("./five_second.py", run_locally=True)
    print("%s wake up " % infile)
    with open(outfile, "w") as p:
        pass
Exemplo n.º 2
0
def runFastqc(FqFileName, fastqcLog, config):
    """
    To run FastQC
    Arguments:
    - `FqFileName`: fastq file
    - `config`: config
    """
    cmds = ['runFastQC.sh']
    #cmds.append("-o")
    cmds.append(fastqc_path)
    cores = int(config['cores'])
    if cores == 0:
        cores = 1
    #cmds.append("-t")
    cmds.append(str(cores))
    cmds.append(FqFileName)

    cmds.append(config["pair_end"])

    logfile = fastqcLog

    run_job(" ".join(cmds),
            job_name=os.path.basename(FqFileName) + "_fastqc",
            job_other_options=cluster_options(config, "runFastqc", cores,
                                              logfile),
            job_script_directory=os.path.dirname(os.path.realpath(__file__)),
            job_environment={'BASH_ENV': '~/.bash_profile'},
            retain_job_scripts=True,
            drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 3
0
def runDiffrepeat(BamFileNames, ResultFile, config):
    """
    To run diffrepeats
    Arguments:
    - `BamFileNames`: bam files
    - `config`: config
    """
    cmds = ['runDiffrepeat.sh']
    cmds.append(diffrepeat_path)
    cmds.append(alignment_path)
    cmds.append(config["repbase_db"])
    cmds.append(ResultFile)
    cmds.append(config["diffrepeat_editdist"])
    cmds.append(config["diffrepeat_mapq"])
    logfile = expandOsPath(
        os.path.join(log_path, config["project_name"] + ".diffrepeat.log"))

    cores = int(config['cores'])
    if cores == 0:
        cores = 1

    run_job(" ".join(cmds),
            job_name="runDiffRepeat",
            job_other_options=cluster_options(config, "runDiffrepeat", cores,
                                              logfile),
            job_script_directory=os.path.dirname(os.path.realpath(__file__)),
            job_environment={'BASH_ENV': '~/.bash_profile'},
            retain_job_scripts=True,
            drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 4
0
def runFastqc(BamFileName, fastqcLog, config):
    """
    To run FastQC
    Arguments:
    - `BamFileName`: bam file
    - `config`: config
    """
    cmds = ['fastqc']
    cmds.append("-o")
    cmds.append(
        expandOsPath(
            os.path.join(config["project_dir"], config["data_dir"], "FastQC")))
    cores = int(config['cores'])
    if cores == 0:
        cores = 1
    cmds.append("-t")
    cmds.append(str(cores))
    cmds.append(BamFileName)
    logfile = BamFileName + ".fastqc.log"

    run_job(" ".join(cmds),
            job_name="fastqc_" + os.path.basename(BamFileName),
            job_other_options=cluster_options(config, "runFastqc", cores,
                                              logfile),
            job_script_directory=os.path.dirname(os.path.realpath(__file__)),
            job_environment={'BASH_ENV': '~/.bash_profile'},
            retain_job_scripts=True,
            drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 5
0
def rmdupBam(BamFileName, rmdupFile, config):
    """
    To remove duplicates
    Arguments:
    - `BamFileName`: bam file
    - `config`: config
    """
    if config["pair_end"]=="no":
        cmds = ['rmdup.bam.sh']
    else:
        cmds = ['rmdup_PE.bam.sh']
    cmds.append(BamFileName)
    cmds.append(rmdup_path)
    #if "bam_sort_buff" in config:
    #    cmds.append(config["bam_sort_buff"])
    logfile = BamFileName + ".rmdup.log"

    cores = 1

    run_job(" ".join(cmds),
        job_name = "rmdup_" + os.path.basename(BamFileName),
        job_other_options = cluster_options(config, "rmdupBam", cores, logfile),
        job_script_directory = os.path.dirname(os.path.realpath(__file__)),
        job_environment={ 'BASH_ENV' : '~/.bash_profile' },
        retain_job_scripts = True, drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 6
0
def genTDF(BamFileName, tdfLog, config):
    """
    To generate TDF files for IGV
    Arguments:
    - `BamFileName`: bam file
    - `config`: config
    """
    cmds = ['igvtools']
    cmds.append("count")
    cmds.append(BamFileName)
    TDFPath = expandOsPath(os.path.join(rmdup_path, "tdf"))
    baseName = os.path.basename(BamFileName)
    cmds.append(os.path.join(TDFPath, baseName.replace(".bam", ".tdf")))
    cmds.append(config["IGV_genome"])
    logfile = BamFileName + ".tdf.log"

    cores = 1

    run_job(" ".join(cmds),
            job_name="genTDF_" + os.path.basename(BamFileName),
            job_other_options=cluster_options(config, "genTDF", cores,
                                              logfile),
            job_script_directory=os.path.dirname(os.path.realpath(__file__)),
            job_environment={'BASH_ENV': '~/.bash_profile'},
            retain_job_scripts=True,
            drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 7
0
def rmdupBam(BamFileName, rmdupFile, config):
    """
    To remove duplicates
    Arguments:
    - `BamFileName`: bam file
    - `config`: config
    """
    if config["pair_end"] == "no":
        cmds = ['rmdup.bam.sh']
    else:
        cmds = ['rmdup_PE.bam.sh']
    cmds.append(BamFileName)
    cmds.append(rmdup_path)
    #if "bam_sort_buff" in config:
    #    cmds.append(config["bam_sort_buff"])
    logfile = BamFileName + ".rmdup.log"

    cores = 1

    run_job(" ".join(cmds),
            job_name="rmdup_" + os.path.basename(BamFileName),
            job_other_options=cluster_options(config, "rmdupBam", cores,
                                              logfile),
            job_script_directory=os.path.dirname(os.path.realpath(__file__)),
            job_environment={'BASH_ENV': '~/.bash_profile'},
            retain_job_scripts=True,
            drmaa_session=my_drmaa_session)

    return 0
def test_task3( infile, outfile):
    print ("%s start to run " % infile)
    #subprocess.check_call("./five_second.py")
    run_job("./five_second.py", run_locally = True)
    print ("%s wake up " % infile)
    with open(outfile, "w") as p:
        pass
Exemplo n.º 9
0
def runPhantomPeak(BamFileName, Log, config):
    """
    To check data with phantomPeak
    Arguments:
    - `BamFileName`: bam file
    - `config`: config
    """
    cmds = ['runPhantomPeak.sh']
    cmds.append(BamFileName)
    cmds.append(str(config["cores"]))
    logfile = BamFileName + ".phantomPeak.log"

    cores = int(config['cores'])
    if cores == 0:
        cores = 1

    run_job(" ".join(cmds),
            job_name="runPhantomPeak_" + os.path.basename(BamFileName),
            job_other_options=cluster_options(config, "runPhantomPeak", cores,
                                              logfile),
            job_script_directory=os.path.dirname(os.path.realpath(__file__)),
            job_environment={'BASH_ENV': '~/.bash_profile'},
            retain_job_scripts=True,
            drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 10
0
def genTDF(BamFileName, tdfLog, config):
    """
    To generate TDF files for IGV
    Arguments:
    - `BamFileName`: bam file
    - `config`: config
    """
    cmds = ['igvtools']
    cmds.append("count")
    cmds.append(BamFileName)
    TDFPath = expandOsPath(os.path.join(rmdup_path, "tdf"))
    baseName = os.path.basename(BamFileName)
    cmds.append(os.path.join(TDFPath, baseName.replace(".bam", ".tdf")))
    cmds.append(config["IGV_genome"])
    logfile = BamFileName + ".tdf.log"

    cores = 1

    run_job(" ".join(cmds),
        job_name = "genTDF_" + os.path.basename(BamFileName),
        job_other_options = cluster_options(config, "genTDF", cores, logfile),
        job_script_directory = os.path.dirname(os.path.realpath(__file__)),
        job_environment={ 'BASH_ENV' : '~/.bash_profile' },
        retain_job_scripts = True, drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 11
0
def runFastqc(BamFileName, fastqcLog, config):
    """
    To run FastQC
    Arguments:
    - `BamFileName`: bam file
    - `config`: config
    """
    cmds = ['fastqc']
    cmds.append("-o")
    cmds.append(expandOsPath(os.path.join(config["project_dir"], config["data_dir"], "FastQC")))
    cores = int(config['cores'])
    if cores == 0:
        cores = 1
    cmds.append("-t")
    cmds.append(str(cores))
    cmds.append(BamFileName)
    logfile = BamFileName + ".fastqc.log"

    run_job(" ".join(cmds),
        job_name = "fastqc_" + os.path.basename(BamFileName),
        job_other_options = cluster_options(config, "runFastqc", cores, logfile),
        job_script_directory = os.path.dirname(os.path.realpath(__file__)),
        job_environment={ 'BASH_ENV' : '~/.bash_profile' },
        retain_job_scripts = True, drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 12
0
def runPhantomPeak(BamFileName, Log, config):
    """
    To check data with phantomPeak
    Arguments:
    - `BamFileName`: bam file
    - `config`: config
    """
    cmds = ['runPhantomPeak.sh']
    cmds.append(BamFileName)
    cmds.append(str(config["cores"]))
    logfile = BamFileName + ".phantomPeak.log"

    cores = int(config['cores'])
    if cores == 0:
        cores = 1

    stdout_res, stderr_res = run_job(" ".join(cmds),
        job_name = "runPhantomPeak_" + os.path.basename(BamFileName),
        job_other_options = cluster_options(config, "runPhantomPeak", cores, logfile),
        job_script_directory = os.path.dirname(os.path.realpath(__file__)),
        job_environment={ 'BASH_ENV' : '~/.bashrc' },
        retain_job_scripts = True, drmaa_session=my_drmaa_session)

    writeLog(logfile, stdout_res, stderr_res)

    return 0
Exemplo n.º 13
0
def BS_flagstat(INfile, OUTfile, sampath, outdir, my_session, logobject):
    read_root = re.sub('.bam', '', os.path.basename(INfile))
    cmd = os.path.join(sampath,
                       'samtools') + ' flagstat ' + INfile + ' > ' + OUTfile
    logobject.info(cmd)
    with open(os.path.join(outdir, "logs", "%s.flagstat.out" % read_root),
              'w') as stdoutF, open(
                  os.path.join(outdir, "logs", "%s.flagstat.err" % read_root),
                  'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(cmd_str=cmd,
                                             job_name='fstat',
                                             logger=logobject,
                                             drmaa_session=my_session,
                                             run_locally=False,
                                             working_directory=os.getcwd(),
                                             job_other_options='-p bioinfo ')

            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logobject.error("Flagstat error: %s" % err)
            raise
        else:
            logobject.info('Flagstat calculation complete')
    return
Exemplo n.º 14
0
def CpG_filt(input_file, output_file):
    ii = input_file
    oo = output_file
    read_root = re.sub('.CG.call.gz', '', os.path.basename(ii))
    gz_cmd = 'gzip -dc ' + ii + ' > ' + re.sub('.gz', '', ii)
    filt_cmd = os.path.join(
        Rpath, 'Rscript'
    ) + ' --no-save --no-restore /data/boehm/group/pipelines/BS_amplicon_seq/v0.1.0/BSampli.mCT.filt.R ' + mextout + ' ' + re.sub(
        '.gz', '', ii) + ' ' + pozFsub
    clean_cmd = 'rm -v ' + re.sub('.gz', '', ii)
    cmd_all = ';'.join([gz_cmd, filt_cmd, clean_cmd])
    logger.info(cmd_all)
    with open(os.path.join(mextout, "logs", "%s.CpG_filt.out" % read_root),
              'w') as stdoutF, open(
                  os.path.join(mextout, "logs", "%s.CpG_filt.err" % read_root),
                  'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(cmd_str=cmd_all,
                                             job_name='CpG_filt',
                                             logger=logger,
                                             drmaa_session=mySession,
                                             run_locally=False,
                                             working_directory=os.getcwd(),
                                             job_other_options='-p bioinfo')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logger.error("CpG filtering error: %s" % err)
            raise
        else:
            logger.info('CpG filtering complete')
Exemplo n.º 15
0
def tn5_shift(input_file, output_file, out_dir, logger, logger_mutex):
  cmd = ("#==================================\n"
         "# TN5 shift for atac seq           \n"
         "#==================================\n"
         "source ~/.bashrc \n"
         "cd $TMPDIR \n"
         "cp {out_dir}/*tagAlign.gz . \n"
         "for tag in *tagAlign.gz \n"
         "do zcat ""$tag"" | awk -F $'\t' 'BEGIN {{OFS = FS}}{{ if ($6 == \"+\") {{$2 = $2 + 4}} else if ($6 == \"-\") {{$3 = $3 - 5}} print $0}}' | \\\n"
         "gzip -nc > ""${{tag:0:${{#tag}}-12}}.tn5.tagAlign.gz"" \n"
         "done \n"
         "mv *tn5* {out_dir} \n")
  cmd = cmd.format(**locals())
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "tn5_shift",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=04:00:00 -w n -l mem=4G -l tmpfs=10G -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )
  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                      cmd,
                      err,
                      stdout_res,
                      stderr_res])))

  with logger_mutex:
    logger.debug("tn5_shift")
Exemplo n.º 16
0
def trim_BQ(infiles,outfiles):
    ii1=infiles[0]
    ii2=infiles[1]
    oo1=outfiles[0]
    oo2=outfiles[1]
    read_root=re.sub('_R1.fastq.gz','',os.path.basename(ii1))
    uzcmd1='zcat -v '+ ii1 + ' > ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii1)))
    uzcmd2='zcat -v '+ ii2 + ' > ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii2)))
    bshcmd='perl '+ os.path.join(prinpath,'prinseq-lite.pl') + ' -fastq ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii1))) + ' -fastq2 ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii2))) + ' -out_good ' + os.path.join(cutout,re.sub('_1.fastq.gz','',os.path.basename(oo1))) +' -trim_qual_right 20 -trim_qual_type min -trim_qual_window 6 -trim_qual_step 3 -min_len 50 -ns_max_p 10 -min_qual_mean 26 -out_bad null'
    zcmd1='gzip -c '+ os.path.join(cutout,re.sub('.gz','',os.path.basename(oo1))) + ' > ' + oo1
    zcmd2='gzip -c '+ os.path.join(cutout,re.sub('.gz','',os.path.basename(oo2))) + ' > ' + oo2
    clcmd='rm -v '+ os.path.join(cutout,re.sub('.gz','',os.path.basename(ii1))) + ' ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(ii2))) + ' ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(oo1))) + ' ' + os.path.join(cutout,re.sub('.gz','',os.path.basename(oo2)))
    cmd_all=';'.join([uzcmd1,uzcmd2,bshcmd,zcmd1,zcmd2,clcmd])
    logger.info(cmd_all)           
    with open(os.path.join(cutout,"logs","%s.BQtrim_reads.out" % read_root),'w+') as stdoutF, open(os.path.join(cutout,"logs","%s.BQtrim_reads.err" % read_root),'w+') as stderrF:
        try:
        
            stdout_res, stderr_res  = run_job(cmd_str   = cmd_all,
                                      job_name          = 'BQtrim_reads',
                                      logger            = logger,
                                      drmaa_session     = mySession,
                                      run_locally       = False,
                                      working_directory = os.getcwd(),
                                      job_other_options = '-p bioinfo')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        except error_drmaa_job as err:
            logger.error("BQtrim_reads error: %s" % err)
            raise
        else:
           logger.info('Base quality trimming complete')
Exemplo n.º 17
0
def map_reads(input_files, output_file):
    ii1 = input_files[0]
    ii2 = input_files[1]
    oo = output_file
    read_root = re.sub('_1.fastq.gz', '', os.path.basename(ii1))
    mapcmd = os.path.join(bismpath, 'bismark') + ' -p ' + str(
        args.nthreads
    ) + ' --non_directional --dovetail --temp_dir /data/extended --path_to_bowtie /package/bowtie2-2.2.8 --output_dir ' + bamoutO + ' --basename ' + read_root + ' --genome_folder ' + crefGpath + ' -1 ' + ii1 + ' -2 ' + ii2
    logger.info(mapcmd)
    with open(os.path.join(bamoutO, "logs", "%s.readmap.out.log" % read_root),
              'w') as stdoutF, open(
                  os.path.join(bamoutO, "logs",
                               "%s.readmap.err.log" % read_root),
                  'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(
                cmd_str=mapcmd,
                job_name='BSmap',
                logger=logger,
                drmaa_session=mySession,
                run_locally=False,
                working_directory=os.getcwd(),
                job_other_options='-p bioinfo --mincpus=' + str(args.nthreads))
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logger.error("Map_reads error: %s" % err)
            raise
        else:
            logger.info('Mapping complete')
    return
Exemplo n.º 18
0
def index_bam(input_file, output_file):
    ii = input_file
    oo = output_file
    read_root = re.sub('.sorted.bam', '', os.path.basename(ii))
    cmd = os.path.join(sampath, 'samtools') + ' index ' + ii
    logger.info(cmd)
    with open(os.path.join(bamoutO, "logs", "%s.bam_index.out" % read_root),
              'w+') as stdoutF, open(
                  os.path.join(bamoutO, "logs",
                               "%s.bam_index.err" % read_root),
                  'w+') as stderrF:
        try:
            stdout_res, stderr_res = run_job(cmd_str=cmd,
                                             job_name='bam_index',
                                             logger=logger,
                                             drmaa_session=mySession,
                                             run_locally=False,
                                             working_directory=os.getcwd(),
                                             job_other_options='-p bioinfo ')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except error_drmaa_job as err:
            logger.error("Bam indexing error: %s" % err)
            raise
        else:
            logger.info('Bam indexing complete')
Exemplo n.º 19
0
def cut_reads_auto(INfile1, INfile2, OUTfile1, OUTfile2, cutThdR1, cutThdR2,
                   cutpath, my_session, cutout, logobject, args):
    read_root = re.sub('_R1.fastq.gz', '', os.path.basename(INfile1))
    bshcmd = cutpath + ' cutadapt -a AGATCGGAAGAGC -A AGATCGGAAGAGC --minimum-length 30  -n 5 -j' + str(
        args.nthreads
    ) + ' -u ' + cutThdR1 + ' -U ' + cutThdR2 + ' -o ' + OUTfile1 + ' -p ' + OUTfile2 + ' ' + INfile1 + ' ' + INfile2 + ';sleep 300'
    with open(os.path.join(cutout, "logs", "%s.trim_reads.out" % read_root),
              'w+') as stdoutF, open(
                  os.path.join(cutout, "logs",
                               "%s.trim_reads.err" % read_root),
                  'w+') as stderrF:
        try:

            stdout_res, stderr_res = run_job(
                cmd_str=bshcmd,
                job_name='cut_reads',
                logger=logobject,
                drmaa_session=my_session,
                run_locally=False,
                working_directory=os.getcwd(),
                job_other_options='-p bioinfo --nodes=1=1 --mincpus={}'.format(
                    args.nthreads))
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        except error_drmaa_job as err:
            logobject.error("Cut_reads error: %s" % err)
            raise
    return
Exemplo n.º 20
0
def BS_conv_rate(ii1sub, oo, metDir, my_session, logobject):
    read_root = os.path.basename(ii1sub)
    CR_cmd = '/data/manke/repository/scripts/DNA_methylation/DEEP_scripts/conversionRate_KS.sh ' + ii1sub + ' ' + oo
    logobject.info(CR_cmd)
    with open(os.path.join(metDir, "logs", "%s.conv_rate.out.log" % read_root),
              'w') as stdoutF, open(
                  os.path.join(metDir, "logs",
                               "%s.conv_rate.err.log" % read_root),
                  'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(cmd_str=CR_cmd,
                                             job_name='conv_rate',
                                             logger=logobject,
                                             drmaa_session=my_session,
                                             run_locally=False,
                                             working_directory=os.getcwd(),
                                             job_other_options='-p bioinfo')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logobject.error("Conversion rate error: %s" % err)
            raise
        else:
            logobject.info('Conversion rate calculation complete')
    return
Exemplo n.º 21
0
def get_flagstat(input_file, output_file):
    ii = input_file
    oo = output_file
    read_root = re.sub('.RGi.bam', '', os.path.basename(ii))
    cmd = os.path.join(sampath, 'samtools') + ' flagstat ' + ii + ' > ' + oo
    logger.info(cmd)
    with open(os.path.join(metout, "logs", "%s.flagstat.out" % read_root),
              'w') as stdoutF, open(
                  os.path.join(metout, "logs", "%s.flagstat.err" % read_root),
                  'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(cmd_str=cmd,
                                             job_name='fstat',
                                             logger=logger,
                                             drmaa_session=mySession,
                                             run_locally=False,
                                             working_directory=os.getcwd(),
                                             job_other_options='-p bioinfo ')

            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logger.error("Flagstat error: %s" % err)
            raise
        else:
            logger.info('Flagstat calculation complete')
Exemplo n.º 22
0
def cut_reads(infiles, outfiles):
    ii1=infiles[0]
    ii2=infiles[1]
    oo1=outfiles[0]
    oo2=outfiles[1]
    read_root=re.sub('_R1.fastq.gz','',os.path.basename(ii1))
    bshcmd=os.path.join(cutpath,'cutadapt') + ' -a AGATCGGAAGAGC -A AGATCGGAAGAGC --minimum-length 30  -n 5  -o ' + oo1 + ' -p ' + oo2 + ' ' + ii1 + ' ' + ii2
    logger.info(bshcmd)       
    with open(os.path.join(cutout,"logs","%s.cut_reads.out" % read_root),'w+') as stdoutF, open(os.path.join(cutout,"logs","%s.cut_reads.err" % read_root),'w+') as stderrF:
        try:
        
            stdout_res, stderr_res  = run_job(cmd_str   = bshcmd,
                                      job_name          = 'cut_reads',
                                      logger            = logger,
                                      drmaa_session     = mySession,
                                      run_locally       = False,
                                      working_directory = os.getcwd(),
                                      job_other_options = '-p bioinfo')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        except error_drmaa_job as err:
            logger.error("Cut_reads error: %s" % err)
            raise
        else:
           logger.info('Adapter trimming complete')
Exemplo n.º 23
0
def BS_Mbias(INfile, OUTfile, POMpath, refG, metDir, nthreads, my_session,
             logobject):
    read_root = re.sub('.bam', '', os.path.basename(INfile))
    Mb_cmd = os.path.join(
        POMpath, 'MethylDackel'
    ) + ' mbias --txt ' + refG + ' ' + INfile + ' ' + OUTfile + ' -@ ' + str(
        nthreads) + ' > ' + OUTfile + '.txt'
    logobject.info(Mb_cmd)
    with open(os.path.join(metDir, "logs", "%s.mbias.out" % read_root),
              'w') as stdoutF, open(
                  os.path.join(metDir, "logs", "%s.mbias.err" % read_root),
                  'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(
                cmd_str=Mb_cmd,
                job_name='mbias',
                logger=logobject,
                drmaa_session=my_session,
                run_locally=False,
                working_directory=os.getcwd(),
                job_other_options='-p bioinfo --mem-per-cpu=10000 --mincpus=' +
                str(nthreads))

            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logobject.error("Methylation bias error: %s" % err)
            raise
        else:
            logobject.info('Methylation bias calculation complete')
    return
Exemplo n.º 24
0
def makeDB(input_files,output_file):
    ii1 = input_files[0]
    ii2 = input_files[1]
    ii3 = input_files[2]
    oo = output_file
    read_root=re.sub('_prin_flash.extendedFrags.sed.fastq.gz','',os.path.basename(ii1))
    oox=os.path.join(os.path.dirname(oo),(read_root + '.readDB'),read_root + '.flash.db')
    bshcmd='zcat -v ' + ii1 + ' ' + ii2 + ' ' + ii3 + ' | awk \'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}\' - | ' + os.path.join(blastpath,'makeblastdb ') + ' -in - -parse_seqids -dbtype nucl -out ' + oox + ' -title ' + read_root + '; ln -fs ' + oox + '.nal ' + oo
    logger.info(bshcmd)
    with open(os.path.join(DBout,"logs","%s.makeDB.out" % read_root),'w+') as stdoutF, open(os.path.join(DBout,"logs","%s.makeDB.err" % read_root),'w+') as stderrF:
        try:
            stdout_res, stderr_res  = run_job(cmd_str           = bshcmd,
                                          job_name          = 'makeDB',
                                          logger            = logger,
                                          drmaa_session     = mySession,
                                          run_locally       = False,
                                          working_directory = os.getcwd(),
                                          job_other_options = '-p bioinfo ')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except error_drmaa_job as err:
            logger.error("MakeDB error: %s" % err)
            raise
        else:
            logger.info('Making database complete')
Exemplo n.º 25
0
def mod_Rnames(input_files,output_files):
    ii1 = input_files[0]
    ii2 = input_files[1]
    ii3 = input_files[2]
    oo1 = output_files[0]
    oo2 = output_files[1]
    oo3 = output_files[2]
    read_root=re.sub('_prin_flash.extendedFrags.fastq.gz','',os.path.basename(ii1))
    cmd1='zcat ' + ii1 + ' | sed \'s/\ /_/g\' - | gzip -c  > ' + oo1
    cmd2='zcat ' + ii2 + ' | sed \'s/\ /_/g\' - | gzip -c  > ' + oo2
    cmd3='zcat ' + ii3 + ' | sed \'s/\ /_/g\' - | gzip -c  > ' + oo3
    cmd_all=[cmd1,cmd2,cmd3]
    bshcmd=' ; '.join(cmd_all)
    logger.info(bshcmd)
    with open(os.path.join(cutout,"logs","%s.sed.out" % read_root),'w+') as stdoutF, open(os.path.join(cutout,"logs","%s.sed.err" % read_root),'w+') as stderrF:
        try:
            stdout_res, stderr_res  = run_job(cmd_str           = bshcmd,
                                          job_name          = 'sed',
                                          logger            = logger,
                                          drmaa_session     = mySession,
                                          run_locally       = False,
                                          working_directory = os.getcwd(),
                                          job_other_options = '-p bioinfo ')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except error_drmaa_job as err:
            logger.error("Sed error: %s" % err)
            raise
        else:
            logger.info('Renaming reads complete')
Exemplo n.º 26
0
def merge_mates(input_files,output_file):
    ii1 = input_files[0]
    ii2 = input_files[1]
    oo = re.sub('.extendedFrags.fastq.gz','',os.path.basename(output_file[0]))
    read_root=re.sub('_prin_1.fastq.gz','',os.path.basename(ii1))
    bshcmd=os.path.join(flashpath,'flash')+ ' -z -M 300 -t 8 -o '+ oo + ' -d ' + cutout + ' ' + ii1 + ' ' + ii2 
    logger.info(bshcmd)
    with open(os.path.join(cutout,"logs","%s.flash.out" % read_root),'w+') as stdoutF, open(os.path.join(cutout,"logs","%s.flash.err" % read_root),'w+') as stderrF:
        try:
            stdout_res, stderr_res  = run_job(cmd_str           = bshcmd,
                                          job_name          = 'flash',
                                          logger            = logger,
                                          drmaa_session     = mySession,
                                          run_locally       = False,
                                          working_directory = os.getcwd(),
                                          job_other_options = '-p bioinfo --mincpus=8')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except error_drmaa_job as err:
            logger.error("Flash error: %s" % err)
            raise
        else:
            logger.info('Merging mates complete')
Exemplo n.º 27
0
def conv_rate(input_files, output_file):
    ii1 = input_files[0]
    ii1sub = re.sub('_1.fastq.gz', '', ii1)
    oo = output_file
    read_root = os.path.basename(ii1sub)
    CR_cmd = '/data/boehm/group/pipelines/BS_amplicon_seq/v0.1.0/conversionRate_prin_KS.sh ' + ii1sub + ' ' + oo
    logger.info(CR_cmd)
    with open(os.path.join(metout, "logs", "%s.conv_rate.out.log" % read_root),
              'w') as stdoutF, open(
                  os.path.join(metout, "logs",
                               "%s.conv_rate.err.log" % read_root),
                  'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(cmd_str=CR_cmd,
                                             job_name='conv_rate',
                                             logger=logger,
                                             drmaa_session=mySession,
                                             run_locally=False,
                                             working_directory=os.getcwd(),
                                             job_other_options='-p bioinfo')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logger.error("Conversion rate error: %s" % err)
            raise
        else:
            logger.info('Conversion rate calculation complete')
Exemplo n.º 28
0
def BS_index_bam(INfile, sampath, bamoutDir, my_session, logobject):
    cmd_bamInd = os.path.join(sampath,
                              'samtools') + ' index ' + INfile + ';sleep 300'
    read_root = re.sub('.bam', '', os.path.basename(INfile))
    logobject.info(cmd_bamInd)
    with open(
            os.path.join(bamoutDir, "logs", "%s.bamIndex.out.log" % read_root),
            'w') as stdoutF, open(
                os.path.join(bamoutDir, "logs",
                             "%s.bamIndex.err.log" % read_root),
                'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(cmd_str=cmd_bamInd,
                                             job_name='bamIndex',
                                             logger=logobject,
                                             drmaa_session=my_session,
                                             run_locally=False,
                                             working_directory=os.getcwd(),
                                             job_other_options='-p bioinfo')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logobject.error("Bam_index error: %s" % err)
            raise
        else:
            logobject.info('Bam indexing complete')
    return
Exemplo n.º 29
0
    def depth_of_cov(input_file, output_file):
        ii = input_file
        oos = output_file
        oos2 = oos.replace('.sample_summary', '')
        read_root = re.sub('.RGi.bam', '', os.path.basename(ii))
        #OUTlist2=oos2[2:]
        cmd_all = 'java -Xmx50g -Djava.io.tmpdir=/data/extended -jar ' + os.path.join(
            GATKpath, 'GenomeAnalysisTK.jar'
        ) + ' -R ' + refG + ' -T DepthOfCoverage -o ' + oos2 + ' -I ' + ii + ' -ct 0 -ct 1 -ct 2 -ct 5 -ct 10 -ct 15 -ct 20 -ct 30 -ct 50  -omitBaseOutput -mmq 10 --partitionType sample -L ' + args.intList
        logger.info(cmd_all)
        with open(
                os.path.join(metout, "logs",
                             "%s.depth_cov.out.log" % read_root),
                'w') as stdoutF, open(
                    os.path.join(metout, "logs",
                                 "%s.depth_cov.err.log" % read_root),
                    'w') as stderrF:
            try:
                stdout_res, stderr_res = run_job(
                    cmd_str=cmd_all,
                    job_name='depth_cov',
                    logger=logger,
                    drmaa_session=mySession,
                    run_locally=False,
                    working_directory=os.getcwd(),
                    job_other_options='-p bioinfo --mem=50000')
                stdoutF.write("".join(stdout_res))
                stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
            except Exception as err:
                logger.error("Depth of coverage error: %s" % err)
                raise
            else:
                logger.info('Depth of coverage calculation complete')
def run_piped_command(cfg, *args):
    run_locally = True
    retain_job_scripts = True
    job_script_dir = os.path.join(cfg.runs_scratch_dir, "drmaa")
    cpus = 1
    mem_per_cpu = 1024
    walltime = "24:00:00"

    stdout, stderr = "", ""
    job_options = "--ntasks=1 \
                   --cpus-per-task={cpus} \
                   --mem-per-cpu={mem} \
                   --time={time} \
                  ".format(cpus=cpus,
                           mem=int(1.2 * mem_per_cpu),
                           time=walltime)

    full_cmd = expand_piped_command(*args)
    print full_cmd
    try:
        stdout, stderr = run_job(full_cmd.strip(),
                                 job_other_options=job_options,
                                 run_locally=run_locally,
                                 retain_job_scripts=retain_job_scripts,
                                 job_script_directory=job_script_dir,
                                 logger=cfg.logger,
                                 working_directory=os.getcwd(),
                                 drmaa_session=cfg.drmaa_session)
    except error_drmaa_job as err:
        raise Exception("\n".join(
            map(str, ["Failed to run:", full_cmd, err, stdout, stderr])))
Exemplo n.º 31
0
def calc_Mbias(input_file, output_file):
    ii = input_file
    oo = output_file
    oos = re.sub('.txt', '', oo)
    read_root = re.sub('.bam', '', os.path.basename(ii))
    Mb_cmd = os.path.join(
        POMpath, 'MethylDackel') + ' mbias --txt --keepDupes -@ ' + str(
            args.nthreads
        ) + ' ' + refG + ' ' + ii + ' ' + oos + ' > ' + oo  #+ '.txt'
    logger.info(Mb_cmd)
    with open(os.path.join(metout, "logs", "%s.mbias.out" % read_root),
              'w') as stdoutF, open(
                  os.path.join(metout, "logs", "%s.mbias.err" % read_root),
                  'w') as stderrF:
        try:
            stdout_res, stderr_res = run_job(
                cmd_str=Mb_cmd,
                job_name='mbias',
                logger=logger,
                drmaa_session=mySession,
                run_locally=False,
                working_directory=os.getcwd(),
                job_other_options='-p bioinfo --mem-per-cpu=10000 --mincpus=' +
                str(args.nthreads))

            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except Exception as err:
            logger.error("Methylation bias error: %s" % err)
            raise
        else:
            logger.info('Methylation bias calculation complete')
Exemplo n.º 32
0
def stringtie(input_file, output_file,abundance_file,qc_path,gtf,logger, logger_mutex):
  bam=os.path.basename(input_file)
  cmd = ( "source ~/.bashrc \n"
          "cd $TMPDIR \n"
          "mkdir reference \n"
          "cp {input_file} . \n"
          "cp {gtf} ./reference/gencode.gtf \n"
          "stringtie -p 8 -G ./reference/gencode.gtf -A {abundance_file} -o {output_file} -B -e -v {bam} \\\n"
          "2>{qc_path}/stringtie.log \n" )
  cmd = cmd.format(**locals())
  #print cmd
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "stringtie",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-w n -S /bin/bash -l h_rt=04:00:00 -w n -l mem=4G -l tmpfs=60G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,
                                     working_directory    = "/home/sejjctj/Scratch",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))

  with logger_mutex:
    logger.debug("stringtie worked")
Exemplo n.º 33
0
def sort_bam(input_file, output_file):
    ii = input_file
    oo = output_file
    read_root = re.sub('_pe.bam', '', os.path.basename(ii))
    cmd = os.path.join(sampath, 'samtools') + ' sort -T ' + os.path.join(
        '/data/extended', read_root) + ' -m 6G -@ ' + str(
            args.nthreads) + ' -o ' + oo + ' ' + ii
    logger.info(cmd)
    with open(os.path.join(bamoutO, "logs", "%s.bamsort.out" % read_root),
              'w+') as stdoutF, open(
                  os.path.join(bamoutO, "logs", "%s.bamsort.err" % read_root),
                  'w+') as stderrF:
        try:
            stdout_res, stderr_res = run_job(
                cmd_str=cmd,
                job_name='bamsort',
                logger=logger,
                drmaa_session=mySession,
                run_locally=False,
                working_directory=os.getcwd(),
                job_other_options='-p bioinfo --mincpus=' + str(args.nthreads))
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except error_drmaa_job as err:
            logger.error("Bam sorting error: %s" % err)
            raise
        else:
            logger.info('Bam sorting complete')
Exemplo n.º 34
0
def bam_to_tagAlign(input_file, output_file, out_dir, logger, logger_mutex):
  print "\n\ninput_file: " + str(input_file) + "\n\n"
  print "\n\ninput_file: " + str(output_file) + "\n\n"
  FINAL_BAM_FILE=os.path.basename(input_file)
  FINAL_BAM_PREFIX=FINAL_BAM_FILE[:-4]
  BAM_LOC=os.path.dirname(input_file)
  OFPREFIX=FINAL_BAM_FILE[:-4]
  FINAL_NMSRT_BAM=OFPREFIX + ".final_filt_nmsrt.bam"
  FINAL_NMSRT_BAM_PREFIX = FINAL_NMSRT_BAM[:-4]
  FINAL_BEDPE_FILE=FINAL_NMSRT_BAM_PREFIX + ".bedpe.gz"
  FINAL_TA_FILE=FINAL_BAM_PREFIX +".PE2SE.tagAlign.gz"
  NREADS=25000000
  SUBSAMPLED_TA_FILE=OFPREFIX + ".filt.nodup.sample" + str(25) + ".MATE1.tagAlign.gz"
  cmd = ("# =================== \n"
        "# Create tagAlign file \n"
        "# =================== \n"
        "source ~/.bashrc \n"
        "cd $TMPDIR \n"
        "cp {input_file} . \n"
        "cp {BAM_LOC}/{FINAL_NMSRT_BAM} . \n"
        "# Create virtual SE file containing both read pairs \n"
        "bedtools bamtobed -i {FINAL_BAM_FILE} \\\n"
        " | awk 'BEGIN{{OFS=\"\\t\"}}{{$4=\"N\";$5=\"1000\";print $0}}' | gzip -nc > {FINAL_TA_FILE} \n"
        "# ================ \n"
        "# Create BEDPE file \n"
        "# ================ \n"
        "bedtools bamtobed -bedpe -mate1 -i {FINAL_NMSRT_BAM} | gzip -nc > {FINAL_BEDPE_FILE} \n"
        "# ================================= \n"
        "# Subsample tagAlign file \n"
        "# Restrict to one read end per pair for CC analysis \n"
        "# ================================ \n"
        "zcat {FINAL_BEDPE_FILE} | grep -v \"chrM\" | shuf -n {NREADS} --random-source={FINAL_BEDPE_FILE}  \\\n"
        " | awk 'BEGIN{{OFS=\"\\t\"}}{{print $1,$2,$3,\"N\",\"1000\",$9}}' | gzip -nc > {SUBSAMPLED_TA_FILE} \n"
        "mv {FINAL_TA_FILE} {out_dir} \n"
        "mv {SUBSAMPLED_TA_FILE} {out_dir} \n"
        "mv {FINAL_BEDPE_FILE} {out_dir} ")
  cmd = cmd.format(**locals())

  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "bam2tag",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=24G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                      cmd,
                      err,
                      stdout_res,
                      stderr_res])))

  with logger_mutex:
    logger.debug("bam_to_tagAlign worked")
Exemplo n.º 35
0
def cut_reads_user(INfile1, INfile2, OUTfile1, OUTfile2, cutpath, my_session,
                   cutout, logobject, args):
    read_root = os.path.basename(INfile1)[:-12]
    adapterSeq = "AGATCGGAAGAGC"
    if args.nextera:
        adapterSeq = "CTGTCTCTTATA"
    bshcmd = "{} cutadapt -a {} -A {} -q {} -m 30 -j {} {} -o {} -p {} {} {} ; sleep 300".format(
        cutpath, adapterSeq, adapterSeq, args.trimThreshold, args.nthreads,
        args.trimOtherArgs, OUTfile1, OUTfile2, INfile1, INfile2)
    with open(os.path.join(cutout, "logs", "%s.trim_reads.out" % read_root),
              'w+') as stdoutF, open(
                  os.path.join(cutout, "logs",
                               "%s.trim_reads.err" % read_root),
                  'w+') as stderrF:
        try:
            stdout_res, stderr_res = run_job(
                cmd_str=bshcmd,
                job_name='cut_reads',
                logger=logobject,
                drmaa_session=my_session,
                run_locally=False,
                working_directory=os.getcwd(),
                job_other_options='-p bioinfo --nodes=1=1 --mincpus={}'.format(
                    args.nthreads))
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        except error_drmaa_job as err:
            logobject.error("Cut_reads error: %s" % err)
            raise
    return
Exemplo n.º 36
0
    def intAgg_stats(input_files, output_files):
        ii = os.path.join(CpGstat_out, input_files[1])
        oo = output_files
        Rcmd = os.path.join(
            Rpath, 'Rscript'
        ) + ' --no-save --no-restore /data/boehm/group/pipelines/BS_amplicon_seq/v0.1.0/BSampli.interval_stats.limma.R ' + intStat_out + ' ' + args.intList + ' ' + ii + ' ' + args.sampleInfo
        logger.info(Rcmd)
        with open(os.path.join(intStat_out, "logs", "interval_stats.out"),
                  'w') as stdoutF, open(
                      os.path.join(intStat_out, "logs", "interval_stats.err"),
                      'w') as stderrF:
            try:
                stdout_res, stderr_res = run_job(
                    cmd_str=Rcmd,
                    job_name='agg_stats',
                    logger=logger,
                    drmaa_session=mySession,
                    run_locally=False,
                    working_directory=os.getcwd(),
                    job_other_options='-p bioinfo')
                stdoutF.write("".join(stdout_res))
                stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
            except Exception as err:
                logger.error("Interval stats error: %s" % err)
                raise
            else:
                logger.info('Interval stats calculation complete')
Exemplo n.º 37
0
def postTrim_fqc(input_files, output_files):
    ii1 = input_files[0]
    ii2 = input_files[1]
    read_root = re.sub('_prin_1.fastq.gz', '', os.path.basename(ii1))
    bshcmd = os.path.join(
        FQCpath,
        'fastqc ') + ' --outdir ' + fqcout + ' -t 8 ' + ii1 + ' ' + ii2
    logger.info(bshcmd)
    with open(os.path.join(fqcout, "logs", "%s.post_fqc.out" % read_root),
              'w+') as stdoutF, open(
                  os.path.join(fqcout, "logs", "%s.post_fqc.err" % read_root),
                  'w+') as stderrF:
        try:
            stdout_res, stderr_res = run_job(
                cmd_str=bshcmd,
                job_name='post_fqc',
                logger=logger,
                drmaa_session=mySession,
                run_locally=False,
                working_directory=os.getcwd(),
                job_other_options='-p bioinfo --mincpus=8')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except error_drmaa_job as err:
            logger.error("Post_trim_fastqc error: %s" % err)
            raise
        else:
            logger.info('Post trim fastqc complete')
Exemplo n.º 38
0
def post_trim_fqc(INfile1, INfile2, fqcout, FQCpath, my_session, logobject):
    read_root = re.sub('_R1.fastq.gz', '', os.path.basename(INfile1))
    bshcmd = os.path.join(
        FQCpath,
        'fastqc ') + ' --outdir ' + fqcout + ' -t 8 ' + INfile1 + ' ' + INfile2
    with open(os.path.join(fqcout, "logs", "%s.post_fqc.out" % read_root),
              'w+') as stdoutF, open(
                  os.path.join(fqcout, "logs", "%s.post_fqc.err" % read_root),
                  'w+') as stderrF:
        try:
            stdout_res, stderr_res = run_job(
                cmd_str=bshcmd,
                job_name='post_fqc',
                logger=logobject,
                drmaa_session=my_session,
                run_locally=False,
                working_directory=os.getcwd(),
                job_other_options='-p bioinfo --nodes=1=1 --mincpus=8')
            stdoutF.write("".join(stdout_res))
            stderrF.write("".join(stderr_res))

        # relay all the stdout, stderr, drmaa output to diagnose failures
        except error_drmaa_job as err:
            logobject.error("Post_trim_fastqc error: %s" % err)
            raise
    return
def run_exonerate(input_file, output_file, genome_filename, query_filename):
	twobit_filename = FASTA_RE_COMPILED.sub('.2bit', genome_filename)
	job_name = input_file.replace('.genblastA.gff3', '.sge')
	job = 'run_est_mapping.py --query_type {}'.format(args.query_type)
	job += ' --upstream {} --downstream {} --mapper exonerate --save_mapper_output --augustus_hints'.format(
		   args.exonerate_upstream, args.exonerate_downstream)
	if args.extra_exonerate_args:
		job += ' --extra_mapper_args "{}"'.format(args.extra_exonerate_args)
	job += ' {} {} {} {}'.format(query_filename, input_file, twobit_filename, output_file)
	job_queue = 'all.q'
	job_env = dict(PATH=PATH_val, PYTHONPATH=PYTHONPATH_val)
	if not args.run_local:
		job_env['MODULESHOME'] = args.modules_home
	run_job(job, job_name=job_name, job_other_options='-q {}'.format(job_queue),
		    job_environment=job_env, drmaa_session=drmaa_session, 
		    working_directory=args.working_directory,
		    run_locally=args.run_local, logger=logger)
Exemplo n.º 40
0
def bowtie2(input_files, out_file, path, outpath,qc_folder,logger, logger_mutex):
    flat_list = [item for sublist in input_files for item in sublist]
    first_reads = []
    second_reads =[]
    for i in flat_list:
        if re.search('val_1', os.path.basename(i)):
            first_reads.append(os.path.basename(i))
        elif re.search('val_2', os.path.basename(i)):
            second_reads.append(os.path.basename(i))
    first_reads = ','.join(first_reads)
    second_reads = ','.join(second_reads)
    bowtie2_output = out_file.split('/')
    bowtie2_output = bowtie2_output[-1]

    cmd = ( " cd $TMPDIR \n"
            " mkdir reference \n"
            " mkdir temporary \n"
            " cp  {path}"  + "/*fq.gz" + " . \n "
            " ls -l > {qc_folder}/log \n"
            " date \n"
            " cp $HOME/Scratch/reference/grch38/bowtie2/*bt2 ./reference \n"
            " bowtie2 -k 4 -X2000 --mm --local --threads 8 \\\n"
            " -x  ./reference/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bowtie_index \\\n"
            " -1 {first_reads} \\\n"
            " -2 {second_reads} \\\n"
            " 2> {qc_folder}/bowtie2.log \\\n"
            " | samtools view -bS - -o temp.bam 2>{qc_folder}/samtools.log \n"
            " ls -lh >> {qc_folder}/list.log \n"
            " ~/applications/sambamba/sambamba_v0.6.6 sort -p -m 4G -t 8 --tmpdir=./temporary temp.bam -o " + bowtie2_output + " \n" 
            " ls -lh >> {qc_folder}/list.log \n"
            " cp " + bowtie2_output + " {outpath} \n"
            " rm -r * ")
    cmd = cmd.format(**locals())
    #print cmd
    try:
        stdout_res, stderr_res = "",""
        stdout_res, stderr_res = run_job(cmd,
                                        job_name = "bowtie2",
                                        job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                        job_other_options    = "-w n -S /bin/bash -V -l h_rt=12:00:00 -w n -l mem=4G -l tmpfs=80G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ",
                                        #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                        retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                        working_directory    = "/home/sejjctj/Scratch",
                                        drmaa_session        = drmaa_session,
                                        logger = logger )

    except error_drmaa_job as err:
        raise Exception("\n".join(map(str,
                        ["Failed to run:",
                         cmd,
                         err,
                         stdout_res,
                         stderr_res])))

    with logger_mutex:
        logger.debug("bowtie2 worked")
Exemplo n.º 41
0
def run_stage(state, stage, command):
    '''Run a pipeline stage, either locally or on the cluster'''

    # Grab the configuration options for this stage
    config = state.config
    modules = config.get_stage_option(stage, 'modules')
    mem = config.get_stage_option(stage, 'mem') * MEGABYTES_IN_GIGABYTE 
    account = config.get_stage_option(stage, 'account')
    queue = config.get_stage_option(stage, 'queue')
    walltime = config.get_stage_option(stage, 'walltime')
    run_local = config.get_stage_option(stage, 'local')
    cores = config.get_stage_option(stage, 'cores')
    pipeline_id = config.get_option('pipeline_id')
    job_name = pipeline_id + '_' + stage

    # Generate a "module load" command for each required module
    if modules is not None:
        module_loads = '\n'.join(['module load ' + module for module in modules])
    else:
        module_loads = '\n'
    cluster_command = '\n'.join([module_loads, command])

    # Specify job-specific options for SLURM
    job_options = '--nodes=1 --ntasks-per-node={cores} --ntasks={cores} --time={time} --mem={mem} --partition={queue} --account={account}' \
                      .format(cores=cores, time=walltime, mem=mem, queue=queue, account=account)

    # Log a message about the job we are about to run
    log_messages = ['Running stage: {}'.format(stage),
                    'Command: {}'.format(command)]
    if not run_local:
        log_messages.append('Job options: {}'.format(job_options))
    state.logger.info('\n'.join(log_messages))

    # Run the job, capturing stdout and stderr
    stdout_res, stderr_res = None, None
    try:
        stdout_res, stderr_res = \
            run_job(cmd_str=cluster_command,
                job_name = job_name,
                logger = state.logger.proxy,
                drmaa_session = state.drmaa_session,
                # Determines whether to run the command on the local
                # machine or run it on the cluster
                run_locally = run_local,
                # Keep a copy of the job script for diagnostic purposes
                retain_job_scripts = True,
                retain_stdout = True,
                retain_stderr = True,
                job_script_directory = state.options.jobscripts, 
                job_other_options = job_options)
    except error_drmaa_job as err:
        raise Exception("\n".join(map(str, ["Failed to run:", command, err, stdout_res, stderr_res])))
Exemplo n.º 42
0
def alignFastqByBowtie(FqFileName, OutputBamFileName, config):
    """
    To align '.fastq' to genome.
    Arguments:
    - `FqFileName`: file to be processed
    """
    if "aligner" in config:
        if config["aligner"] == "bowtie":
            cmds = ['fastq2bam_by_bowtie.sh']
            cmds.append(FqFileName)
            cmds.append(expandOsPath(config['bowtie_index']))
        elif config["aligner"] == "bowtie2":
            cmds = ['fastq2bam_by_bowtie2.sh']
            cmds.append(FqFileName)
            cmds.append(config['bowtie_index'])
        else:
            raise KeyError
    else:
        cmds = ['fastq2bam_by_bowtie.sh']
        cmds.append(FqFileName)
        cmds.append(expandOsPath(config['bowtie_index']))

    target = expandOsPath(os.path.join(config["project_dir"], config["data_dir"]))
    cmds.append(target)
    cmds.append(config["pair_end"])
    cores = int(config['cores'])
    if cores == 0:
        cores = 1
    cmds.append(str(cores))
    logfile = FqFileName + ".alignment.log"

    run_job(" ".join(cmds),
        job_name = "alignFastqByBowtie_" + os.path.basename(FqFileName),
        job_other_options = cluster_options(config, "alignFastqByBowtie", cores, logfile),
        job_script_directory = os.path.dirname(os.path.realpath(__file__)),
        job_environment={ 'BASH_ENV' : '~/.bash_profile' },
        retain_job_scripts = True, drmaa_session=my_drmaa_session)

    return 0
Exemplo n.º 43
0
def hisat2(input_files, out_file, path, outpath,qc_folder,hisat_genome_index,logger, logger_mutex):
    flat_list = [item for sublist in input_files for item in sublist]
    first_reads = []
    second_reads =[]
    for i in flat_list:
        if re.search('val_1', os.path.basename(i)):
            first_reads.append(os.path.basename(i))
        elif re.search('val_2', os.path.basename(i)):
            second_reads.append(os.path.basename(i))
    first_reads = ','.join(first_reads)
    second_reads = ','.join(second_reads)
    hisat_output = out_file.split('/')
    hisat_output = hisat_output[-1]
    
    cmd = ( "source ~/.bashrc \n"
            "cd $TMPDIR \n"
            "mkdir reference \n"
            "cp  {path}/*fq.gz  . \n"
            "cp {hisat_genome_index}/genome* ./reference \n"
            "hisat2 -p 8 -x ./reference/genome_snp_tran  --dta-cufflinks \\\n"
            "--novel-splicesite-outfile ./novel_splice.txt \\\n"
            "--novel-splicesite-infile ./novel_splice.txt \\\n"
            "-1 {first_reads} \\\n"
            "-2 {second_reads} \\\n"
            "2> {qc_folder}/hisat.log | samtools view -bS - -o temp.bam \n"
            "samtools sort -@ 8 temp.bam -m 4G " + hisat_output[:-4] + " 2>{qc_folder}/samtools.log \n"
            "mv {hisat_output} {outpath} \n"
            "mv novel_splice.txt {outpath} \n")
    cmd = cmd.format(**locals())
    try:
        stdout_res, stderr_res = "",""
        stdout_res, stderr_res = run_job(cmd,
                                        job_name = "hisat",
                                        job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                        job_other_options    = "-w n -S /bin/bash -l h_rt=08:00:00 -w n -l mem=4G -l tmpfs=60G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ",
                                        #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                        retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                        working_directory    = "/home/sejjctj/Scratch",
                                        drmaa_session        = drmaa_session,
                                        logger = logger )

    except error_drmaa_job as err:
        raise Exception("\n".join(map(str,
                        ["Failed to run:",
                         cmd,
                         err,
                         stdout_res,
                         stderr_res])))

    with logger_mutex:
        logger.debug("hisat worked")
Exemplo n.º 44
0
def trim_fastq(input_files, output_files, qc_folder, output_folder ,logger, logger_mutex):
    print "OUTPUT FILES!   " + str(output_files)    
    if len(input_files) !=2:
        raise Exception("One of the reads pairs %s missing" % (input_files,))
    cmd = ( " source ~/.bashrc \n"
            " date \n"
            " echo $HOSTNAME \n"
            " cd $TMPDIR \n"
            " cp {input_files[0]} . \n"
            " cp {input_files[1]} . \n"
            " basename1=$(basename {input_files[0]}) \n"
            " basename2=$(basename {input_files[1]}) \n"
            " date \n"
            " ls -l \n"
            #" trim_galore --fastqc --paired {basenames[0]} {basenames[1]} &> {qc_folder}/trim_galore.log \n"
            " trim_galore --fastqc --paired $basename1 $basename2 &> {qc_folder}/trim_galore.log \n"
            " mv *.fq.gz  {output_folder} \n"
            " mv *fastqc*  {qc_folder} \n"
            " mv *report* {qc_folder}; rm * \n" )
  
    job_name = "trim_fastqc"
  ## formats the cmd input to get the variables in the {}
    cmd = cmd.format(**locals())
    #print(cmd)  
    try:
    
      stdout_res, stderr_res = "",""
      stdout_res, stderr_res = run_job(cmd,
                                      job_name,
                                      job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                      job_other_options    = "-w n -S /bin/bash -l h_rt=05:00:00 -l mem=4G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes",
                                      #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                      retain_job_scripts   = True,
                                      working_directory    = "/home/sejjctj/Scratch",
                                      drmaa_session        = drmaa_session,
                                      logger = logger )                                      
   #                                   
    except error_drmaa_job as err:
        raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))
                                  
   
    with logger_mutex:
        logger.debug("trim_fastq worked")
Exemplo n.º 45
0
def create_pseudoreplicates(input_file, output_file, out_dir, logger, logger_mutex):
  FINAL_BEDPE_FILE=os.path.basename(input_file)
  PR_PREFIX=FINAL_BEDPE_FILE[:-26]
  PR1_TA_FILE=PR_PREFIX + ".PE2SE.pr1.tagAlign.gz"
  PR2_TA_FILE=PR_PREFIX + ".PE2SE.pr2.tagAlign.gz"
  cmd = ("# ========================\n"
       "# Create pseudoReplicates\n"
       "# =======================\n"
       "source ~/.bashrc \n"
       "cd $TMPDIR \n"
       "cp {input_file} . \n"
       "# Get total number of read pairs \n"
       "nlines=$( zcat {FINAL_BEDPE_FILE} | wc -l ) \n"
       "nlines=$(( (nlines + 1) / 2 )) \n"
        "# Shuffle and split BEDPE file into 2 equal parts \n"
       "zcat {FINAL_BEDPE_FILE} | shuf --random-source={FINAL_BEDPE_FILE} | split -d -l $nlines - {PR_PREFIX} \n"
       "# Will produce {PR_PREFIX}00 and {PR_PREFIX}01 \n"
       "# Convert read pairs to reads into standard tagAlign file \n"
       "awk 'BEGIN{{OFS=\"\\t\"}}{{printf \"%s\\t%s\\t%s\\tN\\t1000\\t%s\\n%s\\t%s\\t%s\\tN\\t1000\\t%s\\n\",$1,$2,$3,$9,$4,$5,$6,$10}}' {PR_PREFIX}00 | gzip -nc > {PR1_TA_FILE} \n"        
       "rm {PR_PREFIX}00 \n"
       "awk 'BEGIN{{OFS=\"\\t\"}}{{printf \"%s\\t%s\\t%s\\tN\\t1000\\t%s\\n%s\\t%s\\t%s\\tN\\t1000\\t%s\\n\",$1,$2,$3,$9,$4,$5,$6,$10}}' {PR_PREFIX}01 | gzip -nc > {PR2_TA_FILE} \n"
       "rm {PR_PREFIX}01 \n"
       "mv {PR1_TA_FILE} {out_dir} \n"
       "mv {PR2_TA_FILE} {out_dir} "
    )
  cmd = cmd.format(**locals())
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "create_pseudo",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=01:00:00 -w n -l mem=8G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                      cmd,
                      err,
                      stdout_res,
                      stderr_res])))

  with logger_mutex:
    logger.debug("create_pseudoreplicates")
Exemplo n.º 46
0
def phantom_peak_quals(input_file, output_file, out_dir, outfile1,outfile2,logger, logger_mutex):
  
  SUBSAMPLED_TA_FILE=os.path.basename(input_file)
  SUBSAMPLED_TA_FILE=SUBSAMPLED_TA_FILE[:-25] + "filt.nodup.sample25.MATE1.tagAlign.gz"
 
  cmd = ("#########################\n"
         "# run  phantompeakquals #\n"
         "#########################\n"
         "source ~/.bashrc \n"
         "cd $TMPDIR \n"
         "mkdir job_temp \n"
         "mv {out_dir}/{SUBSAMPLED_TA_FILE} . \n"
         "Rscript ~/applications/phantompeakqualtools/run_spp.R "
         " -c={SUBSAMPLED_TA_FILE} -filtchr=chrM "
         " -savp={outfile1} -out={outfile2} "
         " -tmpdir=./job_temp \n" 
         "echo -e \"Filename\\tnumReads\\testFragLen\\tcorr_estFragLen\\tPhantomPeak\\tcorr_phantomPeak\\targmin_corr\\tmin_corr\\tphantomPeakCoef\\trelPhantomPeakCoef\\tQualityTag\" > header \n"
         "sed -r 's/,[^\\t]+//g' {outfile2} > temp \n"
         "cat header temp > temporary && mv temporary temp \n"
         "mv temp {outfile2} \n"
         "mv {outfile2} {out_dir}\n"
         "mv {outfile1} {out_dir}")

  cmd = cmd.format(**locals())
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "phantom",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=24G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                      cmd,
                      err,
                      stdout_res,
                      stderr_res])))

  with logger_mutex:
    logger.debug("create_pseudoreplicates")
Exemplo n.º 47
0
def bowtie2(input_files, out_file, path, outpath,qc_folder,logger, logger_mutex):
    print out_file
    reads = []
    for i in input_files:
      reads.append(os.path.basename(i))
    reads = ','.join(reads)
    print reads
    bowtie2_output = out_file.split('/')
    bowtie2_output = bowtie2_output[-1]
    cmd = ( "cd $TMPDIR \n"
            "mkdir reference \n"
            "cp  {path}"  + "/*fq.gz" + " . \n "
            "ls -l \n"
            "date \n"
            "cp $HOME/Scratch/reference/grch38/bowtie2/*bt2 ./reference \n "
            " bowtie2 -k 4 -X2000 --mm --local --threads 8 "
            " -x  ./reference/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bowtie_index "
            " -U {reads} 2> {qc_folder}/bowtie2.log | samtools view -bS - -o temp.bam \n"
            " samtools sort -@ 8 temp.bam -m 2G " + bowtie2_output[:-4] + " 2>{qc_folder}/samtools.log \n"
            " samtools flagstat {bowtie2_output} > {qc_folder}/{bowtie2_output}.mapstats \n"
            " cp {bowtie2_output} {outpath} \n"
            " rm -r * \n")
    cmd = cmd.format(**locals())
    #print cmd
    try:
        stdout_res, stderr_res = "",""
        stdout_res, stderr_res = run_job(cmd,
                                        job_name = "bowtie2",
                                        job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                        job_other_options    = "-w n -S /bin/bash -V -l h_rt=08:00:00 -w n -l mem=2G -l tmpfs=60G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ",
                                        job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                        retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                        working_directory    = "/home/sejjctj/Scratch",
                                        drmaa_session        = drmaa_session,
                                        logger = logger )

    except error_drmaa_job as err:
        raise Exception("\n".join(map(str,
                        ["Failed to run:",
                         cmd,
                         err,
                         stdout_res,
                         stderr_res])))

    with logger_mutex:
        logger.debug("bowtie2 worked")
Exemplo n.º 48
0
def bam_to_tagAlign(input_file, output_file, out_dir,prefix, logger, logger_mutex):
  FINAL_BAM_FILE=os.path.basename(input_file)
  FINAL_BAM_PREFIX=prefix
  cmd = ("# =================== \n"
         "# Create tagAlign file \n"
         "# =================== \n"
         "cd $TMPDIR \n"
         "cp {input_file} . \n"
         "FINAL_TA_FILE={FINAL_BAM_PREFIX}.tagAlign.gz \n"
         "bedtools bamtobed -i {FINAL_BAM_FILE} | awk 'BEGIN{{OFS=\"\\t\"}}{{$4=\"N\";$5=\"1000\";print $0}}' | gzip -nc > \"$FINAL_TA_FILE\" \n"
         "# ================================= \n"
         "# Subsample tagAlign file \n"
         "# ================================ \n"
         "NREADS=15000000 \n"
         "SUBSAMPLED_TA_FILE={FINAL_BAM_PREFIX}.sample.tagAlign.gz\n"
         "zcat \"$FINAL_TA_FILE\" | grep -v chrM | shuf -n \"$NREADS\" --random-source=\"$FINAL_TA_FILE\" | gzip -nc > \"$SUBSAMPLED_TA_FILE\" \n"
         "mv \"$SUBSAMPLED_TA_FILE\" {out_dir} \n"
         "mv \"$FINAL_TA_FILE\" {out_dir}")
  cmd = cmd.format(**locals())

  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "bam2tag",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=24G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ",
                                     job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                      cmd,
                      err,
                      stdout_res,
                      stderr_res])))

  with logger_mutex:
    logger.debug("bam_to_tagAlign worked")
Exemplo n.º 49
0
def blacklist(input_file, output_file,out_dir, logger, logger_mutex): 
  cmd = ("#===================================\n"
         "#  run mac2 2 on tn5 shifted files  \n"
         "#===================================\n"
         "source ~/.bashrc \n"
         "cd $TMPDIR \n"
         "cp {out_dir}/*narrowPeak.gz . \n"
         "blacklist=\"/home/sejjctj/Scratch/reference/grch38/chipseq_blacklist/hg38.blacklist.bed.gz\" \n"
         "for peak in *narrowPeak.gz \n"
         "do \n"
         "prefix=\"${{tag:0:${{#tag}}-14}}\"   #remove .narrowPeak.gz \n" 
         "filtered_peak=\"${{prefix}}\".narrowPeak.filt.gz \n"
         "bedtools intersect -v a ${{peak}} -b ${{blacklist}} \\\n"
         "| awk 'BEGIN{{OFS=\"\\t\"}}{{if($5>1000) $5=1000; print $0}}' \\\n"
         "| grep -P 'chr[\dXY]+[\\t]' | gzip -nc > ${{filtered_peak}} \n"
         "mv ${{filtered_peak}} {out_dir} \n"
         "done \n")

  cmd = cmd.format(**locals())
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "blacklist",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=08:00:00 -w n -l mem=16G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )
  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                      cmd,
                      err,
                      stdout_res,
                      stderr_res])))

  with logger_mutex:
    logger.debug("blacklist")
Exemplo n.º 50
0
def qorts(input_file, output_file, log_file, gtf, logger, logger_mutex):
    bam=os.path.basename(input_file[0])
    cmd = (" source ~/.bashrc \n"
           " cd $TMPDIR; mkdir tmp \n"
           " cp {input_file[0]} ./ \n"
           " samtools sort -n -m 12G -T prefix -O bam {bam} > namesort.bam \n"
           " java -Xmx48G -Djava.io.tmpdir=./tmp \\\n"
           " -jar ~/applications/QoRTs/QoRTs.jar QC \\\n" 
           " --nameSorted \\\n"
           " --minMAPQ 60 \\\n"
           " --maxReadLength 100 \\\n"
           " namesort.bam \\\n"
           " {gtf} \\\n"
           " {output_file} \\\n"
           " 2>{log_file} " )
    cmd = cmd.format(**locals())
    #print cmd
    try:
      stdout_res, stderr_res = "",""
      stdout_res, stderr_res = run_job(cmd,
                                     job_name = "qorts",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-w n -S /bin/bash  -l h_rt=08:00:00 -w n -l mem=48G -l tmpfs=30G -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

    except error_drmaa_job as err:
      raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))

    with logger_mutex:
      logger.debug("qorts worked")
Exemplo n.º 51
0
def cufflinks(input_file, output_file, path,qc_path,gtf,genome,mask,genome_name,logger, logger_mutex):
  bam=os.path.basename(input_file)
  my_mask=os.path.basename(mask)
  cmd = ( "source ~/.bashrc \n"
          "cd $TMPDIR \n"
          "mkdir reference \n"
          "cp {input_file} . \n"
          "cp {genome}*fa* ./reference  \n"
          "cp {gtf} ./reference/gencode.gtf \n"
          "cp {mask} ./reference \n"
          "cufflinks -q -u --no-update-check -p 8 -G ./reference/gencode.gtf \\\n"
          "-b ./reference/{genome_name} \\\n"
          "--mask-file ./reference/{my_mask} {bam} \\\n"
          "-o  {path}  \\\n"
          "2>{qc_path}/cufflinks.log \n" )
  cmd = cmd.format(**locals())
  #print cmd
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "cufflinks",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-w n -S /bin/bash -l h_rt=04:00:00 -w n -l mem=4G -l tmpfs=60G -pe smp 8 -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,
                                     working_directory    = "/home/sejjctj/Scratch",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))

  with logger_mutex:
    logger.debug("cufflinks worked")
Exemplo n.º 52
0
def star_fusion(input_files, out_file,sample, outpath,qc_folder,logger, logger_mutex):
  fusion_input = input_files[1]
  fusion_name = os.path.basename(fusion_input)
  cmd = ( "source ~/.bashrc \n"
          "module unload perl \n"
          "module load perl/5.16.0 \n"
          "export PERL5LIB=$PERL5LIB:/home/sejjctj/perl5/lib/perl5 \n"
          "cd $TMPDIR \n "
          "cp {fusion_input}  . \n "
          "awk 'BEGIN{{OFS=\"\\t\"}}{{$1=\"chr\"$1;$4=\"chr\"$4;print $0}}' {fusion_name} > temp && mv temp {fusion_name} \n"
          "STAR-Fusion \\\n"
          "--genome_lib_dir /home/sejjctj/Scratch/reference/star_single_cell/fusion/GRCh38_v27_CTAT_lib_Feb092018/ctat_genome_lib_build_dir \\\n"
          "-J {fusion_name} \\\n"
          "--output_dir {outpath} \n" )

  cmd = cmd.format(**locals())
  #print cmd
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "star_fusion",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-w n -S /bin/bash -l h_rt=02:00:00 -w n -l mem=24G -l tmpfs=60G  -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                     cmd,
                     err,
                     stdout_res,
                     stderr_res])))
  
  with logger_mutex:
    logger.debug("star_fusion worked")
Exemplo n.º 53
0
def kallisto(input_files, output_file, path,kallisto_folder,qc_folder):
    input_files = [item for sublist in input_files for item in sublist]
    list_of_reads = []
    for filename in input_files:
        list_of_reads.append(os.path.basename(filename))
    list_of_reads = ' '.join(list_of_reads)

    cmd = ("source ~/.bashrc \n"
           "cd $TMPDIR \n"
           "mkdir reference \n"
           "cp {path}/*fq.gz   . \n"
           "cp $HOME/Scratch/reference/hg38_ver84_transcripts.idx ./reference \n"
           "kallisto quant -b 100 -t 4 -i \\\n"
           "./reference/hg38_ver84_transcripts.idx {list_of_reads} \\\n"
           "-o {kallisto_folder}")
    cmd = cmd.format(**locals())
    try:
      stdout_res, stderr_res = "",""
      stdout_res, stderr_res = run_job(cmd,
                                     job_name             = "kallisto",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-w n -S /bin/bash -l h_rt=04:00:00 -l mem=8G -w n -pe smp 4 -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,
                                     working_directory    = "/home/sejjctj/Scratch",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

    except error_drmaa_job as err:
      raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))

    with logger_mutex:
      logger.debug("kallisto worked")
Exemplo n.º 54
0
def trim_fastq(input_file, output_files, qc_folder, output_folder ,logger, logger_mutex):
    raw_fastq=os.path.basename(input_file)
    cmd = (" cd $TMPDIR ; "
         " cp {input_file} . ;"
         " trim_galore --fastqc  {raw_fastq} 2> {qc_folder}/trim_galore.log ; "
         " mv *.fq.gz  {output_folder} ; "
         " mv *fastqc*  {qc_folder} ; "
         " mv *report* {qc_folder}; rm * ; " )
  
    job_name = "trim_fastqc"
  ## formats the cmd input to get the variables in the {}
    cmd = cmd.format(**locals())
    #print(cmd)  
    try:
    
      stdout_res, stderr_res = "",""
      stdout_res, stderr_res = run_job(cmd,
                                      job_name,
                                      job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                      job_other_options    = "-w n -S /bin/bash -V -l h_rt=05:00:00 -l mem=4G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes",
                                      job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                      retain_job_scripts   = True,
                                      working_directory    = "/home/sejjctj/Scratch",
                                      drmaa_session        = drmaa_session,
                                      logger = logger )                                      
   #                                   
    except error_drmaa_job as err:
        raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))
                                  
   
    with logger_mutex:
        logger.debug("trim_fastq worked")
Exemplo n.º 55
0
def star(input_files, out_file, path,outpath,sample,qc_folder,logger, logger_mutex):
  flat_list = [item for sublist in input_files for item in sublist]
  print(flat_list)
  first_reads = []
  second_reads =[]
  for i in flat_list:
    if re.search('val_1', os.path.basename(i)):
      first_reads.append(os.path.basename(i))
    elif re.search('val_2', os.path.basename(i)):
       second_reads.append(os.path.basename(i))
  first_reads = ','.join(first_reads)
  second_reads = ','.join(second_reads)
  star_output = out_file.split('/')
  star_output = star_output[-1]
  #print star_output
  cmd = ( "source ~/.bashrc \n"
          "cd $TMPDIR \n "
          "cp {path}/*fq.gz  . \n "
          "STAR --runThreadN 4 \\\n"
          "--genomeDir ~/Scratch/reference/star_single_cell/index/ \\\n"
          "--readFilesIn " + first_reads + " " + second_reads + " \\\n"
          "--readFilesCommand zcat \\\n" 
          "--twopassMode Basic \\\n" 
          "--outReadsUnmapped None \\\n" 
          "--chimSegmentMin 12 \\\n" 
          "--chimJunctionOverhangMin 12 \\\n"  
          "--alignSJDBoverhangMin 10 \\\n" 
          "--alignMatesGapMax 100000 \\\n" 
          "--alignIntronMax 100000 \\\n" 
          "--chimSegmentReadGapMax 3 \\\n"                                                                                     
          "--alignSJstitchMismatchNmax 5 -1 5 5 \\\n" 
          "--outSAMstrandField intronMotif \\\n"
          "--outFilterIntronMotifs RemoveNoncanonical \\\n" ## added for compatibility with
          "--outFileNamePrefix {sample} \\\n"               ## cufflinks
          "--outSAMtype BAM SortedByCoordinate\n"
          "cp *junction {outpath} \n"
          "cp *bam {outpath} \n"
          "cp *Log.* {qc_folder} ")
  cmd = cmd.format(**locals())
  print cmd
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "star",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-w n -S /bin/bash -l h_rt=02:00:00 -w n -l mem=24G -l tmpfs=60G -pe smp 4 -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                     cmd,
                     err,
                     stdout_res,
                     stderr_res])))
  
  with logger_mutex:
    logger.debug("star worked")
Exemplo n.º 56
0
def test_task2(infile, outfile):
    print("%s start to run " % infile)
    run_job("./five_second.py", run_locally=True)
    print("%s wake up " % infile)
    with open(outfile, "w") as p:
        pass
Exemplo n.º 57
0
def post_alignment_filter(input_file, output_file, out_dir,log_file, logger, logger_mutex):
  print input_file
  raw_bam=os.path.basename(input_file)
  prefix=raw_bam[:-4]
  FILT_BAM_PREFIX=prefix + ".filt.srt"
  FILT_BAM_FILE=FILT_BAM_PREFIX +".bam"
  MAPQ_THRESH=30
  TMP_FILT_BAM_FILE=FILT_BAM_PREFIX + "dupmark.bam"
  DUP_FILE_QC=FILT_BAM_PREFIX + ".dup.qc"
  FINAL_BAM_PREFIX=prefix + ".filt.nodup.srt"
  FINAL_BAM_FILE=FINAL_BAM_PREFIX + ".bam"
  FINAL_BAM_INDEX_FILE=FINAL_BAM_PREFIX + ".bai"
  FINAL_BAM_FILE_MAPSTATS=FINAL_BAM_PREFIX + ".flagstat.qc"
  PBC_FILE_QC=FINAL_BAM_PREFIX + ".pbc.qc"
  picard_loc="/shared/ucl/apps/picard-tools/1.136/picard-tools-1.136/"
  cmd=("cd $TMPDIR \n"
       "cp {input_file} . \n"
       "date \n"
       "ls -l \n"
       "\n"
       "samtools sort -@ 4 -m 8G {raw_bam} temporary \n"
       "mv temporary.bam {raw_bam} \n"
       "samtools view -@ 4 -F 1804 -q {MAPQ_THRESH} -b {raw_bam} > {FILT_BAM_FILE} \n"
       "mv temporary_bam.bam {FILT_BAM_FILE} \n"
       "echo \"first filter done\" \n"
       "ls -lh \n"
       "#=========================\n"
       "# Mark Duplicates \n"
       "#==========================\n"
       "\n"
       "java -Xmx8G -jar {picard_loc}picard.jar MarkDuplicates INPUT={FILT_BAM_FILE} \\\n"
       "OUTPUT={TMP_FILT_BAM_FILE} METRICS_FILE={DUP_FILE_QC} VALIDATION_STRINGENCY=LENIENT \\\n"
       "ASSUME_SORTED=true REMOVE_DUPLICATES=false \n"
       "mv {TMP_FILT_BAM_FILE} {FILT_BAM_FILE} \n"
       "echo \"mark duplicates done\" \n"
       "ls -lh \n"
       "date \n"
       "\n"
       "# ============================ \n"
       "# Remove duplicates\n"
       "# Index final position sorted BAM \n"
       "# ============================ \n"
       "\n"
       "samtools view -@ 4 -F 1804 -b {FILT_BAM_FILE} > {FINAL_BAM_FILE} \n"
       "\n"
       "# Index Final BAM file \n"
       "samtools index {FINAL_BAM_FILE} {FINAL_BAM_INDEX_FILE} \n"
       "samtools flagstat {FINAL_BAM_FILE} > {FINAL_BAM_FILE_MAPSTATS} \n"
       "# Compute library complexity \n"
       "# ============================= \n"
       "# sort by position and strand \n"
       "# Obtain unique count statistics \n"
       "\n"
       "PBC_FILE_QC={FINAL_BAM_PREFIX}.pbc.qc \n"
       "# PBC File output \n"
       "echo -e \"TotalReadPairs\\tDistinctReadPairs\\tOneReadPair\\tTwoReadPairs\\tNRF=Distinct/Total\\tPBC1=OnePair/Distinct\\tPBC2=OnePair/TwoPair\" > header \n"
       "bedtools bamtobed -i {FILT_BAM_FILE} | awk 'BEGIN{{OFS=\"\\t\"}}{{print $1,$2,$3,$6}}' | \\\n"
       "grep -v chrM | sort | uniq -c | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \\\n"
       "{{m0=m0+1}} {{mt=mt+$1}} END{{printf \"%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \\\n"
       "> {PBC_FILE_QC} \n"
       "mv {FINAL_BAM_FILE} {out_dir} \n"
       "cat header {PBC_FILE_QC} > temp_file && mv temp_file {PBC_FILE_QC} \n"
       "mv {PBC_FILE_QC} {out_dir} \n"
       "mv {FINAL_BAM_FILE} {out_dir} \n")
  cmd = cmd.format(**locals())
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "filter_bam",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=8G -l tmpfs=60G -pe smp 4 -wd /home/sejjctj/Scratch -j yes ",
                                     job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))

  with logger_mutex:
      logger.debug("post_alignment_filter worked")
Exemplo n.º 58
0
def post_alignment_filter(input_file, output_file, out_dir,log_file, logger, logger_mutex):
    input_file = input_file[0]
    print "input:"
    print input_file
    print "output:"
    print output_file
    output_bam = os.path.basename(output_file)
    print "output_bam:"
    print output_bam
    RAW_BAM_FILE=os.path.basename(input_file)
    OFPREFIX = output_bam[:-4]
    print OFPREFIX
    FILT_BAM_PREFIX=OFPREFIX + ".filt"
    FILT_BAM_FILE=FILT_BAM_PREFIX + ".bam"
    TMP_FILT_BAM_PREFIX="tmp." + FILT_BAM_PREFIX + ".nmsrt"
    TMP_FILT_BAM_FILE=TMP_FILT_BAM_PREFIX + ".bam"
    TMP_FILT_FIXMATE_BAM_FILE=TMP_FILT_BAM_PREFIX + ".fixmate.bam"
    TMP_DUP_BAM_FILE=FILT_BAM_PREFIX + ".dupmark.bam"
    DUP_FILE_QC=FILT_BAM_PREFIX + ".dup.qc"
    FINAL_BAM_PREFIX=OFPREFIX + ".nodup"
    FINAL_BAM_FILE=FINAL_BAM_PREFIX + ".bam"
    FINAL_BAM_INDEX_FILE=FINAL_BAM_FILE + ".bai"
    FINAL_BAM_FILE_MAPSTATS=FINAL_BAM_PREFIX + ".flagstat.qc"
    PBC_FILE_QC=OFPREFIX + ".pbc.qc"
    picard_loc="/shared/ucl/apps/picard-tools/1.136/picard-tools-1.136/"
    cmd = ( " # =============================  \n"
            " # Remove unmapped, mate unmapped \n"
            " # not primary alignment, reads failing platform \n"
            " # Only keep properly paired reads \n"
            " # Obtain name sorted BAM file \n"
            " # ================== \n"
            " source ~/.bashrc \n"
            " cd $TMPDIR \n"
            " cp {input_file} ./ \n"
            " ls -lh \n"
            " date \n"
            " samtools view -F 524 -f 2 -u {RAW_BAM_FILE} \\\n"
            " | sambamba sort -n -m 16G -t 4 /dev/stdin -o {TMP_FILT_BAM_FILE} \n"
            " samtools view -h {TMP_FILT_BAM_FILE} | assign_multimappers.py -k 4 --paired-end \\\n"
            " | samtools fixmate -r /dev/stdin {TMP_FILT_FIXMATE_BAM_FILE} \n"
            " ls -lh \n"
            " date \n"
            " # Remove orphan reads (pair was removed) \n"
            " # and read pairs mapping to different chromosomes \n"
            " # obtain position sorted BAM \n"
            " samtools view -F 1804 -f 2 -u {TMP_FILT_FIXMATE_BAM_FILE} \\\n"
            " | sambamba sort -m 16G -t 4 /dev/stdin -o {FILT_BAM_FILE} \n"
            " rm {TMP_FILT_FIXMATE_BAM_FILE} \n"
            " rm {TMP_FILT_BAM_FILE} \n"
            " ls -lh \n"
            " date \n"
            " # ============= \n"
            " # Mark duplicates \n"
            " # ============= \n"
            " java -Xmx16G -jar {picard_loc}picard.jar MarkDuplicates INPUT={FILT_BAM_FILE} "
            " OUTPUT={TMP_DUP_BAM_FILE} METRICS_FILE={DUP_FILE_QC} "
            " VALIDATION_STRINGENCY=LENIENT ASSUME_SORTED=true  REMOVE_DUPLICATES=false \n"
            " mv {TMP_DUP_BAM_FILE}  {FILT_BAM_FILE} \n"
            " # ============================ \n"
            " # Remove duplicates \n"
            " # Index final position sorted BAM \n"
            " # Create final name sorted BAM \n"
            " # ============================ \n"
            " samtools view -F 1804 -f 2 -b {FILT_BAM_FILE} > {output_bam} \n"
            " samtools sort -n -m 16G -@ 4 {output_bam} {OFPREFIX}.final_filt_nmsrt \n"
            " # used later on \n"
            " samtools index {output_bam} \n"
            " samtools flagstat {output_bam} > {output_bam}.mapstats \n"
            " mv {output_bam}.mapstats {out_dir}\n"
            " # ============================= \n"
            " # Compute library complexity    \n"
            " # ============================= \n"
            " # Sort by name \n"
            " # convert to bedPE and obtain fragment coordinates \n"
            " # sort by position and strand \n"
            " # Obtain unique count statistics \n"
            " sambamba sort -n -m 16G -t 4 {FILT_BAM_FILE} -o {OFPREFIX}.srt.tmp.bam  \n"
            " echo -e '# PBC File output\n# TotalReadPairs\tDistinctReadPairs\tOneReadPair\tTwoReadPairs\tNRF=Distinct/Total\tPBC1=OnePair/Distinct\tPBC2=OnePair/TwoPair' > header \n"
            " bedtools bamtobed -bedpe -i {OFPREFIX}.srt.tmp.bam \\\n"
            " | awk 'BEGIN{{OFS=\"\\t\"}}{{print $1,$2,$4,$6,$9,$10}}' | grep -v 'chrM' | sort \\\n"
            " | uniq -c | \\\n"
            " awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}}($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}}{{t=mt+$1}}END{{printf\"%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' " 
            "  > {PBC_FILE_QC} \n"
            " rm {FILT_BAM_FILE} \n"
            " mv {output_bam} {out_dir} \n"
            " mv {output_bam}.bai {out_dir} \n"
            " cat header header {PBC_FILE_QC} > temporary && mv temporary > {PBC_FILE_QC} \n"
            " mv {PBC_FILE_QC} {out_dir} \n"
            " mv {OFPREFIX}.final_filt_nmsrt.bam {out_dir} ")

    cmd = cmd.format(**locals())
    try:
      stdout_res, stderr_res = "",""
      stdout_res, stderr_res = run_job(cmd,
                                     job_name = "filter_bam",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=10:00:00 -w n -l mem=16G -l tmpfs=60G -pe smp 4 -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )

    except error_drmaa_job as err:
      raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))

    with logger_mutex:
      logger.debug("post_alignment_filter worked")
Exemplo n.º 59
0
def macs2(input_file, output_file,out_dir, logger, logger_mutex):
  cmd = ("#===================================\n"
         "#  run mac2 2 on tn5 shifted files  \n"
         "#===================================\n"
         "source ~/.bashrc \n"
         "cd $TMPDIR \n"
         "cp {out_dir}/*tn5.tagAlign.gz . \n"
         "for tag in *tagAlign.gz \n"
         "do \n"
         "prefix=""${{tag:0:${{#tag}}-12}}""   #remove.tagAlign.gz \n" 
         "peakfile=""${{prefix}}"".narrowPeak.gz \n"
         "pval_thresh=0.01 \n"
         "fc_bedgraph=""${{prefix}}"".fc.signal.bedgraph \n"
         "fc_bedgraph_srt=""${{prefix}}"".fc.signal.srt.bedgraph \n"
         "fc_bigwig=""${{prefix}}""_sig.fc.signal.bigwig \n"
         "pval_bedgraph=""${{prefix}}"".pval.signal.bedgraph \n"
         "pval_bedgraph_srt=""${{prefix}}"".pval.signal.srt.bedgraph \n"
         "pval_bigwig=""${{prefix}}_sig.pval.signal.bigwig \n"
         "chrsz=\"/home/sejjctj/Scratch/reference/grch38/hg38.chrom.sizes\" \n"
         "## see https://github.com/taoliu/MACS/issues/145 for choice of --shift and --extsize \n"
         "macs2 callpeak \\\n"
         "-t ""$tag"" -f BED -n ""$prefix"" -g 2700000000 -p $pval_thresh \\\n"
         "--nomodel --shift -100 --extsize 200 -B --SPMR --keep-dup all --call-summits \n"
         "# Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank> \n"
         "sort -k 8gr,8gr \"$prefix\"_peaks.narrowPeak | awk 'BEGIN{{OFS=\"\\t\"}}{{$4=""Peak_""NR ; print $0}}' \\\n"
         " | gzip -nc > ""$peakfile"" \n"
         "rm -f \"$prefix\"_peaks.narrowPeak \n"
         "rm -f \"$prefix\"_peaks.xls \n"
         "rm -f \"$prefix\"_summits.bed \n"
         '''
	 "macs2 bdgcmp -t \"$prefix\"_treat_pileup.bdg -c \"$prefix\"_control_lambda.bdg \\\n"
         "--o-prefix ""$prefix"" -m FE \n"
         "slopBed -i \"$prefix\"_FE.bdg -g ""$chrsz"" -b 0 | bedClip stdin ""$chrsz"" ""$fc_bedgraph"" \n"
         "rm -f ""$prefix""_FE.bdg \n"
         "sort -k1,1 -k2,2n ""$fc_bedgraph"" > ""$fc_bedgraph_srt"" \n"
         "bedGraphToBigWig ""$fc_bedgraph_srt"" ""$chrsz"" ""$fc_bigwig"" \n"
         "rm -f ""$fc_bedgraph"" ""$fc_bedgraph_srt"" \n"
         "# sval counts the number of tags per million in the compressed BED file \n"
         "#sval=$(wc -l <(zcat -f \"$tag\" ) | awk '{{printf \"%f\", $1/1000000}}') \n"
         "sval=$(zcat \"$tag\" | wc -l | awk '{{print $1/1000000}}') \n"
         "macs2 bdgcmp \\\n"
         "-t \"$prefix\"_treat_pileup.bdg -c \"$prefix\"_control_lambda.bdg \\\n"
         "--o-prefix ""$prefix"" -m ppois -S ""${{sval}}"" \n"
         "slopBed -i \"$prefix\"_ppois.bdg -g ""$chrsz"" -b 0 | \\\n"
         "bedClip stdin ""$chrsz"" ""$pval_bedgraph"" \n"
         "rm -f \"$prefix\"_ppois.bdg \n"
         "sort -k1,1 -k2,2n ""$pval_bedgraph"" > ""$pval_bedgraph_srt"" \n"
         "bedGraphToBigWig ""$pval_bedgraph_srt"" ""$chrsz"" ""$pval_bigwig"" \n"
         "rm -f ""$pval_bedgraph"" ""$pval_bedgraph_srt"" \n"
         "rm -f \"$prefix\"_treat_pileup.bdg \"$prefix\"_control_lambda.bdg \n"
         '''
	 "mv ./\"$prefix\"* {out_dir} \n"
         "done \n")
  cmd = cmd.format(**locals())
  try:
    stdout_res, stderr_res = "",""
    stdout_res, stderr_res = run_job(cmd,
                                     job_name = "macs2",
                                     job_script_directory = "/home/sejjctj/Scratch/test_dir",
                                     job_other_options    = "-S /bin/bash -V -l h_rt=08:00:00 -w n -l mem=16G -l tmpfs=60G -wd /home/sejjctj/Scratch -j yes ",
                                     #job_environment      = { 'BASH_ENV' : '/home/sejjctj/.bashrc' } ,
                                     retain_job_scripts   = True,  # retain job scripts for debuging, they go in Scratch/test_dir
                                     working_directory    = "/home/sejjctj/Scratch/test_dir",
                                     drmaa_session        = drmaa_session,
                                     logger = logger )
  except error_drmaa_job as err:
    raise Exception("\n".join(map(str,
                    ["Failed to run:",
                      cmd,
                      err,
                      stdout_res,
                      stderr_res])))

  with logger_mutex:
    logger.debug("mac2_callpeaks")