def create_combine(pids, parentdir, pool, program, shdir): """Create command file to combine crisp or varscan jobs once they're finished. Positional arguments: pids = list of slurm job id dependencies (the jobs that need to finish first) ... """ pooldir = op.join(parentdir, pool) email_text = get_email_info(parentdir, 'final') dependencies = '#SBATCH --dependency=afterok:' + ','.join(pids) text = f'''#!/bin/bash #SBATCH --job-name={pool}-combine-{program} #SBATCH --time=12:00:00 #SBATCH --mem=20000M #SBATCH --cpus-per-task=1 #SBATCH --output={pool}-combine-{program}_%j.out {dependencies} {email_text} source $HOME/.bashrc export PYTHONPATH="${{PYTHONPATH}}:$HOME/pipeline" export SQUEUE_FORMAT="%.8i %.8u %.12a %.68j %.3t %16S %.10L %.5D %.4C %.6b %.7m %N (%r)" python $HOME/pipeline/combine_crispORvarscan.py {pooldir} {program} {pool} ''' combfile = op.join(shdir, f'{pool}-combine-{program}.sh') with open(combfile, 'w') as o: o.write("%s" % text) sbatch(combfile) print(f'sbatched {program} combinefile with dependencies: ' + ','.join(pids))
def create_combine(pids, parentdir, pool, program, shdir): """Create command file to combine varscan jobs once they're finished. Positional arguments: pids = list of slurm job id dependencies (the jobs that need to finish first) ... """ pooldir = op.join(parentdir, pool) email_text = get_email_info(parentdir, 'final') dependencies = '#SBATCH --dependency=afterok:' + ','.join(pids) bash_variables = op.join(parentdir, 'bash_variables') text = f'''#!/bin/bash #SBATCH --job-name={pool}-combine-{program} #SBATCH --time=12:00:00 #SBATCH --mem=20000M #SBATCH --cpus-per-task=1 #SBATCH --output={pool}-combine-{program}_%j.out {dependencies} {email_text} source {bash_variables} python $HOME/pipeline/combine_varscan.py {pooldir} {program} {pool} ''' combfile = op.join(shdir, f'{pool}-combine-{program}.sh') with open(combfile, 'w') as o: o.write("%s" % text) sbatch(combfile) print(f'sbatched {program} combinefile with dependencies: ' + ','.join(pids))
import sys, os, balance_queue, subprocess, shutil from os import path as op from coadaptree import makedir, get_email_info, pklload thisfile, pooldir, samp = sys.argv sortfiles = pklload(op.join(pooldir, '%s_sortfiles.pkl' % samp)) # MarkDuplicates dupdir = op.join(pooldir, '03_dedup_rg_filtered_indexed_sorted_bamfiles') pool = op.basename(pooldir) dupfile = op.join(dupdir, "%s_rd.bam" % samp) dupflag = dupfile.replace(".bam", ".bam.flagstats") dupstat = op.join(dupdir, "%s_rd_dupstat.txt" % samp) # create sh file email_text = get_email_info(op.dirname(pooldir), '03') joined = ' I='.join(sortfiles) text = f"""#!/bin/bash #SBATCH --time=11:59:00 #SBATCH --mem=30000M #SBATCH --ntasks=1 #SBATCH --job-name={pool}-{samp}-mark #SBATCH --output={pool}-{samp}-mark_%j.out {email_text} # remove dups module load picard/2.18.9 java -Djava.io.tmpdir=$SLURM_TMPDIR -jar $EBROOTPICARD/picard.jar MarkDuplicates \ I={joined} O={dupfile} MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 M={dupstat} REMOVE_DUPLICATES=true java -jar $EBROOTPICARD/picard.jar BuildBamIndex I={dupfile} module unload picard
import os, sys, balance_queue, subprocess, shutil from os import path as op from coadaptree import makedir, pklload, get_email_info thisfile, pooldir, samp, dupfile = sys.argv # RealignerTargetCreator aligndir = op.join(pooldir, '04_realign') listfile = op.join(aligndir, '%s_realingment_targets.list' % samp) # get ref parentdir = op.dirname(pooldir) pool = op.basename(pooldir) ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] email_text = get_email_info(parentdir, '04') text = '''#!/bin/bash #SBATCH --time=7-00:00:00 #SBATCH --mem=30000M #SBATCH --nodes=1 #SBATCH --ntasks=32 #SBATCH --cpus-per-task=1 #SBATCH --job-name=%(pool)s-%(samp)s-realign #SBATCH --output=%(pool)s-%(samp)s-realign_%%j.out %(email_text)s # realign using the GATK module load gatk/3.8 module load java export _JAVA_OPTIONS="-Xms256m -Xmx28g" java -Djava.io.tmpdir=$SLURM_TMPDIR -jar $EBROOTGATK/GenomeAnalysisTK.jar \
### imports import sys import os from os import path as op from os import listdir import pickle import numpy as np from coadaptree import fs, createdirs, pklload, get_email_info ### ### args thisfile, parentdir = sys.argv if parentdir.endswith("/"): parentdir = parentdir[:-1] poolref = pklload(op.join(parentdir, 'poolref.pkl')) email_info = get_email_info(parentdir, 'concat') ### ### dirs shdir = op.join(parentdir, 'shfiles/concat') catdir = op.join(parentdir, 'concatenated_vcfs') filtdir = op.join(parentdir, 'filtered_snps') createdirs([shdir,catdir,filtdir]) ### # get the snpfiles snpdir = op.join(parentdir, 'snps') snpfiles = [f.replace('.tbi', '') for f in fs(snpdir) if 'snp' in op.basename(f) and f.endswith('.tbi')] os.system('echo "len(snpfiles) = %s"' % str(len(snpfiles))) # sort snpfiles by pool
""" ### imports import sys, os, pickle, subprocess from os import path as op import numpy as np from coadaptree import fs, createdirs, pklload, get_email_info from genotyping_scheduler import startscheduler, bigbrother, delsched ### ### args thisfile, parentdir = sys.argv if parentdir.endswith("/"): parentdir = parentdir[:-1] poolref = pklload(op.join(parentdir, 'poolref.pkl')) email_info = get_email_info(parentdir, 'final') bash_variables = op.join(parentdir, 'bash_variables') maf = pklload(op.join(parentdir, 'maf.pkl')) ### # make a reservation file so other jobs don't call 05.py resfile = op.join(parentdir, 'shfiles/06_reservation.txt') if not op.exists(resfile): startscheduler(resfile) else: print('06.py was running') bigbrother(resfile, DIR=None) ### dirs shdir = op.join(parentdir, 'shfiles/concat') catdir = op.join(parentdir, 'concatenated_vcfs')