def repair_task_list(): """main gain and loss algorithm""" (sentinel_path, results_path, haplotype_path, cancer_dir_path, tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID() inputs = [] outputs = [] prev_sentinels = [] prev_sentinels.append( taskHelpers.CreateFileList('{0}_findroi.sentinel', 1, sentinel_path)) sentinels = taskHelpers.CreateFileList('{0}_repair.sentinel', 1, sentinel_path) inputs.append( taskHelpers.CreateFileList('{0}.{1}.roi.sorted.bam', 88, tmpbams_path, "gain")) outputs.append( taskHelpers.CreateFileList('{0}.{1}.repaired.bam', 88, tmpbams_path, "gain")) sample_ids = taskHelpers.CreateFileList('{0}', 1, '') job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs, sample_ids, prev_sentinels) for job in job_parameters: yield job
def find_roi_bam_task_list(): """populates task inputs and outputs""" (sentinel_path, results_path, haplotype_path, cancer_dir_path, tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID() inputs = [] outputs = [] prev_sentinels = [] split_path = "/".join([results_path, "splitbams"]) prev_sentinels.append( taskHelpers.CreateFileList('{0}_sortn.sentinel', 1, sentinel_path)) sentinels = taskHelpers.CreateFileList('{0}_findroi.sentinel', 1, sentinel_path) inputs.append( taskHelpers.CreateFileList('{0}.byname.bam', 22, split_path, "extractROI")) outputs.append( taskHelpers.CreateFileList('{0}.{1}.roi.bam', 88, tmpbams_path, "extractROI") ) # max number of outputs chr*events*haplotypes (8 for 2 chromosomes) sample_ids = taskHelpers.CreateFileList('{0}', 1, '') job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs, sample_ids, prev_sentinels) for job in job_parameters: yield job
def complete_pipeline_gain_task_list(): (sentinel_path, results_path, haplotype_path, cancer_dir_path, tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID() inputs = [] outputs = [] prev_sentinels = [] prev_sentinels.append( taskHelpers.CreateFileList('{0}_subsample_gain.sentinel', 1, sentinel_path)) sentinels = taskHelpers.CreateFileList('{0}_sortmerge.sentinel', 1, sentinel_path) inputs.append( taskHelpers.CreateFileList('{0}_{1}_{2}.bam', 88, finalbams_path, "FINAL")) outputs.append( taskHelpers.CreateFileList(params.GetOutputFileName(), 1, finalbams_path)) sample_ids = taskHelpers.CreateFileList('{0}', 1, '') job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs, sample_ids, prev_sentinels) for job in job_parameters: yield job
def sort_by_name_task_list(): """populates task inputs and outputs""" (sentinel_path, results_path, haplotype_path, cancer_dir_path, tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID() inputs = [] outputs = [] prev_sentinels = [] split_path = "/".join([results_path, "splitbams"]) prev_sentinels.append( taskHelpers.CreateFileList('{0}_split.sentinel', 1, sentinel_path)) sentinels = taskHelpers.CreateFileList('{0}_sortn.sentinel', 1, sentinel_path) inputs.append( taskHelpers.CreateFileList('chr{1}.bam', 22, split_path + "/")) outputs.append( taskHelpers.CreateFileList('chr{1}.byname.bam', 22, split_path + "/")) sample_ids = taskHelpers.CreateFileList('{0}', 1, '') job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs, sample_ids, prev_sentinels) for job in job_parameters: yield job
def subsample_loss_task_list(): (sentinel_path, results_path, haplotype_path, cancer_dir_path, tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID() inputs = [] outputs = [] prev_sentinels = [] prev_sentinels.append( taskHelpers.CreateFileList('{0}_mutate_loss.sentinel', 1, sentinel_path)) sentinels = taskHelpers.CreateFileList('{0}_subsample_loss.sentinel', 1, sentinel_path) inputs.append( taskHelpers.CreateFileList('{0}.{1}.mutated.merged.sorted.bam', 12, tmpbams_path, "loss")) outputs.append( taskHelpers.CreateFileList('{0}{1}_GAIN.bam', 88, tmpbams_path, "loss")) sample_ids = taskHelpers.CreateFileList('{0}', 1, '') job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs, sample_ids, prev_sentinels) for job in job_parameters: yield job
def subsample_loss(inputs, output_sentinel, outputs, sample_id, prev_sentinel): """adjusting sample rate for Bam files""" task_list = [] log_msg = ' [subsample loss events] ' + '[' + sample_id + '] ' if pipelineHelpers.CheckSentinel(prev_sentinel, log, log_msg): pipelineHelpers.Logging('INFO', log, log_msg + 'Starting') python = sys.executable current_path = params.GetProgramPath() script_path = pipelineHelpers.GetScriptPath( sample_id, bamhelp.name) bamgineer_mem = bamhelp.GetBamgineerMem('med') for inp in inputs[0]: chrevent=os.path.basename(inp).strip().split("_")[0] chr = re.split('(\d+)',chrevent)[1] original_bam = sub('.mutated.merged.sorted.bam', '.sorted.bam', inp) sentinel_path, results_path,haplotype_path,cancer_dir_path,tmpbams_path,finalbams_path = taskHelpers.GetProjectNamePathRunID() LOSS_FINAL = "/".join([finalbams_path, 'CHR'+str(chr).upper() +'_LOSS.bam']) script = open('{0}sample_{1}_{2}.sh'.format(script_path, 'chr'+str(chr), "loss"), 'w') script.write('#!/bin/bash\n') script.write('#\n') script.write('#$ -cwd \n') script.write('module load samtools/1.2 \n') script.write('python {path}/subsample_loss.py {inbam} {fl} \n'.format(path=current_path,inbam=inp, fl=LOSS_FINAL)) script.close() process = pipelineHelpers.RunTask( os.path.abspath(script.name),4, bamgineer_mem, sample_id, bamhelp.name) task_list.append(process) pipelineHelpers.CheckTaskStatus( task_list, output_sentinel, log, log_msg) pipelineHelpers.Logging('INFO', log, log_msg+ 'Finished Sampling Loss Event')
def mutate_loss(inputs, output_sentinel, outputs, sample_id, prev_sentinel): """mutating reads according to haplotype at germline SNP locations""" task_list = [] log_msg = ' [implement_cnv] ' + '[' + sample_id + '] ' if pipelineHelpers.CheckSentinel(prev_sentinel, log, log_msg): pipelineHelpers.Logging('INFO', log, log_msg + 'Starting') python = sys.executable current_path = params.GetProgramPath() script_path = pipelineHelpers.GetScriptPath( sample_id, bamhelp.name) bamgineer_mem = bamhelp.GetBamgineerMem('med') sentinel_path, results_path,haplotype_path,cancer_dir_path,tmpbams_path,finalbams_path = taskHelpers.GetProjectNamePathRunID() for inp in inputs[0]: chr= os.path.basename(inp).strip().split(".")[0] bedfn= "/".join([haplotype_path, 'loss_het_snp_' + chr + '.bed']) diffn = "/".join([tmpbams_path,"diff.bam"]) nonhet= "/".join([tmpbams_path, 'diff_only1_' + os.path.basename(inp)]) hetfn=sub('.roi.sorted.bam$',".mutated.het.bam", inp) hetfnsorted = sub('.roi.sorted.bam$',".mutated.het.sorted.bam", inp) mergedsortfn = sub('.roi.sorted.bam$',".mutated.merged.sorted.bam", inp) script = open('{0}mutate_{1}_{2}.sh'.format(script_path, chr, "loss"), 'w') script.write('#!/bin/bash\n') script.write('#') script.write('#$ -cwd \n') script.write('module load samtools/1.2 \n') script.write('module load sambamba \n') script.write('module load bamUtil \n') script.write('sort -u {bf} -o {bf}\n\n'.format(bf=bedfn)) script.write('python {path}/mutate.py {inp1} {bf} {happath}\n\n'.format(inp1=inp, bf=bedfn ,path=current_path , happath=haplotype_path)) script.write('sambamba sort {het} -o {hetsort}\n\n'.format(het=hetfn, hetsort=hetfnsorted)) script.write('bam diff --in1 {repairedbam} --in2 {hetsort} --out {dif}\n\n'.format(repairedbam=inp, hetsort=hetfnsorted ,dif=diffn )) script.write('sambamba merge {merged} {hetonly} {nonhetonly}\n\n'.format(merged=mergedsortfn,hetonly=hetfnsorted, nonhetonly= nonhet)) script.write('rm {het} {nonhetonly} \n\n'.format(het=hetfn,nonhetonly= nonhet)) script.close() process = pipelineHelpers.RunTask( os.path.abspath(script.name),4, bamgineer_mem, sample_id, bamhelp.name) task_list.append(process) pipelineHelpers.CheckTaskStatus( task_list, output_sentinel, log, log_msg) pipelineHelpers.Logging('INFO', log, log_msg + 'Finished Mutating')
def find_roi_bam(inputs, output_sentinel, outputs, sample_id, prev_sentinel): """finding ROI bam for each haplotype/event/chr""" task_list = [] log_msg = ' [FindRoiBam] ' + '[' + sample_id + '] ' pipelineHelpers.Logging('INFO', log, log_msg + 'Starting') if pipelineHelpers.CheckSentinel(prev_sentinel, log, log_msg): python = sys.executable script_path = pipelineHelpers.GetScriptPath( sample_id, bamhelp.name) bamgineer_mem = bamhelp.GetBamgineerMem('med') sentinel_path, results_path,haplotype_path,cancer_dir_path,tmpbams_path,finalbams_path = taskHelpers.GetProjectNamePathRunID() for inp, op in izip(inputs[0],outputs[0]): opsorted=sub('.bam$',".sorted.bam", op) chr=os.path.basename(op).strip().split(".")[0] event=os.path.basename(op).strip().split(".")[1] exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi_"+ str(chr) +'.bed']) script = open( '{0}find_roi_{1}_{2}.sh'.format(script_path, chr, event), 'w') script.write('#!/bin/bash\n\n') script.write('#\n') script.write('#$ -cwd \n') script.write('module load bedtools \n') script.write('module load sambamba \n') script.write('sort -u {exonbed} -o {exonbed} \n'.format(exonbed=exonsinroibed)) script.write('bedtools pairtobed -abam {inp} ' '-b {bf} -type either > {outp} \n'.format(inp = inp, bf=exonsinroibed, outp=op)) script.write('sambamba sort {outp} -o ' '{outpsorted} \n'.format(outp=op, outpsorted= opsorted)) script.write('rm {outp} \n'.format( outp=op)) script.close() process = pipelineHelpers.RunTask( os.path.abspath(script.name), 1, bamgineer_mem, sample_id, bamhelp.name) task_list.append(process) pipelineHelpers.CheckTaskStatus( task_list, output_sentinel, log, log_msg) pipelineHelpers.Logging('INFO', log, log_msg + 'Finished FindROI')
import itertools import re import subprocess global bases bases = ('A','T','C','G') log = pipelineHelpers.GetLogFile('Bamgineer') import utils import vcf import gzip import shutil chr_list = range(1,23) event_list=['gain','loss'] sentinel_path, results_path,haplotype_path,cancer_dir_path,tmpbams_path,finalbams_path = taskHelpers.GetProjectNamePathRunID() def initPool(queue, level, terminating_): """ This causes the logging module to be initialized with the necessary info in pool threads to work correctly. """ logging.getLogger('').setLevel(level) global terminating terminating = terminating_ def initialize(): try: utils.createDirectory(results_path)