def init_file_names(chr, event,tmpbams_path, haplotypedir): flist=[] splitbams = params.GetSplitBamsPath() roibam = "/".join([tmpbams_path ,chr + event +"_roi.bam"]) sortbyname = "/".join([splitbams, chr + '.byname.bam']) sortbyCoord = "/".join([splitbams, chr + '.bam']) hetsnp = "/".join([haplotypedir, event+'_het_snp_' + chr + '.bed']) flist.extend([roibam,sortbyname,sortbyCoord,hetsnp]) return flist
def find_non_roi_bam(chr_list): """ Extract paired reads from original bam using generated non-ROI bed. """ chr = chr_list splitbams = params.GetSplitBamsPath() sortbyname = "/".join([splitbams, chr + '.byname.bam']) sortbyCoord = "/".join([splitbams, chr + '.bam']) nonroi = "/".join([finalbams_path, chr + "_non_roi.bam"]) exonsnonroibed = "/".join([haplotype_path, chr + "_non_roi.bed"]) success = False try: if not terminating.is_set(): nonroisort = sub('.bam$', '.sorted', nonroi) if os.path.isfile(nonroisort): success = True else: if os.path.isfile(exonsnonroibed): cmd = " ".join(["sort -u", exonsnonroibed, "-o", exonsnonroibed]); runCommand(cmd) print(" ___ extracting non-roi bams ___") extractAllReadsfromROI(sortbyCoord, exonsnonroibed, nonroi) removeIfEmpty(finalbams_path, ntpath.basename(nonroi)) pysam.sort(nonroi, nonroisort) pysam.index(nonroisort + '.bam') os.remove(nonroi) success = True else: logger.debug(exonsnonroibed + ' does not exist!') return except (KeyboardInterrupt): logger.error('Exception Crtl+C pressed in the child process in find_roi_bam for chr ' + chr) terminating.set() success = False return except Exception as e: logger.exception("Exception in find_non_roi_bam %s", e) terminating.set() success = False return if (success): logger.debug("find_non_roi_bam complete successfully for " + chr) return
def init_file_names(chr, tmpbams_path, haplotypedir, event=''): flist = [] roibam = "/".join([tmpbams_path, chr + "_roi" + event + ".bam"]) splitbams = params.GetSplitBamsPath() hetsnp = "/".join([haplotypedir, chr + '_het_snp' + event + '.bed']) if (not splitbams): splitbams = "/".join([res_path, 'splitbams']) sortbyname = "/".join([splitbams, chr + '.byname.bam']) sortbyCoord = "/".join([splitbams, chr + '.bam']) flist.extend([roibam, sortbyname, sortbyCoord, hetsnp]) return flist
def mergeSortBamFiles(mergedBamfn, finalbamdir): java_path, beagle_path, samtools_path, bedtools_path, vcftools_path, sambamba_path = params.GetSoftwarePath( ) command = "" os.chdir(finalbamdir) matches = [] num_files = 0 for root, dirnames, filenames in os.walk(finalbamdir): for filename in fnmatch.filter(filenames, '*.bam'): path = os.path.join(root, filename) if os.path.islink(path): path = os.path.realpath(path) if not matches.__contains__(path): matches.append(path) command = " ".join([path, command]) num_files = num_files + 1 if num_files > 1: command2 = " ".join([ sambamba_path, "merge", mergedBamfn, command, "--nthreads", str(4) ]) runCommand(command2) elif num_files == 1: if str(command.strip()).endswith("GAIN.bam"): path, fname = os.path.split(str(command.strip())) inbam_original = '/'.join( [params.GetSplitBamsPath(), sub('_gain', '', fname.lower())]) command2 = " ".join([ sambamba_path, "merge", mergedBamfn, command, inbam_original, "--nthreads", str(4) ]) runCommand(command2) elif str(command.strip()).endswith("LOSS.bam"): outbam = sub('.bam$', '.sort.bam', str(command.strip())) sortBam(command, outbam, finalbamdir) os.remove(str(command.strip()))
def init_file_names(chr, tmpbams_path, haplotypedir, event): """ Initialize file names for: ROI bam, chromosome sorted by name, chromosome sorted by coordinate, and heterozygous SNPs bed. """ flist = [] roibam = "/".join([tmpbams_path, chr + "_roi" + event + ".bam"]) splitbams = params.GetSplitBamsPath() hetsnp = "/".join([haplotypedir, chr + '_het_snp' + event + '.bed']) if not splitbams: splitbams = "/".join([res_path, 'splitbams']) sortbyname = "/".join([splitbams, chr + '.byname.bam']) sortbyCoord = "/".join([splitbams, chr + '.bam']) flist.extend([roibam, sortbyname, sortbyCoord, hetsnp]) return flist
def CreateFileList(file_type, num_files, path, flag=None): sentinel_path, results_path, haplotype_path, cancer_dir_path, tmpbams_path, finalbams_path = GetProjectNamePathRunID( ) job_list = [] infile_list = ParseInfile(params.GetInfile()) for line in infile_list: file_list = [] job_info = line ## Find the tumour_id for the sample sample_id = GetSampleIDs(line) if (flag is None): if file_type == 'bam': file_list.append(line[1]) elif num_files == 1: file_list.append(path + file_type.format(sample_id)) elif num_files >= 2: for chr in chr_list: file_list.append(path + file_type.format('chr', str(chr))) else: file_list.append('') job_list.append(file_list) elif (flag == "extractROI"): for chr, event in itertools.product(chr_list, event_list): exonsinroibed = "/".join([ haplotype_path, event + "_exons_in_roi_" + 'chr' + str(chr) + '.bed' ]) if (os.path.isfile(exonsinroibed)): splittmpbams = "/".join([path]) file_list.append(splittmpbams + '/' + 'chr' + file_type.format(chr, event)) job_list.append(file_list) elif (flag == "gain"): for chr in chr_list: splittmpbams = "/".join([path]) if (os.path.isfile(splittmpbams + '/' + 'chr' + str(chr) + '.gain.roi.sorted.bam')): file_list.append(splittmpbams + '/' + 'chr' + file_type.format(chr, "gain")) job_list.append(file_list) elif (flag == "loss"): for chr in chr_list: splittmpbams = "/".join([path]) if (os.path.isfile(splittmpbams + '/' + 'chr' + str(chr) + '.loss.roi.sorted.bam')): file_list.append(splittmpbams + '/' + 'chr' + file_type.format(chr, "loss")) job_list.append(file_list) elif (flag == "FINAL"): for chr, event in itertools.product(chr_list, event_list): chrbam = "/".join([ finalbams_path, 'CHR' + str(chr) + '_' + event.upper() + '.bam' ]) sortbyCoord = "/".join( [params.GetSplitBamsPath(), 'chr' + str(chr) + '.bam']) if (os.path.isfile(chrbam)): file_list.append(chrbam) job_list.append(file_list) elif (event == 'loss' and sortbyCoord): os.symlink(sortbyCoord, chrbam) return job_list
def run_pipeline(results_path): print(results_path) global haplotype_path, cancer_dir_path, tmpbams_path, finalbams_path, log_path, logfile, terminating, logger, logQueue, res_path res_path = results_path haplotype_path, cancer_dir_path, tmpbams_path, finalbams_path, log_path, logfile = handle.GetProjectPaths( results_path) terminating, logger, logQueue = handle.GetLoggings(logfile) chr_list = ['chr' + str(x) for x in range(1, 23)] chr_list.extend(['chrX', 'chrY']) t0 = time.time() outbamfn = params.GetOutputFileName() cnv_list = glob.glob("/".join([params.GetCNVDir(), '*.*'])) chromosome_event = create_chr_event_list(cnv_list, chr_list) logger.debug('pipeline started!') phase_path = '/'.join([results_path, 'phasedvcfdir']) if not os.path.exists('/'.join([results_path, 'phasedvcfdir'])): os.makedirs(phase_path) initialize0(phase_path, cancer_dir_path) for cnv_path in cnv_list: initialize_pipeline(phase_path, haplotype_path, cnv_path) pool1 = multiprocessing.Pool( processes=12, initializer=initPool, initargs=[logQueue, logger.getEffectiveLevel(), terminating]) try: if not params.GetSplitBamsPath(): if not os.path.exists("/".join([res_path, 'splitbams'])): os.makedirs("/".join([res_path, 'splitbams'])) params.SetSplitBamsPath("/".join([res_path, 'splitbams'])) result0 = pool1.map_async(split_bam_by_chr, chromosome_event).get(9999999) result1 = pool1.map_async(find_roi_bam, chromosome_event).get(9999999) result2 = pool1.map_async(implement_cnv, chromosome_event).get(9999999) pool1.close() except KeyboardInterrupt: logger.debug('You cancelled the program!') pool1.terminate() except Exception as e: logger.exception("Exception in main %s", e) pool1.terminate() finally: pool1.join() time.sleep(.1) mergeSortBamFiles(outbamfn, finalbams_path) t1 = time.time() shutil.rmtree(tmpbams_path) logger.debug(' ***** pipeline finished in ' + str(round((t1 - t0) / 60.0, 1)) + ' minutes ***** ') logging.shutdown()