def merge_bam(self,outbam,inbam1,inbam2): ''' does not work ''' #args = ",".join(inbams) #print ",".join(*args) pysam.merge(outbam,inbam1,inbam2) return
def merge_bam(self, outbam, inbam1, inbam2): ''' does not work ''' #args = ",".join(inbams) #print ",".join(*args) pysam.merge(outbam, inbam1, inbam2) return
def haplotype_assignment_nmask(self): sys.stdout.write("Reading in VCF and editing files\n") VCF, VCFids = self.read_in_vcf_nmasked() RNAedit = self.read_in_rna_editing() input_files = list() for c in VCF: sys.stdout.write("Assigning haplotype reads in chromosome " + str(c) + "\n") self.haplotype_assignment_nmask_bychrom(c, VCF[c], VCFids[c], RNAedit[c]) input_files.append(os.path.join(self.outDir, "nmask." + str(c) + ".bam")) sys.stdout.write("Done assigning haplotype reads\nPrinting report file\n") sys.stdout.write("Merging chromosome bam files\n") merge_parameters1 = ["-f", os.path.join(self.outDir, "nmask.bam")] + input_files if len(input_files) > 1: pysam.merge(*merge_parameters) else: os.rename(input_files[0], os.path.join(self.outDir, "nmask.bam")) sys.stdout.write("Sorting and indexing haplotype specific bam files\n") bam2sort(os.path.join(self.outDir, "nmask")) sys.stdout.write("Cleaning up files\n") os.remove(os.path.join(self.outDir, "nmask.bam")) for fn in input_files: os.remove(fn) os.remove(os.path.join(self.outDir, "nmask.fa")) sys.stdout.write("Done!\n") return
def bam_merge(*args): """ Wrapper for the pysam SAMtools merge function Parameters ---------- bam_file_1 : str Location of the bam file to merge into bam_file_2 : str Location of the bam file that is to get merged into bam_file_1 """ logger.info("Merging:") if isinstance(args[0], list): final_bam = args[0][0] tmp_bam = final_bam + "_merge.bam" pysam.merge("-f", tmp_bam, *args[0]) # pylint: disable=no-member else: final_bam = args[0] tmp_bam = final_bam + "_merge.bam" pysam.merge("-f", tmp_bam, *args) # pylint: disable=no-member try: with open(tmp_bam, "rb") as f_in: with open(final_bam, "wb") as f_out: f_out.write(f_in.read()) except IOError: return False os.remove(tmp_bam) return True
def haplotype_assignment_nmask(self): sys.stdout.write("Reading in VCF and editing files\n") VCF,VCFids = self.read_in_vcf_nmasked() RNAedit = self.read_in_rna_editing() input_files = list() for c in VCF: sys.stdout.write("Assigning haplotype reads in chromosome "+str(c) + '\n') self.haplotype_assignment_nmask_bychrom(c,VCF[c],VCFids[c],RNAedit[c]) input_files.append(os.path.join(self.outDir,"nmask."+str(c)+".bam")) sys.stdout.write("Done assigning haplotype reads\nPrinting report file\n") sys.stdout.write("Merging chromosome bam files\n") merge_parameters1 = ['-f',os.path.join(self.outDir,"nmask.bam")] + input_files if len(input_files)>1: pysam.merge(*merge_parameters) else: os.rename(input_files[0],os.path.join(self.outDir,"nmask.bam")) sys.stdout.write("Sorting and indexing haplotype specific bam files - cancelled 2\n") # bam2sort(os.path.join(self.outDir,"nmask")) sys.stdout.write("Cleaning up files - cancelled 2\n") # os.remove(os.path.join(self.outDir,"nmask.bam")) # for fn in input_files: # os.remove(fn) # os.remove(os.path.join(self.outDir,"nmask.fa")) sys.stdout.write("Done!\n") return
def __merge_bams(self, bams=[]): for _ in bams: if not os.path.isfile(_): raise RuntimeError() try: pysam.merge([_ for _ in bams]) return True except: raise RuntimeError()
def merge_bam(input_bams, output_bam, tag=None, threads=1): """ Merge all input files into a single output file, keeping sorted order on a given tag """ args = [] if tag is not None: args.extend(["-t", tag]) if threads > 1: args.extend(["-c", "-p", "-s", "0", "-@", threads]) args.append(output_bam) args.extend(list(input_bams)) args = [str(arg) for arg in args] pysam.merge(*args)
def merge_bams(bams: List[str]) -> str: """ Merge input bams using samtools. This cannot be a local function within `split` because then Python "cannot pickle a local object". :param bams: Name of the final bam + bams to merge. Because of how its called using multiprocessing, the bam basename is the first element of the list. :return: The output bam name. """ bam_name = os.path.realpath(bams[0] + ".bam") bams_to_merge = bams[1:] pysam.merge("-c", "-p", bam_name, *bams_to_merge) return bam_name
def haplotype_assignment(self): sys.stdout.write("Reading in VCF and editing files\n") VCF, VCFids = self.read_in_vcf() RNAedit = self.read_in_rna_editing() counts = defaultdict(lambda: defaultdict(int)) input_files1, input_files2 = list(), list() for c in VCF: sys.stdout.write("Assigning haplotype reads in chromosome " + str(c) + "\n") counts[c]["h1"], counts[c]["h2"], counts[c]["c"] = self.haplotype_assignment_bychrom( c, VCF[c], VCFids[c], RNAedit[c] ) input_files1.append(os.path.join(self.outDir, "hap1." + str(c) + ".bam")) input_files2.append(os.path.join(self.outDir, "hap2." + str(c) + ".bam")) sys.stdout.write("Done assigning haplotype reads\nPrinting report file\n") if not self.nomerge: report_out = open(os.path.join(self.outDir, "report.assignment.txt"), "w") for c in counts: report_out.write( str(c) + "\t" + str(counts[c]["h1"]) + "\t" + str(counts[c]["h2"]) + "\t" + str(counts[c]["c"]) + "\n" ) report_out.close() sys.stdout.write("Merging chromosome bam files\n") merge_parameters1 = ["-f", os.path.join(self.outDir, "hap1.bam")] + input_files1 merge_parameters2 = ["-f", os.path.join(self.outDir, "hap2.bam")] + input_files2 if len(input_files1) > 1: pysam.merge(*merge_parameters1) pysam.merge(*merge_parameters2) else: os.rename(input_files1[0], os.path.join(self.outDir, "hap1.bam")) os.rename(input_files2[0], os.path.join(self.outDir, "hap2.bam")) sys.stdout.write("Sorting and indexing haplotype specific bam files\n") bam2sort(os.path.join(self.outDir, "hap1")) bam2sort(os.path.join(self.outDir, "hap2")) sys.stdout.write("Cleaning up files\n") os.remove(os.path.join(self.outDir, "hap1.bam")) os.remove(os.path.join(self.outDir, "hap2.bam")) for fn in input_files1: os.remove(fn) for fn in input_files2: os.remove(fn) os.remove(os.path.join(self.outDir, "hap1.fa")) os.remove(os.path.join(self.outDir, "hap2.fa")) sys.stdout.write("Done!\n") return
def bam_merge(list_of_bams, merged_bam, ncore=1): """ merge any number of (sorted) bam files """ [bam_index(bam, force=False, ncore=ncore) for bam in list_of_bams] if len(list_of_bams) > 1: merge_parameters = ["-f", f"-@ {samc(ncore)}", merged_bam] + list_of_bams pysam.merge(*merge_parameters) # noqa: pysam bug bam_index(merged_bam) else: # os.symlink() doesn't work with multi_bam_coverage() bam = list_of_bams[0] shutil.copy2(bam, merged_bam) shutil.copy2(f"{bam}.bai", f"{merged_bam}.bai")
def combine_samfiles(multi=False, clipped=False): #Seperate out clipped and unclipped! #Look at naming! if multi: sam1 = "unclipped_multimap.sam" sam2 = "clipped_multimap.sam" bam1 = "unclipped_multimap.bam" bam2 = "clipped_multimap.bam" out = open("multi_mapped.sam", "w") else: sam1 = "unclipped_unique.sam" sam2 = "clipped_unique.sam" bam1 = "unclipped_unique.bam" bam2 = "clipped_unique.bam" out = open("unique_mapped.sam", "w") #Convert unclipped sam to bam #Converts sam to bam bam1_o = open(bam1, "w") a = pysam.view("-bS", sam1) for r in a: bam1_o.write(r) bam1_o.close() #Converts clipped sam to bam if clipped == True: if os.stat(sam2).st_size > 0: #Checking file is not empty try: bam2_o = open(bam2, "w") b = pysam.view("-bS", sam2) for r in b: bam2_o.write(r) bam2_o.close() except: print "Samtools raised error, will assume Sam file is empty!" #Merge clipped and unclipped input_filenames = ["-f", bam1, bam2] output_filename = "tmp1.bam" merge_parameters = [output_filename] + input_filenames pysam.merge(*merge_parameters) pysam.sort("-n", "tmp1.bam", "tmp2") subprocess.call(["rm", sam2, bam2]) else: #If no clipped bam, just sort pysam.sort("-n", bam1, "tmp2") #Converts file to sam d = pysam.view("-h", "tmp2.bam") for r in d: out.write(r) subprocess.call(["rm", "tmp2.bam", "tmp1.bam", sam1, bam1])
def combine_samfiles(multi=False, clipped=False): #Seperate out clipped and unclipped! #Look at naming! if multi: sam1 = "unclipped_multimap.sam" sam2 = "clipped_multimap.sam" bam1 = "unclipped_multimap.bam" bam2 = "clipped_multimap.bam" out = open("multi_mapped.sam", "w") else: sam1 = "unclipped_unique.sam" sam2 = "clipped_unique.sam" bam1 = "unclipped_unique.bam" bam2 = "clipped_unique.bam" out = open("unique_mapped.sam", "w") #Convert unclipped sam to bam #Converts sam to bam bam1_o = open(bam1, "w") a = pysam.view("-bS", sam1) for r in a: bam1_o.write(r) bam1_o.close() #Converts clipped sam to bam if clipped == True: if os.stat(sam2).st_size > 0: #Checking file is not empty try: bam2_o = open(bam2, "w") b = pysam.view("-bS", sam2) for r in b: bam2_o.write(r) bam2_o.close() except: print "Samtools raised error, will assume Sam file is empty!" #Merge clipped and unclipped input_filenames = ["-f", bam1, bam2] output_filename = "tmp1.bam" merge_parameters = [output_filename] + input_filenames pysam.merge(*merge_parameters) pysam.sort("-n", "tmp1.bam", "tmp2" ) subprocess.call(["rm", sam2, bam2]) else: #If no clipped bam, just sort pysam.sort("-n", bam1, "tmp2" ) #Converts file to sam d = pysam.view("-h", "tmp2.bam") for r in d: out.write(r) subprocess.call(["rm", "tmp2.bam", "tmp1.bam", sam1, bam1])
def merge(out_file_name, input_file_names, threads=1): # Note the original samtools merge call can # fail if the total length of the command line # gets too long -- use the API version instead. #args = ['samtools', 'merge', out_file_name] #args.extend(input_file_names) #log_subprocess.check_call(args) if threads > 1: args = ["-c", "-p", "-s", "0", "-@", str(threads)] else: args = [] args.append(str(out_file_name)) args.extend([str(x) for x in input_file_names]) pysam.merge(*args)
def main(): parser = OptionParser() parser.add_option("-i", dest="inbam", type='string', help="the folder of input bam files") (options, args) = parser.parse_args() if not options.inbam: options.inbam = "." open_log(os.path.join(options.inbam, 'MyBamPostProcess.py_log')) logm('Program starts!') file_list_t = [os.path.join(options.inbam, x) for x in os.listdir(options.inbam) if x.endswith(".bam")] file_list = [] # Check log file to see if alignment is successful. for f in file_list_t: try: log_file = open(".".join(f.split(".")[:-1])+".bs_seeker2_log") for line in log_file: pass # Go to the last line if "END" in line: file_list.append(f) logm("File %s is included."%f) else: logm("File %s is excluded."%f) except: logm("File %s has no alignment log file."%f) if len(file_list) == 0: print >> sys.stderr, 'ERROR: no bam files available for post process.' exit(1) sorted_list = [] # Sort for inputsam in file_list: sortedsam = inputsam + "_sorted" pysam.sort(inputsam, sortedsam) sorted_list.append(sortedsam+".bam") logm('Individual bam file sorting finished.') # Merge mergedsam = file_list[0].split(".") mergedsam[0] = mergedsam[0].split("_") mergedsam[0][-1] = "merged" mergedsam[0] = "_".join(mergedsam[0]) mergedsam = ".".join(mergedsam) merge_params = [mergedsam] + sorted_list pysam.merge(*merge_params) logm('Merging finished.') # Remove sortedsams for f in sorted_list: os.remove(f) close_log()
def merge_sorted_fragments(bam_fname, file_fragments, do_not_index=False): logger.debug('Merging sorted BAM fragments ...') t0 = time.time() pysam.merge('-rpcf', bam_fname, *[f + '.sorted' for f in file_fragments]) t1 = time.time() logger.debug('... {:0.2f}s'.format(t1 - t0)) logger.debug('Removing fragments') for f in file_fragments: os.remove(f + '.sorted') if not do_not_index: logger.debug('BAM index ...') t0 = time.time() pysam.index(bam_fname, bam_fname + '.bai') t1 = time.time() logger.debug('... {:0.2f}s'.format(t1 - t0))
def parallel_computation(source_file: Path, destination: Path, cores=4, index_source=False): with tempfile.TemporaryDirectory() as temp: temp2 = Path(temp) processor = delayed(size_select_bam) pool = Parallel(n_jobs=cores) index_path = Path.cwd() / (source_file.name + ".bai") pysam.index("-@", str(cores), str(source_file), str(index_path)) assert index_path.exists() result = pool(processor(source_file, chrom, temp2, cores, index_path) for chrom in CHRS) result = natsorted(result, key=lambda x: x.split(".")[0]) pysam.merge("-h", str(source_file), "-@", str(cores), str(destination), *result) pysam.index("-@", str(cores), str(destination))
def merger(TDIR,batch): ###################################################################################################### ## TDIR = path to temp dir for temporary merge files eg. "/var/tmp/adfas7d" ## ## batch = list of files to merge ## ###################################################################################################### # set the tempfile name bam = tempfile.mktemp(dir=TDIR) # merge bams arguments = ["-f",bam] arguments = arguments + batch pysam.merge(*arguments) # return merged bam return bam
def merge(self, output_bam, input_bam_1, input_bam_2): # Check input files for input_bam in [input_bam_1, input_bam_2]: if not os.path.exists(input_bam) is True: sys.stderr.write( "Input file %s does exist. Merging not possible.\n" % input_bam) return # Check output file if os.path.exists(output_bam) is True: sys.stderr.write( "Output file %s already exists. Merging not possible.\n" % output_bam) return # Merge and generate index pysam.merge(output_bam, input_bam_1, input_bam_2) pysam.index(output_bam)
def merge_bam_files(output_bam, input_bam): """ merge the bam files and create the index @args output_bam: merged result file @type output_bam: str @args input_bam: list of input bam files @type input_bam: list """ for bam_file in input_bam: if not os.path.isfile(bam_file): exit("error: failed to fetch alignment file %s\n" % bam_file) try: pysam.merge(output_bam, input_bam[0], input_bam[1]) except: exit("error: running pysam merge\n%s" % str(e))
def preprocess_sam(sam_files, datasets, tmp_dir="/dev/shm/talon/", n_threads=0): """ Copy and rename the provided SAM/BAM file(s), merge them, and index. This is necessary in order to use Pybedtools commands on the reads. The renaming is necessary in order to label the reads according to their dataset.""" # Create the tmp dir os.system("mkdir -p %s " % (tmp_dir)) # Copy and rename SAM files with dataset names to ensure correct RG tags renamed_sams = [] for sam, dataset in zip(sam_files, datasets): suffix = "." + sam.split(".")[-1] if suffix == ".sam": bam_copy = tmp_dir + dataset + "_unsorted.bam" convert_to_bam(sam, bam_copy) sam = bam_copy sorted_bam = tmp_dir + dataset + ".bam" pysam.sort("-@", str(n_threads), "-o", sorted_bam, sam) renamed_sams.append(sorted_bam) merged_bam = tmp_dir + "merged.bam" merge_args = [merged_bam] + renamed_sams + [ "-f", "-r", "-@", str(n_threads) ] # index_args = [merged_bam, "-@", str(n_threads)] # Merge datasets and use -r option to include a read group tag try: pysam.merge(*merge_args) pysam.index(merged_bam) ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] Merged input SAM/BAM files" % (ts)) except: raise RuntimeError(("Problem merging and indexing SAM/BAM files. " "Check your file paths and make sure that all " "files have headers.")) return merged_bam
def haplotype_assignment(self): sys.stdout.write("Reading in VCF and editing files\n") VCF,VCFids = self.read_in_vcf() RNAedit = self.read_in_rna_editing() counts = defaultdict(lambda:defaultdict(int)) input_files1,input_files2 = list(), list() for c in VCF: sys.stdout.write("Assigning haplotype reads in chromosome "+str(c) + '\n') counts[c]['h1'], counts[c]['h2'],counts[c]['c'] = self.haplotype_assignment_bychrom(c,VCF[c],VCFids[c],RNAedit[c]) input_files1.append(os.path.join(self.outDir,"hap1."+str(c)+".bam")) input_files2.append(os.path.join(self.outDir,"hap2."+str(c)+".bam")) sys.stdout.write("Done assigning haplotype reads\nPrinting report file\n") if not self.nomerge: report_out = open(os.path.join(self.outDir,'report.assignment.txt'),'w') for c in counts: report_out.write(str(c) + '\t' + str(counts[c]['h1']) + '\t' + str(counts[c]['h2']) + '\t' + str(counts[c]['c']) + '\n') report_out.close() sys.stdout.write("Merging chromosome bam files\n") merge_parameters1 = ['-f',os.path.join(self.outDir,"hap1.bam")] + input_files1 merge_parameters2 = ['-f',os.path.join(self.outDir,"hap2.bam")] + input_files2 if len(input_files1)>1: pysam.merge(*merge_parameters1) pysam.merge(*merge_parameters2) else: os.rename(input_files1[0],os.path.join(self.outDir,"hap1.bam")) os.rename(input_files2[0],os.path.join(self.outDir,"hap2.bam")) sys.stdout.write("Sorting and indexing haplotype specific bam files - cancelled\n") # bam2sort(os.path.join(self.outDir,"hap1")) # bam2sort(os.path.join(self.outDir,"hap2")) sys.stdout.write("Cleaning up files - cancelled\n") # os.remove(os.path.join(self.outDir,"hap1.bam")) # os.remove(os.path.join(self.outDir,"hap2.bam")) # for fn in input_files1: # os.remove(fn) # for fn in input_files2: # os.r - cancelled 2emove(fn) # os.remove(os.path.join(self.outDir,"hap1.fa")) # os.remove(os.path.join(self.outDir,"hap2.fa")) sys.stdout.write("Done!\n") return
def merge_bamFiles(basenameNoExtension): if StrandnessType == "fr-firstrand": pysam.merge('-f', basenameNoExtension + "_Reverse.bam", basenameNoExtension + "_99.bam", basenameNoExtension + "_147.bam") pysam.merge('-f', basenameNoExtension + "_Forward.bam", basenameNoExtension + "_163.bam", basenameNoExtension + "_83.bam") elif StrandnessType == "fr-secondstrand": pysam.merge(basenameNoExtension + "_Forward.bam", basenameNoExtension + "_99.bam", basenameNoExtension + "_147.bam") pysam.merge(basenameNoExtension + "_Reverse.bam", basenameNoExtension + "_163.bam", basenameNoExtension + "_83.bam")
def merge_bams(bams: list, output_path: str, threads: int = 4): """Merge bamfiles to output_path When a single bam file is supplied, the bam file is moved to output_path All input bam files are removed Args: bams : list or tuple containing paths to bam files to merge output_path (str): target path Returns: output_path (str) """ assert threads >= 1 if len(bams) == 1: assert os.path.exists(bams[0] + '.bai'), 'Only indexed files can be merged' move(bams[0], output_path) move(bams[0] + '.bai', output_path + '.bai') else: assert all((os.path.exists(bams[0] + '.bai') for bam in bams)), 'Only indexed files can be merged' if which('samtools') is None: pysam.merge( output_path, *bams, f'-@ {threads} -f -p -c') #-c to only keep the same id once else: # This above command can have issues... os.system( f'samtools merge {output_path} {" ".join(bams)} -@ {threads} -f -p -c' ) pysam.index(output_path, f'-@ {threads}') for o in bams: os.remove(o) os.remove(o + '.bai') return output_path
def bam_merge(bam_ins, bam_out): """ merge multiple bam files input: list of bam files input: out.bam """ # check input files bam_flag = [] for b in bam_ins: if not os.path.exists(b) is True: bam_flag.append(b) if len(bam_flag) > 0: sys.exit('BAM files not exists:' + '\n'.join(bam_flag)) # check output file if os.path.exists(bam_out) is True: pass # sys.exit('BAM exists:' + bam_out) else: # merge pysam.merge('-f', bam_out + '.unsorted.bam', *bam_ins) # overwrite output BAM pysam.sort('-o', bam_out, bam_out + '.unsorted.bam') pysam.index(bam_out) os.remove(bam_out + '.unsorted.bam')
def merge_bams(out_file, out_dir, bams): s_bams = [] for bam_file in bams: sorted_bam = sort_bam(out_dir, bam_file) s_bams.append(sorted_bam) rm_bams(bams) in_files = ', '.join(s_bams) print("Merging bam files %s into '%s'") % (in_files, out_file) merge_parameters = ['-f', out_file] + s_bams pysam.merge(*merge_parameters) sorted_bam = sort_bam(out_dir, out_file) try: os.remove(out_file) except OSError: print("2 Couldn't remove %s" % out_file) pass rm_bams(s_bams) return sorted_bam
def merge_bam(self, data_dir, project_id, final_id, run_ids=[]): """ Merge together all the bams in a directory and sort to create the final bam ready to be filtered If run_ids is blank then the function looks for all bam files in the data_dir """ out_bam_file = data_dir + project_id + '/' + final_id + '.bam' if len(run_ids) == 0: bam_files = [ f for f in listdir(data_dir + project_id) if f.endswith(("sai")) ] else: bam_files = [f + ".bam" for f in run_ids] bam_sort_files = [] bam_merge_files = [] for bam in bam_files: bam_loc = data_dir + project_id + '/' + bam bam_sort_files.append(bam_loc) bam_merge_files.append(bam_loc) for bam_sort_file in bam_sort_files: print bam_sort_file pysam.sort("-o", str(bam_sort_file), str(bam_sort_file)) if len(bam_sort_files) == 1: pysam.sort("-o", str(out_bam_file), str(bam_sort_files[0])) else: pysam.merge(out_bam_file, *bam_merge_files) pysam.sort("-o", str(out_bam_file), "-T", str(out_bam_file) + ".bam_sort", str(out_bam_file)) pysam.index(str(out_bam_file))
def merge_bams(bams, output_path): """Merge bamfiles to output_path When a single bam file is supplied, the bam file is moved to output_path All input bam files are removed Args: bams : list or tuple containing paths to bam files to merge output_path (str): target path Returns: output_path (str) """ if len(bams) == 1: move(bams[0], output_path) move(bams[0] + '.bai', output_path + '.bai') else: pysam.merge(output_path, *bams, '-@ 4 -f -l 1 -c') pysam.index(output_path, '-@ 4') for o in bams: os.remove(o) os.remove(o + '.bai') return output_path
def main(args) : """ Main entry point for this script. :param args: the arguments for this script, as a list of string. Should already have had things like the script name stripped. That is, if there are no args provided, this should be the empty list. """ helpStr = "------------------------------------------------------------\n" +\ " MERGING BAM FILES \n" +\ "------------------------------------------------------------\n" +\ "After generating allele specific bam files for each \n" +\ "chromosome, rPGA can merge them. \n" +\ " \n" +\ "To merge bam files for all 22 autosomal chromosomes: \n" +\ " \n" +\ "$ rPGA merge auto \n" +\ " \n" +\ "To merge bam files for all 22 autosomes and X and Y: \n" +\ " \n" +\ "$ rPGA merge all \n" command = (args.command)[1:] if len(command) < 1: sys.stderr.write(helpStr + "\n\n") sys.exit() else : command = command[0].strip().lower() if command == "all" : # if command[0].strip().lower() == "help" : # sys.stderr.write(helpStr + "\n\n") # sys.exit() # else : # outDir = open(".rPGAProject.yaml").readline().rstrip() if not args.o: sys.stderr.write("Must provide output directory -o\n\n") sys.exit() outDir = args.o pysam.merge('-f',outDir+'/hap1.bam',outDir+'/hap1.1.bam',outDir+'/hap1.2.bam',outDir+'/hap1.3.bam',outDir+'/hap1.4.bam',outDir+'/hap1.5.bam',outDir+'/hap1.6.bam', outDir+'/hap1.7.bam',outDir+'/hap1.8.bam',outDir+'/hap1.9.bam',outDir+'/hap1.10.bam',outDir+'/hap1.11.bam',outDir+'/hap1.12.bam',outDir+'/hap1.13.bam', outDir+'/hap1.14.bam',outDir+'/hap1.15.bam',outDir+'/hap1.16.bam',outDir+'/hap1.17.bam',outDir+'/hap1.18.bam',outDir+'/hap1.19.bam',outDir+'/hap1.20.bam', outDir+'/hap1.21.bam',outDir+'/hap1.22.bam',outDir+'/hap1.X.bam',outDir+'/hap1.Y.bam') pysam.merge('-f',outDir+'/hap2.bam',outDir+'/hap2.1.bam',outDir+'/hap2.2.bam',outDir+'/hap2.3.bam',outDir+'/hap2.4.bam',outDir+'/hap2.5.bam',outDir+'/hap2.6.bam', outDir+'/hap2.7.bam',outDir+'/hap2.8.bam',outDir+'/hap2.9.bam',outDir+'/hap2.10.bam',outDir+'/hap2.11.bam',outDir+'/hap2.12.bam',outDir+'/hap2.13.bam', outDir+'/hap2.14.bam',outDir+'/hap2.15.bam',outDir+'/hap2.16.bam',outDir+'/hap2.17.bam',outDir+'/hap2.18.bam',outDir+'/hap2.19.bam',outDir+'/hap2.20.bam', outDir+'/hap2.21.bam',outDir+'/hap2.22.bam',outDir+'/hap2.X.bam',outDir+'/hap2.Y.bam') elif command == "auto": # if args[0].strip().lower() == "help" : # sys.stderr.write(helpStr + "\n\n") # sys.exit() # else: if not args.o: sys.stderr.write("Must provide output directory -o\n\n") sys.exit() # outDir = open(".rPGAProject.yaml").readline().rstrip() else: outDir = args.o pysam.merge('-f',outDir+'/hap1.bam',outDir+'/hap1.1.bam',outDir+'/hap1.2.bam',outDir+'/hap1.3.bam',outDir+'/hap1.4.bam',outDir+'/hap1.5.bam',outDir+'/hap1.6.bam', outDir+'/hap1.7.bam',outDir+'/hap1.8.bam',outDir+'/hap1.9.bam',outDir+'/hap1.10.bam',outDir+'/hap1.11.bam',outDir+'/hap1.12.bam',outDir+'/hap1.13.bam', outDir+'/hap1.14.bam',outDir+'/hap1.15.bam',outDir+'/hap1.16.bam',outDir+'/hap1.17.bam',outDir+'/hap1.18.bam',outDir+'/hap1.19.bam',outDir+'/hap1.20.bam', outDir+'/hap1.21.bam',outDir+'/hap1.22.bam') pysam.merge('-f',outDir+'/hap2.bam',outDir+'/hap2.1.bam',outDir+'/hap2.2.bam',outDir+'/hap2.3.bam',outDir+'/hap2.4.bam',outDir+'/hap2.5.bam',outDir+'/hap2.6.bam', outDir+'/hap2.7.bam',outDir+'/hap2.8.bam',outDir+'/hap2.9.bam',outDir+'/hap2.10.bam',outDir+'/hap2.11.bam',outDir+'/hap2.12.bam',outDir+'/hap2.13.bam', outDir+'/hap2.14.bam',outDir+'/hap2.15.bam',outDir+'/hap2.16.bam',outDir+'/hap2.17.bam',outDir+'/hap2.18.bam',outDir+'/hap2.19.bam',outDir+'/hap2.20.bam', outDir+'/hap2.21.bam',outDir+'/hap2.22.bam') else : sys.stderr.write("rPGA genomes -- unnknown command: " + command + "\n") sys.stderr.write(helpStr + "\n\n")
if bypass: pysam.view("-@", str(thread), "--no-PG", "-L", "bypass.bed", "-b", "-o" "bypassed.bam", inName, catch_stdout=False) ##pysam bug # while bypassing.poll() == None: # continue #merging = Popen([merge_cmd], shell = True) pysam.merge("--no-PG", "-c", "-p", "-b", toMergeF, "-O", "BAM", "-@", str(thread), ouName) # while merging.poll() == None: # continue # rehead_cmd = "samtools view -H " + inName + " | samtools reheader -P - " + ouName # rehead_cmd += " > rehead.bam" + " ; mv rehead.bam " + ouName ## print(rehead_cmd) ##reheading = Popen([rehead_cmd], shell = True) # pysam.view("-H", "-o", "header.sam", inName, catch_stdout=False) # pysam.reheader("-P", "-i", "header.sam", ouName) # while reheading.poll() == None: # continue
os.remove(mapped5) os.remove(mapped3) os.remove(mappedUs) # Sorting the resulting file. We do it this way because the original # file wasn't guaranteed to be sorted. To try to merge the three files # (split, 5' and 3'), we'd need to assume some sorting order and I'm # not willing to do that. Sorting afterward enforces the 'samtools' # name ordering over anything that might have been there originally. bsorted=os.path.join(tmpdir, "sorted") pysam.sort("-n", outbam, bsorted) os.rename(bsorted+".bam", outbam) #################################################################################################### # Stitching two files together to reform a single BAM file. cmd=["-n", "-f", args.output]+allnames pysam.merge(*cmd) for x in allnames: os.remove(x) # Mopping up. import shutil shutil.rmtree(tmpdir) dumpf.close() ####################################################################################################
okread.is_read2 = not is_first sout.write(okread) os.remove(mapped5) os.remove(mapped3) os.remove(mappedUs) # Sorting the resulting file. We do it this way because the original # file wasn't guaranteed to be sorted. To try to merge the three files # (split, 5' and 3'), we'd need to assume some sorting order and I'm # not willing to do that. Sorting afterward enforces the 'samtools' # name ordering over anything that might have been there originally. bsorted = os.path.join(tmpdir, "sorted.bam") pysam.sort("-o", bsorted, "-n", outbam) os.rename(bsorted, outbam) #################################################################################################### # Stitching two files together to reform a single BAM file. cmd = ["-n", "-f", args.output] + allnames pysam.merge(*cmd) for x in allnames: os.remove(x) # Mopping up. import shutil shutil.rmtree(tmpdir) dumpf.close() ####################################################################################################
def main(): g1_al, g2_al, g1_s, gop1, gop2, pt, sth = getfile() makefile(gop1 + "_genome", gop2 + "_genome") g1snp, g2snp = readsnpfile(g1_s) g1samfile = pysam.AlignmentFile(g1_al, "r") g2samfile = pysam.AlignmentFile(g2_al, "r") exitfile("tmp1.bam") exitfile("tmp2.bam") exitfile("tmp3.bam") exitfile("tmp_sortname.bam") tmp1 = pysam.AlignmentFile("tmp1.bam", "wb", template=g1samfile, threads=int(sth)) for r in g1samfile: r.query_name = r.query_name + "_g1" tmp1.write(r) tmp1.close() tmp2 = pysam.AlignmentFile("tmp2.bam", "wb", template=g2samfile, threads=int(sth)) for r in g2samfile: r.query_name = r.query_name + "_g2" tmp2.write(r) tmp2.close() pysam.merge("-@", sth, "tmp3.bam", "tmp1.bam", "tmp2.bam") pysam.sort("-@", sth, "-n", "-o", "tmp_sortname.bam", "tmp3.bam") tmp_all = pysam.AlignmentFile("tmp_sortname.bam", "rb") tmp_bamheader = tmp_all.text g1_header = g1samfile.text g2_header = g2samfile.text snpsupport = open(gop1 + "_" + gop2 + "_" + "support.txt", "w") g1_g1readsfile = pysam.AlignmentFile(gop1 + "_genome/" + gop1 + "reads.sam", "w", template=tmp_all, threads=int(sth), add_sam_header=False) g1_g2readsfile = pysam.AlignmentFile(gop1 + "_genome/" + gop2 + "reads.sam", "w", template=tmp_all, threads=int(sth), add_sam_header=False) g1_unknowreadsfile = pysam.AlignmentFile(gop1 + "_genome/" + "unknownreads.sam", "w", template=tmp_all, threads=int(sth), add_sam_header=False) g1_g1onlyreadsfile = pysam.AlignmentFile(gop1 + "_genome/" + gop1 + "onlyreads.sam", "w", template=tmp_all, threads=int(sth), add_sam_header=False) g2_g1readsfile = pysam.AlignmentFile(gop2 + "_genome/" + gop1 + "reads.sam", "w", template=tmp_all, threads=int(sth), add_sam_header=False) g2_g2readsfile = pysam.AlignmentFile(gop2 + "_genome/" + gop2 + "reads.sam", "w", template=tmp_all, threads=int(sth), add_sam_header=False) g2_unknowreadsfile = pysam.AlignmentFile(gop2 + "_genome/" + "unknownreads.sam", "w", template=tmp_all, threads=int(sth), add_sam_header=False) g2_g2onlyreadsfile = pysam.AlignmentFile(gop2 + "_genome/" + gop2 + "onlyreads.sam", "w", template=tmp_all, threads=int(sth), add_sam_header=False) stat_number = { "g1": 0, "g2": 0, "g1only": 0, "g2only": 0, "unk": 0, "single": 0, "discarded": 0 } if pt == "Isoseq": r0_name = "" samlist = {"g1": [], "g2": []} for r in tmp_all: if r0_name != r.query_name.replace("_g1", "").replace( "_g2", "") and r0_name != "": readt, support = isoreadsphase(samlist, g1snp, g2snp, gop1, gop2) readswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport) stat_number[readt] += 1 r0_name = r.query_name.replace("_g1", "").replace("_g2", "") samlist = {"g1": [], "g2": []} elif r0_name == "": r0_name = r.query_name.replace("_g1", "").replace("_g2", "") if r.query_name.find("_g1") != -1: samlist["g1"].append(r) elif r.query_name.find("_g2") != -1: samlist["g2"].append(r) readt, support = isoreadsphase(samlist, g1snp, g2snp, gop1, gop2) stat_number[readt] += 1 readswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport) elif pt == "RNAseq": r0_name = "" samlist = { "g1_mate1": [], "g1_mate2": [], "g2_mate1": [], "g2_mate2": [] } for r in tmp_all: if r0_name != r.query_name.replace("_g1", "").replace( "_g2", "") and r0_name != "": readt, support = rnapairreadsphase(samlist, g1snp, g2snp, gop1, gop2) if readt == "single": stat_number[readt] += 1 else: stat_number[readt] += 2 pairsreadswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport) r0_name = r.query_name.replace("_g1", "").replace("_g2", "") samlist = { "g1_mate1": [], "g1_mate2": [], "g2_mate1": [], "g2_mate2": [] } elif r0_name == "": r0_name = r.query_name.replace("_g1", "").replace("_g2", "") if r.query_name.find("_g1") != -1: if r.is_read1: samlist["g1_mate1"].append(r) elif r.is_read2: samlist["g1_mate2"].append(r) elif r.query_name.find("_g2") != -1: if r.is_read1: samlist["g2_mate1"].append(r) elif r.is_read2: samlist["g2_mate2"].append(r) readt, support = rnapairreadsphase(samlist, g1snp, g2snp, gop1, gop2) if readt == "single": stat_number[readt] += 1 else: stat_number[readt] += 2 pairsreadswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport) elif pt == "BSseq": r0_name = "" samlist = { "g1_mate1": [], "g1_mate2": [], "g2_mate1": [], "g2_mate2": [] } for r in tmp_all: if r0_name != r.query_name.replace("_g1", "").replace( "_g2", "") and r0_name != "": readt, support = wgbsreadsphase(samlist, g1snp, g2snp, gop1, gop2) if readt == "single": stat_number[readt] += 1 else: stat_number[readt] += 2 pairsreadswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport) r0_name = r.query_name.replace("_g1", "").replace("_g2", "") samlist = { "g1_mate1": [], "g1_mate2": [], "g2_mate1": [], "g2_mate2": [] } elif r0_name == "": r0_name = r.query_name.replace("_g1", "").replace("_g2", "") if r.query_name.find("_g1") != -1: if r.is_read1: samlist["g1_mate1"].append(r) elif r.is_read2: samlist["g1_mate2"].append(r) elif r.query_name.find("_g2") != -1: if r.is_read1: samlist["g2_mate1"].append(r) elif r.is_read2: samlist["g2_mate2"].append(r) readt, support = wgbsreadsphase(samlist, g1snp, g2snp, gop1, gop2) if readt == "single": stat_number[readt] += 1 else: stat_number[readt] += 2 pairsreadswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport) elif pt == "Riboseq": r0_name = "" samlist = {"g1": [], "g2": []} for r in tmp_all: if r0_name != r.query_name.replace("_g1", "").replace( "_g2", "") and r0_name != "": readt, support = riboreadsphase(samlist, g1snp, g2snp, gop1, gop2) readswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport) stat_number[readt] += 1 r0_name = r.query_name.replace("_g1", "").replace("_g2", "") samlist = {"g1": [], "g2": []} elif r0_name == "": r0_name = r.query_name.replace("_g1", "").replace("_g2", "") if r.query_name.find("_g1") != -1: samlist["g1"].append(r) elif r.query_name.find("_g2") != -1: samlist["g2"].append(r) readt, support = riboreadsphase(samlist, g1snp, g2snp, gop1, gop2) stat_number[readt] += 1 readswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport) g1_g1readsfile.close() g1_g2readsfile.close() g1_unknowreadsfile.close() g1_g1onlyreadsfile.close() g2_g1readsfile.close() g2_g2readsfile.close() g2_unknowreadsfile.close() g2_g2onlyreadsfile.close() snpsupport.close() g1file = [ gop1 + "_genome/" + gop1 + "reads", gop1 + "_genome/" + gop2 + "reads", gop1 + "_genome/unknownreads", gop1 + "_genome/" + gop1 + "onlyreads" ] g2file = [ gop2 + "_genome/" + gop1 + "reads", gop2 + "_genome/" + gop2 + "reads", gop2 + "_genome/unknownreads", gop2 + "_genome/" + gop2 + "onlyreads" ] PG = '@PG\tID:phasing.py\tPN:PP2PG\tVN:1.0.0\tCL:' + ' '.join( sys.argv) + '\n' tmp4 = open("tmp4.header", "w") tmp4.write(g1_header + PG) tmp4.close() tmp5 = open("tmp5.header", "w") tmp5.write(g2_header + PG) tmp5.close() for f in g1file: exitfile(f + ".bam") exitfile(f + ".tmp.sam") try: os.system("cat tmp4.header " + f + ".sam >" + f + ".tmp.sam") print("Excuting: samtools view -@ " + sth + " -b -S " + f + ".tmp.sam -o " + f + ".bam\n") os.system("samtools view -@ " + sth + " -b -S " + f + ".tmp.sam -o " + f + ".bam") os.system("rm " + f + ".sam " + f + ".tmp.sam") except Exception as e: print(e) sys.exit(2) else: print("*** " + f + ".sam converted bam file successfully. ***\n") for f in g2file: exitfile(f + ".bam") exitfile(f + ".tmp.sam") try: os.system("cat tmp5.header " + f + ".sam >" + f + ".tmp.sam") print("Excuting: samtools view -@ " + sth + " -b -S " + f + ".tmp.sam -o " + f + ".bam\n") os.system("samtools view -@ " + sth + " -b -S " + f + ".tmp.sam -o " + f + ".bam") os.system("rm " + f + ".sam " + f + ".tmp.sam") except Exception as e: print(e) sys.exit(2) else: print("*** " + f + ".sam converted bam file successfully. ***\n") try: os.remove("tmp1.bam") os.remove("tmp2.bam") os.remove("tmp3.bam") os.remove("tmp_sortname.bam") os.remove("tmp4.header") os.remove("tmp5.header") os.remove(gop1 + "_" + gop2 + "_support.txt") except Exception as e: print(e) sys.exit(2) else: print("*** Deleted tmp file successfully. ***\n") g1samfile.close() g2samfile.close() tmp_all.close() g1reads_stats = stat_number["g1"] g2reads_stats = stat_number["g2"] unknowreads_stats = stat_number["unk"] + stat_number["single"] g1onlyreads_stats = stat_number["g1only"] g2onlyreads_stats = stat_number["g2only"] datatype={"Isoseq":"Iso-Seq (PacBio Isoform Sequence).",\ "RNAseq":"RNA-Seq for paired-end reads.",\ "BSseq":"BS-Seq (Bisulfite Sequencing) for paired-end reads." ,\ "Riboseq":"Ribo-seq (Ribosome Profiling)."} head = """ ========================== Final Phasing Reads Report ========================== """ + "Phasing Data Type:\t" + datatype[pt] + """ Note: When calculating the Separation Rate, the reads which are unmapped in two parental genomes are discarded. Type of reads:\tRead Counts """ report=gop1+"-synteny reads:\t"+str(g1reads_stats)+"\n"+\ gop2+"-synteny reads:\t "+str(g2reads_stats)+"\n"+\ "Unknown reads:\t"+str(unknowreads_stats)+"\n"+\ gop1+"-only reads:\t"+str(g1onlyreads_stats)+"\n"+\ gop2+"-only reads:\t"+str(g2onlyreads_stats)+"\n\n"+\ "Separation Rate:\t"+str(round((float(g1reads_stats)+float(g2reads_stats))/(float(g1reads_stats)+float(g2reads_stats)+float(g1onlyreads_stats)+float(g2onlyreads_stats)+float(unknowreads_stats))*100,2))+"%\n" file_report = open(gop1 + "_" + gop2 + "_Phasing_Report.txt", "w") file_report.write(head + report) file_report.close() print(head + report) print(""" ================= Phasing Finished! ================= """)
sort_and_index(out_base + '_WO.bam') if barcode_type == 2: out_fr.close() out_rr.close() print('I found', tot, fcount, rcount, frcount, rrcount, count_wo) print('sorting - indexing') sort_and_index(out_base + '_F.bam') sort_and_index(out_base + '_R.bam') sort_and_index(out_base + '_FR.bam') sort_and_index(out_base + '_RR.bam') elif barcode_type == 1: if sys.argv[5] == 'True': print('merging') pysam.merge("-f", out_base + '_F_plus_R.bam', out_base + '_F.bam', out_base + '_R.bam') print('sorting - indexing') sort_and_index(out_base + '_F_plus_R.bam') print('delete startig files') os.remove(out_base + '_F.bam') os.remove(out_base + '_R.bam') elif sys.argv[5] == 'False': sort_and_index(out_base + '_F.bam') sort_and_index(out_base + '_R.bam') else: print('Merge is either True or False') sys.exit(1) print('I found', tot, fcount, rcount, count_wo) else:
def merge_sam_files(sam_dir_path, out_sam_path): sam_files = glob.glob(os.path.join(sam_dir_path, '*.sam')) os.makedirs(os.path.dirname(out_sam_path), exist_ok=True) pysam.merge('-f', out_sam_path, *sam_files, catch_stdout=False)
bam_merge_files.append(bam_root + ".sorted.bam") # Run the bs_seeker2-align.py steps on the split up fastq files for ffa in fastq_for_alignment: pwgbs.Aligner(ffa[0], ffa[1], ffa[2], ffa[3][ffa[2]], ffa[4], ffa[5]) # Sort and merge the aligned bam files # Pre-sort the original input bam files for bfs in bam_sort_files: pysam.sort("-o", bfs[1], bfs[0]) f_bam = in_file1.split("/") f_bam[-1] = f_bam[-1].replace(".fastq", ".sorted.bam") out_bam_file = "/".join(f_bam) pysam.merge(out_bam_file, *bam_merge_files) pysam.sort("-o", out_bam_file + '.sorted.bam', "-T", out_bam_file + ".bam_sort", out_bam_file) pysam.index(out_bam_file) # Run the bs_seeker2-call_methylation.py steps pwgbs.MethylationCaller( aligner_dir, out_bam_file, data_dir + project_id + '/' + srr_id + '/' + srr_id, genome_fa["unzipped"] + "_bowtie2") # Tidy up pwgbs.clean_up(ata_dir + project_id)
def sam_parser(bwastdout, out_dir): log.info('Error estimation...') bins = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 100000, 200000, 300000, 400000, 500000, 750000, 1000000000 ] mapped_frac_size = {k: [0, 0] for k in bins} pos_dict = {k: {} for k in arange(0, 250000000, 1000000)} OutDict = {k: {} for k in ['M', 'I', 'D']} header_list = [] proc_lists = {k: [] for k in range(th)} chr_cov_dict = {} file_list = [] fill_val = 0 Flag = True while Flag: line = bwastdout.stdout.readline() Threshold = 50000 while len(proc_lists[th - 1]) <= Threshold: proc_check_first = map(lambda x: len(proc_lists[x]), range(th - 1)) line_counter = 0 if line.strip() == '': Flag = False break if line[0] == '@': header_list.append(line) line = bwastdout.stdout.readline() else: while line_counter < len(proc_lists): proc_lists[line_counter].append(line) line_counter += 1 line = bwastdout.stdout.readline() line_counter = 0 line = bwastdout.stdout.readline() proc_check_second = map(lambda x: len(proc_lists[x]), range(th - 1)) if all(v == 0 for v in proc_check_second) == False: if proc_check_second == proc_check_first: time.sleep(5) proc_check_second = map(lambda x: len(proc_lists[x]), range(th - 1)) if proc_check_second == proc_check_first: break fill_list = (fill_val, fill_val + (th - 1) - map(lambda x: len(proc_lists[x]), range(th - 1)).count(0)) fill_val = fill_val + (th - 1) - map(lambda x: len(proc_lists[x]), range(th - 1)).count(0) res_obj = error_wrap(proc_lists, header_list, fill_list) for ro in res_obj: p_d, OD, m_f_s, fname = ro file_list.append(fname) for k, v in p_d.iteritems(): if v is not {}: for ch, l in v.iteritems(): pos_dict[k][ch] = pos_dict[k].get(ch, []) + l for var_k, cnt in OD.iteritems(): if cnt is not {}: for k, v in cnt.iteritems(): outdictget = OutDict.get(var_k) if outdictget.get(int(k)) == None: outdictget[int(k)] = v else: outdictget[int(k)] = [ outdictget.get(int(k))[0], int(outdictget.get(int(k))[1]) + int(v[1]), int(outdictget.get(int(k))[2]) + int(v[2]) ] for mfs in m_f_s: mapped_frac_size[mfs[0]] = map( sum, zip(mapped_frac_size[mfs[0]], mfs[1:])) proc_lists = {k: [] for k in range(th)} for ev in ['I', 'M', 'D']: OutDict[ev] = dict( zip( sorted(OutDict[ev].keys()), zip([sorted(OutDict[ev].keys())[0]] + list(diff(sorted(OutDict[ev].keys()))), [ max( zip(*map(lambda x: OutDict[ev].get(x), sorted(OutDict[ev].keys())))[1]) if x[0] < zip(*map(lambda x: OutDict[ev].get( x), sorted(OutDict[ev].keys())))[1].index( max( zip(*map(lambda x: OutDict[ev].get(x), sorted(OutDict[ev].keys())))[1])) else x[1] for x in enumerate( zip(*map(lambda x: OutDict[ev].get(x), sorted(OutDict[ev].keys())))[1]) ], zip(*map(lambda x: OutDict[ev].get(x), sorted(OutDict[ev].keys())))[2]))) for k, v in pos_dict.iteritems(): for k1, v1 in v.iteritems(): chr_cov_dict[str(k1)] = chr_cov_dict.get( str(k1), []) + [(k, sum(v1) / 1000000.)] mapped_frac_size = [[ k[0], k[1][0], k[1][1], round(1 - float(k[1][1]) / (float(k[1][0]) + float(k[1][1])), 3) ] for k in sorted(mapped_frac_size.iteritems()) if k[1][0] != 0 or k[1][1] != 0] sorted_unmapfraqseq = zip(*mapped_frac_size)[3] for ev in OutDict.keys(): for k, v in sorted(OutDict[ev].iteritems()): OutDict[ev][k] = round(float(v[2]) / (int(v[0]) * float(v[1])), 4) plot_stats(OutDict, sorted_unmapfraqseq, mapped_frac_size, chr_cov_dict, out_dir) finalfile = os.path.join(out_dir, (prefix + '.bam')) bamsfile = os.path.join(out_dir, 'to_merge.txt') file = open(bamsfile, 'w') for line in file_list: file.write(os.path.join(work_dir, line) + '\n') file.close() pysam.merge("-cp", "-@%s" % str(th), "-b%s" % bamsfile, finalfile, catch_stdout=False) for b in file_list: os.remove(b) os.remove(bamsfile)
def mergeBamFiles(outputFile, inputList, cpus): pysam.merge("-f", outputFile, "-@", cpus, *inputList)
def retrieve_unmapped_reads(args, params, filenames): log.logger.debug('started.') try: if args.p <= 2: thread_n = args.p elif args.p >= 3: thread_n = args.p - 1 # retrieve discordant reads, default if args.use_mate_mapped is False and args.all_discordant is False: if not args.b is None: pysam.view('-@', '%d' % thread_n, '-f', '12', '-F', '3842', '-b', '-o', filenames.discordant_bam, args.b, catch_stdout=False) elif not args.c is None: pysam.view('-@', '%d' % thread_n, '-f', '12', '-F', '3842', '-b', '-o', filenames.discordant_bam, '--reference', args.fa, args.c, catch_stdout=False) pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_merged_pre1, '-2', filenames.unmapped_merged_pre2, '-s', '/dev/null', filenames.discordant_bam) if args.keep is False: os.remove(filenames.discordant_bam) # retrieve discordant reads, non-default else: if not args.b is None: pysam.view('-@', '%d' % thread_n, '-f', '1', '-F', '3842', '-b', '-o', filenames.discordant_bam, args.b, catch_stdout=False) elif not args.c is None: pysam.view('-@', '%d' % thread_n, '-f', '1', '-F', '3842', '-b', '-o', filenames.discordant_bam, '--reference', args.fa, args.c, catch_stdout=False) pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o', filenames.discordant_sort_bam, filenames.discordant_bam) if args.keep is False: os.remove(filenames.discordant_bam) if args.all_discordant is True: pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_merged_pre1, '-2', filenames.unmapped_merged_pre2, '-s', '/dev/null', filenames.discordant_sort_bam) else: pysam.fastq('-@', '%d' % thread_n, '-f', '12', '-F', '3328', '-N', '-0', '/dev/null', '-1', filenames.unmapped_1, '-2', filenames.unmapped_2, '-s', '/dev/null', filenames.discordant_sort_bam) if args.use_mate_mapped is True: pysam.view('-@', '%d' % thread_n, '-f', '8', '-F', '3332', '-b', '-o', filenames.unmapped_bam_3, filenames.discordant_sort_bam, catch_stdout=False) pysam.view('-@', '%d' % thread_n, '-f', '4', '-F', '3336', '-b', '-o', filenames.unmapped_bam_4, filenames.discordant_sort_bam, catch_stdout=False) pysam.merge('-@', '%d' % thread_n, '-f', filenames.unmapped_bam_34, filenames.unmapped_bam_3, filenames.unmapped_bam_4) pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o', filenames.unmapped_sorted_34, filenames.unmapped_bam_34) pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_3, '-2', filenames.unmapped_4, '-s', '/dev/null', filenames.unmapped_sorted_34) # concatenate fastq with open(filenames.unmapped_merged_pre1, 'w') as outfile: for f in [filenames.unmapped_1, filenames.unmapped_3]: if os.path.exists(f) is True: with open(f) as infile: for line in infile: outfile.write(line) utils.gzip_or_del(args, params, f) with open(filenames.unmapped_merged_pre2, 'w') as outfile: for f in [filenames.unmapped_2, filenames.unmapped_4]: if os.path.exists(f) is True: with open(f) as infile: for line in infile: outfile.write(line) utils.gzip_or_del(args, params, f) # remove short reads infile1 = open(filenames.unmapped_merged_pre1) infile2 = open(filenames.unmapped_merged_pre2) outfile1 = open(filenames.unmapped_merged_1, 'w') outfile2 = open(filenames.unmapped_merged_2, 'w') min_seq_len = params.min_seq_len tmp1, tmp2 = [], [] for line1, line2 in zip(infile1, infile2): tmp1.append(line1) tmp2.append(line2) if len(tmp1) == 4: seqlen1 = len(tmp1[1].strip()) seqlen2 = len(tmp2[1].strip()) if seqlen1 >= min_seq_len and seqlen2 >= min_seq_len: outfile1.write(''.join(tmp1)) outfile2.write(''.join(tmp2)) tmp1, tmp2 = [], [] infile1.close() infile2.close() outfile1.close() outfile2.close() utils.gzip_or_del(args, params, filenames.unmapped_merged_pre1) utils.gzip_or_del(args, params, filenames.unmapped_merged_pre2) if args.keep is False: if os.path.exists(filenames.discordant_sort_bam) is True: os.remove(filenames.discordant_sort_bam) if args.use_mate_mapped is True: os.remove(filenames.unmapped_bam_3) os.remove(filenames.unmapped_bam_4) os.remove(filenames.unmapped_bam_34) os.remove(filenames.unmapped_sorted_34) except: log.logger.error('\n' + traceback.format_exc()) exit(1)