Exemplo n.º 1
0
	def merge_bam(self,outbam,inbam1,inbam2):
		''' does not work '''
		#args = ",".join(inbams)
		#print ",".join(*args)
		pysam.merge(outbam,inbam1,inbam2)

		return
Exemplo n.º 2
0
    def merge_bam(self, outbam, inbam1, inbam2):
        ''' does not work '''
        #args = ",".join(inbams)
        #print ",".join(*args)
        pysam.merge(outbam, inbam1, inbam2)

        return
Exemplo n.º 3
0
 def haplotype_assignment_nmask(self):
     sys.stdout.write("Reading in VCF and editing files\n")
     VCF, VCFids = self.read_in_vcf_nmasked()
     RNAedit = self.read_in_rna_editing()
     input_files = list()
     for c in VCF:
         sys.stdout.write("Assigning haplotype reads in chromosome " + str(c) + "\n")
         self.haplotype_assignment_nmask_bychrom(c, VCF[c], VCFids[c], RNAedit[c])
         input_files.append(os.path.join(self.outDir, "nmask." + str(c) + ".bam"))
     sys.stdout.write("Done assigning haplotype reads\nPrinting report file\n")
     sys.stdout.write("Merging chromosome bam files\n")
     merge_parameters1 = ["-f", os.path.join(self.outDir, "nmask.bam")] + input_files
     if len(input_files) > 1:
         pysam.merge(*merge_parameters)
     else:
         os.rename(input_files[0], os.path.join(self.outDir, "nmask.bam"))
     sys.stdout.write("Sorting and indexing haplotype specific bam files\n")
     bam2sort(os.path.join(self.outDir, "nmask"))
     sys.stdout.write("Cleaning up files\n")
     os.remove(os.path.join(self.outDir, "nmask.bam"))
     for fn in input_files:
         os.remove(fn)
     os.remove(os.path.join(self.outDir, "nmask.fa"))
     sys.stdout.write("Done!\n")
     return
Exemplo n.º 4
0
    def bam_merge(*args):
        """
        Wrapper for the pysam SAMtools merge function

        Parameters
        ----------
        bam_file_1 : str
            Location of the bam file to merge into
        bam_file_2 : str
            Location of the bam file that is to get merged into bam_file_1
        """
        logger.info("Merging:")

        if isinstance(args[0], list):
            final_bam = args[0][0]
            tmp_bam = final_bam + "_merge.bam"
            pysam.merge("-f", tmp_bam, *args[0])  # pylint: disable=no-member
        else:
            final_bam = args[0]
            tmp_bam = final_bam + "_merge.bam"
            pysam.merge("-f", tmp_bam, *args)  # pylint: disable=no-member

        try:
            with open(tmp_bam, "rb") as f_in:
                with open(final_bam, "wb") as f_out:
                    f_out.write(f_in.read())
        except IOError:
            return False

        os.remove(tmp_bam)

        return True
Exemplo n.º 5
0
  def haplotype_assignment_nmask(self):
    sys.stdout.write("Reading in VCF and editing files\n")
    VCF,VCFids = self.read_in_vcf_nmasked()
    RNAedit = self.read_in_rna_editing()
    input_files = list()
    for c in VCF:
      sys.stdout.write("Assigning haplotype reads in chromosome "+str(c) + '\n')
      self.haplotype_assignment_nmask_bychrom(c,VCF[c],VCFids[c],RNAedit[c])
      input_files.append(os.path.join(self.outDir,"nmask."+str(c)+".bam"))
    sys.stdout.write("Done assigning haplotype reads\nPrinting report file\n")
    sys.stdout.write("Merging chromosome bam files\n")
    merge_parameters1 = ['-f',os.path.join(self.outDir,"nmask.bam")] + input_files
    if len(input_files)>1:
      pysam.merge(*merge_parameters)
    else:
      os.rename(input_files[0],os.path.join(self.outDir,"nmask.bam"))
    sys.stdout.write("Sorting and indexing haplotype specific bam files - cancelled 2\n")
#    bam2sort(os.path.join(self.outDir,"nmask"))
    sys.stdout.write("Cleaning up files - cancelled 2\n")
#    os.remove(os.path.join(self.outDir,"nmask.bam"))
#    for fn in input_files:
#      os.remove(fn)
#    os.remove(os.path.join(self.outDir,"nmask.fa"))
    sys.stdout.write("Done!\n")
    return
Exemplo n.º 6
0
Arquivo: prepare.py Projeto: soh-i/Ivy
 def __merge_bams(self, bams=[]):
     for _ in bams:
         if not os.path.isfile(_):
             raise RuntimeError()
     try:
         pysam.merge([_ for _ in bams])
         return True
     except:
         raise RuntimeError()
Exemplo n.º 7
0
def merge_bam(input_bams, output_bam, tag=None, threads=1):
    """ Merge all input files into a single output file, keeping sorted order on a given tag
    """
    args = []
    if tag is not None:
        args.extend(["-t", tag])
    if threads > 1:
        args.extend(["-c", "-p", "-s", "0", "-@", threads])
    args.append(output_bam)
    args.extend(list(input_bams))
    args = [str(arg) for arg in args]
    pysam.merge(*args)
Exemplo n.º 8
0
def merge_bams(bams: List[str]) -> str:
    """ Merge input bams using samtools.

    This cannot be a local function within `split` because then Python "cannot pickle a local object".
    :param bams: Name of the  final bam + bams to merge.
        Because of how its called using multiprocessing, the bam basename is the first element of the list.
    :return: The output bam name.
    """
    bam_name = os.path.realpath(bams[0] + ".bam")
    bams_to_merge = bams[1:]
    pysam.merge("-c", "-p", bam_name, *bams_to_merge)
    return bam_name
Exemplo n.º 9
0
 def haplotype_assignment(self):
     sys.stdout.write("Reading in VCF and editing files\n")
     VCF, VCFids = self.read_in_vcf()
     RNAedit = self.read_in_rna_editing()
     counts = defaultdict(lambda: defaultdict(int))
     input_files1, input_files2 = list(), list()
     for c in VCF:
         sys.stdout.write("Assigning haplotype reads in chromosome " + str(c) + "\n")
         counts[c]["h1"], counts[c]["h2"], counts[c]["c"] = self.haplotype_assignment_bychrom(
             c, VCF[c], VCFids[c], RNAedit[c]
         )
         input_files1.append(os.path.join(self.outDir, "hap1." + str(c) + ".bam"))
         input_files2.append(os.path.join(self.outDir, "hap2." + str(c) + ".bam"))
     sys.stdout.write("Done assigning haplotype reads\nPrinting report file\n")
     if not self.nomerge:
         report_out = open(os.path.join(self.outDir, "report.assignment.txt"), "w")
         for c in counts:
             report_out.write(
                 str(c)
                 + "\t"
                 + str(counts[c]["h1"])
                 + "\t"
                 + str(counts[c]["h2"])
                 + "\t"
                 + str(counts[c]["c"])
                 + "\n"
             )
         report_out.close()
         sys.stdout.write("Merging chromosome bam files\n")
         merge_parameters1 = ["-f", os.path.join(self.outDir, "hap1.bam")] + input_files1
         merge_parameters2 = ["-f", os.path.join(self.outDir, "hap2.bam")] + input_files2
         if len(input_files1) > 1:
             pysam.merge(*merge_parameters1)
             pysam.merge(*merge_parameters2)
         else:
             os.rename(input_files1[0], os.path.join(self.outDir, "hap1.bam"))
             os.rename(input_files2[0], os.path.join(self.outDir, "hap2.bam"))
         sys.stdout.write("Sorting and indexing haplotype specific bam files\n")
         bam2sort(os.path.join(self.outDir, "hap1"))
         bam2sort(os.path.join(self.outDir, "hap2"))
         sys.stdout.write("Cleaning up files\n")
         os.remove(os.path.join(self.outDir, "hap1.bam"))
         os.remove(os.path.join(self.outDir, "hap2.bam"))
         for fn in input_files1:
             os.remove(fn)
         for fn in input_files2:
             os.remove(fn)
         os.remove(os.path.join(self.outDir, "hap1.fa"))
         os.remove(os.path.join(self.outDir, "hap2.fa"))
     sys.stdout.write("Done!\n")
     return
Exemplo n.º 10
0
def bam_merge(list_of_bams, merged_bam, ncore=1):
    """
    merge any number of (sorted) bam files
    """
    [bam_index(bam, force=False, ncore=ncore) for bam in list_of_bams]
    if len(list_of_bams) > 1:
        merge_parameters = ["-f", f"-@ {samc(ncore)}", merged_bam] + list_of_bams
        pysam.merge(*merge_parameters)  # noqa: pysam bug
        bam_index(merged_bam)
    else:
        # os.symlink() doesn't work with multi_bam_coverage()
        bam = list_of_bams[0]
        shutil.copy2(bam, merged_bam)
        shutil.copy2(f"{bam}.bai", f"{merged_bam}.bai")
Exemplo n.º 11
0
def combine_samfiles(multi=False, clipped=False):
    #Seperate out clipped and unclipped!
    #Look at naming!
    if multi:
        sam1 = "unclipped_multimap.sam"
        sam2 = "clipped_multimap.sam"
        bam1 = "unclipped_multimap.bam"
        bam2 = "clipped_multimap.bam"
        out = open("multi_mapped.sam", "w")
    else:
        sam1 = "unclipped_unique.sam"
        sam2 = "clipped_unique.sam"
        bam1 = "unclipped_unique.bam"
        bam2 = "clipped_unique.bam"
        out = open("unique_mapped.sam", "w")
    #Convert unclipped sam to bam

    #Converts sam to bam
    bam1_o = open(bam1, "w")
    a = pysam.view("-bS", sam1)
    for r in a:
        bam1_o.write(r)
    bam1_o.close()
    #Converts clipped sam to bam
    if clipped == True:
        if os.stat(sam2).st_size > 0:  #Checking file is not empty
            try:
                bam2_o = open(bam2, "w")
                b = pysam.view("-bS", sam2)
                for r in b:
                    bam2_o.write(r)
                bam2_o.close()
            except:
                print "Samtools raised error, will assume Sam file is empty!"
            #Merge clipped and unclipped
            input_filenames = ["-f", bam1, bam2]
            output_filename = "tmp1.bam"
            merge_parameters = [output_filename] + input_filenames
            pysam.merge(*merge_parameters)
            pysam.sort("-n", "tmp1.bam", "tmp2")
            subprocess.call(["rm", sam2, bam2])
    else:
        #If no clipped bam, just sort
        pysam.sort("-n", bam1, "tmp2")
    #Converts file to sam
    d = pysam.view("-h", "tmp2.bam")
    for r in d:
        out.write(r)
    subprocess.call(["rm", "tmp2.bam", "tmp1.bam", sam1, bam1])
Exemplo n.º 12
0
def combine_samfiles(multi=False, clipped=False):
	#Seperate out clipped and unclipped!
	#Look at naming!
	if multi:
		sam1 = "unclipped_multimap.sam"
		sam2 = "clipped_multimap.sam"
		bam1 = "unclipped_multimap.bam"
		bam2 = "clipped_multimap.bam"
		out = open("multi_mapped.sam", "w")
	else:
		sam1 = "unclipped_unique.sam"
		sam2 = "clipped_unique.sam"
		bam1 = "unclipped_unique.bam"
		bam2 = "clipped_unique.bam"
		out = open("unique_mapped.sam", "w")
	#Convert unclipped sam to bam

	#Converts sam to bam
	bam1_o = open(bam1, "w")
	a = pysam.view("-bS", sam1)
	for r in a:                                     
		bam1_o.write(r)
	bam1_o.close()
	#Converts clipped sam to bam
	if clipped == True:
		if os.stat(sam2).st_size > 0: #Checking file is not empty
			try:
				bam2_o = open(bam2, "w")
				b = pysam.view("-bS", sam2)
				for r in b:                                     
					bam2_o.write(r)
				bam2_o.close()
			except:
				print "Samtools raised error, will assume Sam file is empty!"
			#Merge clipped and unclipped
			input_filenames = ["-f", bam1, bam2]
			output_filename = "tmp1.bam"
			merge_parameters = [output_filename] + input_filenames
			pysam.merge(*merge_parameters)
			pysam.sort("-n", "tmp1.bam", "tmp2" )
			subprocess.call(["rm", sam2, bam2])
	else:
		#If no clipped bam, just sort 
		pysam.sort("-n", bam1, "tmp2" )
	#Converts file to sam
	d = pysam.view("-h", "tmp2.bam")
	for r in d:                                     
		out.write(r)
	subprocess.call(["rm", "tmp2.bam", "tmp1.bam", sam1, bam1])
Exemplo n.º 13
0
def merge(out_file_name, input_file_names, threads=1):
    # Note the original samtools merge call can
    # fail if the total length of the command line
    # gets too long -- use the API version instead.
    #args = ['samtools', 'merge', out_file_name]
    #args.extend(input_file_names)
    #log_subprocess.check_call(args)

    if threads > 1:
        args = ["-c", "-p", "-s", "0", "-@", str(threads)]
    else:
        args = []

    args.append(str(out_file_name))
    args.extend([str(x) for x in input_file_names])
    pysam.merge(*args)
Exemplo n.º 14
0
def main():
    parser = OptionParser()
    parser.add_option("-i", dest="inbam", type='string', help="the folder of input bam files")
    (options, args) = parser.parse_args()
    if not options.inbam:
        options.inbam = "."
    open_log(os.path.join(options.inbam, 'MyBamPostProcess.py_log'))
    logm('Program starts!')
    file_list_t = [os.path.join(options.inbam, x) for x in os.listdir(options.inbam) if x.endswith(".bam")]
    file_list = []
    # Check log file to see if alignment is successful.
    for f in file_list_t:
        try:
            log_file = open(".".join(f.split(".")[:-1])+".bs_seeker2_log")
            for line in log_file:
                pass
            # Go to the last line
            if "END" in line:
                file_list.append(f)
                logm("File %s is included."%f)
            else:
                logm("File %s is excluded."%f)
        except:
            logm("File %s has no alignment log file."%f)
    if len(file_list) == 0:
        print >> sys.stderr, 'ERROR: no bam files available for post process.'
        exit(1)
    sorted_list = []
    # Sort
    for inputsam in file_list:
        sortedsam = inputsam + "_sorted"
        pysam.sort(inputsam, sortedsam)
        sorted_list.append(sortedsam+".bam")
    logm('Individual bam file sorting finished.')
    # Merge
    mergedsam = file_list[0].split(".")
    mergedsam[0] = mergedsam[0].split("_")
    mergedsam[0][-1] = "merged"
    mergedsam[0] = "_".join(mergedsam[0])
    mergedsam = ".".join(mergedsam)
    merge_params = [mergedsam] + sorted_list
    pysam.merge(*merge_params)
    logm('Merging finished.')
    # Remove sortedsams
    for f in sorted_list:
        os.remove(f)
    close_log()
Exemplo n.º 15
0
def merge_sorted_fragments(bam_fname, file_fragments, do_not_index=False):
  logger.debug('Merging sorted BAM fragments ...')
  t0 = time.time()
  pysam.merge('-rpcf', bam_fname, *[f + '.sorted' for f in file_fragments])
  t1 = time.time()
  logger.debug('... {:0.2f}s'.format(t1 - t0))

  logger.debug('Removing fragments')
  for f in file_fragments:
   os.remove(f + '.sorted')

  if not do_not_index:
    logger.debug('BAM index ...')
    t0 = time.time()
    pysam.index(bam_fname, bam_fname + '.bai')
    t1 = time.time()
    logger.debug('... {:0.2f}s'.format(t1 - t0))
Exemplo n.º 16
0
def parallel_computation(source_file: Path, destination: Path, cores=4,
                         index_source=False):

    with tempfile.TemporaryDirectory() as temp:
        temp2 = Path(temp)
        processor = delayed(size_select_bam)
        pool = Parallel(n_jobs=cores)

        index_path = Path.cwd() / (source_file.name + ".bai")
        pysam.index("-@", str(cores), str(source_file), str(index_path))
        assert index_path.exists()
        result = pool(processor(source_file, chrom, temp2, cores, index_path)
                      for chrom in CHRS)
        result = natsorted(result, key=lambda x: x.split(".")[0])
        pysam.merge("-h", str(source_file), "-@", str(cores), str(destination),
                    *result)
        pysam.index("-@", str(cores), str(destination))
Exemplo n.º 17
0
def merger(TDIR,batch):

	######################################################################################################
	## TDIR = path to temp dir for temporary merge files eg. "/var/tmp/adfas7d"                         ##
	## batch = list of files to merge                                                                   ##
	######################################################################################################

	# set the tempfile name
	bam = tempfile.mktemp(dir=TDIR)

	# merge bams
	arguments = ["-f",bam]
	arguments = arguments + batch
	pysam.merge(*arguments)

	# return merged bam
	return bam
Exemplo n.º 18
0
def merge_sorted_fragments(bam_fname, file_fragments, do_not_index=False):
    logger.debug('Merging sorted BAM fragments ...')
    t0 = time.time()
    pysam.merge('-rpcf', bam_fname, *[f + '.sorted' for f in file_fragments])
    t1 = time.time()
    logger.debug('... {:0.2f}s'.format(t1 - t0))

    logger.debug('Removing fragments')
    for f in file_fragments:
        os.remove(f + '.sorted')

    if not do_not_index:
        logger.debug('BAM index ...')
        t0 = time.time()
        pysam.index(bam_fname, bam_fname + '.bai')
        t1 = time.time()
        logger.debug('... {:0.2f}s'.format(t1 - t0))
Exemplo n.º 19
0
 def merge(self, output_bam, input_bam_1, input_bam_2):
     # Check input files
     for input_bam in [input_bam_1, input_bam_2]:
         if not os.path.exists(input_bam) is True:
             sys.stderr.write(
                 "Input file %s does exist. Merging not possible.\n" %
                 input_bam)
             return
     # Check output file
     if os.path.exists(output_bam) is True:
         sys.stderr.write(
             "Output file %s already exists. Merging not possible.\n" %
             output_bam)
         return
     # Merge and generate index
     pysam.merge(output_bam, input_bam_1, input_bam_2)
     pysam.index(output_bam)
Exemplo n.º 20
0
 def merge(self, output_bam, input_bam_1, input_bam_2):
     # Check input files
     for input_bam in [input_bam_1, input_bam_2]:
         if not os.path.exists(input_bam) is True:
             sys.stderr.write(
                 "Input file %s does exist. Merging not possible.\n" %
                 input_bam)
             return
     # Check output file
     if os.path.exists(output_bam) is True:
         sys.stderr.write(
             "Output file %s already exists. Merging not possible.\n" %
             output_bam)
         return
     # Merge and generate index
     pysam.merge(output_bam, input_bam_1, input_bam_2)
     pysam.index(output_bam)
Exemplo n.º 21
0
def merge_bam_files(output_bam, input_bam):
    """
    merge the bam files and create the index 

    @args output_bam: merged result file 
    @type output_bam: str 
    @args input_bam: list of input bam files 
    @type input_bam: list 
    """

    for bam_file in input_bam:
        if not os.path.isfile(bam_file):
            exit("error: failed to fetch alignment file %s\n" % bam_file) 

    try:
        pysam.merge(output_bam, input_bam[0], input_bam[1])
    except:
        exit("error: running pysam merge\n%s" % str(e))
Exemplo n.º 22
0
def merge_bam_files(output_bam, input_bam):
    """
    merge the bam files and create the index 

    @args output_bam: merged result file 
    @type output_bam: str 
    @args input_bam: list of input bam files 
    @type input_bam: list 
    """

    for bam_file in input_bam:
        if not os.path.isfile(bam_file):
            exit("error: failed to fetch alignment file %s\n" % bam_file) 

    try:
        pysam.merge(output_bam, input_bam[0], input_bam[1])
    except:
        exit("error: running pysam merge\n%s" % str(e))
Exemplo n.º 23
0
def preprocess_sam(sam_files,
                   datasets,
                   tmp_dir="/dev/shm/talon/",
                   n_threads=0):
    """ Copy and rename the provided SAM/BAM file(s), merge them, and index.
        This is necessary in order to use Pybedtools commands on the reads.
        The renaming is necessary in order to label the reads according to
        their dataset."""

    # Create the tmp dir
    os.system("mkdir -p %s " % (tmp_dir))

    # Copy and rename SAM files with dataset names to ensure correct RG tags
    renamed_sams = []
    for sam, dataset in zip(sam_files, datasets):
        suffix = "." + sam.split(".")[-1]
        if suffix == ".sam":
            bam_copy = tmp_dir + dataset + "_unsorted.bam"
            convert_to_bam(sam, bam_copy)
            sam = bam_copy
        sorted_bam = tmp_dir + dataset + ".bam"
        pysam.sort("-@", str(n_threads), "-o", sorted_bam, sam)
        renamed_sams.append(sorted_bam)

    merged_bam = tmp_dir + "merged.bam"
    merge_args = [merged_bam] + renamed_sams + [
        "-f", "-r", "-@", str(n_threads)
    ]
    # index_args = [merged_bam, "-@", str(n_threads)]

    # Merge datasets and use -r option to include a read group tag
    try:
        pysam.merge(*merge_args)
        pysam.index(merged_bam)
        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        print("[ %s ] Merged input SAM/BAM files" % (ts))
    except:
        raise RuntimeError(("Problem merging and indexing SAM/BAM files. "
                            "Check your file paths and make sure that all "
                            "files have headers."))
    return merged_bam
Exemplo n.º 24
0
  def haplotype_assignment(self):
    sys.stdout.write("Reading in VCF and editing files\n")
    VCF,VCFids = self.read_in_vcf()
    RNAedit = self.read_in_rna_editing()
    counts = defaultdict(lambda:defaultdict(int))
    input_files1,input_files2 = list(), list()
    for c in VCF:
      sys.stdout.write("Assigning haplotype reads in chromosome "+str(c) + '\n')
      counts[c]['h1'], counts[c]['h2'],counts[c]['c'] = self.haplotype_assignment_bychrom(c,VCF[c],VCFids[c],RNAedit[c])
      input_files1.append(os.path.join(self.outDir,"hap1."+str(c)+".bam"))
      input_files2.append(os.path.join(self.outDir,"hap2."+str(c)+".bam"))
    sys.stdout.write("Done assigning haplotype reads\nPrinting report file\n")
    if not self.nomerge:
      report_out = open(os.path.join(self.outDir,'report.assignment.txt'),'w')
      for c in counts:
        report_out.write(str(c) + '\t' + str(counts[c]['h1']) + '\t' + str(counts[c]['h2']) + '\t' + str(counts[c]['c']) + '\n')
      report_out.close()
      sys.stdout.write("Merging chromosome bam files\n")
      merge_parameters1 = ['-f',os.path.join(self.outDir,"hap1.bam")] + input_files1
      merge_parameters2 = ['-f',os.path.join(self.outDir,"hap2.bam")] + input_files2
      if len(input_files1)>1:
        pysam.merge(*merge_parameters1)
        pysam.merge(*merge_parameters2)
      else:
        os.rename(input_files1[0],os.path.join(self.outDir,"hap1.bam"))
        os.rename(input_files2[0],os.path.join(self.outDir,"hap2.bam"))
      sys.stdout.write("Sorting and indexing haplotype specific bam files - cancelled\n")
#      bam2sort(os.path.join(self.outDir,"hap1"))
#      bam2sort(os.path.join(self.outDir,"hap2"))
      sys.stdout.write("Cleaning up files - cancelled\n")
#      os.remove(os.path.join(self.outDir,"hap1.bam"))
#      os.remove(os.path.join(self.outDir,"hap2.bam"))
#      for fn in input_files1:
#        os.remove(fn)
#      for fn in input_files2:
#        os.r - cancelled 2emove(fn)
#      os.remove(os.path.join(self.outDir,"hap1.fa"))
#      os.remove(os.path.join(self.outDir,"hap2.fa"))
    sys.stdout.write("Done!\n")
    return 
Exemplo n.º 25
0
def merge_bamFiles(basenameNoExtension):
    if StrandnessType == "fr-firstrand":
        pysam.merge('-f', basenameNoExtension + "_Reverse.bam",
                    basenameNoExtension + "_99.bam",
                    basenameNoExtension + "_147.bam")
        pysam.merge('-f', basenameNoExtension + "_Forward.bam",
                    basenameNoExtension + "_163.bam",
                    basenameNoExtension + "_83.bam")
    elif StrandnessType == "fr-secondstrand":
        pysam.merge(basenameNoExtension + "_Forward.bam",
                    basenameNoExtension + "_99.bam",
                    basenameNoExtension + "_147.bam")
        pysam.merge(basenameNoExtension + "_Reverse.bam",
                    basenameNoExtension + "_163.bam",
                    basenameNoExtension + "_83.bam")
def merge_bams(bams: list, output_path: str, threads: int = 4):
    """Merge bamfiles to output_path

    When a single bam file is supplied, the bam file is moved to  output_path
    All input bam files are removed

    Args:
        bams : list or tuple containing paths to bam files to merge
        output_path (str): target path

    Returns:
        output_path (str)

    """
    assert threads >= 1
    if len(bams) == 1:
        assert os.path.exists(bams[0] +
                              '.bai'), 'Only indexed files can be merged'
        move(bams[0], output_path)
        move(bams[0] + '.bai', output_path + '.bai')
    else:
        assert all((os.path.exists(bams[0] + '.bai')
                    for bam in bams)), 'Only indexed files can be merged'
        if which('samtools') is None:
            pysam.merge(
                output_path, *bams,
                f'-@ {threads} -f -p -c')  #-c to only keep the same id once
        else:
            # This above command can have issues...
            os.system(
                f'samtools merge {output_path} {" ".join(bams)} -@ {threads} -f -p -c'
            )

        pysam.index(output_path, f'-@ {threads}')
        for o in bams:
            os.remove(o)
            os.remove(o + '.bai')
    return output_path
Exemplo n.º 27
0
def bam_merge(bam_ins, bam_out):
    """
    merge multiple bam files
    input: list of bam files
    input: out.bam
    """
    # check input files
    bam_flag = []
    for b in bam_ins:
        if not os.path.exists(b) is True:
            bam_flag.append(b)
    if len(bam_flag) > 0:
        sys.exit('BAM files not exists:' + '\n'.join(bam_flag))
    # check output file
    if os.path.exists(bam_out) is True:
        pass
        # sys.exit('BAM exists:' + bam_out)
    else:
        # merge
        pysam.merge('-f', bam_out + '.unsorted.bam', *bam_ins) # overwrite output BAM
        pysam.sort('-o', bam_out, bam_out + '.unsorted.bam')
        pysam.index(bam_out)
        os.remove(bam_out + '.unsorted.bam')
Exemplo n.º 28
0
def merge_bams(out_file, out_dir, bams):
    s_bams = []
    for bam_file in bams:
        sorted_bam = sort_bam(out_dir, bam_file)
        s_bams.append(sorted_bam)

    rm_bams(bams)

    in_files = ', '.join(s_bams)
    print("Merging bam files %s into '%s'") % (in_files, out_file)
    merge_parameters = ['-f', out_file] + s_bams
    pysam.merge(*merge_parameters)

    sorted_bam = sort_bam(out_dir, out_file)
    try:
        os.remove(out_file)
    except OSError:
        print("2 Couldn't remove %s" % out_file)
        pass

    rm_bams(s_bams)

    return sorted_bam
Exemplo n.º 29
0
    def merge_bam(self, data_dir, project_id, final_id, run_ids=[]):
        """
        Merge together all the bams in a directory and sort to create the final
        bam ready to be filtered
        
        If run_ids is blank then the function looks for all bam files in the
        data_dir
        """
        out_bam_file = data_dir + project_id + '/' + final_id + '.bam'

        if len(run_ids) == 0:
            bam_files = [
                f for f in listdir(data_dir + project_id)
                if f.endswith(("sai"))
            ]
        else:
            bam_files = [f + ".bam" for f in run_ids]

        bam_sort_files = []
        bam_merge_files = []
        for bam in bam_files:
            bam_loc = data_dir + project_id + '/' + bam
            bam_sort_files.append(bam_loc)
            bam_merge_files.append(bam_loc)

        for bam_sort_file in bam_sort_files:
            print bam_sort_file
            pysam.sort("-o", str(bam_sort_file), str(bam_sort_file))

        if len(bam_sort_files) == 1:
            pysam.sort("-o", str(out_bam_file), str(bam_sort_files[0]))
        else:
            pysam.merge(out_bam_file, *bam_merge_files)
            pysam.sort("-o", str(out_bam_file), "-T",
                       str(out_bam_file) + ".bam_sort", str(out_bam_file))

        pysam.index(str(out_bam_file))
Exemplo n.º 30
0
def merge_bams(bams, output_path):
    """Merge bamfiles to output_path

    When a single bam file is supplied, the bam file is moved to  output_path
    All input bam files are removed

    Args:
        bams : list or tuple containing paths to bam files to merge
        output_path (str): target path

    Returns:
        output_path (str)

    """
    if len(bams) == 1:
        move(bams[0], output_path)
        move(bams[0] + '.bai', output_path + '.bai')
    else:
        pysam.merge(output_path, *bams, '-@ 4 -f -l 1 -c')
        pysam.index(output_path, '-@ 4')
        for o in bams:
            os.remove(o)
            os.remove(o + '.bai')
    return output_path
Exemplo n.º 31
0
def main(args) :
  """
    Main entry point for this script.

    :param args: the arguments for this script, as a list of string. Should
                 already have had things like the script name stripped. That
                 is, if there are no args provided, this should be the empty
                 list.
  """

  helpStr = "------------------------------------------------------------\n" +\
            "                      MERGING BAM FILES                     \n" +\
            "------------------------------------------------------------\n" +\
            "After generating allele specific bam files for each         \n" +\
            "chromosome, rPGA can merge them.                            \n" +\
            "                                                            \n" +\
            "To merge bam files for all 22 autosomal chromosomes:        \n" +\
            "                                                            \n" +\
            "$ rPGA merge auto                                           \n" +\
            "                                                            \n" +\
            "To merge bam files for all 22 autosomes and X and Y:        \n" +\
            "                                                            \n" +\
            "$ rPGA merge all                                            \n"

  command = (args.command)[1:]
  if len(command) < 1:
    sys.stderr.write(helpStr + "\n\n")
    sys.exit()
  else :
    command = command[0].strip().lower()
    if command == "all" :
#      if command[0].strip().lower() == "help" :
#        sys.stderr.write(helpStr + "\n\n")
#        sys.exit()
#      else :
#        outDir = open(".rPGAProject.yaml").readline().rstrip()
      if not args.o:
        sys.stderr.write("Must provide output directory -o\n\n")
        sys.exit()

      outDir = args.o
      pysam.merge('-f',outDir+'/hap1.bam',outDir+'/hap1.1.bam',outDir+'/hap1.2.bam',outDir+'/hap1.3.bam',outDir+'/hap1.4.bam',outDir+'/hap1.5.bam',outDir+'/hap1.6.bam',
                  outDir+'/hap1.7.bam',outDir+'/hap1.8.bam',outDir+'/hap1.9.bam',outDir+'/hap1.10.bam',outDir+'/hap1.11.bam',outDir+'/hap1.12.bam',outDir+'/hap1.13.bam',
                  outDir+'/hap1.14.bam',outDir+'/hap1.15.bam',outDir+'/hap1.16.bam',outDir+'/hap1.17.bam',outDir+'/hap1.18.bam',outDir+'/hap1.19.bam',outDir+'/hap1.20.bam',
                  outDir+'/hap1.21.bam',outDir+'/hap1.22.bam',outDir+'/hap1.X.bam',outDir+'/hap1.Y.bam')
      pysam.merge('-f',outDir+'/hap2.bam',outDir+'/hap2.1.bam',outDir+'/hap2.2.bam',outDir+'/hap2.3.bam',outDir+'/hap2.4.bam',outDir+'/hap2.5.bam',outDir+'/hap2.6.bam',
                  outDir+'/hap2.7.bam',outDir+'/hap2.8.bam',outDir+'/hap2.9.bam',outDir+'/hap2.10.bam',outDir+'/hap2.11.bam',outDir+'/hap2.12.bam',outDir+'/hap2.13.bam',
                  outDir+'/hap2.14.bam',outDir+'/hap2.15.bam',outDir+'/hap2.16.bam',outDir+'/hap2.17.bam',outDir+'/hap2.18.bam',outDir+'/hap2.19.bam',outDir+'/hap2.20.bam',
                  outDir+'/hap2.21.bam',outDir+'/hap2.22.bam',outDir+'/hap2.X.bam',outDir+'/hap2.Y.bam')
    elif command == "auto":
#      if args[0].strip().lower() == "help" :
#        sys.stderr.write(helpStr + "\n\n")
#        sys.exit()
#      else:
      if not args.o:
        sys.stderr.write("Must provide output directory -o\n\n")
        sys.exit()
#       outDir = open(".rPGAProject.yaml").readline().rstrip()
      else:
        outDir = args.o
        pysam.merge('-f',outDir+'/hap1.bam',outDir+'/hap1.1.bam',outDir+'/hap1.2.bam',outDir+'/hap1.3.bam',outDir+'/hap1.4.bam',outDir+'/hap1.5.bam',outDir+'/hap1.6.bam',
                    outDir+'/hap1.7.bam',outDir+'/hap1.8.bam',outDir+'/hap1.9.bam',outDir+'/hap1.10.bam',outDir+'/hap1.11.bam',outDir+'/hap1.12.bam',outDir+'/hap1.13.bam',
                    outDir+'/hap1.14.bam',outDir+'/hap1.15.bam',outDir+'/hap1.16.bam',outDir+'/hap1.17.bam',outDir+'/hap1.18.bam',outDir+'/hap1.19.bam',outDir+'/hap1.20.bam',
                    outDir+'/hap1.21.bam',outDir+'/hap1.22.bam')
        pysam.merge('-f',outDir+'/hap2.bam',outDir+'/hap2.1.bam',outDir+'/hap2.2.bam',outDir+'/hap2.3.bam',outDir+'/hap2.4.bam',outDir+'/hap2.5.bam',outDir+'/hap2.6.bam',
                    outDir+'/hap2.7.bam',outDir+'/hap2.8.bam',outDir+'/hap2.9.bam',outDir+'/hap2.10.bam',outDir+'/hap2.11.bam',outDir+'/hap2.12.bam',outDir+'/hap2.13.bam',
                    outDir+'/hap2.14.bam',outDir+'/hap2.15.bam',outDir+'/hap2.16.bam',outDir+'/hap2.17.bam',outDir+'/hap2.18.bam',outDir+'/hap2.19.bam',outDir+'/hap2.20.bam',
                    outDir+'/hap2.21.bam',outDir+'/hap2.22.bam')
    else :
      sys.stderr.write("rPGA genomes -- unnknown command: " + command + "\n")
      sys.stderr.write(helpStr + "\n\n")
Exemplo n.º 32
0
if bypass:
    pysam.view("-@",
               str(thread),
               "--no-PG",
               "-L",
               "bypass.bed",
               "-b", "-o"
               "bypassed.bam",
               inName,
               catch_stdout=False)  ##pysam bug

# while bypassing.poll() == None:
#     continue

#merging = Popen([merge_cmd], shell = True)
pysam.merge("--no-PG", "-c", "-p", "-b", toMergeF, "-O", "BAM", "-@",
            str(thread), ouName)

# while merging.poll() == None:
#     continue

# rehead_cmd = "samtools view -H " + inName + " | samtools reheader -P  - " + ouName
# rehead_cmd += " > rehead.bam" + " ; mv rehead.bam " + ouName

## print(rehead_cmd)

##reheading = Popen([rehead_cmd], shell = True)
# pysam.view("-H", "-o", "header.sam", inName, catch_stdout=False)
# pysam.reheader("-P",  "-i", "header.sam", ouName)

# while reheading.poll() == None:
#     continue
Exemplo n.º 33
0
	os.remove(mapped5)
	os.remove(mapped3)
	os.remove(mappedUs)

	# Sorting the resulting file. We do it this way because the original
	# file wasn't guaranteed to be sorted. To try to merge the three files
	# (split, 5' and 3'), we'd need to assume some sorting order and I'm
	# not willing to do that. Sorting afterward enforces the 'samtools'
	# name ordering over anything that might have been there originally.
	bsorted=os.path.join(tmpdir, "sorted")
	pysam.sort("-n", outbam, bsorted)
	os.rename(bsorted+".bam", outbam)
	
####################################################################################################
# Stitching two files together to reform a single BAM file.

cmd=["-n", "-f", args.output]+allnames
pysam.merge(*cmd)
for x in allnames:
	os.remove(x)

# Mopping up.
import shutil
shutil.rmtree(tmpdir)
dumpf.close()

####################################################################################################


Exemplo n.º 34
0
            okread.is_read2 = not is_first
            sout.write(okread)

    os.remove(mapped5)
    os.remove(mapped3)
    os.remove(mappedUs)

    # Sorting the resulting file. We do it this way because the original
    # file wasn't guaranteed to be sorted. To try to merge the three files
    # (split, 5' and 3'), we'd need to assume some sorting order and I'm
    # not willing to do that. Sorting afterward enforces the 'samtools'
    # name ordering over anything that might have been there originally.
    bsorted = os.path.join(tmpdir, "sorted.bam")
    pysam.sort("-o", bsorted, "-n", outbam)
    os.rename(bsorted, outbam)

####################################################################################################
# Stitching two files together to reform a single BAM file.

cmd = ["-n", "-f", args.output] + allnames
pysam.merge(*cmd)
for x in allnames:
    os.remove(x)

# Mopping up.
import shutil
shutil.rmtree(tmpdir)
dumpf.close()

####################################################################################################
Exemplo n.º 35
0
def main():
    g1_al, g2_al, g1_s, gop1, gop2, pt, sth = getfile()
    makefile(gop1 + "_genome", gop2 + "_genome")
    g1snp, g2snp = readsnpfile(g1_s)

    g1samfile = pysam.AlignmentFile(g1_al, "r")
    g2samfile = pysam.AlignmentFile(g2_al, "r")

    exitfile("tmp1.bam")
    exitfile("tmp2.bam")
    exitfile("tmp3.bam")
    exitfile("tmp_sortname.bam")

    tmp1 = pysam.AlignmentFile("tmp1.bam",
                               "wb",
                               template=g1samfile,
                               threads=int(sth))

    for r in g1samfile:
        r.query_name = r.query_name + "_g1"
        tmp1.write(r)
    tmp1.close()

    tmp2 = pysam.AlignmentFile("tmp2.bam",
                               "wb",
                               template=g2samfile,
                               threads=int(sth))
    for r in g2samfile:
        r.query_name = r.query_name + "_g2"
        tmp2.write(r)
    tmp2.close()

    pysam.merge("-@", sth, "tmp3.bam", "tmp1.bam", "tmp2.bam")
    pysam.sort("-@", sth, "-n", "-o", "tmp_sortname.bam", "tmp3.bam")

    tmp_all = pysam.AlignmentFile("tmp_sortname.bam", "rb")

    tmp_bamheader = tmp_all.text
    g1_header = g1samfile.text
    g2_header = g2samfile.text

    snpsupport = open(gop1 + "_" + gop2 + "_" + "support.txt", "w")

    g1_g1readsfile = pysam.AlignmentFile(gop1 + "_genome/" + gop1 +
                                         "reads.sam",
                                         "w",
                                         template=tmp_all,
                                         threads=int(sth),
                                         add_sam_header=False)
    g1_g2readsfile = pysam.AlignmentFile(gop1 + "_genome/" + gop2 +
                                         "reads.sam",
                                         "w",
                                         template=tmp_all,
                                         threads=int(sth),
                                         add_sam_header=False)
    g1_unknowreadsfile = pysam.AlignmentFile(gop1 + "_genome/" +
                                             "unknownreads.sam",
                                             "w",
                                             template=tmp_all,
                                             threads=int(sth),
                                             add_sam_header=False)
    g1_g1onlyreadsfile = pysam.AlignmentFile(gop1 + "_genome/" + gop1 +
                                             "onlyreads.sam",
                                             "w",
                                             template=tmp_all,
                                             threads=int(sth),
                                             add_sam_header=False)

    g2_g1readsfile = pysam.AlignmentFile(gop2 + "_genome/" + gop1 +
                                         "reads.sam",
                                         "w",
                                         template=tmp_all,
                                         threads=int(sth),
                                         add_sam_header=False)
    g2_g2readsfile = pysam.AlignmentFile(gop2 + "_genome/" + gop2 +
                                         "reads.sam",
                                         "w",
                                         template=tmp_all,
                                         threads=int(sth),
                                         add_sam_header=False)
    g2_unknowreadsfile = pysam.AlignmentFile(gop2 + "_genome/" +
                                             "unknownreads.sam",
                                             "w",
                                             template=tmp_all,
                                             threads=int(sth),
                                             add_sam_header=False)
    g2_g2onlyreadsfile = pysam.AlignmentFile(gop2 + "_genome/" + gop2 +
                                             "onlyreads.sam",
                                             "w",
                                             template=tmp_all,
                                             threads=int(sth),
                                             add_sam_header=False)

    stat_number = {
        "g1": 0,
        "g2": 0,
        "g1only": 0,
        "g2only": 0,
        "unk": 0,
        "single": 0,
        "discarded": 0
    }

    if pt == "Isoseq":
        r0_name = ""
        samlist = {"g1": [], "g2": []}
        for r in tmp_all:
            if r0_name != r.query_name.replace("_g1", "").replace(
                    "_g2", "") and r0_name != "":
                readt, support = isoreadsphase(samlist, g1snp, g2snp, gop1,
                                               gop2)
                readswrite(samlist, readt, support, g1_g1readsfile,
                           g1_g2readsfile, g1_unknowreadsfile,
                           g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile,
                           g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport)
                stat_number[readt] += 1
                r0_name = r.query_name.replace("_g1", "").replace("_g2", "")
                samlist = {"g1": [], "g2": []}
            elif r0_name == "":
                r0_name = r.query_name.replace("_g1", "").replace("_g2", "")
            if r.query_name.find("_g1") != -1:
                samlist["g1"].append(r)
            elif r.query_name.find("_g2") != -1:
                samlist["g2"].append(r)
        readt, support = isoreadsphase(samlist, g1snp, g2snp, gop1, gop2)
        stat_number[readt] += 1
        readswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile,
                   g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile,
                   g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile,
                   snpsupport)
    elif pt == "RNAseq":
        r0_name = ""
        samlist = {
            "g1_mate1": [],
            "g1_mate2": [],
            "g2_mate1": [],
            "g2_mate2": []
        }
        for r in tmp_all:
            if r0_name != r.query_name.replace("_g1", "").replace(
                    "_g2", "") and r0_name != "":
                readt, support = rnapairreadsphase(samlist, g1snp, g2snp, gop1,
                                                   gop2)
                if readt == "single":
                    stat_number[readt] += 1
                else:
                    stat_number[readt] += 2
                pairsreadswrite(samlist, readt, support, g1_g1readsfile,
                                g1_g2readsfile, g1_unknowreadsfile,
                                g1_g1onlyreadsfile, g2_g1readsfile,
                                g2_g2readsfile, g2_unknowreadsfile,
                                g2_g2onlyreadsfile, snpsupport)
                r0_name = r.query_name.replace("_g1", "").replace("_g2", "")
                samlist = {
                    "g1_mate1": [],
                    "g1_mate2": [],
                    "g2_mate1": [],
                    "g2_mate2": []
                }
            elif r0_name == "":
                r0_name = r.query_name.replace("_g1", "").replace("_g2", "")
            if r.query_name.find("_g1") != -1:
                if r.is_read1:
                    samlist["g1_mate1"].append(r)
                elif r.is_read2:
                    samlist["g1_mate2"].append(r)
            elif r.query_name.find("_g2") != -1:
                if r.is_read1:
                    samlist["g2_mate1"].append(r)
                elif r.is_read2:
                    samlist["g2_mate2"].append(r)
        readt, support = rnapairreadsphase(samlist, g1snp, g2snp, gop1, gop2)
        if readt == "single":
            stat_number[readt] += 1
        else:
            stat_number[readt] += 2
        pairsreadswrite(samlist, readt, support, g1_g1readsfile,
                        g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile,
                        g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile,
                        g2_g2onlyreadsfile, snpsupport)
    elif pt == "BSseq":
        r0_name = ""
        samlist = {
            "g1_mate1": [],
            "g1_mate2": [],
            "g2_mate1": [],
            "g2_mate2": []
        }
        for r in tmp_all:
            if r0_name != r.query_name.replace("_g1", "").replace(
                    "_g2", "") and r0_name != "":
                readt, support = wgbsreadsphase(samlist, g1snp, g2snp, gop1,
                                                gop2)
                if readt == "single":
                    stat_number[readt] += 1
                else:
                    stat_number[readt] += 2
                pairsreadswrite(samlist, readt, support, g1_g1readsfile,
                                g1_g2readsfile, g1_unknowreadsfile,
                                g1_g1onlyreadsfile, g2_g1readsfile,
                                g2_g2readsfile, g2_unknowreadsfile,
                                g2_g2onlyreadsfile, snpsupport)
                r0_name = r.query_name.replace("_g1", "").replace("_g2", "")
                samlist = {
                    "g1_mate1": [],
                    "g1_mate2": [],
                    "g2_mate1": [],
                    "g2_mate2": []
                }
            elif r0_name == "":
                r0_name = r.query_name.replace("_g1", "").replace("_g2", "")
            if r.query_name.find("_g1") != -1:
                if r.is_read1:
                    samlist["g1_mate1"].append(r)
                elif r.is_read2:
                    samlist["g1_mate2"].append(r)
            elif r.query_name.find("_g2") != -1:
                if r.is_read1:
                    samlist["g2_mate1"].append(r)
                elif r.is_read2:
                    samlist["g2_mate2"].append(r)
        readt, support = wgbsreadsphase(samlist, g1snp, g2snp, gop1, gop2)
        if readt == "single":
            stat_number[readt] += 1
        else:
            stat_number[readt] += 2
        pairsreadswrite(samlist, readt, support, g1_g1readsfile,
                        g1_g2readsfile, g1_unknowreadsfile, g1_g1onlyreadsfile,
                        g2_g1readsfile, g2_g2readsfile, g2_unknowreadsfile,
                        g2_g2onlyreadsfile, snpsupport)
    elif pt == "Riboseq":
        r0_name = ""
        samlist = {"g1": [], "g2": []}
        for r in tmp_all:
            if r0_name != r.query_name.replace("_g1", "").replace(
                    "_g2", "") and r0_name != "":
                readt, support = riboreadsphase(samlist, g1snp, g2snp, gop1,
                                                gop2)
                readswrite(samlist, readt, support, g1_g1readsfile,
                           g1_g2readsfile, g1_unknowreadsfile,
                           g1_g1onlyreadsfile, g2_g1readsfile, g2_g2readsfile,
                           g2_unknowreadsfile, g2_g2onlyreadsfile, snpsupport)
                stat_number[readt] += 1
                r0_name = r.query_name.replace("_g1", "").replace("_g2", "")
                samlist = {"g1": [], "g2": []}
            elif r0_name == "":
                r0_name = r.query_name.replace("_g1", "").replace("_g2", "")
            if r.query_name.find("_g1") != -1:
                samlist["g1"].append(r)
            elif r.query_name.find("_g2") != -1:
                samlist["g2"].append(r)
        readt, support = riboreadsphase(samlist, g1snp, g2snp, gop1, gop2)
        stat_number[readt] += 1
        readswrite(samlist, readt, support, g1_g1readsfile, g1_g2readsfile,
                   g1_unknowreadsfile, g1_g1onlyreadsfile, g2_g1readsfile,
                   g2_g2readsfile, g2_unknowreadsfile, g2_g2onlyreadsfile,
                   snpsupport)

    g1_g1readsfile.close()
    g1_g2readsfile.close()
    g1_unknowreadsfile.close()
    g1_g1onlyreadsfile.close()
    g2_g1readsfile.close()
    g2_g2readsfile.close()
    g2_unknowreadsfile.close()
    g2_g2onlyreadsfile.close()
    snpsupport.close()

    g1file = [
        gop1 + "_genome/" + gop1 + "reads", gop1 + "_genome/" + gop2 + "reads",
        gop1 + "_genome/unknownreads", gop1 + "_genome/" + gop1 + "onlyreads"
    ]
    g2file = [
        gop2 + "_genome/" + gop1 + "reads", gop2 + "_genome/" + gop2 + "reads",
        gop2 + "_genome/unknownreads", gop2 + "_genome/" + gop2 + "onlyreads"
    ]
    PG = '@PG\tID:phasing.py\tPN:PP2PG\tVN:1.0.0\tCL:' + ' '.join(
        sys.argv) + '\n'

    tmp4 = open("tmp4.header", "w")
    tmp4.write(g1_header + PG)
    tmp4.close()
    tmp5 = open("tmp5.header", "w")
    tmp5.write(g2_header + PG)
    tmp5.close()

    for f in g1file:
        exitfile(f + ".bam")
        exitfile(f + ".tmp.sam")
        try:
            os.system("cat tmp4.header " + f + ".sam >" + f + ".tmp.sam")
            print("Excuting: samtools view -@ " + sth + " -b -S " + f +
                  ".tmp.sam -o " + f + ".bam\n")
            os.system("samtools view -@ " + sth + " -b -S " + f +
                      ".tmp.sam -o " + f + ".bam")
            os.system("rm " + f + ".sam " + f + ".tmp.sam")
        except Exception as e:
            print(e)
            sys.exit(2)
        else:
            print("*** " + f + ".sam converted bam file successfully. ***\n")
    for f in g2file:
        exitfile(f + ".bam")
        exitfile(f + ".tmp.sam")
        try:
            os.system("cat tmp5.header " + f + ".sam >" + f + ".tmp.sam")
            print("Excuting: samtools view -@ " + sth + " -b -S " + f +
                  ".tmp.sam -o " + f + ".bam\n")
            os.system("samtools view -@ " + sth + " -b -S " + f +
                      ".tmp.sam -o " + f + ".bam")
            os.system("rm " + f + ".sam " + f + ".tmp.sam")
        except Exception as e:
            print(e)
            sys.exit(2)
        else:
            print("*** " + f + ".sam converted bam file successfully. ***\n")
    try:
        os.remove("tmp1.bam")
        os.remove("tmp2.bam")
        os.remove("tmp3.bam")
        os.remove("tmp_sortname.bam")
        os.remove("tmp4.header")
        os.remove("tmp5.header")
        os.remove(gop1 + "_" + gop2 + "_support.txt")
    except Exception as e:
        print(e)
        sys.exit(2)
    else:
        print("*** Deleted tmp file successfully. ***\n")

    g1samfile.close()
    g2samfile.close()
    tmp_all.close()
    g1reads_stats = stat_number["g1"]
    g2reads_stats = stat_number["g2"]
    unknowreads_stats = stat_number["unk"] + stat_number["single"]
    g1onlyreads_stats = stat_number["g1only"]
    g2onlyreads_stats = stat_number["g2only"]
    datatype={"Isoseq":"Iso-Seq (PacBio Isoform Sequence).",\
              "RNAseq":"RNA-Seq for paired-end reads.",\
              "BSseq":"BS-Seq (Bisulfite Sequencing) for paired-end reads." ,\
              "Riboseq":"Ribo-seq (Ribosome Profiling)."}
    head = """
==========================
Final Phasing Reads Report
==========================

""" + "Phasing Data Type:\t" + datatype[pt] + """

Note: When calculating the Separation Rate, the reads which are unmapped in two parental genomes are discarded.

Type of reads:\tRead Counts
"""
    report=gop1+"-synteny reads:\t"+str(g1reads_stats)+"\n"+\
           gop2+"-synteny reads:\t "+str(g2reads_stats)+"\n"+\
           "Unknown reads:\t"+str(unknowreads_stats)+"\n"+\
           gop1+"-only reads:\t"+str(g1onlyreads_stats)+"\n"+\
           gop2+"-only reads:\t"+str(g2onlyreads_stats)+"\n\n"+\
           "Separation Rate:\t"+str(round((float(g1reads_stats)+float(g2reads_stats))/(float(g1reads_stats)+float(g2reads_stats)+float(g1onlyreads_stats)+float(g2onlyreads_stats)+float(unknowreads_stats))*100,2))+"%\n"
    file_report = open(gop1 + "_" + gop2 + "_Phasing_Report.txt", "w")
    file_report.write(head + report)
    file_report.close()
    print(head + report)
    print("""
=================
Phasing Finished!
=================
""")
Exemplo n.º 36
0
sort_and_index(out_base + '_WO.bam')

if barcode_type == 2:
    out_fr.close()
    out_rr.close()
    print('I found', tot, fcount, rcount, frcount, rrcount, count_wo)
    print('sorting - indexing')
    sort_and_index(out_base + '_F.bam')
    sort_and_index(out_base + '_R.bam')
    sort_and_index(out_base + '_FR.bam')
    sort_and_index(out_base + '_RR.bam')

elif barcode_type == 1:
    if sys.argv[5] == 'True':
        print('merging')
        pysam.merge("-f", out_base + '_F_plus_R.bam', out_base + '_F.bam',
                    out_base + '_R.bam')
        print('sorting - indexing')
        sort_and_index(out_base + '_F_plus_R.bam')
        print('delete startig files')
        os.remove(out_base + '_F.bam')
        os.remove(out_base + '_R.bam')

    elif sys.argv[5] == 'False':
        sort_and_index(out_base + '_F.bam')
        sort_and_index(out_base + '_R.bam')

    else:
        print('Merge is either True or False')
        sys.exit(1)
    print('I found', tot, fcount, rcount, count_wo)
else:
Exemplo n.º 37
0
def merge_sam_files(sam_dir_path, out_sam_path):
    sam_files = glob.glob(os.path.join(sam_dir_path, '*.sam'))
    os.makedirs(os.path.dirname(out_sam_path), exist_ok=True)
    pysam.merge('-f', out_sam_path, *sam_files, catch_stdout=False)
Exemplo n.º 38
0
        bam_merge_files.append(bam_root + ".sorted.bam")

    # Run the bs_seeker2-align.py steps on the split up fastq files
    for ffa in fastq_for_alignment:
        pwgbs.Aligner(ffa[0], ffa[1], ffa[2], ffa[3][ffa[2]], ffa[4], ffa[5])

    # Sort and merge the aligned bam files
    # Pre-sort the original input bam files
    for bfs in bam_sort_files:
        pysam.sort("-o", bfs[1], bfs[0])

    f_bam = in_file1.split("/")
    f_bam[-1] = f_bam[-1].replace(".fastq", ".sorted.bam")
    out_bam_file = "/".join(f_bam)

    pysam.merge(out_bam_file, *bam_merge_files)

    pysam.sort("-o", out_bam_file + '.sorted.bam', "-T",
               out_bam_file + ".bam_sort", out_bam_file)

    pysam.index(out_bam_file)

    # Run the bs_seeker2-call_methylation.py steps
    pwgbs.MethylationCaller(
        aligner_dir, out_bam_file,
        data_dir + project_id + '/' + srr_id + '/' + srr_id,
        genome_fa["unzipped"] + "_bowtie2")

    # Tidy up
    pwgbs.clean_up(ata_dir + project_id)
Exemplo n.º 39
0
def sam_parser(bwastdout, out_dir):
    log.info('Error estimation...')
    bins = [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
        200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000,
        6000, 7000, 8000, 9000, 10000, 15000, 20000, 25000, 30000, 35000,
        40000, 45000, 50000, 100000, 200000, 300000, 400000, 500000, 750000,
        1000000000
    ]
    mapped_frac_size = {k: [0, 0] for k in bins}
    pos_dict = {k: {} for k in arange(0, 250000000, 1000000)}
    OutDict = {k: {} for k in ['M', 'I', 'D']}
    header_list = []
    proc_lists = {k: [] for k in range(th)}
    chr_cov_dict = {}
    file_list = []
    fill_val = 0
    Flag = True
    while Flag:
        line = bwastdout.stdout.readline()
        Threshold = 50000
        while len(proc_lists[th - 1]) <= Threshold:
            proc_check_first = map(lambda x: len(proc_lists[x]), range(th - 1))
            line_counter = 0
            if line.strip() == '':
                Flag = False
                break
            if line[0] == '@':
                header_list.append(line)
                line = bwastdout.stdout.readline()
            else:
                while line_counter < len(proc_lists):
                    proc_lists[line_counter].append(line)
                    line_counter += 1
                    line = bwastdout.stdout.readline()
                line_counter = 0
                line = bwastdout.stdout.readline()
            proc_check_second = map(lambda x: len(proc_lists[x]),
                                    range(th - 1))
            if all(v == 0 for v in proc_check_second) == False:
                if proc_check_second == proc_check_first:
                    time.sleep(5)
                    proc_check_second = map(lambda x: len(proc_lists[x]),
                                            range(th - 1))
                    if proc_check_second == proc_check_first:
                        break
        fill_list = (fill_val, fill_val + (th - 1) -
                     map(lambda x: len(proc_lists[x]), range(th - 1)).count(0))
        fill_val = fill_val + (th - 1) - map(lambda x: len(proc_lists[x]),
                                             range(th - 1)).count(0)
        res_obj = error_wrap(proc_lists, header_list, fill_list)
        for ro in res_obj:
            p_d, OD, m_f_s, fname = ro
            file_list.append(fname)
            for k, v in p_d.iteritems():
                if v is not {}:
                    for ch, l in v.iteritems():
                        pos_dict[k][ch] = pos_dict[k].get(ch, []) + l
            for var_k, cnt in OD.iteritems():
                if cnt is not {}:
                    for k, v in cnt.iteritems():
                        outdictget = OutDict.get(var_k)
                        if outdictget.get(int(k)) == None:
                            outdictget[int(k)] = v
                        else:
                            outdictget[int(k)] = [
                                outdictget.get(int(k))[0],
                                int(outdictget.get(int(k))[1]) + int(v[1]),
                                int(outdictget.get(int(k))[2]) + int(v[2])
                            ]
            for mfs in m_f_s:
                mapped_frac_size[mfs[0]] = map(
                    sum, zip(mapped_frac_size[mfs[0]], mfs[1:]))
            proc_lists = {k: [] for k in range(th)}
    for ev in ['I', 'M', 'D']:
        OutDict[ev] = dict(
            zip(
                sorted(OutDict[ev].keys()),
                zip([sorted(OutDict[ev].keys())[0]] +
                    list(diff(sorted(OutDict[ev].keys()))), [
                        max(
                            zip(*map(lambda x: OutDict[ev].get(x),
                                     sorted(OutDict[ev].keys())))[1])
                        if x[0] < zip(*map(lambda x: OutDict[ev].get(
                            x), sorted(OutDict[ev].keys())))[1].index(
                                max(
                                    zip(*map(lambda x: OutDict[ev].get(x),
                                             sorted(OutDict[ev].keys())))[1]))
                        else x[1] for x in enumerate(
                            zip(*map(lambda x: OutDict[ev].get(x),
                                     sorted(OutDict[ev].keys())))[1])
                    ],
                    zip(*map(lambda x: OutDict[ev].get(x),
                             sorted(OutDict[ev].keys())))[2])))
    for k, v in pos_dict.iteritems():
        for k1, v1 in v.iteritems():
            chr_cov_dict[str(k1)] = chr_cov_dict.get(
                str(k1), []) + [(k, sum(v1) / 1000000.)]
    mapped_frac_size = [[
        k[0], k[1][0], k[1][1],
        round(1 - float(k[1][1]) / (float(k[1][0]) + float(k[1][1])), 3)
    ] for k in sorted(mapped_frac_size.iteritems())
                        if k[1][0] != 0 or k[1][1] != 0]
    sorted_unmapfraqseq = zip(*mapped_frac_size)[3]
    for ev in OutDict.keys():
        for k, v in sorted(OutDict[ev].iteritems()):
            OutDict[ev][k] = round(float(v[2]) / (int(v[0]) * float(v[1])), 4)
    plot_stats(OutDict, sorted_unmapfraqseq, mapped_frac_size, chr_cov_dict,
               out_dir)
    finalfile = os.path.join(out_dir, (prefix + '.bam'))
    bamsfile = os.path.join(out_dir, 'to_merge.txt')
    file = open(bamsfile, 'w')
    for line in file_list:
        file.write(os.path.join(work_dir, line) + '\n')
    file.close()
    pysam.merge("-cp",
                "-@%s" % str(th),
                "-b%s" % bamsfile,
                finalfile,
                catch_stdout=False)
    for b in file_list:
        os.remove(b)
    os.remove(bamsfile)
Exemplo n.º 40
0
def mergeBamFiles(outputFile, inputList, cpus):
    pysam.merge("-f", outputFile, "-@", cpus, *inputList)
Exemplo n.º 41
0
def retrieve_unmapped_reads(args, params, filenames):
    log.logger.debug('started.')
    try:
        if args.p <= 2:
            thread_n = args.p
        elif args.p >= 3:
            thread_n = args.p - 1
        # retrieve discordant reads, default
        if args.use_mate_mapped is False and args.all_discordant is False:
            if not args.b is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '12',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           args.b,
                           catch_stdout=False)
            elif not args.c is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '12',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           '--reference',
                           args.fa,
                           args.c,
                           catch_stdout=False)
            pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1',
                        filenames.unmapped_merged_pre1, '-2',
                        filenames.unmapped_merged_pre2, '-s', '/dev/null',
                        filenames.discordant_bam)
            if args.keep is False:
                os.remove(filenames.discordant_bam)
        # retrieve discordant reads, non-default
        else:
            if not args.b is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '1',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           args.b,
                           catch_stdout=False)
            elif not args.c is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '1',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           '--reference',
                           args.fa,
                           args.c,
                           catch_stdout=False)
            pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o',
                       filenames.discordant_sort_bam, filenames.discordant_bam)
            if args.keep is False:
                os.remove(filenames.discordant_bam)
            if args.all_discordant is True:
                pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null',
                            '-1', filenames.unmapped_merged_pre1, '-2',
                            filenames.unmapped_merged_pre2, '-s', '/dev/null',
                            filenames.discordant_sort_bam)
            else:
                pysam.fastq('-@', '%d' % thread_n, '-f', '12', '-F', '3328',
                            '-N', '-0', '/dev/null', '-1',
                            filenames.unmapped_1, '-2', filenames.unmapped_2,
                            '-s', '/dev/null', filenames.discordant_sort_bam)
                if args.use_mate_mapped is True:
                    pysam.view('-@',
                               '%d' % thread_n,
                               '-f',
                               '8',
                               '-F',
                               '3332',
                               '-b',
                               '-o',
                               filenames.unmapped_bam_3,
                               filenames.discordant_sort_bam,
                               catch_stdout=False)
                    pysam.view('-@',
                               '%d' % thread_n,
                               '-f',
                               '4',
                               '-F',
                               '3336',
                               '-b',
                               '-o',
                               filenames.unmapped_bam_4,
                               filenames.discordant_sort_bam,
                               catch_stdout=False)
                    pysam.merge('-@', '%d' % thread_n, '-f',
                                filenames.unmapped_bam_34,
                                filenames.unmapped_bam_3,
                                filenames.unmapped_bam_4)
                    pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o',
                               filenames.unmapped_sorted_34,
                               filenames.unmapped_bam_34)
                    pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null',
                                '-1', filenames.unmapped_3, '-2',
                                filenames.unmapped_4, '-s', '/dev/null',
                                filenames.unmapped_sorted_34)
                # concatenate fastq
                with open(filenames.unmapped_merged_pre1, 'w') as outfile:
                    for f in [filenames.unmapped_1, filenames.unmapped_3]:
                        if os.path.exists(f) is True:
                            with open(f) as infile:
                                for line in infile:
                                    outfile.write(line)
                            utils.gzip_or_del(args, params, f)
                with open(filenames.unmapped_merged_pre2, 'w') as outfile:
                    for f in [filenames.unmapped_2, filenames.unmapped_4]:
                        if os.path.exists(f) is True:
                            with open(f) as infile:
                                for line in infile:
                                    outfile.write(line)
                            utils.gzip_or_del(args, params, f)
        # remove short reads
        infile1 = open(filenames.unmapped_merged_pre1)
        infile2 = open(filenames.unmapped_merged_pre2)
        outfile1 = open(filenames.unmapped_merged_1, 'w')
        outfile2 = open(filenames.unmapped_merged_2, 'w')
        min_seq_len = params.min_seq_len
        tmp1, tmp2 = [], []
        for line1, line2 in zip(infile1, infile2):
            tmp1.append(line1)
            tmp2.append(line2)
            if len(tmp1) == 4:
                seqlen1 = len(tmp1[1].strip())
                seqlen2 = len(tmp2[1].strip())
                if seqlen1 >= min_seq_len and seqlen2 >= min_seq_len:
                    outfile1.write(''.join(tmp1))
                    outfile2.write(''.join(tmp2))
                tmp1, tmp2 = [], []
        infile1.close()
        infile2.close()
        outfile1.close()
        outfile2.close()
        utils.gzip_or_del(args, params, filenames.unmapped_merged_pre1)
        utils.gzip_or_del(args, params, filenames.unmapped_merged_pre2)
        if args.keep is False:
            if os.path.exists(filenames.discordant_sort_bam) is True:
                os.remove(filenames.discordant_sort_bam)
            if args.use_mate_mapped is True:
                os.remove(filenames.unmapped_bam_3)
                os.remove(filenames.unmapped_bam_4)
                os.remove(filenames.unmapped_bam_34)
                os.remove(filenames.unmapped_sorted_34)

    except:
        log.logger.error('\n' + traceback.format_exc())
        exit(1)