def align_dataset_parallel( dataset, blocks, reference, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, align_threads, barcodeInfo, ): do_sorting = True try: # process block by block memTotalGb = _get_total_memory_gb() maxReads = 20000000 if memTotalGb > 140: maxReads = 60000000 if reference and len(blocks) > 1 and int(dataset["read_count"]) > maxReads: printtime( "DEBUG: TRADITIONAL BLOCK PROCESSING ------ prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset["file_prefix"], reference, dataset["read_count"]) ) # start alignment for each block and current barcode with reads # TODO: in how many blocks are reads with this barcode for block in blocks: printtime("DEBUG: ALIGN ONLY ONE BLOCK: %s" % block) align( [block], os.path.join(BASECALLER_RESULTS, dataset["basecaller_bam"]), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats=False, do_sorting=do_sorting, do_mark_duplicates=False, do_indexing=False, output_dir=os.path.join(block, ALIGNMENT_RESULTS), output_basename=dataset["file_prefix"], threads=align_threads, barcode_info=barcodeInfo, ) bamdir = "." # TODO , do we need this ? bamBase = dataset["file_prefix"] bamfile = dataset["file_prefix"] + ".bam" block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks ] block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] printtime("blocks with reads: %s" % len(block_bam_list)) bamFile = dataset["file_prefix"] + ".bam" composite_bam_filepath = dataset["file_prefix"] + ".bam" blockprocessing.extract_and_merge_bam_header( block_bam_list, composite_bam_filepath ) # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...] cmd = "samtools merge -l1 [email protected]" if do_ionstats: cmd += " - " else: cmd += " %s" % (composite_bam_filepath) for bamfile in block_bam_list: cmd += " %s" % bamfile cmd += " -h %s.header.sam" % composite_bam_filepath if do_ionstats: bam_filenames = ["/dev/stdin"] ionstats_alignment_filename = ( "%s.ionstats_alignment.json" % bamBase ) # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json') ionstats_alignment_h5_filename = ( "%s.ionstats_error_summary.h5" % bamBase ) # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_error_summary.h5') ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x, ) cmd += " | tee >(%s)" % ionstats_cmd if do_mark_duplicates: json_name = ( "BamDuplicates.%s.json" % bamBase if bamBase != "rawlib" else "BamDuplicates.json" ) cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(["/bin/bash", "-c", cmd]).wait() if ret != 0: printtime("ERROR: merging failed, return code: %d" % ret) raise RuntimeError("exit code: %d" % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd, shell=True) else: printtime( "DEBUG: MERGED BLOCK PROCESSING ----------- prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset["file_prefix"], reference, dataset["read_count"]) ) # TODO: try a python multiprocessing pool align( blocks, os.path.join(BASECALLER_RESULTS, dataset["basecaller_bam"]), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir=ALIGNMENT_RESULTS if reference else BASECALLER_RESULTS, output_basename=dataset["file_prefix"], threads=align_threads, barcode_info=barcodeInfo, ) except Exception: traceback.print_exc()
def align( blocks, basecaller_bam_filename, # e.g. 'basecaller_results/IonXpress001.basecaller.bam' alignerArgs, ionstatsArgs, referenceName, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir, output_basename, threads=0): try: threads = threads or multiprocessing.cpu_count() bamBase = os.path.normpath(output_dir + "/" + output_basename) bamFile = bamBase + ".bam" printtime("reference: %s" % referenceName) printtime("input blocks: %s" % blocks) printtime("input reads: %s" % basecaller_bam_filename) printtime("output dir: %s" % output_dir) printtime("output basename: %s" % output_basename) printtime("full output base: %s" % bamBase) printtime("full output file: %s" % bamFile) # TODO: not always used if 'tmap' in alignerArgs: aligner = 'tmap' if '...' in alignerArgs: alist = alignerArgs.split('...') cmd = alist[0] tmap_stage_options = alist[1] else: cmd = 'tmap mapall' tmap_stage_options = 'stage1 map4' elif 'bowtie2' in alignerArgs: aligner = 'bowtie2' cmd = alignerArgs else: printtime("ERROR: Aligner command not specified") raise if not referenceName: # 1. create merged unmapped bam, 2. call ionstats # TODO: long term: move ionstats basecaller into basecaller binary cmd = "" composite_bam_filepath = bamBase+'.basecaller.bam' if blocks: bamdir = '.' # TODO , do we need this ? bamfile = basecaller_bam_filename block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] if len(block_bam_list) >= 2: blockprocessing.extract_and_merge_bam_header(block_bam_list, composite_bam_filepath) cmd = 'samtools cat -h %s.header.sam -o /dev/stdout' % (composite_bam_filepath) for blockbamfile in block_bam_list: cmd = cmd + ' %s' % blockbamfile elif len(block_bam_list) == 1: # cmd = "samtools reheader %s.header.sam %s -" % (composite_bam_filepath,block_bam_list[0]) cmd = "cat %s" % (block_bam_list[0]) else: return ''' if block_bam_list: composite_bai_filepath="" mark_duplicates=False method='samtools' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) ''' bam_filenames = ["/dev/stdin"] else: bam_filenames = [basecaller_bam_filename] if do_ionstats: ionstats_cmd = ionstats.generate_ionstats_basecaller_cmd( bam_filenames, bamBase+'.ionstats_basecaller.json', library_key, graph_max_x) if blocks: cmd += " | tee >(%s)" % ionstats_cmd cmd += " > %s" % composite_bam_filepath else: cmd = ionstats_cmd printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: unmapped bam merging failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) return if aligner == 'tmap': referenceFastaFile = '/results/referenceLibrary/tmap-f3/' + referenceName + '/' + referenceName + '.fasta' if blocks: bamdir = '.' # TODO , do we need this ? bamfile = basecaller_bam_filename # printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks)) block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) printtime("blocks with reads: %s" % len(block_bam_list)) if len(block_bam_list) >= 2: blockprocessing.extract_and_merge_bam_header(block_bam_list, basecaller_bam_filename) mergecmd = 'samtools cat -h %s.header.sam -o /dev/stdout' % basecaller_bam_filename for blockbamfile in block_bam_list: mergecmd = mergecmd + ' %s' % blockbamfile ''' mergecmd = 'java -Xmx8g -jar /opt/picard/picard-tools-current/picard.jar MergeSamFiles' for blockbamfile in block_bam_list: mergecmd = mergecmd + ' I=%s' % blockbamfile mergecmd = mergecmd + ' O=/dev/stdout' mergecmd = mergecmd + ' VERBOSITY=WARNING' # suppress INFO on stderr mergecmd = mergecmd + ' QUIET=true' # suppress job-summary on stderr mergecmd = mergecmd + ' VALIDATION_STRINGENCY=SILENT' ''' cmd = mergecmd + " | " + cmd cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -i bam" elif len(block_bam_list) == 1: cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -r %s" % block_bam_list[0] else: printtime("ERROR: all blocks filtered") return else: cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -r %s" % basecaller_bam_filename cmd += " -v" cmd += " -Y" cmd += " -u --prefix-exclude 5" # random seed based on read name after ignoring first 5 characters if do_realign: cmd += " --do-realign" cmd += " -o 2" # -o 0: SAM, -o 2: uncompressed BAM cmd += " %s" % tmap_stage_options cmd += " 2>> " + bamBase + '.alignmentQC_out.txt' # logfile elif aligner == 'bowtie2': referenceFastaDir = '/results/referenceLibrary/bowtie2/' + referenceName + '/' + referenceName cmd = "java -Xmx8g -jar /opt/picard/picard-tools-current/picard.jar SamToFastq I=%s F=/dev/stdout" % basecaller_bam_filename cmd += " | /results/plugins/bowtielauncher/bowtie2 -p%d -x %s -U /dev/stdin" % (threads, referenceFastaDir) cmd += " | samtools view -ubS -" if do_ionstats: bam_filenames = ["/dev/stdin"] ionstats_alignment_filename = "%s.ionstats_alignment.json" % bamBase ionstats_alignment_h5_filename = "%s.ionstats_error_summary.h5" % bamBase ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x) cmd += " | tee >(%s)" % ionstats_cmd if do_sorting: if do_mark_duplicates: # TODO: implement alternative, maybe with named pipes cmd += " | samtools sort -m 1000M -l1 [email protected] -o - -" json_name = 'BamDuplicates.%s.json' % bamBase if bamBase != 'rawlib' else 'BamDuplicates.json' cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: # cmd += " | ( samtools sort -m 1000M -l1 [email protected] - %s <&0 & )" % bamBase cmd += " | samtools sort -m 1000M -l1 [email protected] - %s" % bamBase else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: alignment failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd, shell=True) ''' if do_indexing: try: composite_bam_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') composite_bai_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai') blockprocessing.create_index_file(composite_bam_filepath, composite_bai_filepath) except: traceback.print_exc() ''' except: raise
def align( blocks, basecaller_bam_filename, # e.g. 'basecaller_results/IonXpress001.basecaller.bam' alignerArgs, ionstatsArgs, referenceName, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir, output_basename, threads=0, barcode_info={}, ): try: threads = threads or multiprocessing.cpu_count() memGb = _get_total_memory_gb() bamBase = os.path.normpath(output_dir + "/" + output_basename) bamFile = bamBase + ".bam" printtime("reference: %s" % referenceName) printtime("input blocks: %s" % blocks) printtime("input reads: %s" % basecaller_bam_filename) printtime("output dir: %s" % output_dir) printtime("output basename: %s" % output_basename) printtime("full output base: %s" % bamBase) printtime("full output file: %s" % bamFile) # TODO: not always used # process args before the splits alignerArgs = process_tmap_bed_file_args( input_args=alignerArgs, bam_filename=output_basename, barcode_info=barcode_info, ) if "tmap" in alignerArgs: aligner = "tmap" if "..." in alignerArgs: alist = alignerArgs.split("...") cmd = alist[0] if memGb and memGb <= 40: cmd = _add_read_queue_size(cmd, 50000) tmap_stage_options = alist[1] else: cmd = "tmap mapall" if memGb and memGb <= 40: cmd = _add_read_queue_size(cmd, 50000) tmap_stage_options = "stage1 map4" elif "bowtie2" in alignerArgs: aligner = "bowtie2" cmd = alignerArgs else: printtime("ERROR: Aligner command not specified") raise if not referenceName: # 1. create merged unmapped bam, 2. call ionstats # TODO: long term: move ionstats basecaller into basecaller binary cmd = "" composite_bam_filepath = bamBase + ".basecaller.bam" if blocks: bamdir = "." # TODO , do we need this ? bamfile = basecaller_bam_filename block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks ] block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] if len(block_bam_list) >= 2: blockprocessing.extract_and_merge_bam_header( block_bam_list, composite_bam_filepath ) cmd = "samtools cat -h %s.header.sam -o /dev/stdout" % ( composite_bam_filepath ) for blockbamfile in block_bam_list: cmd = cmd + " %s" % blockbamfile elif len(block_bam_list) == 1: # cmd = "samtools reheader %s.header.sam %s -" % (composite_bam_filepath,block_bam_list[0]) cmd = "cat %s" % (block_bam_list[0]) else: return """ if block_bam_list: composite_bai_filepath="" mark_duplicates=False method='samtools' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) """ bam_filenames = ["/dev/stdin"] else: bam_filenames = [basecaller_bam_filename] if do_ionstats: ionstats_cmd = ionstats.generate_ionstats_basecaller_cmd( bam_filenames, bamBase + ".ionstats_basecaller.json", library_key, graph_max_x, ) if blocks: cmd += " | tee >(%s)" % ionstats_cmd cmd += " > %s" % composite_bam_filepath else: cmd = ionstats_cmd printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(["/bin/bash", "-c", cmd]).wait() if ret != 0: printtime("ERROR: unmapped bam merging failed, return code: %d" % ret) raise RuntimeError("exit code: %d" % ret) return if aligner == "tmap": referenceFastaFile = ( ion.referenceBasePath + referenceName + "/" + referenceName + ".fasta" ) if blocks: bamdir = "." # TODO , do we need this ? bamfile = basecaller_bam_filename # printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks)) block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks ] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) printtime("blocks with reads: %s" % len(block_bam_list)) if len(block_bam_list) >= 2: blockprocessing.extract_and_merge_bam_header( block_bam_list, basecaller_bam_filename ) mergecmd = ( "samtools cat -h %s.header.sam -o /dev/stdout" % basecaller_bam_filename ) for blockbamfile in block_bam_list: mergecmd = mergecmd + " %s" % blockbamfile """ mergecmd = 'java -Xmx8g -jar ' + ion.picardPath + ' MergeSamFiles' for blockbamfile in block_bam_list: mergecmd = mergecmd + ' I=%s' % blockbamfile mergecmd = mergecmd + ' O=/dev/stdout' mergecmd = mergecmd + ' VERBOSITY=WARNING' # suppress INFO on stderr mergecmd = mergecmd + ' QUIET=true' # suppress job-summary on stderr mergecmd = mergecmd + ' VALIDATION_STRINGENCY=SILENT' """ cmd = mergecmd + " | " + cmd cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -i bam" elif len(block_bam_list) == 1: cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -r %s" % block_bam_list[0] else: printtime("ERROR: all blocks filtered") return else: cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -r %s" % basecaller_bam_filename cmd += " -v" cmd += " -Y" cmd += ( " -u --prefix-exclude 5" ) # random seed based on read name after ignoring first 5 characters if do_realign: cmd += " --do-realign" cmd += " -o 2" # -o 0: SAM, -o 2: uncompressed BAM cmd += " %s" % tmap_stage_options cmd += " 2>> " + bamBase + ".alignmentQC_out.txt" # logfile elif aligner == "bowtie2": referenceFastaDir = ( "/results/referenceLibrary/bowtie2/" + referenceName + "/" + referenceName ) cmd = "java -Xmx8g -jar %s SamToFastq I=%s F=/dev/stdout" % ( ion.picardPath, basecaller_bam_filename, ) cmd += ( " | /results/plugins/bowtielauncher/bowtie2 -p%d -x %s -U /dev/stdin" % (threads, referenceFastaDir) ) cmd += " | samtools view -ubS -" if do_ionstats: bam_filenames = ["/dev/stdin"] ionstats_alignment_filename = "%s.ionstats_alignment.json" % bamBase ionstats_alignment_h5_filename = "%s.ionstats_error_summary.h5" % bamBase ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x, ) cmd += " | tee >(%s)" % ionstats_cmd # use number align_threads if smaller than 12 samtool_threads = min(threads, 12) if do_sorting: if do_mark_duplicates: # use '-T' option to avoid temp file name collision among barcodes cmd += " | samtools sort -m 1000M -l1 [email protected]%d -T tmp_%s -O bam -o - -" % ( samtool_threads, bamBase, ) json_name = ( "BamDuplicates.%s.json" % bamBase if bamBase != "rawlib" else "BamDuplicates.json" ) cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: # use '-T' option to avoid temp file name collision among barcodes cmd += " | samtools sort -m 1000M -l1 [email protected]{thread_num} -T tmp_{output_prefix} -O bam -o - - > {output_prefix}.bam".format( thread_num=samtool_threads, output_prefix=bamBase ) else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(["/bin/bash", "-c", cmd]).wait() if ret != 0: printtime("ERROR: alignment failed, return code: %d" % ret) raise RuntimeError("exit code: %d" % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd, shell=True) """ if do_indexing: try: composite_bam_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') composite_bai_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai') blockprocessing.create_index_file(composite_bam_filepath, composite_bai_filepath) except: traceback.print_exc() """ except Exception: raise
def align_dataset_parallel( dataset, blocks, reference, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, align_threads ): do_sorting = True try: # process block by block if reference and len(blocks) > 1 and int(dataset["read_count"]) > 20000000: printtime("DEBUG: TRADITIONAL BLOCK PROCESSING ------ prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # start alignment for each block and current barcode with reads # TODO: in how many blocks are reads with this barcode for block in blocks: printtime("DEBUG: ALIGN ONLY ONE BLOCK: %s" % block) align( [block], os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats=False, do_sorting=do_sorting, do_mark_duplicates=False, do_indexing=False, output_dir=os.path.join(block, ALIGNMENT_RESULTS), output_basename=dataset['file_prefix'], threads=align_threads) bamdir = '.' # TODO , do we need this ? bamBase = dataset['file_prefix'] bamfile = dataset['file_prefix'] + ".bam" block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] printtime("blocks with reads: %s" % len(block_bam_list)) bamFile = dataset['file_prefix'] + ".bam" composite_bam_filepath = dataset['file_prefix'] + ".bam" blockprocessing.extract_and_merge_bam_header(block_bam_list, composite_bam_filepath) # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...] cmd = 'samtools merge -l1 [email protected]' if do_ionstats: cmd += ' - ' else: cmd += ' %s' % (composite_bam_filepath) for bamfile in block_bam_list: cmd += ' %s' % bamfile cmd += ' -h %s.header.sam' % composite_bam_filepath if do_ionstats: bam_filenames = ["/dev/stdin"] ionstats_alignment_filename = "%s.ionstats_alignment.json" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json') ionstats_alignment_h5_filename = "%s.ionstats_error_summary.h5" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_error_summary.h5') ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x) cmd += " | tee >(%s)" % ionstats_cmd if do_mark_duplicates: json_name = 'BamDuplicates.%s.json' % bamBase if bamBase != 'rawlib' else 'BamDuplicates.json' cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: merging failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd, shell=True) else: printtime("DEBUG: MERGED BLOCK PROCESSING ----------- prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # TODO: try a python multiprocessing pool align( blocks, os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir=ALIGNMENT_RESULTS if reference else BASECALLER_RESULTS, output_basename=dataset['file_prefix'], threads=align_threads) except: traceback.print_exc()
def align( blocks, basecaller_bam_filename, # e.g. 'basecaller_results/IonXpress001.basecaller.bam' alignerArgs, ionstatsArgs, referenceName, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir, output_basename, threads=0): try: threads = threads or multiprocessing.cpu_count() bamBase = os.path.normpath(output_dir + "/" + output_basename) bamFile = bamBase + ".bam" printtime("reference: %s" % referenceName) printtime("input blocks: %s" % blocks) printtime("input reads: %s" % basecaller_bam_filename) printtime("output dir: %s" % output_dir) printtime("output basename: %s" % output_basename) printtime("full output base: %s" % bamBase) printtime("full output file: %s" % bamFile) # TODO: not always used if 'tmap' in alignerArgs: aligner = 'tmap' if '...' in alignerArgs: alist = alignerArgs.split('...') cmd = alist[0] tmap_stage_options = alist[1] else: cmd = 'tmap mapall' tmap_stage_options = 'stage1 map4' elif 'bowtie2' in alignerArgs: aligner = 'bowtie2' cmd = alignerArgs else: printtime("ERROR: Aligner command not specified") raise if not referenceName: # 1. create merged unmapped bam, 2. call ionstats # TODO: long term: move ionstats basecaller into basecaller binary cmd = "" composite_bam_filepath = bamBase + '.basecaller.bam' if blocks: bamdir = '.' # TODO , do we need this ? bamfile = basecaller_bam_filename block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks ] block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] if len(block_bam_list) >= 2: blockprocessing.extract_and_merge_bam_header( block_bam_list, composite_bam_filepath) cmd = 'samtools cat -h %s.header.sam -o /dev/stdout' % ( composite_bam_filepath) for blockbamfile in block_bam_list: cmd = cmd + ' %s' % blockbamfile elif len(block_bam_list) == 1: # cmd = "samtools reheader %s.header.sam %s -" % (composite_bam_filepath,block_bam_list[0]) cmd = "cat %s" % (block_bam_list[0]) else: return ''' if block_bam_list: composite_bai_filepath="" mark_duplicates=False method='samtools' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) ''' bam_filenames = ["/dev/stdin"] else: bam_filenames = [basecaller_bam_filename] if do_ionstats: ionstats_cmd = ionstats.generate_ionstats_basecaller_cmd( bam_filenames, bamBase + '.ionstats_basecaller.json', library_key, graph_max_x) if blocks: cmd += " | tee >(%s)" % ionstats_cmd cmd += " > %s" % composite_bam_filepath else: cmd = ionstats_cmd printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime( "ERROR: unmapped bam merging failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) return if aligner == 'tmap': referenceFastaFile = '/results/referenceLibrary/tmap-f3/' + referenceName + '/' + referenceName + '.fasta' if blocks: bamdir = '.' # TODO , do we need this ? bamfile = basecaller_bam_filename # printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks)) block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks ] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) printtime("blocks with reads: %s" % len(block_bam_list)) if len(block_bam_list) >= 2: blockprocessing.extract_and_merge_bam_header( block_bam_list, basecaller_bam_filename) mergecmd = 'samtools cat -h %s.header.sam -o /dev/stdout' % basecaller_bam_filename for blockbamfile in block_bam_list: mergecmd = mergecmd + ' %s' % blockbamfile ''' mergecmd = 'java -Xmx8g -jar /opt/picard/picard-tools-current/picard.jar MergeSamFiles' for blockbamfile in block_bam_list: mergecmd = mergecmd + ' I=%s' % blockbamfile mergecmd = mergecmd + ' O=/dev/stdout' mergecmd = mergecmd + ' VERBOSITY=WARNING' # suppress INFO on stderr mergecmd = mergecmd + ' QUIET=true' # suppress job-summary on stderr mergecmd = mergecmd + ' VALIDATION_STRINGENCY=SILENT' ''' cmd = mergecmd + " | " + cmd cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -i bam" elif len(block_bam_list) == 1: cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -r %s" % block_bam_list[0] else: printtime("ERROR: all blocks filtered") return else: cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile cmd += " -r %s" % basecaller_bam_filename cmd += " -v" cmd += " -Y" cmd += " -u --prefix-exclude 5" # random seed based on read name after ignoring first 5 characters if do_realign: cmd += " --do-realign" cmd += " -o 2" # -o 0: SAM, -o 2: uncompressed BAM cmd += " %s" % tmap_stage_options cmd += " 2>> " + bamBase + '.alignmentQC_out.txt' # logfile elif aligner == 'bowtie2': referenceFastaDir = '/results/referenceLibrary/bowtie2/' + referenceName + '/' + referenceName cmd = "java -Xmx8g -jar /opt/picard/picard-tools-current/picard.jar SamToFastq I=%s F=/dev/stdout" % basecaller_bam_filename cmd += " | /results/plugins/bowtielauncher/bowtie2 -p%d -x %s -U /dev/stdin" % ( threads, referenceFastaDir) cmd += " | samtools view -ubS -" if do_ionstats: bam_filenames = ["/dev/stdin"] ionstats_alignment_filename = "%s.ionstats_alignment.json" % bamBase ionstats_alignment_h5_filename = "%s.ionstats_error_summary.h5" % bamBase ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x) cmd += " | tee >(%s)" % ionstats_cmd if do_sorting: if do_mark_duplicates: # TODO: implement alternative, maybe with named pipes cmd += " | samtools sort -m 1000M -l1 [email protected] -o - -" json_name = 'BamDuplicates.%s.json' % bamBase if bamBase != 'rawlib' else 'BamDuplicates.json' cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: # cmd += " | ( samtools sort -m 1000M -l1 [email protected] - %s <&0 & )" % bamBase cmd += " | samtools sort -m 1000M -l1 [email protected] - %s" % bamBase else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: alignment failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd, shell=True) ''' if do_indexing: try: composite_bam_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') composite_bai_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai') blockprocessing.create_index_file(composite_bam_filepath, composite_bai_filepath) except: traceback.print_exc() ''' except: raise
def align( blocks, alignerArgs, ionstatsArgs, referenceName, basecaller_meta_information, library_key, graph_max_x, readFile, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, logfile, output_dir, output_basename): # Input : readFile # Output : output_dir/output_basename.bam try: if 'tmap' in alignerArgs: aligner = 'tmap' if '...' in alignerArgs: alist = alignerArgs.split('...') cmd = alist[0] tmap_stage_options = alist[1] else: cmd = 'tmap mapall' tmap_stage_options = 'stage1 map4' elif 'bowtie2' in alignerArgs: aligner = 'bowtie2' cmd = alignerArgs else: printtime("ERROR: Aligner command not specified") raise threads = multiprocessing.cpu_count() bamBase = os.path.normpath(output_dir + "/" + output_basename) bamFile = bamBase + ".bam" blocks=[] # TODO if aligner == 'tmap': referenceFastaFile = '/results/referenceLibrary/tmap-f3/' + referenceName + '/' + referenceName + '.fasta' if blocks: mergecmd = 'java -Xmx8g -jar /opt/picard/picard-tools-current/picard.jar MergeSamFiles' bamdir = '.' bamfile = readFile block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] for bamfile in block_bam_list: mergecmd = mergecmd + ' I=%s' % bamfile mergecmd = mergecmd + ' O=/dev/stdout' mergecmd = mergecmd + ' VERBOSITY=WARNING' # suppress INFO on stderr mergecmd = mergecmd + ' QUIET=true' # suppress job-summary on stderr mergecmd = mergecmd + ' VALIDATION_STRINGENCY=SILENT' cmd = mergecmd + " | " + cmd cmd += " -n %d" % threads cmd += " -f %s" % referenceFastaFile if blocks: cmd += " -i bam" else: cmd += " -r %s" % readFile cmd += " -v" cmd += " -Y" cmd += " -u --prefix-exclude 5" # random seed based on read name after ignoring first 5 characters if do_realign: cmd += " --do-realign" cmd += " -o 2" # -o 0: SAM, -o 2: uncompressed BAM cmd += " %s" % tmap_stage_options cmd += " 2>> " + logfile elif aligner == 'bowtie2': referenceFastaDir = '/results/referenceLibrary/bowtie2/' + referenceName + '/' + referenceName cmd="java -Xmx8g -jar /opt/picard/picard-tools-current/picard.jar SamToFastq I=%s F=/dev/stdout" % readFile cmd+=" | /results/plugins/bowtielauncher/bowtie2 -p%d -x %s -U /dev/stdin" % (threads, referenceFastaDir) cmd+=" | samtools view -ubS -" if do_ionstats: bam_filenames=["/dev/stdin"] ionstats_alignment_filename="%s.ionstats_alignment.json" % bamBase ionstats_alignment_h5_filename="%s.ionstats_error_summary.h5" % bamBase ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x) cmd += " | tee >(%s)" % ionstats_cmd if do_sorting: if do_mark_duplicates: #TODO: implement alternative, maybe with named pipes cmd += " | samtools sort -m 1000M -l1 [email protected] -o - -" json_name = 'BamDuplicates.%s.json' % bamBase if bamBase!='rawlib' else 'BamDuplicates.json' cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: # cmd += " | ( samtools sort -m 1000M -l1 [email protected] - %s <&0 & )" % bamBase cmd += " | samtools sort -m 1000M -l1 [email protected] - %s" % bamBase else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: alignment failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile print cmd subprocess.call(cmd,shell=True) except: raise
def process_datasets(blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, basecaller_datasets, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, barcodeInfo): printtime("Attempt to align") printtime("DEBUG: PROCESS DATASETS blocks: '%s'" % blocks) do_sorting = True # TODO: compare with pipeline/python/ion/utils/ionstats.py ionstats_basecaller_file_list = [] ionstats_alignment_file_list = [] ionstats_basecaller_filtered_file_list = [] ionstats_alignment_filtered_file_list = [] for dataset in basecaller_datasets["datasets"]: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] #print "DEBUG: reference: %s' % reference filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get( 'filtered', False): filtered = False # skip non-existing bam file if int(dataset["read_count"]) == 0: continue try: # process block by block if reference and len(blocks) > 1 and int( dataset["read_count"]) > 20000000: printtime( "DEBUG: TRADITIONAL BLOCK PROCESSING ------ prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # start alignment for each block and current barcode with reads # TODO: in how many blocks are reads with this barcode for block in blocks: printtime("DEBUG: ALIGN ONLY ONE BLOCK: %s" % block) align([block], os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats=False, do_sorting=do_sorting, do_mark_duplicates=False, do_indexing=False, output_dir=os.path.join(block, ALIGNMENT_RESULTS), output_basename=dataset['file_prefix']) bamdir = '.' # TODO , do we need this ? bamBase = dataset['file_prefix'] bamfile = dataset['file_prefix'] + ".bam" # printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks)) block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks ] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) printtime("blocks with reads: %s" % len(block_bam_list)) bamFile = dataset['file_prefix'] + ".bam" composite_bam_filepath = dataset['file_prefix'] + ".bam" blockprocessing.extract_and_merge_bam_header( block_bam_list, composite_bam_filepath) # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...] cmd = 'samtools merge -l1 [email protected]' if do_ionstats: cmd += ' - ' else: cmd += ' %s' % (composite_bam_filepath) for bamfile in block_bam_list: cmd += ' %s' % bamfile cmd += ' -h %s.header.sam' % composite_bam_filepath if do_ionstats: bam_filenames = ["/dev/stdin"] ionstats_alignment_filename = "%s.ionstats_alignment.json" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json') ionstats_alignment_h5_filename = "%s.ionstats_error_summary.h5" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_error_summary.h5') ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x) cmd += " | tee >(%s)" % ionstats_cmd if do_mark_duplicates: json_name = 'BamDuplicates.%s.json' % bamBase if bamBase != 'rawlib' else 'BamDuplicates.json' cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: merging failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd, shell=True) else: printtime( "DEBUG: MERGED BLOCK PROCESSING ----------- prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # TODO: try a python multiprocessing pool align(blocks, os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir=ALIGNMENT_RESULTS if reference else BASECALLER_RESULTS, output_basename=dataset['file_prefix']) except: traceback.print_exc() if reference: if filtered: ionstats_alignment_filtered_file_list.append( os.path.join( ALIGNMENT_RESULTS, dataset['file_prefix'] + '.ionstats_alignment.json')) else: ionstats_alignment_file_list.append( os.path.join( ALIGNMENT_RESULTS, dataset['file_prefix'] + '.ionstats_alignment.json')) else: if filtered: ionstats_basecaller_filtered_file_list.append( os.path.join( BASECALLER_RESULTS, dataset['file_prefix'] + '.ionstats_basecaller.json')) else: ionstats_basecaller_file_list.append( os.path.join( BASECALLER_RESULTS, dataset['file_prefix'] + '.ionstats_basecaller.json')) if do_ionstats: # Merge ionstats files from individual (barcoded) datasets if len(ionstats_alignment_file_list) > 0: ionstats.reduce_stats( ionstats_alignment_file_list, os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\[email protected]\tSN:ref\tLN:4\[email protected]\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\[email protected]\tSN:ref\tLN:4\[email protected]\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime( "ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_alignment( ionstatsArgs, ['empty_dummy.bam'], os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'), basecaller_meta_information, library_key, graph_max_x) except: raise if len(ionstats_basecaller_file_list) > 0: ionstats.reduce_stats( ionstats_basecaller_file_list, os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\[email protected]\tSN:ref\tLN:4\[email protected]\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\[email protected]\tSN:ref\tLN:4\[email protected]\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime( "ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_basecaller( ['empty_dummy.bam'], os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'), library_key, graph_max_x) except: raise ionstatslist = [] a = os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json') b = os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json') if os.path.exists(a): ionstatslist.append(a) if os.path.exists(b): ionstatslist.append(b) if len(ionstatslist) > 0: ionstats.reduce_stats( ionstatslist, os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller_with_aligninfos.json')) ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json')) # if len(ionstats_alignment_h5_file_list) > 0: # ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5')) printtime("**** Alignment completed ****")
def process_datasets( blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, basecaller_datasets, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, barcodeInfo): printtime("Attempt to align") printtime("DEBUG: PROCESS DATASETS blocks: '%s'" % blocks) do_sorting = True # TODO: compare with pipeline/python/ion/utils/ionstats.py ionstats_basecaller_file_list = [] ionstats_alignment_file_list = [] ionstats_basecaller_filtered_file_list = [] ionstats_alignment_filtered_file_list = [] for dataset in basecaller_datasets["datasets"]: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] #print "DEBUG: reference: %s' % reference filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get('filtered',False): filtered = False # skip non-existing bam file if int(dataset["read_count"]) == 0: continue try: # process block by block if reference and len(blocks) > 1 and int(dataset["read_count"]) > 20000000: printtime("DEBUG: TRADITIONAL BLOCK PROCESSING ------ prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # start alignment for each block and current barcode with reads # TODO: in how many blocks are reads with this barcode for block in blocks: printtime("DEBUG: ALIGN ONLY ONE BLOCK: %s" % block) align( [block], os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats=False, do_sorting=do_sorting, do_mark_duplicates=False, do_indexing=False, output_dir=os.path.join(block,ALIGNMENT_RESULTS), output_basename=dataset['file_prefix']) bamdir = '.' # TODO , do we need this ? bamBase = dataset['file_prefix'] bamfile = dataset['file_prefix'] + ".bam" # printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks)) block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] # printtime("DEBUG: block_bam_list: %s" % block_bam_list) printtime("blocks with reads: %s" % len(block_bam_list)) bamFile = dataset['file_prefix'] + ".bam" composite_bam_filepath = dataset['file_prefix'] + ".bam" blockprocessing.extract_and_merge_bam_header(block_bam_list,composite_bam_filepath) # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...] cmd = 'samtools merge -l1 [email protected]' if do_ionstats: cmd += ' - ' else: cmd += ' %s' % (composite_bam_filepath) for bamfile in block_bam_list: cmd += ' %s' % bamfile cmd += ' -h %s.header.sam' % composite_bam_filepath if do_ionstats: bam_filenames=["/dev/stdin"] ionstats_alignment_filename="%s.ionstats_alignment.json" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json') ionstats_alignment_h5_filename="%s.ionstats_error_summary.h5" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_error_summary.h5') ionstats_cmd = ionstats.generate_ionstats_alignment_cmd( ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_meta_information, library_key, graph_max_x) cmd += " | tee >(%s)" % ionstats_cmd if do_mark_duplicates: json_name = 'BamDuplicates.%s.json' % bamBase if bamBase!='rawlib' else 'BamDuplicates.json' cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name) else: cmd += " > %s.bam" % bamBase printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait() if ret != 0: printtime("ERROR: merging failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) # TODO: piping into samtools index or create index in sort process ? if do_indexing and do_sorting: cmd = "samtools index " + bamFile printtime("DEBUG: Calling '%s':" % cmd) subprocess.call(cmd,shell=True) else: printtime("DEBUG: MERGED BLOCK PROCESSING ----------- prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"])) # TODO: try a python multiprocessing pool align( blocks, os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, output_dir=ALIGNMENT_RESULTS if reference else BASECALLER_RESULTS, output_basename=dataset['file_prefix']) except: traceback.print_exc() if reference: if filtered: ionstats_alignment_filtered_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) else: ionstats_alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) else: if filtered: ionstats_basecaller_filtered_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) else: ionstats_basecaller_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) if do_ionstats: # Merge ionstats files from individual (barcoded) datasets if len(ionstats_alignment_file_list) > 0: ionstats.reduce_stats(ionstats_alignment_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\[email protected]\tSN:ref\tLN:4\[email protected]\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\[email protected]\tSN:ref\tLN:4\[email protected]\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_alignment( ionstatsArgs, ['empty_dummy.bam'], os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'), basecaller_meta_information, library_key, graph_max_x) except: raise if len(ionstats_basecaller_file_list) > 0: ionstats.reduce_stats(ionstats_basecaller_file_list,os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\[email protected]\tSN:ref\tLN:4\[email protected]\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\[email protected]\tSN:ref\tLN:4\[email protected]\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_basecaller( ['empty_dummy.bam'], os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'), library_key, graph_max_x) except: raise ionstatslist = [] a = os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json') b = os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json') if os.path.exists(a): ionstatslist.append(a) if os.path.exists(b): ionstatslist.append(b) if len(ionstatslist) > 0: ionstats.reduce_stats( ionstatslist, os.path.join(BASECALLER_RESULTS,'ionstats_basecaller_with_aligninfos.json')) ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json')) # if len(ionstats_alignment_h5_file_list) > 0: # ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5')) printtime("**** Alignment completed ****")