def merge_barcoded_alignment_bams(ALIGNMENT_RESULTS, basecaller_datasets, method): try: composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, 'rawlib.bam') bam_file_list = [] for dataset in basecaller_datasets["datasets"]: bam_name = os.path.join( ALIGNMENT_RESULTS, os.path.basename(dataset['file_prefix']) + '.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) else: printtime("WARNING: exclude %s from merging into %s" % (bam_name, composite_bam_filename)) composite_bai_filename = composite_bam_filename + '.bai' mark_duplicates = False blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bai_filename, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: Generate merged %s on barcoded run failed" % composite_bam_filename) printtime("Finished barcode merging of %s" % ALIGNMENT_RESULTS)
def merge_basecaller_bam(dirs, BASECALLER_RESULTS): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return # Iterate over datasets. Could be one for non-barcoded runs or multiple for barcoded runs for dataset in datasets_basecaller['datasets']: if 'basecaller_bam' not in dataset: continue ############################################### # Merge Per-barcode Unmapped BAMs # ############################################### try: block_bam_list = [os.path.join(dir,BASECALLER_RESULTS, dataset['basecaller_bam']) for dir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filename = os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']) if block_bam_list: blockprocessing.merge_bam_files(block_bam_list,composite_bam_filename,composite_bam_filename+'.bai',False) except: printtime("ERROR: merging %s unsuccessful" % dataset['basecaller_bam']) ## Note! on barcoded runs, barcode files are NOT subsequently merged into one multi-barcode BAM. printtime("Finished merging basecaller BAM files")
def merge_barcoded_basecaller_bams(BASECALLER_RESULTS): datasets_basecaller_path = os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json") if not os.path.exists(datasets_basecaller_path): printtime("ERROR: %s does not exist" % datasets_basecaller_path) raise Exception("ERROR: %s does not exist" % datasets_basecaller_path) datasets_basecaller = {} try: f = open(datasets_basecaller_path,'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % datasets_basecaller_path) raise Exception("ERROR: problem parsing %s" % datasets_basecaller_path) try: composite_bam_filename = os.path.join(BASECALLER_RESULTS,'rawlib.basecaller.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: print os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']) if os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): bam_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])) blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',False) except: traceback.print_exc() printtime("ERROR: Generate merged rawlib.basecaller.bam on barcoded runs failed") printtime("Finished basecaller barcode merging")
def merge_alignment_bigdata(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, mark_duplicates): datasets_json = {} try: f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"), 'r') datasets_json = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json")) traceback.print_exc() return for dataset in datasets_json['datasets']: # Merge BAMs try: block_bam_list = [ os.path.join(dir, ALIGNMENT_RESULTS, dataset['file_prefix'] + '.bam') for dir in dirs ] block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] composite_bam_filename = os.path.join( ALIGNMENT_RESULTS, dataset['file_prefix'] + '.bam') if block_bam_list: blockprocessing.merge_bam_files( block_bam_list, composite_bam_filename, composite_bam_filename + '.bai', mark_duplicates) except: printtime("ERROR: merging %s unsuccessful" % (dataset['file_prefix'] + '.bam'))
def merge_bams(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, basecaller_datasets, mark_duplicates): for dataset in basecaller_datasets['datasets']: try: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get('filtered',False): filtered = False if reference and not filtered: bamdir = ALIGNMENT_RESULTS bamfile = dataset['file_prefix']+'.bam' else: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: if reference and not filtered: composite_bai_filepath = composite_bam_filepath+'.bai' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates) else: composite_bai_filepath="" mark_duplicates=False blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method='samtools') except: print traceback.format_exc() printtime("ERROR: merging %s unsuccessful" % bamfile)
def merge_barcoded_basecaller_bams(BASECALLER_RESULTS, basecaller_datasets, method): try: composite_bam_filename = os.path.join(BASECALLER_RESULTS, 'rawlib.basecaller.bam') if not os.path.exists(composite_bam_filename): # TODO bam_file_list = [] for dataset in basecaller_datasets["datasets"]: print os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']) if os.path.exists( os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): bam_file_list.append( os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])) composite_bai_filepath = "" mark_duplicates = False blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: Generate merged %s on barcoded run failed" % composite_bam_filename) printtime("Finished basecaller barcode merging")
def merge_bams(dirs, BASECALLER_RESULTS, basecaller_datasets, method): for dataset in basecaller_datasets['datasets']: try: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in dirs ] block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: composite_bai_filepath = "" mark_duplicates = False blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: merging %s unsuccessful" % bamfile) printtime("Finished merging basecaller BAM files")
def merge_bams(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, basecaller_datasets, mark_duplicates): for dataset in basecaller_datasets['datasets']: try: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group][ 'reference'] filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get( 'filtered', False): filtered = False if reference and not filtered: bamdir = ALIGNMENT_RESULTS bamfile = dataset['file_prefix'] + '.bam' else: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [ os.path.join(blockdir, bamdir, bamfile) for blockdir in dirs ] block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: if reference and not filtered: composite_bai_filepath = composite_bam_filepath + '.bai' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates) else: composite_bai_filepath = "" mark_duplicates = False blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method='samtools') except: print traceback.format_exc() printtime("ERROR: merging %s unsuccessful" % bamfile)
def merge_bams(dirs, BASECALLER_RESULTS, basecaller_datasets, method): for dataset in basecaller_datasets['datasets']: try: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: composite_bai_filepath="" mark_duplicates=False blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: merging %s unsuccessful" % bamfile) printtime("Finished merging basecaller BAM files")
def merge_barcoded_basecaller_bams(BASECALLER_RESULTS, basecaller_datasets, method): try: composite_bam_filename = os.path.join(BASECALLER_RESULTS,'rawlib.basecaller.bam') if not os.path.exists(composite_bam_filename): #TODO bam_file_list = [] for dataset in basecaller_datasets["datasets"]: print os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']) if os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): bam_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])) composite_bai_filepath="" mark_duplicates=False blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: Generate merged %s on barcoded run failed" % composite_bam_filename) printtime("Finished basecaller barcode merging")
def merge_barcoded_alignment_bams(ALIGNMENT_RESULTS, basecaller_datasets, method): try: composite_bam_filename = os.path.join(ALIGNMENT_RESULTS,'rawlib.bam') bam_file_list = [] for dataset in basecaller_datasets["datasets"]: bam_name = os.path.join(ALIGNMENT_RESULTS,os.path.basename(dataset['file_prefix'])+'.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) else: printtime("WARNING: exclude %s from merging into %s" % (bam_name,composite_bam_filename)) composite_bai_filename = composite_bam_filename+'.bai' mark_duplicates = False blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bai_filename, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: Generate merged %s on barcoded run failed" % composite_bam_filename) printtime("Finished barcode merging of %s" % ALIGNMENT_RESULTS)
def merge_alignment_bigdata(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, mark_duplicates): datasets_json = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_json = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return for dataset in datasets_json['datasets']: # Merge BAMs try: block_bam_list = [os.path.join(dir,ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') for dir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') if block_bam_list: blockprocessing.merge_bam_files(block_bam_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates) except: printtime("ERROR: merging %s unsuccessful" % (dataset['file_prefix']+'.bam'))
def merge_alignment_bigdata(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, mark_duplicates): datasets_json = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_json = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return for dataset in datasets_json['datasets']: # Merge BAMs try: block_bam_list = [os.path.join(dir,ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') for dir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') if block_bam_list: blockprocessing.merge_bam_files(block_bam_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates) except: printtime("ERROR: merging %s unsuccessful" % (dataset['file_prefix']+'.bam')) printtime("Creating legacy name links") if dataset.has_key('legacy_prefix'): link_src = [ os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam'), os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai')] link_dst = [ os.path.join(ALIGNMENT_RESULTS, os.path.basename(dataset['legacy_prefix'])+'.bam'), os.path.join(ALIGNMENT_RESULTS, os.path.basename(dataset['legacy_prefix'])+'.bam.bai')] for (src,dst) in zip(link_src,link_dst): try: os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst))
def alignment_post_processing(BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows, mark_duplicates, force_alignstats): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"), 'r') datasets_basecaller = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists( os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.alignment.summary') if os.path.exists(src): input_prefix_list.append( os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][ read_group]: X_name = datasets_basecaller['read_groups'][read_group][ 'barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src, os.path.dirname(dst)), dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Special legacy post-processing. # Generate merged rawlib.bam on barcoded runs composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, 'rawlib.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: bam_name = os.path.join( ALIGNMENT_RESULTS, os.path.basename(dataset['file_prefix']) + '.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bam_filename + '.bai', mark_duplicates) force_alignstats = True if force_alignstats: ## Generate data for error plot for barcoded run from composite bam printtime("Call alignStats to generate raw accuracy") try: cmd = "alignStats" cmd += " -n 12" cmd += " --alignSummaryFile alignStats_err.txt" cmd += " --alignSummaryJsonFile alignStats_err.json" cmd += " --alignSummaryMinLen 1" #cmd += " --alignSummaryMaxLen %s" % str(int(graph_max_x)) cmd += " --alignSummaryMaxLen %s" % str(int(400)) cmd += " --alignSummaryLenStep 1" cmd += " --alignSummaryMaxErr 10" cmd += " --infile %s" % composite_bam_filename cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS printtime("DEBUG: Calling '%s'" % cmd) os.system(cmd) except: printtime("alignStats failed") mergeAlignStatsResults(input_prefix_list, ALIGNMENT_RESULTS + "/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS, 'base_error_plot.png'), int(graph_max_x)) ionstats_plots.alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'), os.path.join(ALIGNMENT_RESULTS, 'alignment_rate_plot.png'), int(graph_max_x)) # Create aligned histogram plot # Create AQ20 plot printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment("./", barcodelist_path) # These graphs are likely obsolete makeAlignGraphs()
parser.add_argument('-i', '--add-file', dest='files', action='append', default=[], help="list of files to process") parser.add_argument('-m', '--merge-bams', dest='merge_out', action='store', default = "", help='merge bam files') parser.add_argument('-d', '--mark-duplicates', dest='duplicates', action='store_true', default = False, help='mark duplicates') parser.add_argument('-a', '--align-stats', dest='align_stats', action='store', default = "", help='generate alignment stats') parser.add_argument('-g', '--genomeinfo', dest='genomeinfo', action='store', default = "", help='genome info file for alignment stats') parser.add_argument('-p', '--merge-plots', dest='merge_plots', action='store_true', default = "", help='generate report plots') parser.add_argument('-z', '--zip', dest='zip', action='store', default = "", help='zip input files') args = parser.parse_args() if args.merge_out and len(args.files) > 1: # Merge BAM files outputBAM = args.merge_out print "Merging bam files to %s, mark duplicates is %s" % (outputBAM, args.duplicates) merge_bam_files(args.files, outputBAM, outputBAM.replace('.bam','.bam.bai'), args.duplicates) if args.align_stats: # Call alignStats on merged bam file inputBAM = args.align_stats print "Running alignStats on %s" % inputBAM cmd = "alignStats" if '_rawlib.bam' in inputBAM: bcid = inputBAM.split('_rawlib.bam')[0] cmd += " -o %s" % bcid # make alignment_BC.summary links to BC.alignment.summary output of alignStats os.symlink('%s.alignment.summary' % bcid, 'alignment_%s.summary' % bcid)
cmd = "calibrate --hpmodelMerge" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) except: traceback.print_exc() printtime("ERROR: Merge Basecaller Recalibration Results failed") try: printtime("INFO: merging rawtf.basecaller.bam") block_bam_list = [os.path.join(adir, env['BASECALLER_RESULTS'], 'rawtf.basecaller.bam') for adir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filename = os.path.join(env['BASECALLER_RESULTS'], 'rawtf.basecaller.bam') if block_bam_list: blockprocessing.merge_bam_files(block_bam_list,composite_bam_filename,composite_bai_filepath="",mark_duplicates=False,method='picard') except: print traceback.format_exc() printtime("ERROR: merging rawtf.basecaller.bam unsuccessful") if do_unfiltered_processing: basecaller_datasets = blockprocessing.get_datasets_basecaller(env['BASECALLER_RESULTS']) try: os.mkdir(os.path.join(env['BASECALLER_RESULTS'],'unfiltered.untrimmed')) basecaller.merge_datasets_basecaller_json( dirs, os.path.join(env['BASECALLER_RESULTS'],"unfiltered.untrimmed"))
printtime("INFO: merging rawtf.basecaller.bam") block_bam_list = [ os.path.join(adir, env["BASECALLER_RESULTS"], "rawtf.basecaller.bam") for adir in dirs ] block_bam_list = [ block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename) ] composite_bam_filename = os.path.join(env["BASECALLER_RESULTS"], "rawtf.basecaller.bam") if block_bam_list: blockprocessing.merge_bam_files( block_bam_list, composite_bam_filename, composite_bai_filepath="", mark_duplicates=False, method="picard", ) except Exception: print(traceback.format_exc()) printtime("ERROR: merging rawtf.basecaller.bam unsuccessful") if do_unfiltered_processing: basecaller_datasets = blockprocessing.get_datasets_basecaller( env["BASECALLER_RESULTS"]) try: os.mkdir( os.path.join(env["BASECALLER_RESULTS"],
def process_datasets( blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS, basecaller_meta_information, library_key, graph_max_x, basecaller_datasets, ALIGNMENT_RESULTS, do_realign, do_ionstats, do_mark_duplicates, do_indexing, barcodeInfo): printtime("Attempt to align") do_sorting = True # compare with pipeline/python/ion/utils/ionstats.py ionstats_basecaller_file_list = [] ionstats_alignment_file_list = [] ionstats_basecaller_filtered_file_list = [] ionstats_alignment_filtered_file_list = [] for dataset in basecaller_datasets["datasets"]: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] #print "DEBUG: reference: %s' % reference filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get('filtered',False): filtered = False # skip non-existing bam file if int(dataset["read_count"]) == 0: continue if reference: # merge unmapped bam files TODO move into align try: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: composite_bai_filepath="" mark_duplicates=False method='samtools' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: merging %s unsuccessful" % bamfile) try: align( blocks, alignmentArgs, ionstatsArgs, reference, basecaller_meta_information, library_key, graph_max_x, os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), do_realign, do_ionstats, do_sorting, do_mark_duplicates, do_indexing, logfile=os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignmentQC_out.txt'), output_dir=ALIGNMENT_RESULTS, output_basename=dataset['file_prefix']) except: traceback.print_exc() if filtered: ionstats_alignment_filtered_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) else: ionstats_alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) ''' if do_indexing: try: composite_bam_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') composite_bai_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai') blockprocessing.create_index_file(composite_bam_filepath, composite_bai_filepath) except: traceback.print_exc() ''' else: # merge unmapped bam file without reference try: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: composite_bai_filepath="" mark_duplicates=False method='samtools' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: merging %s unsuccessful" % bamfile) if do_ionstats: # TODO: move ionstats basecaller into basecaller ionstats.generate_ionstats_basecaller( [os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])], os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'), library_key, graph_max_x) if filtered: ionstats_basecaller_filtered_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) else: ionstats_basecaller_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) if do_ionstats: # Merge ionstats files from individual (barcoded) datasets if len(ionstats_alignment_file_list) > 0: ionstats.reduce_stats(ionstats_alignment_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_alignment( ionstatsArgs, ['empty_dummy.bam'], os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'), basecaller_meta_information, library_key, graph_max_x) except: raise if len(ionstats_basecaller_file_list) > 0: ionstats.reduce_stats(ionstats_basecaller_file_list,os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json')) else: # barcode classification filtered all barcodes or no reads available # TODO: ionstats needs to produce initial json file try: #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" cmd = "echo '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret != 0: printtime("ERROR: empty bam file generation failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) ionstats.generate_ionstats_basecaller( ['empty_dummy.bam'], os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'), library_key, graph_max_x) except: raise ionstatslist = [] a = os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json') b = os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json') if os.path.exists(a): ionstatslist.append(a) if os.path.exists(b): ionstatslist.append(b) if len(ionstatslist) > 0: ionstats.reduce_stats( ionstatslist, os.path.join(BASECALLER_RESULTS,'ionstats_basecaller_with_aligninfos.json')) ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json')) # if len(ionstats_alignment_h5_file_list) > 0: # ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5')) printtime("**** Alignment completed ****")
def post_basecalling(BASECALLER_RESULTS,expName,resultsName,flows): datasets_basecaller_path = os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json") if not os.path.exists(datasets_basecaller_path): printtime("ERROR: %s does not exist" % datasets_basecaller_path) open('badblock.txt', 'w').close() return datasets_basecaller = {} try: f = open(datasets_basecaller_path,'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % datasets_basecaller_path) traceback.print_exc() open('badblock.txt', 'w').close() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 quality_file_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue # Call ionstats utility to generate alignment-independent metrics for current unmapped BAM ionstats.generate_ionstats_basecaller( os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']), os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'), graph_max_x) # Plot read length sparkline ionstats_plots.read_length_sparkline( os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'), os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.sparkline.png'), graph_max_x) quality_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json')) # Merge ionstats_basecaller files from individual barcodes/dataset ionstats.reduce_stats(quality_file_list,os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json')) # Generate legacy stats file: quality.summary ionstats.generate_legacy_basecaller_files( os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'), os.path.join(BASECALLER_RESULTS,'')) # Plot classic read length histogram ionstats_plots.old_read_length_histogram( os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'), os.path.join(BASECALLER_RESULTS,'readLenHisto.png'), graph_max_x) # Plot new read length histogram ionstats_plots.read_length_histogram( os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'), os.path.join(BASECALLER_RESULTS,'readLenHisto2.png'), graph_max_x) # Plot quality value histogram ionstats_plots.quality_histogram( os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'), os.path.join(BASECALLER_RESULTS,'quality_histogram.png')) # Generate merged rawlib.basecaller.bam on barcoded runs, TODO, can this be removed? composite_bam_filename = os.path.join(BASECALLER_RESULTS,'rawlib.basecaller.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: if os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): bam_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])) blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',False) printtime("Finished basecaller post processing")
def alignment_post_processing( BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows, mark_duplicates, force_alignstats): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 400 input_prefix_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue printtime("Barcode processing, rename") src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary') if os.path.exists(src): input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.')) #terrible hack to make aggregate_alignment happy X_name = 'nomatch' read_group = dataset['read_groups'][0] if 'barcode_name' in datasets_basecaller['read_groups'][read_group]: X_name = datasets_basecaller['read_groups'][read_group]['barcode_name'] dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name) try: os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) printtime("Creating legacy name links") if dataset.has_key('legacy_prefix'): link_src = [ os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam'), os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai')] link_dst = [ os.path.join(ALIGNMENT_RESULTS, os.path.basename(dataset['legacy_prefix'])+'.bam'), os.path.join(ALIGNMENT_RESULTS, os.path.basename(dataset['legacy_prefix'])+'.bam.bai')] for (src,dst) in zip(link_src,link_dst): try: os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst) except: printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst)) # Special legacy post-processing. # Generate merged rawlib.basecaller.bam and rawlib.sff on barcoded runs composite_bam_filename = os.path.join(ALIGNMENT_RESULTS,'rawlib.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: bam_name = os.path.join(ALIGNMENT_RESULTS,os.path.basename(dataset['file_prefix'])+'.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates) force_alignstats = True if force_alignstats: ## Generate data for error plot for barcoded run from composite bam printtime("Call alignStats to generate raw accuracy") try: cmd = "alignStats" cmd += " -n 12" cmd += " --alignSummaryFile alignStats_err.txt" cmd += " --alignSummaryJsonFile alignStats_err.json" cmd += " --alignSummaryMinLen 1" #cmd += " --alignSummaryMaxLen %s" % str(int(graph_max_x)) cmd += " --alignSummaryMaxLen %s" % str(int(400)) cmd += " --alignSummaryLenStep 1" cmd += " --alignSummaryMaxErr 10" cmd += " --infile %s" % composite_bam_filename cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS printtime("DEBUG: Calling '%s'" % cmd) os.system(cmd) except: printtime("alignStats failed") mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/") try: base_error_plot.generate_base_error_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x)) base_error_plot.generate_alignment_rate_plot( os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'), os.path.join(BASECALLER_RESULTS,'readLen.txt'), os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x)) # Create aligned histogram plot # Create AQ20 plot printtime("Base error plot has been created successfully") except: printtime("ERROR: Failed to generate base error plot") traceback.print_exc() # Generate alignment_barcode_summary.csv barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment ("./",barcodelist_path) # These graphs are likely obsolete makeAlignGraphs()
def alignment_post_processing( libraryName, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows, mark_duplicates): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return try: graph_max_x = int(50 * math.ceil(0.014 * int(flows))) except: graph_max_x = 800 alignment_file_list = [] for dataset in datasets_basecaller["datasets"]: if not os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): continue ionstats.generate_ionstats_alignment( os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam'), os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'), graph_max_x) ionstats2alignstats(libraryName, os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.alignment.summary')) alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')) # In Progress: merge ionstats alignment results ionstats.reduce_stats(alignment_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')) ionstats2alignstats(libraryName, os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), os.path.join(ALIGNMENT_RESULTS,'alignment.summary')) # Special legacy post-processing. # Generate merged rawlib.bam on barcoded runs composite_bam_filename = os.path.join(ALIGNMENT_RESULTS,'rawlib.bam') if not os.path.exists(composite_bam_filename): bam_file_list = [] for dataset in datasets_basecaller["datasets"]: bam_name = os.path.join(ALIGNMENT_RESULTS,os.path.basename(dataset['file_prefix'])+'.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates) # Generate alignment_barcode_summary.csv #TODO: use datasets_basecaller.json + *.ionstats_alignment.json instead of barcodeList.txt and alignment_*.summary barcodelist_path = 'barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../barcodeList.txt' if not os.path.exists(barcodelist_path): barcodelist_path = '../../../../barcodeList.txt' if os.path.exists(barcodelist_path): printtime("Barcode processing, aggregate") aggregate_alignment ("./",barcodelist_path) # These graphs are likely obsolete #makeAlignGraphs() # In Progress: Use ionstats alignment results to generate plots ionstats_plots.alignment_rate_plot2( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'alignment_rate_plot.png', graph_max_x) ionstats_plots.base_error_plot( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'base_error_plot.png', graph_max_x) ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q10.png', 'AQ10', 'red') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q17.png', 'AQ17', 'yellow') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q20.png', 'AQ20', 'green') ionstats_plots.old_aq_length_histogram( os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'), 'Filtered_Alignments_Q47.png', 'AQ47', 'purple')
help='generate report plots') parser.add_argument('-z', '--zip', dest='zip', action='store', default="", help='zip input files') args = parser.parse_args() if args.merge_out and len(args.files) > 1: # Merge BAM files outputBAM = args.merge_out print "Merging bam files to %s, mark duplicates is %s" % ( outputBAM, args.duplicates) merge_bam_files(args.files, outputBAM, outputBAM.replace('.bam', '.bam.bai'), args.duplicates) if args.align_stats: # Call alignStats on merged bam file inputBAM = args.align_stats print "Running alignStats on %s" % inputBAM cmd = "alignStats" if '_rawlib.bam' in inputBAM: bcid = inputBAM.split('_rawlib.bam')[0] cmd += " -o %s" % bcid # make alignment_BC.summary links to BC.alignment.summary output of alignStats os.symlink('%s.alignment.summary' % bcid, 'alignment_%s.summary' % bcid)