def sigproc(analysisArgs, libKey, tfKey, pathtorawblock, SIGPROC_RESULTS): if analysisArgs: cmd = analysisArgs # e.g /home/user/Analysis --flowlimit 80 else: cmd = "Analysis" printtime("ERROR: Analysis command not specified, using default: 'Analysis'") cmd += " --librarykey=%s" % (libKey) cmd += " --tfkey=%s" % (tfKey) cmd += " --no-subdir" cmd += " --output-dir=%s" % (SIGPROC_RESULTS) cmd += " %s" % pathtorawblock printtime("Analysis command: " + cmd) proc = subprocess.Popen(shlex.split(cmd.encode('utf8')), shell=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE) stdout_value, stderr_value = proc.communicate() status = proc.returncode sys.stdout.write("%s" % stdout_value) sys.stderr.write("%s" % stderr_value) # Ion Reporter try: sigproc_log_path = os.path.join(SIGPROC_RESULTS, 'sigproc.log') with open(sigproc_log_path, 'a') as f: if stdout_value: f.write(stdout_value) if stderr_value: f.write(stderr_value) except IOError: traceback.print_exc() return status
def processBlock(tf_basecaller_bam_filename, BASECALLER_RESULTS, tfkey, floworder, analysis_dir): try: # These files will be created tfstatsjson_path = os.path.join(BASECALLER_RESULTS, "TFStats.json") tfbam_filename = os.path.join(BASECALLER_RESULTS, "rawtf.bam") tfref_filename = os.path.join(BASECALLER_RESULTS, "DefaultTFs.fasta") ionstats_tf_filename = os.path.join(BASECALLER_RESULTS, "ionstats_tf.json") # TF analysis in 5 simple steps buildTFReference(tfref_filename, analysis_dir, tfkey) alignTFs(tf_basecaller_bam_filename, tfbam_filename, tfref_filename) ionstats.generate_ionstats_tf(tfbam_filename, tfref_filename, ionstats_tf_filename) ionstats_plots.tf_length_histograms(ionstats_tf_filename, ".") ionstats.generate_legacy_tf_files(ionstats_tf_filename, tfstatsjson_path) except NoTFDataException as e: printtime("No data to analyze Test Fragments (%s)" % e.msg) f = open(os.path.join(BASECALLER_RESULTS, "TFStats.json"), "w") f.write(json.dumps({})) f.close() except: traceback.print_exc()
def reduce_stats (input_filename_list, output_filename): # wait for asynchronous process substitution processes # TODO import time time.sleep(10) try: #need to copy, cannot index an iterator copy_input_filename_list = list(input_filename_list) length=len(copy_input_filename_list) # process file list in smaller intervalls size = 100 i=0 while (i<length): if i+size<length: input_files = copy_input_filename_list[i:i+size] output_file = output_filename+"."+str(i+size) else: input_files = copy_input_filename_list[i:length] output_file = output_filename # add results from earlier iterations if i>0: input_files=input_files+[output_filename+"."+str(i)] i=i+size com = "ionstats reduce" com += " -o %s" % (output_file) com += " " + " ".join(input_files) printtime("DEBUG: Calling '%s'" % com) subprocess.call(com,shell=True) except: printtime('ERROR: Failed ionstats reduce') traceback.print_exc()
def merge_bams(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, basecaller_datasets, mark_duplicates): for dataset in basecaller_datasets['datasets']: try: read_group = dataset['read_groups'][0] reference = basecaller_datasets['read_groups'][read_group]['reference'] filtered = True for rg_name in dataset["read_groups"]: if not basecaller_datasets["read_groups"][rg_name].get('filtered',False): filtered = False if reference and not filtered: bamdir = ALIGNMENT_RESULTS bamfile = dataset['file_prefix']+'.bam' else: bamdir = BASECALLER_RESULTS bamfile = dataset['basecaller_bam'] block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filepath = os.path.join(bamdir, bamfile) if block_bam_list: if reference and not filtered: composite_bai_filepath = composite_bam_filepath+'.bai' blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates) else: composite_bai_filepath="" mark_duplicates=False blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method='samtools') except: print traceback.format_exc() printtime("ERROR: merging %s unsuccessful" % bamfile)
def old_read_length_histogram(ionstats_basecaller_filename, output_png_filename, max_length): try: printtime("DEBUG: Generating plot %s" % output_png_filename) f = open(ionstats_basecaller_filename,'r') ionstats_basecaller = json.load(f); f.close() histogram_x = range(0,max_length,1) num_bins = len(histogram_x) histogram_y = [0] * num_bins for read_length,frequency in enumerate(ionstats_basecaller['full']['read_length_histogram']): current_bin = min(read_length,num_bins-1) if read_length < num_bins: histogram_y[current_bin] += frequency fig = plt.figure(figsize=(8,4),dpi=100) ax = fig.add_subplot(111) ax.bar(histogram_x, histogram_y, width=2, color="#2D4782",linewidth=0) ax.set_title('Read Length Histogram') ax.set_xlabel('Read Length') ax.set_ylabel('Count') fig.savefig(output_png_filename) except: printtime('Unable to generate plot %s' % output_png_filename) traceback.print_exc()
def generate_ionstats_alignment_cmd(ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_json, library_key, histogram_length): try: if ionstatsArgs: com = ionstatsArgs else: com = "ionstats alignment" printtime("ERROR: ionstats alignment command not specified, using default: 'ionstats alignment'") com += " -i %s" % (bam_filenames[0]) for bam_filename in bam_filenames[1:]: com += ",%s" % (bam_filename) com += " -o %s" % (ionstats_alignment_filename) com += " -k %s" % (library_key) com += " -h %d" % (int(histogram_length)) com += " --evaluate-hp true" com += " --output-h5 %s" % ionstats_alignment_h5_filename if basecaller_json: block_col_offset = basecaller_json["BaseCaller"]['block_col_offset'] block_row_offset = basecaller_json["BaseCaller"]['block_row_offset'] block_col_size = basecaller_json["BaseCaller"]['block_col_size'] block_row_size = basecaller_json["BaseCaller"]['block_row_size'] subregion_col_size, subregion_row_size = generate_ionstats_subregion_dims(block_col_size, block_row_size) com += " --chip-origin %s,%s" % (block_col_offset, block_row_offset) com += " --chip-dim %s,%s" % (block_col_size, block_row_size) com += " --subregion-dim %s,%s" % (subregion_col_size, subregion_row_size) except: traceback.print_exc() raise return com
def basecaller_cmd(basecallerArgs, SIGPROC_RESULTS, libKey, tfKey, runID, BASECALLER_RESULTS, block_col_offset, block_row_offset, datasets_pipeline_path, adapter): if basecallerArgs: cmd = basecallerArgs else: cmd = "BaseCaller" printtime("ERROR: BaseCaller command not specified, using default: 'BaseCaller'") cmd += " --input-dir=%s" % (SIGPROC_RESULTS) cmd += " --librarykey=%s" % (libKey) cmd += " --tfkey=%s" % (tfKey) cmd += " --run-id=%s" % (runID) cmd += " --output-dir=%s" % (BASECALLER_RESULTS) cmd += " --block-col-offset %d" % (block_col_offset) cmd += " --block-row-offset %d" % (block_row_offset) cmd += " --datasets=%s" % (datasets_pipeline_path) cmd += " --trim-adapter %s" % (adapter) return cmd
def generate_ionstats_alignment( bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_json, histogram_length ): try: com = "ionstats alignment" com += " -i %s" % (bam_filenames[0]) for bam_filename in bam_filenames[1:]: com += ",%s" % (bam_filename) com += " -o %s" % (ionstats_alignment_filename) com += " -h %d" % (int(histogram_length)) if basecaller_json: block_col_offset = basecaller_json["BaseCaller"]["block_col_offset"] block_row_offset = basecaller_json["BaseCaller"]["block_row_offset"] block_col_size = basecaller_json["BaseCaller"]["block_col_size"] block_row_size = basecaller_json["BaseCaller"]["block_row_size"] com += " --evaluate-hp true" com += " --output-h5 %s" % ionstats_alignment_h5_filename com += " --chip-origin %s,%s" % (block_col_offset, block_row_offset) com += " --chip-dim %s,%s" % (block_col_size, block_row_size) com += " --subregion-dim %s,%s" % (min(92, block_col_size - 1), min(74, block_row_size - 1)) printtime("DEBUG: Calling '%s'" % com) subprocess.call(com, shell=True) except: printtime("Failed ionstats alignment") traceback.print_exc()
def generate_legacy_tf_files(ionstats_tf_filename, tfstats_json_filename): try: f = open(ionstats_tf_filename, "r") ionstats_tf = json.load(f) f.close() tfstats_json = {} for tf_name, tf_data in ionstats_tf["results_by_tf"].iteritems(): tfstats_json[tf_name] = { "TF Name": tf_name, "TF Seq": tf_data["sequence"], "Num": tf_data["full"]["num_reads"], "System SNR": tf_data["system_snr"], "Per HP accuracy NUM": tf_data["hp_accuracy_numerator"], "Per HP accuracy DEN": tf_data["hp_accuracy_denominator"], "Q10": tf_data["AQ10"]["read_length_histogram"], "Q17": tf_data["AQ17"]["read_length_histogram"], "Q10 Mean": tf_data["AQ10"]["mean_read_length"], "Q17 Mean": tf_data["AQ17"]["mean_read_length"], "50Q10": sum(tf_data["AQ10"]["read_length_histogram"][50:]), "50Q17": sum(tf_data["AQ17"]["read_length_histogram"][50:]), } f = open(tfstats_json_filename, "w") f.write(json.dumps(tfstats_json, indent=4)) f.close() except: printtime("Failed to generate %s" % (tfstats_json_filename)) traceback.print_exc()
def barcode_report_stats(barcode_names): CA_barcodes_json = [] ionstats_file_list = [] printtime("DEBUG: creating CA_barcode_summary.json") for bcname in sorted(barcode_names): ionstats_file = bcname + '_rawlib.ionstats_alignment.json' barcode_json = {"barcode_name": bcname, "AQ7_num_bases":0, "full_num_reads":0, "AQ7_mean_read_length":0} try: stats = json.load(open(ionstats_file)) for key in stats.keys(): if key in ['AQ7', 'AQ10', 'AQ17', 'AQ20', 'AQ30', 'AQ47', 'full', 'aligned']: barcode_json.update({ key+ "_max_read_length": stats[key].get("max_read_length"), key+ "_mean_read_length": stats[key].get("mean_read_length"), key+ "_num_bases": stats[key].get("num_bases"), key+ "_num_reads": stats[key].get("num_reads") }) ionstats_file_list.append(ionstats_file) except: printtime("DEBUG: error reading ionstats from %s" % ionstats_file) traceback.print_exc() if bcname == 'nomatch': CA_barcodes_json.insert(0, barcode_json) else: CA_barcodes_json.append(barcode_json) with open('CA_barcode_summary.json','w') as f: f.write(json.dumps(CA_barcodes_json, indent=2)) # generate merged ionstats_alignment.json if not os.path.exists('ionstats_alignment.json'): ionstats.reduce_stats(ionstats_file_list,'ionstats_alignment.json')
def spawn_cluster_job(rpath, scriptname, args, holds=None): out_path = "%s/drmaa_stdout_block.html" % rpath err_path = "%s/drmaa_stderr_block.txt" % rpath logout = open(os.path.join(out_path), "w") logout.write("<html><pre> \n") logout.close() cwd = os.getcwd() # SGE sge_queue = "all.q" if is_thumbnail: sge_queue = "thumbnail.q" jt_nativeSpecification = "-pe ion_pe 1 -q " + sge_queue # TODO experiment if is_blockprocessing and ("X1" in rpath): # process some blocks on instrument if env["pgmName"] == "Mustang": # != "" sge_queue = "proton_" + env["pgmName"].lower() + ".q" jt_nativeSpecification = "-q " + sge_queue printtime("Use " + sge_queue) # TORQUE # jt_nativeSpecification = "" jt_remoteCommand = "python" jt_workingDirectory = os.path.join(cwd, rpath) jt_outputPath = ":" + os.path.join(cwd, out_path) jt_errorPath = ":" + os.path.join(cwd, err_path) jt_args = [os.path.join("/usr/bin", scriptname), args] jt_joinFiles = False if holds != None and len(holds) > 0: jt_nativeSpecification += " -hold_jid " for holdjobid in holds: jt_nativeSpecification += "%s," % holdjobid # TODO remove debug output print jt_nativeSpecification print jt_remoteCommand print jt_workingDirectory print jt_outputPath print jt_errorPath print jt_args try: jobid = jobserver.submitjob( jt_nativeSpecification, jt_remoteCommand, jt_workingDirectory, jt_outputPath, jt_errorPath, jt_args, jt_joinFiles, ) except: traceback.print_exc() jobid = -1 return jobid
def get_parent_barcode_files(parent_folder, datasets_path, barcodeSet): # try to get barcode names from datasets json, fallback on globbing for older reports datasetsFile = os.path.join(parent_folder,datasets_path) barcode_bams = [] try: with open(datasetsFile, 'r') as f: datasets_json = json.loads(f.read()) for dataset in datasets_json.get("datasets",[]): bamfile = os.path.join(parent_folder, dataset["file_prefix"]+'.bam') if os.path.exists(bamfile): barcode_bams.append(bamfile) elif 'legacy_prefix' in dataset.keys(): old_bamfile = os.path.join(parent_folder, dataset["legacy_prefix"]+'.bam') if os.path.exists(old_bamfile): barcode_bams.append(old_bamfile) except: pass if len(barcode_bams) == 0: printtime("DEBUG: no barcoded files found from %s" % datasetsFile) barcode_bams = glob( os.path.join(parent_folder, barcodeSet+'*_rawlib.bam') ) barcode_bams.append( os.path.join(parent_folder, 'nomatch_rawlib.bam') ) barcode_bams.sort() printtime("DEBUG: found %i barcodes in %s" % (len(barcode_bams), parent_folder) ) return barcode_bams
def read_length_histogram(ionstats_basecaller_filename, output_png_filename, max_length): try: printtime("DEBUG: Generating plot %s" % output_png_filename) f = open(ionstats_basecaller_filename,'r') ionstats_basecaller = json.load(f); f.close() histogram_x = range(0,max_length,1) num_bins = len(histogram_x) histogram_y = [0] * num_bins for read_length,frequency in enumerate(ionstats_basecaller['full']['read_length_histogram']): current_bin = min(read_length,num_bins-1) if read_length < num_bins: histogram_y[current_bin] += frequency max_y = max(histogram_y) max_y = max(max_y,1) fig = plt.figure(figsize=(4,3.5),dpi=100) ax = fig.add_subplot(111,frame_on=False,yticks=[],position=[0,0.15,1,0.88]) ax.bar(histogram_x,histogram_y,width=2.5, color="#2D4782",linewidth=0, zorder=2) ax.set_ylim(0,1.2*max_y) ax.set_xlim(-5,max_length+15) ax.set_xlabel("Read Length") fig.patch.set_alpha(0.0) fig.savefig(output_png_filename) except: printtime('Unable to generate plot %s' % output_png_filename) traceback.print_exc()
def getExpLogMsgs(env): """ Parses explog_final.txt for warning messages and dumps them to ReportLog.html. This only works if the raw data files have not been deleted. For a from-wells analysis, you may not have raw data. """ inputFile = os.path.join(env["pathToRaw"], "explog_final.txt") outputFile = os.path.join("./", "ReportLog.html") try: f = open(inputFile, "r") except: printtime("Cannot open file %s" % inputFile) return True line = f.readline() while line: if "WARNINGS:" in line: if len("WARNINGS: ") < len(line): # print to output file try: g = open(outputFile, "a") g.write("From PGM explog_final.txt:\n") g.write(line) g.close() except: printtime("Cannot open file %s" % outputFile) line = f.readline() f.close() return False
def generate_raw_data_traces(libKey, tfKey, floworder, SIGPROC_RESULTS): ######################################################## #Generate Raw Data Traces for lib and TF keys # ######################################################## printtime("Generate Raw Data Traces for lib and TF keys(iontrace_Test_Fragment.png, iontrace_Library.png) and raw_peak_signal file") tfRawPath = os.path.join(SIGPROC_RESULTS, 'avgNukeTrace_%s.txt' % tfKey) libRawPath = os.path.join(SIGPROC_RESULTS, 'avgNukeTrace_%s.txt' % libKey) peakOut = 'raw_peak_signal' if os.path.exists(tfRawPath): try: kp = plotKey.KeyPlot(tfKey, floworder, 'Test Fragment') kp.parse(tfRawPath) kp.dump_max(os.path.join('.',peakOut)) kp.plot() except: printtime("TF key graph didn't render") traceback.print_exc() else: printtime("ERROR: %s is missing" % tfRawPath) if os.path.exists(libRawPath): try: kp = plotKey.KeyPlot(libKey, floworder, 'Library') kp.parse(libRawPath) kp.dump_max(os.path.join('.',peakOut)) kp.plot() except: printtime("Lib key graph didn't render") traceback.print_exc() else: printtime("ERROR: %s is missing" % libRawPath)
def PE_set_value(key): if forward_env[key] == reverse_env[key]: return forward_env[key] else: printtime("ERROR forward run %s (%s) doesn't match reverse run %s (%s)" % (key, forward_env[key], key, reverse_env[key]) ) return 'unknown'
def generate_legacy_tf_files (ionstats_tf_filename, tfstats_json_filename): try: f = open(ionstats_tf_filename,'r') ionstats_tf = json.load(f); f.close() tfstats_json = {} for tf_name,tf_data in ionstats_tf['results_by_tf'].iteritems(): tfstats_json[tf_name] = { 'TF Name' : tf_name, 'TF Seq' : tf_data['sequence'], 'Num' : tf_data['full']['num_reads'], 'System SNR' : tf_data['system_snr'], 'Per HP accuracy NUM' : tf_data['hp_accuracy_numerator'], 'Per HP accuracy DEN' : tf_data['hp_accuracy_denominator'], 'Q10' : tf_data['AQ10']['read_length_histogram'], 'Q17' : tf_data['AQ17']['read_length_histogram'], 'Q10 Mean' : tf_data['AQ10']['mean_read_length'], 'Q17 Mean' : tf_data['AQ17']['mean_read_length'], '50Q10' : sum(tf_data['AQ10']['read_length_histogram'][50:]), '50Q17' : sum(tf_data['AQ17']['read_length_histogram'][50:]), } f = open(tfstats_json_filename,'w') f.write(json.dumps(tfstats_json, indent=4)) f.close() except: printtime('Failed to generate %s' % (tfstats_json_filename)) traceback.print_exc()
def basecaller_cmd(basecallerArgs, SIGPROC_RESULTS, libKey, tfKey, runID, BASECALLER_RESULTS, block_col_offset, block_row_offset, datasets_pipeline_path, adapter): if basecallerArgs: cmd = basecallerArgs else: cmd = "BaseCaller" printtime("ERROR: BaseCaller command not specified, using default: 'BaseCaller'") cmd += " --input-dir=%s" % (SIGPROC_RESULTS) cmd += " --librarykey=%s" % (libKey) cmd += " --tfkey=%s" % (tfKey) cmd += " --run-id=%s" % (runID) cmd += " --output-dir=%s" % (BASECALLER_RESULTS) cmd += " --block-offset %d,%d" % (block_col_offset, block_row_offset) cmd += " --datasets=%s" % (datasets_pipeline_path) cmd += " --trim-adapter %s" % (adapter) phase_estimates_json = os.path.join(SIGPROC_RESULTS, "PhaseEstimates.json") if os.path.exists(phase_estimates_json): cmd += " --phase-estimation-file %s" % phase_estimates_json return cmd
def reduce_stats_h5 (input_filename_list, output_filename): try: #need to copy, cannot index an iterator copy_input_filename_list = list(input_filename_list) length=len(copy_input_filename_list) # process file list in smaller intervalls size = 100 i=0 while (i<length): if i+size<length: input_files = copy_input_filename_list[i:i+size] output_file = output_filename+"."+str(i+size) else: input_files = copy_input_filename_list[i:length] output_file = output_filename # add results from earlier iterations if i>0: input_files=input_files+[output_filename+"."+str(i)] i=i+size com = "ionstats reduce-h5" com += " -o %s" % (output_file) com += " " + " ".join(input_files) printtime("DEBUG: Calling '%s'" % com) proc = subprocess.Popen(com, shell=True) status = proc.wait() if proc.returncode != 0: raise Exception('ERROR: ionstats reduce-h5 return code: %s' % proc.returncode) except: printtime('ERROR: Failed ionstats reduce-h5') traceback.print_exc() raise
def old_aq_length_histogram(ionstats_alignment_filename, output_png_filename, aq_string, color): try: printtime("DEBUG: Generating plot %s" % output_png_filename) f = open(ionstats_alignment_filename,'r') ionstats_alignment = json.load(f); f.close() data = ionstats_alignment[aq_string]['read_length_histogram'] xaxis = range(len(data)) ymax = max(data) + 10 xlen = len(data) + 10 xmax = len(data) - 1 if xmax < 400: xmax = 400 fig = plt.figure(figsize=(8,4),dpi=100) ax = fig.add_subplot(111) ax.bar(xaxis, data, facecolor = color, align = 'center', linewidth=0, alpha=1.0, width = 1.0) ax.set_xlabel('Filtered %s Read Length' % aq_string) ax.set_ylabel('Count') ax.set_title('Filtered %s Read Length' % aq_string) ax.set_xlim(0,xmax) ax.set_ylim(0,ymax) fig.savefig(output_png_filename) except: printtime('Unable to generate plot %s' % output_png_filename) traceback.print_exc()
def generate_ionstats_subregion_dims(block_col_size, block_row_size): try: subregion_col_size = 92 subregion_row_size = 74 if (block_col_size == 1200 and block_row_size == 800): # Thumbnail subregion_col_size = 50 subregion_row_size = 50 elif ((block_col_size == 30912 and block_row_size == 21296) or (block_col_size == 2576 and block_row_size == 2662)): # P2 subregion_col_size = 368 subregion_row_size = 296 elif ((block_col_size == 15456 and block_row_size == 10656) or (block_col_size == 1288 and block_row_size == 1332)): # P1 subregion_col_size = 184 subregion_row_size = 148 elif ((block_col_size == 7680 and block_row_size == 5312) or (block_col_size == 640 and block_row_size == 664)): # P0 subregion_col_size = 80 subregion_row_size = 83 elif (block_col_size == 3392 and block_row_size == 3792): # 318 subregion_col_size = 53 subregion_row_size = 48 elif (block_col_size == 3392 and block_row_size == 2120): # 316v2 subregion_col_size = 53 subregion_row_size = 53 elif (block_col_size == 2736 and block_row_size == 2640): # 316 subregion_col_size = 48 subregion_row_size = 48 elif (block_col_size == 1280 and block_row_size == 1152): # 314 subregion_col_size = 40 subregion_row_size = 48 return(subregion_col_size, subregion_row_size) except: printtime('ERROR: Failed to generate subregion dims from input %s,%s' % (block_col_size, block_row_size)) traceback.print_exc()
def submit_job(script, args, sge_queue = 'all.q', hold_jid = None): cwd = os.getcwd() #SGE jt_nativeSpecification = "-pe ion_pe 1 -q " + sge_queue printtime("Use "+ sge_queue) jt_remoteCommand = "python" jt_workingDirectory = cwd jt_outputPath = ":" + "%s/drmaa_stdout_block.txt" % cwd jt_errorPath = ":" + "%s/drmaa_stderr_block.txt" % cwd jt_args = [script] + args jt_joinFiles = False if hold_jid != None and len(hold_jid) > 0: jt_nativeSpecification += " -hold_jid " for holdjobid in hold_jid: jt_nativeSpecification += "%s," % holdjobid try: jobid = jobserver.submitjob( jt_nativeSpecification, jt_remoteCommand, jt_workingDirectory, jt_outputPath, jt_errorPath, jt_args, jt_joinFiles) return jobid except: traceback.print_exc() printtime("FAILED submitting %s job" % script) sys.exit()
def processBlock(tf_basecaller_bam_filename, BASECALLER_RESULTS, tfkey, floworder, analysis_dir): try: # These files will be created tfstatsjson_path = os.path.join(BASECALLER_RESULTS,"TFStats.json") tfbam_filename = os.path.join(BASECALLER_RESULTS,"rawtf.bam") tfref_filename = os.path.join(BASECALLER_RESULTS,"DefaultTFs.fasta") # TF analysis in 5 simple steps buildTFReference(tfref_filename,analysis_dir,tfkey) alignTFs(tf_basecaller_bam_filename, tfbam_filename, tfref_filename) doAlignStats(tfbam_filename) # Note: alignStats dumps its results to files in current directory doTFMapper(tfbam_filename, tfref_filename, tfstatsjson_path) generatePlots(floworder,tfstatsjson_path) except NoTFDataException as e: printtime("No data to analyze Test Fragments (%s)" % e.msg) f = open(os.path.join(BASECALLER_RESULTS,'TFStats.json'),'w') f.write(json.dumps({})) f.close() except: traceback.print_exc()
def find_barcodes_to_process(parentBAMs, barcodeSet): # get barcode files to process barcode_files = {} barcodeSet_Info = None datasets_path = 'basecaller_results/datasets_basecaller.json' barcodelist_path = 'barcodeList.txt' if not barcodeSet: return barcodeSet, barcode_files, barcodeSet_Info for bamfile in parentBAMs: parent_folder = os.path.dirname(bamfile) if os.path.exists(os.path.join(parent_folder, barcodelist_path)): bcList_file = os.path.join(parent_folder, barcodelist_path) bcSetName_new = open(bcList_file, 'r').readline().split('file_id')[1].strip() if barcodeSet != bcSetName_new: printtime("Warning: different barcode sets: %s and %s" % (barcodeSet, bcSetName_new)) if not barcodeSet_Info: barcodeSet_Info = {'nomatch': {'index': 0}} try: with open(bcList_file, 'r') as f: for line in f.readlines(): if line.startswith('barcode'): splitline = line.split(',') name = splitline[1] barcodeSet_Info[name] = { 'index': splitline[0].split()[1], 'sequence': splitline[2], 'adapter': splitline[3] } except: traceback.print_exc() # get barcode BAM files barcode_bams = get_parent_barcode_files(parent_folder, datasets_path, barcodeSet) for bc_path in barcode_bams: try: bcname = [name for name in barcodeSet_Info.keys() if os.path.basename(bc_path).startswith(name)][0] except: bcname = 'unknown' if bcname not in barcode_files: barcode_files[bcname] = { 'count': 0, 'bcfiles_to_merge': [] } barcode_files[bcname]['filename'] = bcname + '_rawlib.bam' barcode_files[bcname]['count'] += 1 barcode_files[bcname]['bcfiles_to_merge'].append(bc_path) if barcodeSet: try: shutil.copy(bcList_file, barcodelist_path) except: traceback.print_exc() return barcodeSet, barcode_files, barcodeSet_Info
def set_result_status(status): try: if os.path.exists(primary_key_file): jobserver.updatestatus(primary_key_file, status, True) printtime("MergeTLStatus %s\tpid %d\tpk file %s started" % (status, os.getpid(), primary_key_file)) except: traceback.print_exc()
def doAlignStats(bam_filename): try: com = 'alignStats -i %s -p 1 -o TF -a TF.alignTable.txt -n 12' % bam_filename printtime("DEBUG: Calling '%s'" % com) os.system(com) except: printtime("ERROR: alignStats failed")
def set_result_status(status): try: primary_key_file = os.path.join(os.getcwd(),'primary.key') jobserver.updatestatus(primary_key_file, status, True) printtime("TLStatus %s\tpid %d\tpk file %s started in %s" % (status, os.getpid(), primary_key_file, debugging_cwd)) except: traceback.print_exc()
def generate_ionstats_basecaller(unmapped_bam_filenames, ionstats_basecaller_filename, library_key, histogram_length): com = generate_ionstats_basecaller_cmd(unmapped_bam_filenames, ionstats_basecaller_filename, library_key, histogram_length) try: printtime("DEBUG: Calling '%s'" % com) subprocess.call(com, shell=True) except: printtime('Failed ionstats basecaller') traceback.print_exc()
def generate_ionstats_alignment(ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_json, library_key, histogram_length): com = generate_ionstats_alignment_cmd(ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_json, library_key, histogram_length) try: printtime("DEBUG: Calling '%s'" % com) subprocess.call(com,shell=True) except: printtime('Failed ionstats alignment') traceback.print_exc()
def quality_histogram(ionstats_basecaller_filename,output_png_filename): try: printtime("DEBUG: Generating plot %s" % output_png_filename) f = open(ionstats_basecaller_filename,'r') ionstats_basecaller = json.load(f); f.close() qv_histogram = ionstats_basecaller["qv_histogram"] sum_total = float(sum(qv_histogram)) if sum_total > 0: percent_0_5 = 100.0 * sum(qv_histogram[0:5]) / sum_total percent_5_10 = 100.0 * sum(qv_histogram[5:10]) / sum_total percent_10_15 = 100.0 * sum(qv_histogram[10:15]) / sum_total percent_15_20 = 100.0 * sum(qv_histogram[15:20]) / sum_total percent_20 = 100.0 * sum(qv_histogram[20:]) / sum_total else: percent_0_5 = 0.0 percent_5_10 = 0.0 percent_10_15 = 0.0 percent_15_20 = 0.0 percent_20 = 0.0 graph_x = [0,5,10,15,20] graph_y = [percent_0_5,percent_5_10,percent_10_15,percent_15_20,percent_20] max_y = max(graph_y) ticklabels = ['0-4','5-9','10-14','15-19','20+'] fig = plt.figure(figsize=(4,4),dpi=100) ax = fig.add_subplot(111,frame_on=False,xticks=[],yticks=[],position=[.1,0.1,1,0.9]) ax.bar(graph_x,graph_y,width=4.8, color="#2D4782",linewidth=0) for idx in range(5): label_bottom = ticklabels[idx] label_top = '%1.0f%%' % graph_y[idx] ax.text(idx*5 + 2.5,-max_y*0.04,label_bottom,horizontalalignment='center',verticalalignment='top', fontsize=12) ax.text(idx*5 + 2.5,max_y*0.06+graph_y[idx],label_top,horizontalalignment='center',verticalalignment='bottom', fontsize=12) ax.set_xlabel("Base Quality") ax.set_xlim(0,34.8) ax.set_ylim(-0.1*max_y,1.2*max_y) fig.patch.set_alpha(0.0) fig.savefig(output_png_filename) plt.close() except: printtime('Unable to generate plot %s' % output_png_filename) traceback.print_exc()
def runplugins(plugins, env, level=RunLevel.DEFAULT, params={}): printtime("Starting plugins runlevel=%s" % level) params.setdefault('run_mode', 'pipeline') # Plugins launched here come from pipeline try: pluginserver = xmlrpclib.ServerProxy( "http://%s:%d" % (PLUGINSERVER_HOST, PLUGINSERVER_PORT), allow_none=True) # call ionPlugin xmlrpc function to launch selected plugins # note that dependency plugins may be added to the plugins dict plugins, msg = call_launchPluginsXMLRPC(env['primary_key'], plugins, env['net_location'], env['username'], level, params, pluginserver) print msg except: traceback.print_exc() return plugins
def generate_ionstats_alignment_cmd(ionstatsArgs, bam_filenames, ionstats_alignment_filename, ionstats_alignment_h5_filename, basecaller_json, library_key, histogram_length): try: if ionstatsArgs: com = ionstatsArgs else: com = "ionstats alignment" printtime( "ERROR: ionstats alignment command not specified, using default: 'ionstats alignment'" ) com += " -i %s" % (bam_filenames[0]) for bam_filename in bam_filenames[1:]: com += ",%s" % (bam_filename) com += " -o %s" % (ionstats_alignment_filename) com += " -k %s" % (library_key) com += " -h %d" % (int(histogram_length)) com += " --evaluate-hp true" com += " --output-h5 %s" % ionstats_alignment_h5_filename if basecaller_json: block_col_offset = basecaller_json["BaseCaller"][ 'block_col_offset'] block_row_offset = basecaller_json["BaseCaller"][ 'block_row_offset'] block_col_size = basecaller_json["BaseCaller"]['block_col_size'] block_row_size = basecaller_json["BaseCaller"]['block_row_size'] subregion_col_size, subregion_row_size = generate_ionstats_subregion_dims( block_col_size, block_row_size) com += " --chip-origin %s,%s" % (block_col_offset, block_row_offset) com += " --chip-dim %s,%s" % (block_col_size, block_row_size) com += " --subregion-dim %s,%s" % (subregion_col_size, subregion_row_size) except: traceback.print_exc() raise return com
def alignTFs(basecaller_bam_filename, bam_filename, fasta_path): com1 = "tmap mapall -n 12 -f %s -r %s -Y -v stage1 map4" % ( fasta_path, basecaller_bam_filename) com2 = "samtools view -Sb -o %s - 2>> /dev/null" % bam_filename printtime("DEBUG: Calling '%s | %s':" % (com1, com2)) p1 = subprocess.Popen(com1, stdout=subprocess.PIPE, shell=True) p2 = subprocess.Popen(com2, stdin=p1.stdout, shell=True) p2.communicate() p1.communicate() if p1.returncode != 0: raise subprocess.CalledProcessError(p1.returncode, com1) if p2.returncode != 0: # Assumption: samtools view only fails when there are zero reads. printtime( "Command '%s | %s' failed, presumably because there are no TF reads" % (com1, com2)) raise Exception('No TF reads found')
def beadfind_cmd(beadfindArgs, libKey, tfKey, pathtorawblock, SIGPROC_RESULTS, block_offset_xy): if beadfindArgs: cmd = beadfindArgs # e.g /home/user/Beadfind -xyz else: cmd = "justBeadFind" printtime( "ERROR: Beadfind command not specified, using default: 'justBeadFind'" ) cmd += " --librarykey=%s" % (libKey) cmd += " --tfkey=%s" % (tfKey) cmd += " --no-subdir" cmd += " --output-dir=%s" % (SIGPROC_RESULTS) # justBeadFind is currently internally deriving the block offset # cmd += " --block-offset %d,%d" % block_offset_xy cmd += " %s" % pathtorawblock return cmd
def old_aq_length_histogram(ionstats_alignment_filename, output_png_filename, aq_string, color): try: printtime("DEBUG: Generating plot %s" % output_png_filename) f = open(ionstats_alignment_filename, "r") ionstats_alignment = json.load(f) f.close() data = ionstats_alignment[aq_string]["read_length_histogram"] xaxis = range(len(data)) ymax = max(data) + 10 xlen = len(data) + 10 xmax = len(data) - 1 if xmax < 400: xmax = 400 fig = plt.figure(figsize=(8, 4), dpi=100) ax = fig.add_subplot(111) ax.bar( xaxis, data, facecolor=color, align="center", linewidth=0, alpha=1.0, width=1.0, ) ax.set_xlabel("Filtered %s Read Length" % aq_string) ax.set_ylabel("Count") ax.set_title("Filtered %s Read Length" % aq_string) ax.set_xlim(0, xmax) ax.set_ylim(0, ymax) fig.savefig(output_png_filename) plt.close() except Exception: printtime("Unable to generate plot %s" % output_png_filename) traceback.print_exc()
def alignment_rate_plot(alignStats, ionstats_basecaller_filename, output_png_filename, graph_max_x, y_ticks=None): if not os.path.exists(alignStats): printtime("ERROR: %s does not exist" % alignStats) return def intWithCommas(x): if type(x) not in [type(0), type(0L)]: raise TypeError("Parameter must be an integer.") if x < 0: return '-' + intWithCommas(-x) result = '' while x >= 1000: x, r = divmod(x, 1000) result = ",%03d%s" % (r, result) return "%d%s" % (x, result)
def merge_barcoded_basecaller_bams(BASECALLER_RESULTS, basecaller_datasets, method): try: composite_bam_filename = os.path.join(BASECALLER_RESULTS, 'rawlib.basecaller.bam') if not os.path.exists(composite_bam_filename): # TODO bam_file_list = [] for dataset in basecaller_datasets["datasets"]: print os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']) if os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])): bam_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])) composite_bai_filepath = "" mark_duplicates = False blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bai_filepath, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: Generate merged %s on barcoded run failed" % composite_bam_filename) printtime("Finished basecaller barcode merging")
def basecaller_cmd(basecallerArgs, SIGPROC_RESULTS, libKey, tfKey, runID, BASECALLER_RESULTS, block_col_offset, block_row_offset, datasets_pipeline_path, adapter): if basecallerArgs: cmd = basecallerArgs else: cmd = "BaseCaller" printtime( "ERROR: BaseCaller command not specified, using default: 'BaseCaller'" ) cmd += " --input-dir=%s" % (SIGPROC_RESULTS) cmd += " --librarykey=%s" % (libKey) cmd += " --tfkey=%s" % (tfKey) cmd += " --run-id=%s" % (runID) cmd += " --output-dir=%s" % (BASECALLER_RESULTS) cmd += " --block-offset %d,%d" % (block_col_offset, block_row_offset) cmd += " --datasets=%s" % (datasets_pipeline_path) cmd += " --trim-adapter %s" % (adapter) return cmd
def align(libraryName, lib_path, output_dir, output_basename): # Input -> output_basename.bam # Output -> output_dir/output_basename.bam try: cmd = "alignmentQC.pl" cmd += " --logfile %s" % os.path.join(output_dir, "alignmentQC_out.txt") cmd += " --output-dir %s" % output_dir cmd += " --input %s" % lib_path cmd += " --genome %s" % libraryName cmd += " --max-plot-read-len %s" % str(int(400)) cmd += " --out-base-name %s" % output_basename cmd += " --skip-alignStats" printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: raise RuntimeError('exit code: %d' % ret) except: raise
def get_barcode_files(parent_folder, datasets_path, bcSetName): # try to get barcode names from datasets json, fallback on globbing for older reports datasetsFile = os.path.join(parent_folder,datasets_path) barcode_bams = [] try: with open(datasetsFile, 'r') as f: datasets_json = json.loads(f.read()) for dataset in datasets_json.get("datasets",[]): bamfile = os.path.join(parent_folder, dataset["legacy_prefix"]+'.bam') if os.path.exists(bamfile): barcode_bams.append(bamfile) except: pass if len(barcode_bams) == 0: barcode_bams = glob( os.path.join(parent_folder, bcSetName+'*_rawlib.bam') ) barcode_bams.append( os.path.join(parent_folder, 'nomatch_rawlib.bam') ) barcode_bams.sort() printtime("DEBUG: found %i barcodes in %s" % (len(barcode_bams), parent_folder) ) return barcode_bams
def wait_on_jobs(jobIds, jobName, status = "Processing"): try: jobserver.updatestatus(primary_key_file, status, True) except: traceback.print_exc() # wait for job to finish while len(jobIds) > 0: for jid in jobIds: try: jobstatus = jobserver.jobstatus(jobId) except: traceback.print_exc() continue if jobstatus=='done' or jobstatus=='failed' or jobstatus=="DRMAA BUG": printtime("DEBUG: Job %s has ended with status %s" % (str(jid),jobstatus)) jobIds.remove(jid) printtime("waiting for %s job(s) to finish ..." % jobName) time.sleep(10)
def merge_basecaller_bam(dirs, BASECALLER_RESULTS): datasets_basecaller = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_basecaller = json.load(f); f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return # Iterate over datasets. Could be one for non-barcoded runs or multiple for barcoded runs for dataset in datasets_basecaller['datasets']: if 'basecaller_bam' not in dataset: continue ############################################### # Merge Per-barcode Unmapped BAMs # ############################################### try: block_bam_list = [os.path.join(dir,BASECALLER_RESULTS, dataset['basecaller_bam']) for dir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filename = os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']) if block_bam_list: blockprocessing.merge_bam_files(block_bam_list,composite_bam_filename,composite_bam_filename+'.bai',False) except: printtime("ERROR: merging %s unsuccessful" % dataset['basecaller_bam']) ## Note! on barcoded runs, barcode files are NOT subsequently merged into one multi-barcode BAM. printtime("Finished merging basecaller BAM files")
def get_plugins_to_run(plugins, report_type): """ Sort out runtypes and runlevels of each plugin and return plugins appropriate for this analysis """ blocklevel = False plugins_to_run = {} printtime("Get plugins to run, report type = %s" % report_type) for name in plugins.keys(): plugin = plugins[name] # default is run on wholechip and thumbnail, but not composite selected = report_type in [RunType.FULLCHIP, RunType.THUMB] if plugin.get('runtypes'): selected = (report_type in plugin['runtypes']) if selected: plugin['runlevels'] = plugin.get('runlevels') if plugin.get( 'runlevels') else [RunLevel.DEFAULT] printtime("Plugin %s is enabled, runlevels=%s" % (plugin['name'], ','.join(plugin['runlevels']))) plugins_to_run[name] = plugin # check if have any blocklevel plugins if report_type == RunType.COMPOSITE and RunLevel.BLOCK in plugin[ 'runlevels']: blocklevel = True else: printtime("Plugin %s (runtypes=%s) is not enabled for %s report" % (plugin['name'], ','.join(plugin.get('runtypes', '')), report_type)) return plugins_to_run, blocklevel
def calibrate(dir_recalibration, sampleBAMFile, recalibArgs, chipflow): try: if recalibArgs: cmd = recalibArgs else: cmd = "Calibration" # default parameters block_offset_x = chipflow["BaseCaller"]['block_col_offset'] block_offset_y = chipflow["BaseCaller"]['block_row_offset'] block_size_x = chipflow["BaseCaller"]['block_col_size'] block_size_y = chipflow["BaseCaller"]['block_row_size'] if "--block-offset" not in cmd: cmd += " --block-offset %d,%d" % (block_offset_x, block_offset_y) if "--block-size" not in cmd: cmd += " --block-size %d,%d" % (block_size_x, block_size_y) cmd += " -i %s" % sampleBAMFile cmd += " -o %s" % dir_recalibration printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd,shell=True) if ret == 0: printtime("Calibration generated: %s" % (os.path.join(dir_recalibration,"Calibration.json"))) else: raise RuntimeError('Calibration exit code: %d' % ret) except: printtime('ERROR: HP training failed') traceback.print_exc() raise
def QVtable(dir_recalibration, genome_path, sampleBAMFile, xMin, xMax, xCuts, yMin, yMax, yCuts, flowSpan): '''Generates a QV table from the mapped sample reads''' try: cmd = "java -jar /usr/local/share/java/FlowspaceCalibration.jar" cmd += " I=%s" % sampleBAMFile cmd += " R=%s" % genome_path cmd += " O=%s" % os.path.join(dir_recalibration, 'sample.csv') cmd += " F=%s" % os.path.join(dir_recalibration, 'sample.flow.csv') cmd += " X_MIN=%d" % xMin #X_MAX=3391 =0 Y_MAX=3791 Y_MIN=0 X_CUTS=1 Y_CUTS=1 FLOW_SPAN=520 cmd += " X_MAX=%d" % xMax cmd += " X_CUTS=%d" % xCuts cmd += " Y_MIN=%d" % yMin cmd += " Y_MAX=%d" % yMax cmd += " Y_CUTS=%d" % yCuts cmd += " FLOW_SPAN=%d" % flowSpan cmd += " VALIDATION_STRINGENCY=SILENT NUM_THREADS=16 MAX_QUEUE_SIZE=8192 > %s 2>&1" % os.path.join( dir_recalibration, 'flowQVtable.log') printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret == 0: printtime("Finished flow QV table") else: raise RuntimeError('Flow QV table exit code: %d' % ret) except: printtime('ERROR: flow QV table failed') raise
def merge_barcoded_alignment_bams(ALIGNMENT_RESULTS, basecaller_datasets, method): try: composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, 'rawlib.bam') bam_file_list = [] for dataset in basecaller_datasets["datasets"]: bam_name = os.path.join( ALIGNMENT_RESULTS, os.path.basename(dataset['file_prefix']) + '.bam') if os.path.exists(bam_name): bam_file_list.append(bam_name) else: printtime("WARNING: exclude %s from merging into %s" % (bam_name, composite_bam_filename)) composite_bai_filename = composite_bam_filename + '.bai' mark_duplicates = False blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename, composite_bai_filename, mark_duplicates, method) except: traceback.print_exc() printtime("ERROR: Generate merged %s on barcoded run failed" % composite_bam_filename) printtime("Finished barcode merging of %s" % ALIGNMENT_RESULTS)
def read_length_sparkline(ionstats_basecaller_filename, output_png_filename, max_length): try: printtime("DEBUG: Generating plot %s" % output_png_filename) f = open(ionstats_basecaller_filename,'r') ionstats_basecaller = json.load(f); f.close() histogram_x = range(0,max_length,5) num_bins = len(histogram_x) histogram_y = [0] * num_bins for read_length,frequency in enumerate(ionstats_basecaller['full']['read_length_histogram']): current_bin = min(read_length/5,num_bins-1) histogram_y[current_bin] += frequency max_y = max(histogram_y) max_y = max(max_y,1) fig = plt.figure(figsize=(3,0.3),dpi=100) ax = fig.add_subplot(111,frame_on=False,xticks=[],yticks=[],position=[0,0,1,1]) ax.bar(histogram_x,histogram_y,width=6.5, color="#2D4782",linewidth=0, zorder=2) for idx in range(0,max_length,50): label_bottom = str(idx) ax.text(idx,max_y*0.70,label_bottom,horizontalalignment='center',verticalalignment='center', fontsize=8, zorder=1) ax.axvline(x=idx,color='#D0D0D0',ymax=0.5, zorder=0) ax.axvline(x=idx,color='#D0D0D0',ymin=0.9, zorder=0) ax.set_ylim(0,max_y) ax.set_xlim(-10,max_length) fig.patch.set_alpha(0.0) fig.savefig(output_png_filename) plt.close() # Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. except: printtime('Unable to generate plot %s' % output_png_filename) traceback.print_exc()
def generate_raw_data_traces(libKey, tfKey, floworder, SIGPROC_RESULTS, plot_output_dir=os.getcwd()): # # Generate Raw Data Traces for lib and TF keys # # printtime("Generate Raw Data Traces for lib and TF keys " + "(iontrace_Test_Fragment.png, iontrace_Library.png) " + "and raw_peak_signal file") tfRawPath = os.path.join(SIGPROC_RESULTS, "avgNukeTrace_%s.txt" % tfKey) libRawPath = os.path.join(SIGPROC_RESULTS, "avgNukeTrace_%s.txt" % libKey) peakOut = "raw_peak_signal" if os.path.exists(tfRawPath): try: kp = plotKey.KeyPlot(tfKey, floworder, "Test Fragment") kp.parse(tfRawPath) kp.dump_max(os.path.join(plot_output_dir, peakOut)) kp.plot(outdir=plot_output_dir) except Exception: printtime("TF key graph didn't render") traceback.print_exc() else: printtime("ERROR: %s is missing" % tfRawPath) if os.path.exists(libRawPath): try: kp = plotKey.KeyPlot(libKey, floworder, "Library") kp.parse(libRawPath) kp.dump_max(os.path.join(plot_output_dir, peakOut)) kp.plot(outdir=plot_output_dir) except Exception: printtime("Lib key graph didn't render") traceback.print_exc() else: printtime("ERROR: %s is missing" % libRawPath)
def merge_alignment_bigdata(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, mark_duplicates): datasets_json = {} try: f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r') datasets_json = json.load(f) f.close() except: printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json")) traceback.print_exc() return for dataset in datasets_json['datasets']: # Merge BAMs try: block_bam_list = [os.path.join(dir,ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') for dir in dirs] block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)] composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam') if block_bam_list: blockprocessing.merge_bam_files(block_bam_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates) except: printtime("ERROR: merging %s unsuccessful" % (dataset['file_prefix']+'.bam'))
def wait_on_jobs(jobIds, jobName, status="Processing", max_running_jobs=0): try: jobserver.updatestatus(primary_key_file, status, True) except Exception: traceback.print_exc() # wait for job to finish while len(jobIds) > max_running_jobs: printtime("waiting for %s job(s) to finish ..." % jobName) for jobid in jobIds: try: jobstatus = jobserver.jobstatus(jobid) except Exception: traceback.print_exc() continue if jobstatus == "done" or jobstatus == "failed" or jobstatus == "DRMAA BUG": printtime("DEBUG: Job %s has ended with status %s" % (str(jobid), jobstatus)) jobIds.remove(jobid) time.sleep(20)
def generate_ionstats_subregion_dims(block_col_size, block_row_size): try: subregion_col_size = 92 subregion_row_size = 74 if block_col_size == 1200 and block_row_size == 800: # Thumbnail subregion_col_size = 50 subregion_row_size = 50 elif (block_col_size == 30912 and block_row_size == 21296) or ( block_col_size == 2576 and block_row_size == 2662): # P2 subregion_col_size = 368 subregion_row_size = 296 elif (block_col_size == 15456 and block_row_size == 10656) or ( block_col_size == 1288 and block_row_size == 1332): # P1 subregion_col_size = 184 subregion_row_size = 148 elif (block_col_size == 7680 and block_row_size == 5312) or (block_col_size == 640 and block_row_size == 664): # P0 subregion_col_size = 80 subregion_row_size = 83 elif block_col_size == 3392 and block_row_size == 3792: # 318 subregion_col_size = 53 subregion_row_size = 48 elif block_col_size == 3392 and block_row_size == 2120: # 316v2 subregion_col_size = 53 subregion_row_size = 53 elif block_col_size == 2736 and block_row_size == 2640: # 316 subregion_col_size = 48 subregion_row_size = 48 elif block_col_size == 1280 and block_row_size == 1152: # 314 subregion_col_size = 40 subregion_row_size = 48 return (subregion_col_size, subregion_row_size) except Exception: printtime("ERROR: Failed to generate subregion dims from input %s,%s" % (block_col_size, block_row_size)) traceback.print_exc()
def merge_inlinecontrol_json(dirs, BASECALLER_RESULTS): printtime("Merging inline_control_stats.json files") print(dirs) try: inlinecontrolfiles = [] for subdir in dirs: subdir = os.path.join(BASECALLER_RESULTS, subdir) printtime("DEBUG: %s:" % subdir) inlinecontroljson = os.path.join(subdir, "inline_control_stats.json") if os.path.exists(inlinecontroljson): inlinecontrolfiles.append(subdir) else: printtime( "Warning: Merging inline_control_stats.json files: skipped %s" % inlinecontroljson) merge(inlinecontrolfiles, BASECALLER_RESULTS) except Exception: traceback.print_exc() printtime("Merging inline_control_stats.json files failed") printtime("Finished merging inline control stats")
def merge_basecaller_json(dirs, BASECALLER_RESULTS): printtime("Merging BaseCaller.json files") try: basecallerfiles = [] for subdir in dirs: subdir = os.path.join(BASECALLER_RESULTS, subdir) printtime("DEBUG: %s:" % subdir) if isbadblock(subdir, "Merging BaseCaller.json files"): continue basecallerjson = os.path.join(subdir, 'BaseCaller.json') if os.path.exists(basecallerjson): basecallerfiles.append(subdir) else: printtime("ERROR: Merging BaseCaller.json files: skipped %s" % basecallerjson) mergeBaseCallerJson.merge(basecallerfiles, BASECALLER_RESULTS) except: traceback.print_exc() printtime("Merging BaseCaller.json files failed") printtime("Finished merging basecaller stats")
def mergeAvgNukeTraces(dirs, SIGPROC_RESULTS, key, beads, from_rawdata=False): # # Merging avgNukeTrace_*.txt files # # printtime("Merging avgNukeTrace_*.txt files") try: output_trace_file = os.path.join(SIGPROC_RESULTS, "avgNukeTrace_%s.txt" % key) sumAvgNukeTraceData = None sumWells = 0 config = ConfigParser.RawConfigParser() for subdir in dirs: if from_rawdata: # from rawdata: onboard_results/sigproc_results/block_X0_Y0/ in rawdata input_basedir = os.path.join(SIGPROC_RESULTS, subdir) else: # original format: block_X0_Y0/sigproc_results/ # from results from RUO pipeline input_basedir = os.path.join(subdir, SIGPROC_RESULTS) try: input_trace_file = os.path.join(input_basedir, "avgNukeTrace_%s.txt" % key) if os.path.exists(input_trace_file): config.read(os.path.join(input_basedir, "bfmask.stats")) wells = config.getint("global", beads) labels = numpy.genfromtxt(input_trace_file, delimiter=" ", usecols=[0], dtype=str) currentAvgNukeTraceData = numpy.genfromtxt( input_trace_file, delimiter=" ")[:, 1:] else: continue except Exception: traceback.print_exc() continue if sumAvgNukeTraceData is None: sumAvgNukeTraceData = currentAvgNukeTraceData * wells else: sumAvgNukeTraceData += currentAvgNukeTraceData * wells sumWells += wells AvgNukeTraceData = sumAvgNukeTraceData / sumWells AvgNukeTraceTable = numpy.column_stack( (labels, AvgNukeTraceData.astype("|S10"))) numpy.savetxt(output_trace_file, AvgNukeTraceTable, fmt="%s") except Exception: traceback.print_exc() printtime("ERROR: Merging %s failed" % output_trace_file) printtime("Finished mergeAvgNukeTraces")
def buildTFReference(tfreffasta_filename, analysis_dir, tfkey): ''' Build the DefaultTFs.fasta from DefaultTFs.conf ''' DefaultTFconfPath = os.path.join(analysis_dir, 'DefaultTFs.conf') if not os.path.exists(DefaultTFconfPath): if not os.path.exists('/opt/ion/config/DefaultTFs.conf'): printtime( 'ERROR: could not locate DefaultTFs.conf (tried %s and /opt/ion/config/DefaultTFs.conf)' % DefaultTFconfPath) raise IOError DefaultTFconfPath = '/opt/ion/config/DefaultTFs.conf' printtime('TFPipeline: Using TF sequences from %s' % DefaultTFconfPath) num_tfs = 0 try: confFile = open(DefaultTFconfPath, 'r') fastaFile = open(tfreffasta_filename, 'w') for confLine in confFile.readlines(): if len(confLine) == 0: continue if confLine[0] == '#': continue confEntries = confLine.split(',') if len(confEntries) != 3: continue if confEntries[1] != tfkey: continue fastaFile.write('>%s\n' % confEntries[0]) fastaFile.write('%s\n' % str(confEntries[2]).strip()) num_tfs += 1 confFile.close() fastaFile.close() except Exception as e: printtime("ERROR: failed convert %s into %s" % (DefaultTFconfPath, tfreffasta_filename)) raise e if num_tfs == 0: printtime("No suitable TFs with key %s found in %s" % (tfkey, DefaultTFconfPath)) raise NoTFDataException('No TF reference sequences')
def beadfind(beadfindArgs, libKey, tfKey, pathtorawblock, SIGPROC_RESULTS): if beadfindArgs: cmd = beadfindArgs # e.g /home/user/Beadfind -xyz else: cmd = "justBeadFind" printtime( "ERROR: Beadfind command not specified, using default: 'justBeadFind'" ) cmd += " --librarykey=%s" % (libKey) cmd += " --tfkey=%s" % (tfKey) cmd += " --no-subdir" cmd += " --output-dir=%s" % (SIGPROC_RESULTS) cmd += " %s" % pathtorawblock printtime("Beadfind command: " + cmd) proc = subprocess.Popen(shlex.split(cmd.encode('utf8')), shell=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE) stdout_value, stderr_value = proc.communicate() status = proc.returncode sys.stdout.write("%s" % stdout_value) sys.stderr.write("%s" % stderr_value) # Ion Reporter try: sigproc_log_path = os.path.join(SIGPROC_RESULTS, 'sigproc.log') with open(sigproc_log_path, 'a') as f: if stdout_value: f.write(stdout_value) if stderr_value: f.write(stderr_value) except IOError: traceback.print_exc() return status
def mergeRawPeakSignals(dirs): ############################################### # Merge raw_peak_signal files # ############################################### printtime("Merging raw_peak_signal files") try: raw_peak_signal_files = [] for subdir in dirs: printtime("DEBUG: %s:" % subdir) if isbadblock(subdir, "Merging raw_peak_signal files"): continue raw_peak_signal_file = os.path.join(subdir,'raw_peak_signal') if os.path.exists(raw_peak_signal_file): raw_peak_signal_files.append(raw_peak_signal_file) else: printtime("ERROR: Merging raw_peak_signal files: skipped %s" % raw_peak_signal_file) composite_raw_peak_signal_file = "raw_peak_signal" blockprocessing.merge_raw_key_signals(raw_peak_signal_files, composite_raw_peak_signal_file) except: printtime("Merging raw_peak_signal files failed") printtime("Finished mergeRawPeakSignals")
def align(referenceName, lib_path, bidirectional, mark_duplicates, realign, skip_sorting, aligner_opts_extra, logfile, output_dir, output_basename): # Input -> output_basename.bam # Output -> output_dir/output_basename.bam try: cmd = "alignmentQC.py" cmd += " --logfile %s" % logfile cmd += " --output-dir %s" % output_dir cmd += " --input %s" % lib_path cmd += " --genome %s" % referenceName #cmd += " --max-plot-read-len %s" % str(int(800)) cmd += " --out-base-name %s" % output_basename #cmd += " --skip-alignStats" #cmd += " --threads 8" #cmd += " --server-key 13" if realign: cmd += " --realign" if skip_sorting: cmd += " --skip-sorting" if bidirectional: cmd += ' --bidirectional' if aligner_opts_extra: cmd += ' --aligner-opts-extra "%s"' % aligner_opts_extra if mark_duplicates: cmd += ' --mark-duplicates' printtime("DEBUG: Calling '%s':" % cmd) ret = subprocess.call(cmd, shell=True) if ret != 0: printtime("ERROR: alignmentQC.py failed, return code: %d" % ret) raise RuntimeError('exit code: %d' % ret) except: raise
def read_length_histogram(ionstats_basecaller_filename, output_png_filename, max_length): try: printtime("DEBUG: Generating plot %s" % output_png_filename) f = open(ionstats_basecaller_filename,'r') ionstats_basecaller = json.load(f); f.close() histogram_x = range(0,max_length,1) num_bins = len(histogram_x) histogram_y = [0] * num_bins for read_length,frequency in enumerate(ionstats_basecaller['full']['read_length_histogram']): current_bin = min(read_length,num_bins-1) if read_length < num_bins: histogram_y[current_bin] += frequency max_y = max(histogram_y) max_y = max(max_y,1) fig = plt.figure(figsize=(4,3.5),dpi=100) ax = fig.add_subplot(111,frame_on=False,yticks=[],position=[0,0.15,1,0.88]) ax.bar(histogram_x,histogram_y,width=2.5, color="#2D4782",linewidth=0, zorder=2) ax.set_ylim(0,1.2*max_y) ax.set_xlim(-5,max_length+15) ax.set_xlabel("Read Length") fig.patch.set_alpha(0.0) fig.savefig(output_png_filename) plt.close() except: printtime('Unable to generate plot %s' % output_png_filename) traceback.print_exc()
def update_bfmask_artifacts(bfmaskPath, bfmaskstatspath, outputdir, plot_title): printtime("Make Bead Density Plots") try: beadDensityPlot.genHeatmap(bfmaskPath, bfmaskstatspath, outputdir, plot_title) except IOError as err: printtime("Bead Density Plot file error: %s" % err) except Exception as err: printtime("Bead Density Plot generation failure: %s" % err) traceback.print_exc()