def mergeBasecallerResults(dirs, QualityPath, merged_bead_mask_path, floworder, libsff, tfsff, BASECALLER_RESULTS): ############################################ # Merge individual quality.summary files # ############################################ printtime("Merging individual quality.summary files") config_out = ConfigParser.RawConfigParser() config_out.optionxform = str # don't convert to lowercase config_out.add_section('global') numberkeys = ['Number of 50BP Reads', 'Number of 100BP Reads', 'Number of 150BP Reads', 'Number of Reads at Q0', 'Number of Bases at Q0', 'Number of 50BP Reads at Q0', 'Number of 100BP Reads at Q0', 'Number of 150BP Reads at Q0', 'Number of Reads at Q17', 'Number of Bases at Q17', 'Number of 50BP Reads at Q17', 'Number of 150BP Reads at Q17', 'Number of 100BP Reads at Q17', 'Number of Reads at Q20', 'Number of Bases at Q20', 'Number of 50BP Reads at Q20', 'Number of 100BP Reads at Q20', 'Number of 150BP Reads at Q20'] maxkeys = ['Max Read Length at Q0', 'Max Read Length at Q17', 'Max Read Length at Q20'] meankeys = ['System SNR', 'Mean Read Length at Q0', 'Mean Read Length at Q17', 'Mean Read Length at Q20'] config_in = MyConfigParser() config_in.optionxform = str # don't convert to lowercase doinit = True for i,subdir in enumerate(dirs): if isbadblock(subdir, "Merging quality.summary"): continue summaryfile=os.path.join(BASECALLER_RESULTS, subdir, 'quality.summary') if os.path.exists(summaryfile): printtime("INFO: process %s" % summaryfile) config_in.read(summaryfile) for key in numberkeys: value_in = config_in.get('global',key) if doinit: value_out = 0 else: value_out = config_out.get('global', key) config_out.set('global', key, int(value_in) + int(value_out)) for key in maxkeys: value_in = config_in.get('global',key) if doinit: value_out = 0 else: value_out = config_out.get('global', key) config_out.set('global', key, max(int(value_in),int(value_out))) for key in meankeys: value_in = config_in.get('global',key) if doinit: value_out = 0 else: value_out = config_out.get('global', key) config_out.set('global', key, float(value_out)+float(value_in)/len(dirs)) doinit = False else: printtime("ERROR: skipped %s" % summaryfile) with open(QualityPath, 'wb') as configfile: config_out.write(configfile) ################################################## #generate TF Metrics # #look for both keys and append same file # ################################################## printtime("Merging TFMapper metrics and generating TF plots") try: TFPipeline.mergeBlocks(BASECALLER_RESULTS,dirs,floworder) except: printtime("ERROR: Merging TFMapper metrics failed") ############################################### # Merge BaseCaller.json files # ############################################### printtime("Merging BaseCaller.json files") try: basecallerfiles = [] for subdir in dirs: subdir = os.path.join(BASECALLER_RESULTS,subdir) printtime("DEBUG: %s:" % subdir) if isbadblock(subdir, "Merging BaseCaller.json files"): continue basecallerjson = os.path.join(subdir,'BaseCaller.json') if os.path.exists(basecallerjson): basecallerfiles.append(subdir) else: printtime("ERROR: Merging BaseCaller.json files: skipped %s" % basecallerjson) mergeBaseCallerJson.merge(basecallerfiles,BASECALLER_RESULTS) except: printtime("Merging BaseCaller.json files failed") ######################################## # Merge individual block SFF files # ######################################## printtime("Merging Library SFF files") try: cmd = 'SFFProtonMerge' cmd = cmd + ' -i rawlib.sff' cmd = cmd + ' -o %s ' % libsff for subdir in dirs: subdir = os.path.join(BASECALLER_RESULTS,subdir) if isbadblock(subdir, "Merging Library SFF files"): continue rawlibsff = os.path.join(subdir,'rawlib.sff') if os.path.exists(rawlibsff): cmd = cmd + ' %s' % subdir else: printtime("ERROR: skipped %s" % rawlibsff) printtime("DEBUG: Calling '%s'" % cmd) subprocess.call(cmd,shell=True) except: printtime("SFFProtonMerge failed (library)") printtime("Merging Test Fragment SFF files") try: cmd = 'SFFProtonMerge' cmd = cmd + ' -i rawtf.sff' cmd = cmd + ' -o %s ' % tfsff for subdir in dirs: subdir = os.path.join(BASECALLER_RESULTS,subdir) if isbadblock(subdir, "Merging Test Fragment SFF files"): continue rawtfsff = os.path.join(subdir,'rawtf.sff') if os.path.exists(rawtfsff): cmd = cmd + ' %s' % subdir else: printtime("ERROR: skipped %s" % rawtfsff) printtime("DEBUG: Calling '%s'" % cmd) subprocess.call(cmd,shell=True) except: printtime("SFFProtonMerge failed (test fragments)")
def mergeAlignStatsResults(input_prefix_list,output_prefix): ############################################ # Merge individual alignment.summary files # ############################################ printtime("Merging individual alignment.summary files") config_out = ConfigParser.RawConfigParser() config_out.optionxform = str # don't convert to lowercase config_out.add_section('global') quallist = ['Q7', 'Q10', 'Q17', 'Q20', 'Q30', 'Q47'] bplist = [50, 100, 150, 200, 250, 300, 350, 400] fixedkeys = [ 'Genome', 'Genome Version', 'Index Version', 'Genomesize' ] numberkeys = ['Total number of Reads', 'Filtered Mapped Bases in Q7 Alignments', 'Filtered Mapped Bases in Q10 Alignments', 'Filtered Mapped Bases in Q17 Alignments', 'Filtered Mapped Bases in Q20 Alignments', 'Filtered Mapped Bases in Q30 Alignments', 'Filtered Mapped Bases in Q47 Alignments', 'Filtered Q7 Alignments', 'Filtered Q10 Alignments', 'Filtered Q17 Alignments', 'Filtered Q20 Alignments', 'Filtered Q30 Alignments', 'Filtered Q47 Alignments'] for q in quallist: for bp in bplist: numberkeys.append('Filtered %s%s Reads' % (bp, q)) maxkeys = ['Filtered Q7 Longest Alignment', 'Filtered Q10 Longest Alignment', 'Filtered Q17 Longest Alignment', 'Filtered Q20 Longest Alignment', 'Filtered Q30 Longest Alignment', 'Filtered Q47 Longest Alignment'] # init for key in fixedkeys: value_out = 'unknown' config_out.set('global', key, value_out) for key in numberkeys: value_out = 0 config_out.set('global', key, int(value_out)) for key in maxkeys: value_out = 0 config_out.set('global', key, int(value_out)) config_in = MyConfigParser() config_in.optionxform = str # don't convert to lowercase for input_prefix in input_prefix_list: alignmentfile = input_prefix + 'alignment.summary' if os.path.exists(alignmentfile): config_in.read(os.path.join(alignmentfile)) for key in numberkeys: value_in = config_in.get('global',key) value_out = config_out.get('global', key) config_out.set('global', key, int(value_in) + int(value_out)) for key in maxkeys: value_in = config_in.get('global',key) value_out = config_out.get('global', key) config_out.set('global', key, max(int(value_in),int(value_out))) for key in fixedkeys: value_in = config_in.get('global',key) value_out = config_out.get('global',key) #todo config_out.set('global', key, value_in) else: printtime("ERROR: skipped %s" % alignmentfile) # Regenerate trickier alignment.summary metrics for qual in quallist: try: q_bases = config_out.get('global','Filtered Mapped Bases in %s Alignments' % qual) q_reads = config_out.get('global','Filtered %s Alignments' % qual) q_readlen = 0 if q_reads > 0: q_readlen = q_bases / q_reads config_out.set('global','Filtered %s Mean Alignment Length' % qual, q_readlen) genomesize = float(config_out.get('global','Genomesize')) q_coverage = 0.0 if genomesize > 0: q_coverage = q_bases / genomesize config_out.set('global','Filtered %s Mean Coverage Depth' % qual, '%1.1f' % q_coverage) # Not mergeable at this point config_out.set('global','Filtered %s Coverage Percentage' % qual, 'N/A') except: pass with open(output_prefix + 'alignment.summary', 'wb') as configfile: config_out.write(configfile) ######################################### # Merge individual alignTable.txt files # ######################################### printtime("Merging individual alignTable.txt files") table = 0 header = None for input_prefix in input_prefix_list: alignTableFile = input_prefix + 'alignTable.txt' if os.path.exists(alignTableFile): if header is None: header = numpy.loadtxt(alignTableFile, dtype='string', comments='#') table += numpy.loadtxt(alignTableFile, dtype='int', comments='#',skiprows=1) else: printtime("ERROR: skipped %s" % alignTableFile) #fix first column if header is not None: table[:,0] = (header[1:,0]) f_handle = open(output_prefix+ 'alignTable.txt', 'w') numpy.savetxt(f_handle, header[0][None], fmt='%s', delimiter='\t') numpy.savetxt(f_handle, table, fmt='%i', delimiter='\t') f_handle.close()
def mergeAlignStatsResults(input_prefix_list, output_prefix): ############################################ # Merge individual alignment.summary files # ############################################ printtime("Merging individual alignment.summary files") config_out = ConfigParser.RawConfigParser() config_out.optionxform = str # don't convert to lowercase config_out.add_section('global') quallist = ['Q7', 'Q10', 'Q17', 'Q20', 'Q30', 'Q47'] bplist = [50, 100, 150, 200, 250, 300, 350, 400] fixedkeys = ['Genome', 'Genome Version', 'Index Version', 'Genomesize'] numberkeys = [ 'Total number of Reads', 'Total Mapped Reads', 'Total Mapped Target Bases', 'Filtered Mapped Bases in Q7 Alignments', 'Filtered Mapped Bases in Q10 Alignments', 'Filtered Mapped Bases in Q17 Alignments', 'Filtered Mapped Bases in Q20 Alignments', 'Filtered Mapped Bases in Q30 Alignments', 'Filtered Mapped Bases in Q47 Alignments', 'Filtered Q7 Alignments', 'Filtered Q10 Alignments', 'Filtered Q17 Alignments', 'Filtered Q20 Alignments', 'Filtered Q30 Alignments', 'Filtered Q47 Alignments' ] for q in quallist: for bp in bplist: numberkeys.append('Filtered %s%s Reads' % (bp, q)) maxkeys = [ 'Filtered Q7 Longest Alignment', 'Filtered Q10 Longest Alignment', 'Filtered Q17 Longest Alignment', 'Filtered Q20 Longest Alignment', 'Filtered Q30 Longest Alignment', 'Filtered Q47 Longest Alignment' ] # init for key in fixedkeys: value_out = 'unknown' config_out.set('global', key, value_out) for key in numberkeys: value_out = 0 config_out.set('global', key, int(value_out)) for key in maxkeys: value_out = 0 config_out.set('global', key, int(value_out)) config_in = MyConfigParser() config_in.optionxform = str # don't convert to lowercase for input_prefix in input_prefix_list: alignmentfile = input_prefix + 'alignment.summary' if os.path.exists(alignmentfile): config_in.read(os.path.join(alignmentfile)) for key in numberkeys: value_in = config_in.get('global', key) value_out = config_out.get('global', key) config_out.set('global', key, int(value_in) + int(value_out)) for key in maxkeys: value_in = config_in.get('global', key) value_out = config_out.get('global', key) config_out.set('global', key, max(int(value_in), int(value_out))) for key in fixedkeys: value_in = config_in.get('global', key) value_out = config_out.get('global', key) #todo config_out.set('global', key, value_in) else: printtime("ERROR: skipped %s" % alignmentfile) # Regenerate trickier alignment.summary metrics for qual in quallist: try: q_bases = config_out.get( 'global', 'Filtered Mapped Bases in %s Alignments' % qual) q_reads = config_out.get('global', 'Filtered %s Alignments' % qual) q_readlen = 0 if q_reads > 0: q_readlen = q_bases / q_reads config_out.set('global', 'Filtered %s Mean Alignment Length' % qual, q_readlen) genomesize = float(config_out.get('global', 'Genomesize')) q_coverage = 0.0 if genomesize > 0: q_coverage = q_bases / genomesize config_out.set('global', 'Filtered %s Mean Coverage Depth' % qual, '%1.1f' % q_coverage) # Not mergeable at this point config_out.set('global', 'Filtered %s Coverage Percentage' % qual, 'N/A') except: pass with open(output_prefix + 'alignment.summary', 'wb') as configfile: config_out.write(configfile) ######################################### # Merge individual alignTable.txt files # ######################################### printtime("Merging individual alignTable.txt files") table = 0 header = None for input_prefix in input_prefix_list: alignTableFile = input_prefix + 'alignTable.txt' if os.path.exists(alignTableFile): if header is None: header = numpy.loadtxt(alignTableFile, dtype='string', comments='#') table += numpy.loadtxt(alignTableFile, dtype='int', comments='#', skiprows=1) else: printtime("ERROR: skipped %s" % alignTableFile) #fix first column if header is not None: table[:, 0] = (header[1:, 0]) f_handle = open(output_prefix + 'alignTable.txt', 'w') numpy.savetxt(f_handle, header[0][None], fmt='%s', delimiter='\t') numpy.savetxt(f_handle, table, fmt='%i', delimiter='\t') f_handle.close()
def mergeAlignmentResults(dirs, env, ALIGNMENT_RESULTS): ############################################ # Merge individual alignment.summary files # ############################################ printtime("Merging individual alignment.summary files") config_out = ConfigParser.RawConfigParser() config_out.optionxform = str # don't convert to lowercase config_out.add_section('global') quallist = ['Q7', 'Q10', 'Q17', 'Q20', 'Q47'] bplist = [50, 100, 150, 200, 250, 300, 350, 400] fixedkeys = [ 'Genome', 'Genome Version', 'Index Version', 'Genomesize' ] numberkeys = ['Total number of Reads', 'Filtered Mapped Bases in Q7 Alignments', 'Filtered Mapped Bases in Q10 Alignments', 'Filtered Mapped Bases in Q17 Alignments', 'Filtered Mapped Bases in Q20 Alignments', 'Filtered Mapped Bases in Q47 Alignments', 'Filtered Q7 Alignments', 'Filtered Q10 Alignments', 'Filtered Q17 Alignments', 'Filtered Q20 Alignments', 'Filtered Q47 Alignments'] for q in quallist: for bp in bplist: numberkeys.append('Filtered %s%s Reads' % (bp, q)) maxkeys = ['Filtered Q7 Longest Alignment', 'Filtered Q10 Longest Alignment', 'Filtered Q17 Longest Alignment', 'Filtered Q20 Longest Alignment', 'Filtered Q47 Longest Alignment'] meankeys = ['Filtered Q7 Mean Alignment Length', 'Filtered Q10 Mean Alignment Length', 'Filtered Q17 Mean Alignment Length', 'Filtered Q20 Mean Alignment Length', 'Filtered Q47 Mean Alignment Length', 'Filtered Q7 Coverage Percentage', 'Filtered Q10 Coverage Percentage', 'Filtered Q17 Coverage Percentage', 'Filtered Q20 Coverage Percentage', 'Filtered Q47 Coverage Percentage', 'Filtered Q7 Mean Coverage Depth', 'Filtered Q10 Mean Coverage Depth', 'Filtered Q17 Mean Coverage Depth', 'Filtered Q20 Mean Coverage Depth', 'Filtered Q47 Mean Coverage Depth'] # init for key in fixedkeys: value_out = 'unknown' config_out.set('global', key, value_out) for key in numberkeys: value_out = 0 config_out.set('global', key, int(value_out)) for key in maxkeys: value_out = 0 config_out.set('global', key, int(value_out)) for key in meankeys: value_out = 0 config_out.set('global', key, float(value_out)) config_in = MyConfigParser() config_in.optionxform = str # don't convert to lowercase for i,subdir in enumerate(dirs): if isbadblock(subdir, "Merging alignment.summary"): continue alignmentfile=os.path.join(subdir, 'alignment.summary') if os.path.exists(alignmentfile): config_in.read(os.path.join(alignmentfile)) for key in numberkeys: value_in = config_in.get('global',key) value_out = config_out.get('global', key) config_out.set('global', key, int(value_in) + int(value_out)) for key in maxkeys: value_in = config_in.get('global',key) value_out = config_out.get('global', key) config_out.set('global', key, max(int(value_in),int(value_out))) for key in fixedkeys: value_in = config_in.get('global',key) value_out = config_out.get('global',key) #todo config_out.set('global', key, value_in) for key in meankeys: value_in = config_in.get('global',key) value_out = config_out.get('global', key) config_out.set('global', key, float(value_out)+float(value_in)/len(dirs)) # 'Filtered Q17 Mean Coverage Depth' = # 'Filtered Mapped Bases in Q17 Alignments' / 'Genomesize'; else: printtime("ERROR: skipped %s" % alignmentfile) with open('alignment.summary.merged', 'wb') as configfile: config_out.write(configfile) r = subprocess.call(["ln", "-s", os.path.join(ALIGNMENT_RESULTS,"alignment.summary.merged"), os.path.join(ALIGNMENT_RESULTS,"alignment.summary")]) ######################################### # Merge individual alignTable.txt files # ######################################### printtime("Merging individual alignTable.txt files") table = 0 header = None for subdir in dirs: if isbadblock(subdir, "Merging alignTable.txt"): continue alignTableFile = os.path.join(subdir,'alignTable.txt') if os.path.exists(alignTableFile): if header is None: header = numpy.loadtxt(alignTableFile, dtype='string', comments='#') table += numpy.loadtxt(alignTableFile, dtype='int', comments='#',skiprows=1) else: printtime("ERROR: skipped %s" % alignTableFile) #fix first column table[:,0] = (header[1:,0]) f_handle = open('alignTable.txt.merged', 'w') numpy.savetxt(f_handle, header[0][None], fmt='%s', delimiter='\t') numpy.savetxt(f_handle, table, fmt='%i', delimiter='\t') f_handle.close() r = subprocess.call(["ln", "-s", os.path.join(ALIGNMENT_RESULTS,"alignTable.txt.merged"), os.path.join(ALIGNMENT_RESULTS,"alignTable.txt")]) ############################################# # Merge alignment.summary (json) # ############################################# printtime("Merging alignment.summary (json)") try: cmd = 'merge_alignment.summary.py' for subdir in dirs: if isbadblock(subdir, "Merging alignment.summary (json)"): continue alignmentfile=os.path.join(subdir, 'alignment.summary') if os.path.exists(alignmentfile): cmd = cmd + ' %s' % alignmentfile else: printtime("ERROR: skipped %s" % alignmentfile) cmd = cmd + ' > alignment.summary.json' printtime("DEBUG: Calling '%s'" % cmd) subprocess.call(cmd,shell=True) except: printtime("Merging alignment.summary (json) failed") ############################################# # Merge alignTable.txt (json) # ############################################# printtime("Merging alignTable.txt (json)") try: cmd = 'merge_alignTable.py' for subdir in dirs: if isbadblock(subdir, "Merging alignTable.txt (json)"): continue alignstatsfile=os.path.join(subdir, 'alignTable.txt') if os.path.exists(alignstatsfile): cmd = cmd + ' %s' % alignstatsfile else: printtime("ERROR: skipped %s" % alignstatsfile) cmd = cmd + ' > alignTable.txt.json' printtime("DEBUG: Calling '%s'" % cmd) subprocess.call(cmd,shell=True) except: printtime("Merging alignTable.txt (json) failed") ############################################# # Merge individual block bam files # ############################################# printtime("Merging bam files") try: # cmd = 'picard-tools MergeSamFiles' cmd = 'java -Xmx8g -jar /opt/picard/picard-tools-current/MergeSamFiles.jar' for subdir in dirs: if isbadblock(subdir, "Merging bam files"): continue bamfile = os.path.join(ALIGNMENT_RESULTS, subdir, "rawlib.bam") if os.path.exists(bamfile): cmd = cmd + ' I=%s' % bamfile else: printtime("ERROR: skipped %s" % bamfile) cmd = cmd + ' O=%s/%s_%s.bam' % (ALIGNMENT_RESULTS, env['expName'], env['resultsName']) cmd = cmd + ' ASSUME_SORTED=true' cmd = cmd + ' CREATE_INDEX=true' cmd = cmd + ' USE_THREADING=true' cmd = cmd + ' VALIDATION_STRINGENCY=LENIENT' printtime("DEBUG: Calling '%s'" % cmd) subprocess.call(cmd,shell=True) except: printtime("bam file merge failed") try: srcbaifilepath = '%s/%s_%s.bai' % (ALIGNMENT_RESULTS, env['expName'], env['resultsName']) dstbaifilepath = '%s/%s_%s.bam.bai' % (ALIGNMENT_RESULTS, env['expName'], env['resultsName']) if os.path.exists(srcbaifilepath): os.rename(srcbaifilepath, dstbaifilepath) else: printtime("ERROR: %s doesn't exists" % srcbaifilepath) except: traceback.print_exc() #remove symbolic links os.remove("alignment.summary") os.remove("alignTable.txt") ################################################## #Call alignStats on merged bam file # ################################################## printtime("Call alignStats on merged bam file") try: cmd = "alignStats -i %s/%s_%s.bam" % (ALIGNMENT_RESULTS, env['expName'], env['resultsName']) cmd = cmd + " -g /results/referenceLibrary/%s/%s/%s.info.txt" % (env["tmap_version"],env["libraryName"], env["libraryName"]) cmd = cmd + " -n 12 -l 20 -m 400 -q 7,10,17,20,47 -s 0 -a alignTable.txt" cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS cmd = cmd + " 2>> " + os.path.join(ALIGNMENT_RESULTS, "alignStats_out.txt") printtime("DEBUG: Calling '%s'" % cmd) os.system(cmd) except: printtime("alignStats failed")