def align_upper_pm(peaks, ladder, anchor_pairs, anchor_z): # this is another attempt to perform ladder - size standard alignment one peak by one anchor_pairs = sorted(anchor_pairs) anchor_rtimes, anchor_bpsizes = zip(*anchor_pairs) anchor_rtimes = list(anchor_rtimes) anchor_bpsizes = list(anchor_bpsizes) remaining_sizes = [x for x in ladder['sizes'] if x > anchor_bpsizes[-1]] current_sizes = anchor_bpsizes order = ladder['order'] z = estimate_z(anchor_rtimes, anchor_bpsizes, order).z f = ZFunc(peaks, current_sizes, anchor_pairs, estimate=True) pairs, rss = f.get_pairs(z) while True: if not remaining_sizes: return pairs, z, rss, f current_sizes.append(remaining_sizes.pop(0)) f.set_sizes(current_sizes) score, next_z = minimize_score(f, z, order) pairs, rss = f.get_pairs(z) if rss < 100: z = next_z if is_verbosity(5): plot(f.rtimes, f.sizes, z, pairs)
def align_lower_pm(peaks, ladder, anchor_pairs, anchor_z): # this is another attempt to perform ladder - size standard alignment one peak by one anchor_pairs = sorted(anchor_pairs) anchor_rtimes, anchor_bpsizes = zip(*anchor_pairs) anchor_rtimes = list(anchor_rtimes) anchor_bpsizes = list(anchor_bpsizes) remaining_sizes = [x for x in ladder['sizes'] if x < anchor_bpsizes[0]] current_sizes = anchor_bpsizes z = estimate_z(anchor_rtimes, anchor_bpsizes, 3).z f = ZFunc(peaks, current_sizes, anchor_pairs, estimate=True) pairs, rss = f.get_pairs(z) while True: if not remaining_sizes: return pairs, z, rss, f current_sizes.insert(0, remaining_sizes.pop(-1)) f.set_sizes(current_sizes) score, z = minimize_score(f, z, 3) pairs, rss = f.get_pairs(z) if is_verbosity(5): plot(f.rtimes, f.sizes, z, pairs)
def align(self, parameters, ladder=None, anchor_pairs=None): # sanity checks if self.marker.code != 'ladder': raise RuntimeError( 'E: align() must be performed on ladder channel!') ladder = self.fsa.panel.get_ladder() # prepare ladder qcfunc if 'qcfunc' not in ladder: ladder['qcfunc'] = algo.generate_scoring_function( ladder['strict'], ladder['relax']) start_time = time.process_time() result = algo.align_peaks(self, parameters, ladder, anchor_pairs) dpresult = result.dpresult fsa = self.fsa fsa.z = dpresult.z fsa.rss = dpresult.rss fsa.nladder = len(dpresult.sized_peaks) fsa.score = result.score fsa.duration = time.process_time() - start_time # set allele sizes from ladder steps alleles = self.get_alleles() alleles.sort(key=lambda x: x.rtime) ladder_sizes = ladder['sizes'] ladder_sizes.sort() for allele, ladder_size in zip(alleles, ladder_sizes): allele.size = ladder_size # check the allele method method = parameters.allelemethod if method == const.allelemethod.leastsquare: fsa.allele_fit_func = algo.least_square(alleles, self.fsa.z) elif method == const.allelemethod.cubicspline: fsa.allele_fit_func = algo.cubic_spline(alleles) elif method == const.allelemethod.localsouthern: fsa.allele_fit_func = algo.local_southern(alleles) else: raise RuntimeError #min_rtime = ladders[1].rtime #max_rtime = ladders[-2].rtime fsa.min_rtime = parameters.ladder.min_rtime fsa.max_rtime = parameters.ladder.max_rtime #import pprint; pprint.pprint(dpresult.sized_peaks) #print(fsa.z) if is_verbosity(4): cout('O: Score %3.2f | %5.2f | %d/%d | %s | %5.1f | %s' % (fsa.score, fsa.rss, fsa.nladder, len(ladder['sizes']), result.method, fsa.duration, fsa.filename))
def align_upper_pm(peaks, ladder, anchor_pairs, anchor_z): # this is another attempt to perform ladder - size standard alignment one peak by one anchor_pairs = sorted(anchor_pairs) anchor_rtimes, anchor_bpsizes = zip( *anchor_pairs ) anchor_rtimes = list(anchor_rtimes) anchor_bpsizes = list(anchor_bpsizes) remaining_sizes = [x for x in ladder['sizes'] if x > anchor_bpsizes[-1]] current_sizes = anchor_bpsizes order = ladder['order'] zres = estimate_z(anchor_rtimes, anchor_bpsizes, order) z,rss = zres.z, zres.rss f = ZFunc(peaks, current_sizes, anchor_pairs) while remaining_sizes: current_sizes.append( remaining_sizes.pop(0) ) if ( remaining_sizes and (remaining_sizes[-1] - current_sizes[-1]) < 100 and (remaining_sizes[0] - current_sizes[-1]) < 11 ): current_sizes.append( remaining_sizes.pop(0) ) f.set_sizes(current_sizes) score, next_z = minimize_score(f, z, order) next_pairs, next_rss = f.get_pairs(z) if (next_rss - rss) < 70: z = next_z rss = next_rss pairs = next_pairs if is_verbosity(5): plot(f.rtimes, f.sizes, z, pairs ) # finalize the alignment with stringent criteria dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss) if dp_result.rss - rss > 50: return pairs, z, rss, f dp_pairs = [(x[1], x[0]) for x in dp_result.sized_peaks] if is_verbosity(5): plot(f.rtimes, f.sizes, dp_result.z, dp_pairs) return dp_pairs, dp_result.z, dp_result.rss, f
def estimate_pm(peaks, bpsizes): rtimes = [p.rtime for p in peaks] rtime_points = prepare_rtimes(rtimes) bpsize_pair = [bpsizes[1], bpsizes[-2]] f = ZFunc(peaks, bpsizes, [], estimate=True) scores = [] for rtime_pair in rtime_points: if rtime_pair[0] >= rtime_pair[1]: continue # y = ax + b # y1 = ax1 + b # y2 = ax2 + b # ------------ - # y1 - y2 = a(x1 - x2) # a = (y1 - y2)/(x1 - x2) # b = y1 - ax1 #slope = (bpsize_pair[1] - bpsize_pair[0]) / (rtime_pair[1] - rtime_pair[0]) #intercept = bpsize_pair[0] - slope * rtime_pair[0] #z = [ slope intercept ] zres = estimate_z(rtime_pair, bpsize_pair, 1) score = f(zres.z) scores.append((score, zres)) if is_verbosity(5): plot(f.rtimes, f.sizes, zres.z, []) scores.sort(key=lambda x: x[0]) #import pprint; pprint.pprint(scores[:5]) zresult = scores[0][1] dp_result = align_dp(f.rtimes, f.sizes, f.similarity, zresult.z, zresult.rss) #import pprint; pprint.pprint(dp_result.sized_peaks) if is_verbosity(5): plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) return ([(x[1], x[0]) for x in dp_result.sized_peaks], dp_result.z)
def estimate_pm(peaks, bpsizes): rtimes = [ p.rtime for p in peaks ] rtime_points = prepare_rtimes( rtimes ) bpsize_pair = [ bpsizes[1], bpsizes[-2]] f = ZFunc(peaks, bpsizes, [], estimate = True) scores = [] for rtime_pair in rtime_points: if rtime_pair[0] >= rtime_pair[1]: continue # y = ax + b # y1 = ax1 + b # y2 = ax2 + b # ------------ - # y1 - y2 = a(x1 - x2) # a = (y1 - y2)/(x1 - x2) # b = y1 - ax1 #slope = (bpsize_pair[1] - bpsize_pair[0]) / (rtime_pair[1] - rtime_pair[0]) #intercept = bpsize_pair[0] - slope * rtime_pair[0] #z = [ slope intercept ] zres = estimate_z(rtime_pair, bpsize_pair, 1) score = f(zres.z) scores.append( (score, zres) ) if is_verbosity(5): plot(f.rtimes, f.sizes, zres.z, [] ) scores.sort( key = lambda x: x[0] ) #import pprint; pprint.pprint(scores[:5]) zresult = scores[0][1] dp_result = align_dp(f.rtimes, f.sizes, f.similarity, zresult.z, zresult.rss) #import pprint; pprint.pprint(dp_result.sized_peaks) if is_verbosity(5): plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) return ( [(x[1], x[0]) for x in dp_result.sized_peaks], dp_result.z )
def create_channels(self, params): if is_verbosity(4): cerr('I: Generating channels for %s' % self.filename) trace = self.get_trace() trace_channels = algo.separate_channels(trace, params) for tc in trace_channels: channel = self.Channel(data=tc.smooth_channel, dye=tc.dye_name, wavelen=tc.dye_wavelength, status=const.channelstatus.reseted, fsa=self) self.add_channel(channel)
def generate_similarity(peaks): rfus = [p.rfu for p in peaks] # use the 2nd highest peaks since the 1st or 2nd may be noises highest_rfu = list(sorted(rfus, reverse=True))[2] N = len(rfus) similarity = list([ (np.log10(rfu / highest_rfu) + N) / N if rfu < highest_rfu else 1.0 for rfu in rfus ]) if is_verbosity(4): print(N, ' => ') print(rfus) print(highest_rfu) print(similarity) return similarity
def align_dp(rtimes, sizes, similarity, z, rss, order=3): """ align ladders with peaks using dynamic programming (global alignment) return (dpscore, RSS, Z, ladder_aligned_peaks) """ sizes = list(sorted(sizes, reverse=True)) rtimes = list(sorted(rtimes, reverse=True)) dpscore = -1 while True: S = generate_scores(sizes, rtimes, similarity, np.poly1d(z)) result = dp(S, -5e-3) cur_dpscore = result['D'][-1][-1] matches = result['matches'] aligned_peaks = [(sizes[i], rtimes[j]) for i, j in matches] # realign std_size, peak_sizes = zip(*aligned_peaks) cur_zres = estimate_z(peak_sizes, std_size, order) if cur_dpscore < dpscore: if is_verbosity(4): cerr('W: dynamic programming did not converge!!') break if cur_dpscore == dpscore: break z = cur_zres.z rss = cur_zres.rss dpscore = cur_dpscore sized_peaks = aligned_peaks return DPResult(dpscore, rss, z, sized_peaks)
def align_lower_pm(peaks, ladder, anchor_pairs, anchor_z): # this is another attempt to perform ladder - size standard alignment one peak by one anchor_pairs = sorted(anchor_pairs) anchor_rtimes, anchor_bpsizes = zip( *anchor_pairs ) anchor_rtimes = list(anchor_rtimes) anchor_bpsizes = list(anchor_bpsizes) remaining_sizes = [x for x in ladder['sizes'] if x < anchor_bpsizes[0]] current_sizes = anchor_bpsizes zscore = estimate_z(anchor_rtimes, anchor_bpsizes, 3) z = zscore.z rss = zscore.rss f = ZFunc(peaks, current_sizes, anchor_pairs) while True: if not remaining_sizes: return pairs, z, rss, f current_sizes.insert(0, remaining_sizes.pop(-1)) f.set_sizes(current_sizes) score, next_z = minimize_score(f, z, 3) next_pairs, next_rss = f.get_pairs(next_z) # if delta rss (current rss - prev rss) is above certain threshold, # then assume the latest peak standar is not appropriate, and # use previous z and rss if (next_rss - rss) > 20: current_sizes.pop(0) else: z = next_z rss = next_rss pairs = next_pairs if is_verbosity(5): plot(f.rtimes, f.sizes, z, pairs )
def align_lower_pm(peaks, ladder, anchor_pairs, anchor_z): # this is another attempt to perform ladder - size standard alignment one peak by one anchor_pairs = sorted(anchor_pairs) anchor_rtimes, anchor_bpsizes = zip(*anchor_pairs) anchor_rtimes = list(anchor_rtimes) anchor_bpsizes = list(anchor_bpsizes) remaining_sizes = [x for x in ladder['sizes'] if x < anchor_bpsizes[0]] current_sizes = anchor_bpsizes zscore = estimate_z(anchor_rtimes, anchor_bpsizes, 3) z = zscore.z rss = zscore.rss f = ZFunc(peaks, current_sizes, anchor_pairs) while True: if not remaining_sizes: return pairs, z, rss, f current_sizes.insert(0, remaining_sizes.pop(-1)) f.set_sizes(current_sizes) score, next_z = minimize_score(f, z, 3) next_pairs, next_rss = f.get_pairs(next_z) # if delta rss (current rss - prev rss) is above certain threshold, # then assume the latest peak standar is not appropriate, and # use previous z and rss if (next_rss - rss) > 20: current_sizes.pop(0) else: z = next_z rss = next_rss pairs = next_pairs if is_verbosity(5): plot(f.rtimes, f.sizes, z, pairs)
def do_listpeaks(args, fsa_list, dbh): if args.outfile != '-': out_stream = open(args.outfile, 'w') else: out_stream = sys.stdout if args.peaks_format == 'standard': out_stream.write( 'SAMPLE\tFILENAME \tDYE\tRTIME\tSIZE\tHEIGHT\tAREA\tSCORE\n') elif args.peaks_format == 'peakscanner': out_stream.write( "Dye/Sample Peak,Sample File Name,Type,Size,Height,Area in Point,Area in BP,Corrected Area in BP,Data Point,Begin Point," ) if args.merge: out_stream.write( "Begin BP,End Point,End BP,Width in Point,Width in BP,Score,Peak Group,User Comments,User Edit\n" ) else: out_stream.write( "Begin BP,End Point,End BP,Width in Point,Width in BP,Score,User Comments,User Edit\n" ) else: raise RuntimeError("Unknown value for args.peaks_format") out_stream.close() for (fsa, fsa_index) in fsa_list: cverr(3, 'D: calling FSA %s' % fsa.filename) markers = fsa.panel.data['markers'] if args.outfile != '-': out_stream = open(args.outfile, 'a') else: out_stream = sys.stdout for channel in fsa.channels: if channel.is_ladder(): color = markers['x/ladder']['filter'] else: color = markers['x/' + channel.dye]['filter'] alleles = channel.get_alleles(broad_peaks_only=False) if is_verbosity(4): cout('Marker => %s | %s [%d]' % (channel.marker.code, channel.dye, len(alleles))) cout("channel has alleles :", len(alleles)) i = 1 smeared_alleles = channel.smeared_alleles if (not args.merge) or channel.is_ladder(): for p in alleles: if args.peaks_format == 'standard': out_stream.write( '%6s\t%10s\t%3s\t%d\t%d\t%5i\t%3.2f\t%3.2f\n' % (fsa_index, fsa.filename[:-4], color, p.rtime, p.size, p.height, p.area, p.qscore)) else: out_stream.write( '"%s, %i",%s, %s, %f, %i, %i, %i, %i, %i, %i, %f, %i, %f, %i, %f, %f,,\n' % (color, i, fsa.filename, p.type, p.size, p.height, p.area, p.area_bp, p.area_bp_corr, p.rtime, p.brtime, p.begin_bp, p.ertime, p.end_bp, p.wrtime, p.width_bp, p.qscore)) i = i + 1 else: if is_verbosity(4): cout('Marker => %s | %s [%d]' % (channel.marker.code, channel.dye, len(smeared_alleles))) cout("channel has smeared alleles :", len(smeared_alleles)) i = 1 for p in smeared_alleles: out_stream.write( '"%s, %i", %s, %s, %f, %i, %i, %i, %i, %i, %i, %f, %i, %f, %i, %f, %f, %i,,\n' % (color, i, fsa.filename, p.type, p.size, p.height, p.area, p.area_bp, p.area_bp_corr, p.rtime, p.brtime, p.begin_bp, p.ertime, p.end_bp, p.wrtime, p.width_bp, p.qscore, p.group)) i = i + 1 out_stream.close()
def align_pm(peaks, ladder, anchor_pairs=None): if not anchor_pairs: anchor_peaks = [p for p in peaks if 1500 < p.rtime < 5000] anchor_pairs, initial_z = estimate_pm(anchor_peaks, ladder['signature']) else: rtimes, bpsizes = zip(*anchor_pairs) initial_z = estimate_z(rtimes, bpsizes, 1) anchor_pairs.sort() pairs, z, rss, f = align_upper_pm(peaks, ladder, anchor_pairs, initial_z) pairs, z, rss, f = align_lower_pm(peaks, ladder, pairs, initial_z) #print(rss) #plot(f.rtimes, f.sizes, z, pairs) # last dp dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss) import pprint pprint.pprint(dp_result.sized_peaks) if is_verbosity(4): plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) dp_result.sized_peaks = f.get_sized_peaks(dp_result.sized_peaks) score, msg = ladder['qcfunc'](dp_result, method='strict') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.pm_strict) score, msg = ladder['qcfunc'](dp_result, method='relax') return AlignResult(score, msg, dp_result, const.alignmethod.pm_relax) f = ZFunc(peaks, ladder['sizes'], anchor_pairs) z = initial_z score = last_score = 0 last_z = None for order in [1, 2, 3]: last_rss = -1 rss = 0 niter = 0 while abs(rss - last_rss) > 1e-3: niter += 1 print('Iter: %d' % niter) print(z) score = f(z) if last_score and last_score < score: # score does not converge; just exit print('does not converge!') break pairs, cur_rss = f.get_pairs(z) rtimes, bpsizes = zip(*pairs) zres = estimate_z(rtimes, bpsizes, order) last_z = z z = zres.z last_rss = rss rss = zres.rss print(rss) dp_result = align_dp(f.rtimes, f.sizes, last_z, last_rss) return align_gm2(peaks, ladder, anchor_pairs, dp_result.z) new_anchor_pairs = [] zf = np.poly1d(dp_result.z) for p in dp_result.sized_peaks: if (p[0] - zf(p[1]))**2 < 2: new_anchor_pairs.append((p[1], p[0])) import pprint pprint.pprint(dp_result.sized_peaks) plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) return align_gm(peaks, ladder, anchor_pairs, dp_result.z)
def align_pm(peaks, ladder, anchor_pairs=None): if not anchor_pairs: longest_rtime_peak = max([p.rtime for p in peaks]) if longest_rtime_peak > PEAK_RTIME_UPPER_BOUND: bound_adjust_ratio = longest_rtime_peak / PEAK_RTIME_UPPER_BOUND anchor_start = ANCHOR_RTIME_LOWER_BOUND * bound_adjust_ratio anchor_end = ANCHOR_RTIME_UPPER_BOUND * bound_adjust_ratio else: anchor_start = ANCHOR_RTIME_LOWER_BOUND anchor_end = ANCHOR_RTIME_UPPER_BOUND anchor_peaks = [ p for p in peaks if anchor_start < p.rtime < anchor_end ] anchor_pairs, initial_z = estimate_pm( anchor_peaks, ladder['signature'] ) else: rtimes, bpsizes = zip( *anchor_pairs ) initial_z = estimate_z(rtimes, bpsizes, 1) anchor_pairs.sort() pairs, z, rss, f = align_upper_pm(peaks, ladder, anchor_pairs, initial_z) #print(pairs) pairs, z, rss, f = align_lower_pm(peaks, ladder, pairs, initial_z) #print(rss) #plot(f.rtimes, f.sizes, z, pairs) # last dp dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss) if is_verbosity(1): import pprint; pprint.pprint(dp_result.sized_peaks) if is_verbosity(4): plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) dp_result.sized_peaks = f.get_sized_peaks(dp_result.sized_peaks) score, msg = ladder['qcfunc'](dp_result, method='strict') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.pm_strict) score, msg = ladder['qcfunc'](dp_result, method='relax') return AlignResult(score, msg, dp_result, const.alignmethod.pm_relax) f = ZFunc(peaks, ladder['sizes'], anchor_pairs) z = initial_z score = last_score = 0 last_z = None for order in [1, 2, 3]: last_rss = -1 rss = 0 niter = 0 while abs(rss - last_rss) > 1e-3: niter += 1 cverr(5, 'Iter: %d' % niter) cverr(5, z) score = f(z) if last_score and last_score < score: # score does not converge; just exit cverr(5, 'does not converge!') break pairs, cur_rss = f.get_pairs(z) rtimes, bpsizes = zip( *pairs ) zres = estimate_z(rtimes, bpsizes, order) last_z = z z = zres.z last_rss = rss rss = zres.rss cverr(5, rss) dp_result = align_dp(f.rtimes, f.sizes, last_z, last_rss) return align_gm2(peaks, ladder, anchor_pairs, dp_result.z) new_anchor_pairs = [] zf = np.poly1d(dp_result.z) for p in dp_result.sized_peaks: if (p[0] - zf(p[1]))**2 < 2: new_anchor_pairs.append( (p[1], p[0]) ) #import pprint; pprint.pprint(dp_result.sized_peaks) plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) return align_gm(peaks, ladder, anchor_pairs, dp_result.z)
def filter_for_artifact(peaks, params, expected_peak_number=0): """ params.max_peak_number params.artifact_ratio params.artifact_dist ~ 5 """ # the following code in this function performs the necessary acrobatic act # to select the most likely peaks that can be considered as true signals, # which is especially necessary for ladder - size assignment if len(peaks) == expected_peak_number: return peaks # we need to adapt to the noise level of current channel if expected_peak_number > 0: epn = expected_peak_number theta_peaks = sorted(peaks, key=lambda x: x.theta, reverse=True)[round(epn / 2) + 3:epn - 1] #theta_peaks = theta_peaks[2:4] + theta_peaks[round(epn/2):epn-1] omega_peaks = sorted(peaks, key=lambda x: x.omega, reverse=True) omega_peaks = omega_peaks[2:4] + omega_peaks[round(epn / 2):epn - 1] rfu_peaks = sorted(peaks, key=lambda x: x.rfu, reverse=True)[:epn - 1] if theta_peaks[-1].theta < 8: theta_peaks.sort() thetas = np.array([p.theta for p in theta_peaks]) rtimes = [p.rtime for p in theta_peaks] #plt.scatter(rtimes, thetas) #plt.show() popt, pcov = curve_fit(math_func, rtimes, 0.5 * thetas, p0=[-1, 1]) if is_verbosity(4): xx = np.linspace(rtimes[0], rtimes[-1] + 2000, 100) yy = math_func(xx, *popt) plt.plot(xx, yy) plt.scatter([p.rtime for p in peaks], [p.theta for p in peaks]) plt.show() q_theta = lambda x: x.theta >= math_func(x.rtime, *popt ) or x.theta > 100 else: q_theta = lambda x: x.theta >= min(theta_peaks[-1].theta, params. min_theta) if omega_peaks[-1].omega < 200: omega_peaks.sort() omegas = np.array([p.omega for p in omega_peaks]) rtimes = np.array([p.rtime for p in omega_peaks]) # generate a quadratic threshold for omega # generate a quadratic ratio series first popt, pcov = curve_fit( quadratic_math_func, [rtimes[0], (rtimes[0] + rtimes[-1]) / 2, rtimes[-1]], [0.05, 0.25, 0.05]) ratios = quadratic_math_func(rtimes, *popt) if is_verbosity(4): plt.plot(rtimes, ratios) plt.show() # use the ratios to enforce quadratic threshold popt, pcov = curve_fit(quadratic_math_func, rtimes, ratios * omegas, p0=[-1, 1, 0]) if popt[0] > 0: # enforce small flat ratio popt, pcov = curve_fit(math_func, rtimes, 0.25 * omegas, p0=[1, 0]) popt = np.insert(popt, 0, 0.0) # convert to 3 params if is_verbosity(4): plt.scatter(rtimes, omegas) xx = np.linspace(rtimes[0], rtimes[-1] + 2000, 100) yy = quadratic_math_func(xx, *popt) plt.plot(xx, yy) plt.scatter([p.rtime for p in peaks], [p.omega for p in peaks]) plt.show() q_omega = lambda x: (x.omega >= 100 or x.omega >= quadratic_math_func(x.rtime, *popt)) else: q_omega = lambda x: x.omega >= min(omega_peaks[-1].omega, 50) min_rfu = rfu_peaks[-1].rfu * 0.125 else: min_theta = 0 min_omega = 0 min_theta_omega = 0 min_rfu = 2 # filter for too sharp/thin peaks filtered_peaks = [] for p in peaks: #filtered_peaks.append(p); continue cverr(5, p) if len(filtered_peaks) < 2 and p.area > 50: # first two real peaks might be a bit lower filtered_peaks.append(p) continue if not q_omega(p): cverr(5, '! q_omega') continue #if not q_theta(p): # print('! q_theta') # continue #if min_theta and min_omega and p.omega < min_omega and p.theta < min_theta: # print('! omega & theta') # continue #if min_theta_omega and p.theta * p.omega < min_theta_omega: # print('! theta_omega') # continue if p.theta < 1.0 and p.area < 25 and p.omega < 5: cverr(5, '! extreme theta & area & omega') continue if p.rfu < min_rfu: cverr(5, '! extreme min_rfu') continue if p.beta > 25 and p.theta < 0.5: cverr(5, '! extreme beta') continue if p.wrtime < 3: continue if p.rfu >= 25 and p.beta * p.theta < 6: continue if p.rfu < 25 and p.beta * p.theta < 3: continue #if p.omega < 50: # continue #if p.omega < 100 and p.theta < 5: # continue #if ( params.max_beta and min_theta and # (p.beta > params.max_beta and p.theta < min_theta) ): # print('! max_beta') # continue filtered_peaks.append(p) #import pprint; pprint.pprint(filtered_peaks) # filter for distance between peaks and their rfu ratio peaks = sorted(filtered_peaks, key=lambda x: x.rtime) non_artifact_peaks = [] for idx in range(len(peaks)): p = peaks[idx] if idx > 0: prev_p = peaks[idx - 1] if (p.brtime - prev_p.ertime < params.artifact_dist and p.rfu < params.artifact_ratio * prev_p.rfu): # we are artifact, just skip print('artifact1:', p) continue if idx < len(peaks) - 1: next_p = peaks[idx + 1] if (next_p.brtime - p.ertime < params.artifact_dist and p.rfu < params.artifact_ratio * next_p.rfu): # we are artifact, just skip print('artefact2:', p) continue non_artifact_peaks.append(p) #import pprint; pprint.pprint(non_artifact_peaks) #print(len(non_artifact_peaks)) peaks = non_artifact_peaks cverr(3, '## non artifact peaks: %d' % len(peaks)) return peaks
def filter_for_artifact(peaks, params, expected_peak_number = 0): """ params.max_peak_number params.artifact_ratio params.artifact_dist ~ 5 """ # the following code in this function performs the necessary acrobatic act # to select the most likely peaks that can be considered as true signals, # which is especially necessary for ladder - size assignment if len(peaks) == expected_peak_number: return peaks # we need to adapt to the noise level of current channel if expected_peak_number > 0: epn = expected_peak_number theta_peaks = sorted(peaks, key = lambda x: x.theta, reverse=True)[round(epn/2)+3:epn-1] #theta_peaks = theta_peaks[2:4] + theta_peaks[round(epn/2):epn-1] omega_peaks = sorted(peaks, key = lambda x: x.omega, reverse=True) omega_peaks = omega_peaks[2:4] + omega_peaks[round(epn/2):epn-1] rfu_peaks = sorted(peaks, key = lambda x: x.rfu, reverse=True)[:epn-1] if theta_peaks[-1].theta < 8: theta_peaks.sort() thetas = np.array([ p.theta for p in theta_peaks ]) rtimes = [ p.rtime for p in theta_peaks ] #plt.scatter(rtimes, thetas) #plt.show() popt, pcov = curve_fit( math_func, rtimes, 0.5 * thetas, p0 = [ -1, 1 ]) if is_verbosity(4): xx = np.linspace( rtimes[0], rtimes[-1]+2000, 100 ) yy = math_func(xx, *popt) plt.plot(xx, yy) plt.scatter( [p.rtime for p in peaks], [p.theta for p in peaks]) plt.show() q_theta = lambda x: x.theta >= math_func(x.rtime, *popt) or x.theta > 100 else: q_theta = lambda x: x.theta >= min(theta_peaks[-1].theta, params.min_theta) if omega_peaks[-1].omega < 200: omega_peaks.sort() omegas = np.array([ p.omega for p in omega_peaks ]) rtimes = np.array([ p.rtime for p in omega_peaks ]) # generate a quadratic threshold for omega # generate a quadratic ratio series first popt, pcov = curve_fit( quadratic_math_func, [rtimes[0], (rtimes[0] + rtimes[-1])/2, rtimes[-1]], [0.05, 0.25, 0.05]) ratios = quadratic_math_func(rtimes, *popt) if is_verbosity(4): plt.plot(rtimes, ratios) plt.show() # use the ratios to enforce quadratic threshold popt, pcov = curve_fit( quadratic_math_func, rtimes, ratios * omegas, p0 = [ -1, 1, 0 ]) if popt[0] > 0: # enforce small flat ratio popt, pcov = curve_fit( math_func, rtimes, 0.25 * omegas, p0 = [ 1, 0 ]) popt = np.insert(popt, 0, 0.0) # convert to 3 params if is_verbosity(4): plt.scatter(rtimes, omegas) xx = np.linspace( rtimes[0], rtimes[-1]+2000, 100 ) yy = quadratic_math_func(xx, *popt) plt.plot(xx, yy) plt.scatter( [p.rtime for p in peaks], [p.omega for p in peaks]) plt.show() q_omega = lambda x: ( x.omega >= 100 or x.omega >= quadratic_math_func(x.rtime, *popt) ) else: q_omega = lambda x: x.omega >= min(omega_peaks[-1].omega, 50) min_rfu = rfu_peaks[-1].rfu * 0.125 else: min_theta = 0 min_omega = 0 min_theta_omega = 0 min_rfu = 2 # filter for too sharp/thin peaks filtered_peaks = [] for p in peaks: #filtered_peaks.append(p); continue cverr(5, str(p)) if len(filtered_peaks) < 2 and p.area > 50: # first two real peaks might be a bit lower filtered_peaks.append(p) continue if not q_omega(p): cverr(5, '! q_omega') continue #if not q_theta(p): # print('! q_theta') # continue #if min_theta and min_omega and p.omega < min_omega and p.theta < min_theta: # print('! omega & theta') # continue #if min_theta_omega and p.theta * p.omega < min_theta_omega: # print('! theta_omega') # continue if p.theta < 1.0 and p.area < 25 and p.omega < 5: cverr(5, '! extreme theta & area & omega') continue if p.rfu < min_rfu: cverr(5, '! extreme min_rfu') continue if p.beta > 25 and p.theta < 0.5: cverr(5, '! extreme beta') continue if p.wrtime < 3: continue if p.rfu >= 25 and p.beta * p.theta < 6: continue if p.rfu < 25 and p.beta * p.theta < 3: continue #if p.omega < 50: # continue #if p.omega < 100 and p.theta < 5: # continue #if ( params.max_beta and min_theta and # (p.beta > params.max_beta and p.theta < min_theta) ): # print('! max_beta') # continue filtered_peaks.append(p) #import pprint; pprint.pprint(filtered_peaks) # filter for distance between peaks and their rfu ratio peaks = sorted(filtered_peaks, key = lambda x: x.rtime) non_artifact_peaks = [] for idx in range(len(peaks)): p = peaks[idx] if idx > 0: prev_p = peaks[idx-1] if ( p.brtime - prev_p.ertime < params.artifact_dist and p.rfu < params.artifact_ratio * prev_p.rfu ): # we are artifact, just skip print('artifact1:', p) continue if idx < len(peaks)-1: next_p = peaks[idx+1] if ( next_p.brtime - p.ertime < params.artifact_dist and p.rfu < params.artifact_ratio * next_p.rfu ): # we are artifact, just skip print('artefact2:', p) continue non_artifact_peaks.append( p ) #import pprint; pprint.pprint(non_artifact_peaks) #print(len(non_artifact_peaks)) peaks = non_artifact_peaks cverr(3, '## non artifact peaks: %d' % len(peaks)) return peaks
def align_pm(peaks, ladder, anchor_pairs=None): if not anchor_pairs: longest_rtime_peak = max([p.rtime for p in peaks]) if longest_rtime_peak > PEAK_RTIME_UPPER_BOUND: bound_adjust_ratio = longest_rtime_peak / PEAK_RTIME_UPPER_BOUND anchor_start = ANCHOR_RTIME_LOWER_BOUND * bound_adjust_ratio anchor_end = ANCHOR_RTIME_UPPER_BOUND * bound_adjust_ratio else: anchor_start = ANCHOR_RTIME_LOWER_BOUND anchor_end = ANCHOR_RTIME_UPPER_BOUND anchor_peaks = [ p for p in peaks if anchor_start < p.rtime < anchor_end ] anchor_pairs, initial_z = estimate_pm(anchor_peaks, ladder['signature']) else: rtimes, bpsizes = zip(*anchor_pairs) initial_z = estimate_z(rtimes, bpsizes, 1) anchor_pairs.sort() pairs, z, rss, f = align_upper_pm(peaks, ladder, anchor_pairs, initial_z) #print(pairs) pairs, z, rss, f = align_lower_pm(peaks, ladder, pairs, initial_z) #print(rss) #plot(f.rtimes, f.sizes, z, pairs) # last dp dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss) if is_verbosity(1): import pprint pprint.pprint(dp_result.sized_peaks) if is_verbosity(4): plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) dp_result.sized_peaks = f.get_sized_peaks(dp_result.sized_peaks) score, msg = ladder['qcfunc'](dp_result, method='strict') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.pm_strict) score, msg = ladder['qcfunc'](dp_result, method='relax') return AlignResult(score, msg, dp_result, const.alignmethod.pm_relax) f = ZFunc(peaks, ladder['sizes'], anchor_pairs) z = initial_z score = last_score = 0 last_z = None for order in [1, 2, 3]: last_rss = -1 rss = 0 niter = 0 while abs(rss - last_rss) > 1e-3: niter += 1 cverr(5, 'Iter: %d' % niter) cverr(5, z) score = f(z) if last_score and last_score < score: # score does not converge; just exit cverr(5, 'does not converge!') break pairs, cur_rss = f.get_pairs(z) rtimes, bpsizes = zip(*pairs) zres = estimate_z(rtimes, bpsizes, order) last_z = z z = zres.z last_rss = rss rss = zres.rss cverr(5, rss) dp_result = align_dp(f.rtimes, f.sizes, last_z, last_rss) return align_gm2(peaks, ladder, anchor_pairs, dp_result.z) new_anchor_pairs = [] zf = np.poly1d(dp_result.z) for p in dp_result.sized_peaks: if (p[0] - zf(p[1]))**2 < 2: new_anchor_pairs.append((p[1], p[0])) #import pprint; pprint.pprint(dp_result.sized_peaks) plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) return align_gm(peaks, ladder, anchor_pairs, dp_result.z)
def align_pm(peaks, ladder, anchor_pairs=None): if not anchor_pairs: anchor_peaks = [p for p in peaks if 1500 < p.rtime < 5000] # this finds the pair of peaks that best match to the 2nd and next-to-last ladder steps, and # does a linear fit to the rest of peaks to find the peaks matched to ladder steps anchor_pairs, initial_z = estimate_pm(anchor_peaks, ladder['signature']) else: rtimes, bpsizes = zip(*anchor_pairs) initial_z = estimate_z(rtimes, bpsizes, 1) anchor_pairs.sort() # if the number of anchor pairs equals the number of ladder steps, no need to do pair matching if len(anchor_pairs) == len(ladder['sizes']): f = ZFunc(peaks, ladder['sizes'], anchor_pairs, estimate=True) anchor_rtimes, anchor_bpsizes = zip(*anchor_pairs) zres = estimate_z(anchor_rtimes, anchor_bpsizes, 2) score, z = minimize_score(f, zres.z, 2) pairs, rss = f.get_pairs(z) else: pairs, z, rss, f = align_upper_pm(peaks, ladder, anchor_pairs, initial_z) if is_verbosity(4): print(pairs) pairs, z, rss, f = align_lower_pm(peaks, ladder, pairs, z) #print(rss) #plot(f.rtimes, f.sizes, z, pairs) # last dp dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss) if is_verbosity(4): import pprint pprint.pprint(dp_result.sized_peaks) plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) dp_result.sized_peaks = f.get_sized_peaks(dp_result.sized_peaks) score, msg = ladder['qcfunc'](dp_result, method='strict') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.pm_strict) score, msg = ladder['qcfunc'](dp_result, method='relax') return AlignResult(score, msg, dp_result, const.alignmethod.pm_relax) f = ZFunc(peaks, ladder['sizes'], anchor_pairs) z = initial_z score = last_score = 0 last_z = None for order in [1, 2, 3]: last_rss = -1 rss = 0 niter = 0 while abs(rss - last_rss) > 1e-3: niter += 1 print('Iter: %d' % niter) print(z) score = f(z) if last_score and last_score < score: # score does not converge; just exit print('does not converge!') break pairs, cur_rss = f.get_pairs(z) rtimes, bpsizes = zip(*pairs) zres = estimate_z(rtimes, bpsizes, order) last_z = z z = zres.z last_rss = rss rss = zres.rss print(rss) dp_result = align_dp(f.rtimes, f.sizes, last_z, last_rss) return align_gm2(peaks, ladder, anchor_pairs, dp_result.z) new_anchor_pairs = [] zf = np.poly1d(dp_result.z) for p in dp_result.sized_peaks: if (p[0] - zf(p[1]))**2 < 2: new_anchor_pairs.append((p[1], p[0])) if is_verbosity(4): #import pprint; pprint.pprint(dp_result.sized_peaks) plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) return align_gm(peaks, ladder, anchor_pairs, dp_result.z)