def compute_pval_thresh(real_pvals, null_pvals, verbose): ''' given an observed p-value and a collection of real and random p-values, calculate the empirical false discovery rate (FDR) for the p-value. returns: dictionary of floats''' pval_thresh = dict() if verbose: label = ">> computing thresholds: " progress = ProgressBar(len(real_pvals), label=label) for obs_pval in real_pvals: if verbose: progress.next() if obs_pval in pval_thresh: continue real_pvals_better = num_pvals_better(obs_pval, real_pvals, verbose) null_pvals_better = num_pvals_better(obs_pval, null_pvals, verbose) if real_pvals_better == 0: qval = 0.0 else: qval = null_pvals_better / real_pvals_better pval_thresh[obs_pval] = qval if verbose: progress.end() return pval_thresh
def find_trimmed_peaks(peak_data, genome, chrom, trackname, max_width, verbose): ''' trim peaks to min width with max signal. returns: data with same form as peak_data, trimmed''' # TODO: recalculate pvalue for the trimmed and return that # XXX: there is likely a faster way to do this with native numpy # functions. speed scales exponentially with window size, may want to # add a "max_width" param to prevent trimming of peaks e.g. >1000 bp trimmed_peaks = [] if verbose: progress = ProgressBar(len(peak_data), label=">> trimming peaks: ") for peak_start, peak_end, pval in peak_data: if peak_end - peak_start > max_width: trimmed_peaks.append((peak_start, peak_end, pval)) continue if verbose: progress.next() continuous = genome[chrom.name][peak_start:peak_end, trackname] trim_peak_start, trim_peak_end = max_contiguous_signal(continuous) # update coords with new start & end - set end first peak_end = peak_start + trim_peak_end peak_start = peak_start + trim_peak_start trimmed_peaks.append((peak_start, peak_end, pval)) # clean up progress bar if verbose: progress.end() return trimmed_peaks
def identify_local_peaks(chromosome, track_index, peak_width, lambda_base, log_pval_thresh, local_lambda, shuffle_data, verbose): ''' identify peaks within a chromsome. returns: list of (start, end, pvalue) tuples''' local_peaks = [] # pdb.set_trace() if verbose: progress = ProgressBar(num_supercontigs(chromosome), label=">> calling peaks: ") for supercontig, continuous in chromosome.itercontinuous(): if verbose: progress.next() # load data into memory track_continuous = continuous[:, track_index] # shuffle the data if requested if shuffle_data: shuffle(track_continuous) # generate window ranges i.e. peak widths peak_range = xrange(0, len(track_continuous), int(peak_width)) for peak_start in peak_range: # set the peak end peak_end = int(peak_start + peak_width) peak_data = track_continuous[peak_start:peak_end] # skip the window if it is empty if isnan(nansum(peak_data)): continue # calculate local lambdas and choose max if local_lambda: lambda_values = calc_local_lambdas(peak_width, track_continuous, peak_start, peak_end) lambda_values.append(lambda_base) lambda_value = max(lambda_values) else: lambda_value = lambda_base # initial pvalue for the region log_pvalue = calc_log_pvalue(peak_data, lambda_value, verbose) # check that the pvalue is defined & that it meets the # threshold if not log_pvalue or log_pvalue < log_pval_thresh: continue # calculate chromosomal coords relative to current # contig coords chrom_start = supercontig.start + peak_start chrom_end = supercontig.start + peak_end # save current peak fields = (chrom_start, chrom_end, log_pvalue) local_peaks.append(fields) # reset lambda value to base lambda_value = lambda_base if verbose: progress.end() return local_peaks