def rank_chimeras(input_file, output_file, empirical_prob): ''' rank the chimeras according to the empirical distribution of encompassing read coverage, spanning read coverage, and junction permiscuity ''' # profile the chimeras arr = [] for c in SpanningChimera.parse(open(input_file)): arr.append(get_ranking_props(c)) arr = np.array(arr) # choose bin sizes maxbins = 500 bins = [] for d in xrange(arr.shape[1]): bins.append(get_quantiles(arr[:,d], np.linspace(0, 1, maxbins))) H, edges = np.histogramdd(arr, bins=bins) #N = np.sum(H) # now rank each chimera using the empirical distribution chimera_scores = [] for c in SpanningChimera.parse(open(input_file)): props = get_ranking_props(c) p = hist_interp_prob(H, edges, props) chimera_scores.append((1-p, c)) outfh = open(output_file, "w") sorted_chimera_scores = sorted(chimera_scores, key=operator.itemgetter(0)) empirical_probs = np.array([x[0] for x in sorted_chimera_scores]) prob_cutoff = scoreatpercentile(empirical_probs, empirical_prob) print >>outfh, '\t'.join(['#gene5p', 'start5p', 'end5p', 'gene3p', 'start3p', 'end3p', 'name', 'weighted_cov', 'strand5p', 'strand3p', 'type', 'distance', 'encompassing_reads', 'encompassing_reads_plus', 'encompassing_reads_minus', 'multimap_hist', 'isize5p', 'isize3p', 'exons5p', 'exons3p', 'junction_permiscuity5p', 'junction_permiscuity3p', 'encompassing_ids', 'encompassing_read1', 'encompassing_read2', 'junction_id', 'junction_pos', 'homology5p', 'homology3p', 'spanning_reads', 'encomp_and_spanning', 'total_reads', 'spanning_info', 'breakpoint_hist', 'empirical_prob']) for p,c in sorted_chimera_scores: if p > prob_cutoff: break arr = get_anchor_hist(c) arrstring = ','.join([str(round(x,1)) for x in arr]) print >>outfh, '\t'.join(map(str, c.to_list() + [arrstring, p])) outfh.close()
def get_quantiles(a, probs): sorted_a = np.sort(a) unique_a = np.unique(a) maxbins = probs.shape[0] if unique_a.shape[0] <= maxbins: edges = list(unique_a) else: edges = [] for p in probs: score = scoreatpercentile(sorted_a, p) if len(edges) > 0 and (score == edges[-1]): continue edges.append(score) if len(edges) == 1: return 1 return edges
def rank_chimeras(input_file, output_file, empirical_prob): ''' rank the chimeras according to the empirical distribution of encompassing read coverage, spanning read coverage, and junction permiscuity ''' # profile the chimeras arr = [] for c in SpanningChimera.parse(open(input_file)): arr.append(get_ranking_props(c)) arr = np.array(arr) # choose bin sizes maxbins = 500 bins = [] for d in xrange(arr.shape[1]): bins.append(get_quantiles(arr[:, d], np.linspace(0, 1, maxbins))) H, edges = np.histogramdd(arr, bins=bins) #N = np.sum(H) # now rank each chimera using the empirical distribution chimera_scores = [] for c in SpanningChimera.parse(open(input_file)): props = get_ranking_props(c) p = hist_interp_prob(H, edges, props) chimera_scores.append((1 - p, c)) outfh = open(output_file, "w") sorted_chimera_scores = sorted(chimera_scores, key=operator.itemgetter(0)) empirical_probs = np.array([x[0] for x in sorted_chimera_scores]) prob_cutoff = scoreatpercentile(empirical_probs, empirical_prob) print >> outfh, '\t'.join([ '#gene5p', 'start5p', 'end5p', 'gene3p', 'start3p', 'end3p', 'name', 'weighted_cov', 'strand5p', 'strand3p', 'type', 'distance', 'encompassing_reads', 'encompassing_reads_plus', 'encompassing_reads_minus', 'multimap_hist', 'isize5p', 'isize3p', 'exons5p', 'exons3p', 'junction_permiscuity5p', 'junction_permiscuity3p', 'encompassing_ids', 'encompassing_read1', 'encompassing_read2', 'junction_id', 'junction_pos', 'homology5p', 'homology3p', 'spanning_reads', 'encomp_and_spanning', 'total_reads', 'spanning_info', 'breakpoint_hist', 'empirical_prob' ]) for p, c in sorted_chimera_scores: if p > prob_cutoff: break arr = get_anchor_hist(c) arrstring = ','.join([str(round(x, 1)) for x in arr]) print >> outfh, '\t'.join(map(str, c.to_list() + [arrstring, p])) outfh.close()