示例#1
0
def rank_chimeras(input_file, output_file, empirical_prob):
    '''
    rank the chimeras according to the empirical distribution
    of encompassing read coverage, spanning read coverage, 
    and junction permiscuity
    '''
    # profile the chimeras
    arr = []
    for c in SpanningChimera.parse(open(input_file)):        
        arr.append(get_ranking_props(c))
    arr = np.array(arr)
    # choose bin sizes
    maxbins = 500
    bins = []
    for d in xrange(arr.shape[1]):    
        bins.append(get_quantiles(arr[:,d], np.linspace(0, 1, maxbins))) 
    H, edges = np.histogramdd(arr, bins=bins)
    #N = np.sum(H)
    # now rank each chimera using the empirical distribution
    chimera_scores = []
    for c in SpanningChimera.parse(open(input_file)):
        props = get_ranking_props(c)
        p = hist_interp_prob(H, edges, props)
        chimera_scores.append((1-p, c))
    outfh = open(output_file, "w")
    sorted_chimera_scores = sorted(chimera_scores, key=operator.itemgetter(0))
    empirical_probs = np.array([x[0] for x in sorted_chimera_scores])
    prob_cutoff = scoreatpercentile(empirical_probs, empirical_prob)
    
    print >>outfh, '\t'.join(['#gene5p', 'start5p', 'end5p', 'gene3p', 
                              'start3p', 'end3p', 'name', 'weighted_cov', 
                              'strand5p', 'strand3p', 'type', 'distance', 
                              'encompassing_reads', 'encompassing_reads_plus',
                              'encompassing_reads_minus', 'multimap_hist',
                              'isize5p', 'isize3p', 'exons5p', 'exons3p',
                              'junction_permiscuity5p', 
                              'junction_permiscuity3p',
                              'encompassing_ids', 'encompassing_read1',
                              'encompassing_read2', 'junction_id', 
                              'junction_pos', 'homology5p', 'homology3p', 
                              'spanning_reads', 'encomp_and_spanning',
                              'total_reads', 'spanning_info', 
                              'breakpoint_hist', 'empirical_prob']) 
    for p,c in sorted_chimera_scores:
        if p > prob_cutoff:
            break
        arr = get_anchor_hist(c)
        arrstring = ','.join([str(round(x,1)) for x in arr])
        print >>outfh, '\t'.join(map(str, c.to_list() + [arrstring, p]))
    outfh.close() 
示例#2
0
def get_quantiles(a, probs):
    sorted_a = np.sort(a)
    unique_a = np.unique(a)
    maxbins = probs.shape[0]
    if unique_a.shape[0] <= maxbins:
        edges = list(unique_a)
    else:
        edges = []
        for p in probs:
            score = scoreatpercentile(sorted_a, p)
            if len(edges) > 0 and (score == edges[-1]):
                continue
            edges.append(score)
    if len(edges) == 1:
        return 1
    return edges
示例#3
0
def get_quantiles(a, probs):
    sorted_a = np.sort(a)    
    unique_a = np.unique(a)
    maxbins = probs.shape[0]
    if unique_a.shape[0] <= maxbins:
        edges = list(unique_a)
    else:
        edges = []
        for p in probs:
            score = scoreatpercentile(sorted_a, p)
            if len(edges) > 0 and (score == edges[-1]):
                continue
            edges.append(score)
    if len(edges) == 1:
        return 1
    return edges
示例#4
0
def rank_chimeras(input_file, output_file, empirical_prob):
    '''
    rank the chimeras according to the empirical distribution
    of encompassing read coverage, spanning read coverage, 
    and junction permiscuity
    '''
    # profile the chimeras
    arr = []
    for c in SpanningChimera.parse(open(input_file)):
        arr.append(get_ranking_props(c))
    arr = np.array(arr)
    # choose bin sizes
    maxbins = 500
    bins = []
    for d in xrange(arr.shape[1]):
        bins.append(get_quantiles(arr[:, d], np.linspace(0, 1, maxbins)))
    H, edges = np.histogramdd(arr, bins=bins)
    #N = np.sum(H)
    # now rank each chimera using the empirical distribution
    chimera_scores = []
    for c in SpanningChimera.parse(open(input_file)):
        props = get_ranking_props(c)
        p = hist_interp_prob(H, edges, props)
        chimera_scores.append((1 - p, c))
    outfh = open(output_file, "w")
    sorted_chimera_scores = sorted(chimera_scores, key=operator.itemgetter(0))
    empirical_probs = np.array([x[0] for x in sorted_chimera_scores])
    prob_cutoff = scoreatpercentile(empirical_probs, empirical_prob)

    print >> outfh, '\t'.join([
        '#gene5p', 'start5p', 'end5p', 'gene3p', 'start3p', 'end3p', 'name',
        'weighted_cov', 'strand5p', 'strand3p', 'type', 'distance',
        'encompassing_reads', 'encompassing_reads_plus',
        'encompassing_reads_minus', 'multimap_hist', 'isize5p', 'isize3p',
        'exons5p', 'exons3p', 'junction_permiscuity5p',
        'junction_permiscuity3p', 'encompassing_ids', 'encompassing_read1',
        'encompassing_read2', 'junction_id', 'junction_pos', 'homology5p',
        'homology3p', 'spanning_reads', 'encomp_and_spanning', 'total_reads',
        'spanning_info', 'breakpoint_hist', 'empirical_prob'
    ])
    for p, c in sorted_chimera_scores:
        if p > prob_cutoff:
            break
        arr = get_anchor_hist(c)
        arrstring = ','.join([str(round(x, 1)) for x in arr])
        print >> outfh, '\t'.join(map(str, c.to_list() + [arrstring, p]))
    outfh.close()