Пример #1
0
def rank_chimeras(input_file, output_file, empirical_prob):
    '''
    rank the chimeras according to the empirical distribution
    of encompassing read coverage, spanning read coverage, 
    and junction permiscuity
    '''
    # profile the chimeras
    arr = []
    for c in SpanningChimera.parse(open(input_file)):        
        arr.append(get_ranking_props(c))
    arr = np.array(arr)
    # choose bin sizes
    maxbins = 500
    bins = []
    for d in xrange(arr.shape[1]):    
        bins.append(get_quantiles(arr[:,d], np.linspace(0, 1, maxbins))) 
    H, edges = np.histogramdd(arr, bins=bins)
    #N = np.sum(H)
    # now rank each chimera using the empirical distribution
    chimera_scores = []
    for c in SpanningChimera.parse(open(input_file)):
        props = get_ranking_props(c)
        p = hist_interp_prob(H, edges, props)
        chimera_scores.append((1-p, c))
    outfh = open(output_file, "w")
    sorted_chimera_scores = sorted(chimera_scores, key=operator.itemgetter(0))
    empirical_probs = np.array([x[0] for x in sorted_chimera_scores])
    prob_cutoff = scoreatpercentile(empirical_probs, empirical_prob)
    
    print >>outfh, '\t'.join(['#gene5p', 'start5p', 'end5p', 'gene3p', 
                              'start3p', 'end3p', 'name', 'weighted_cov', 
                              'strand5p', 'strand3p', 'type', 'distance', 
                              'encompassing_reads', 'encompassing_reads_plus',
                              'encompassing_reads_minus', 'multimap_hist',
                              'isize5p', 'isize3p', 'exons5p', 'exons3p',
                              'junction_permiscuity5p', 
                              'junction_permiscuity3p',
                              'encompassing_ids', 'encompassing_read1',
                              'encompassing_read2', 'junction_id', 
                              'junction_pos', 'homology5p', 'homology3p', 
                              'spanning_reads', 'encomp_and_spanning',
                              'total_reads', 'spanning_info', 
                              'breakpoint_hist', 'empirical_prob']) 
    for p,c in sorted_chimera_scores:
        if p > prob_cutoff:
            break
        arr = get_anchor_hist(c)
        arrstring = ','.join([str(round(x,1)) for x in arr])
        print >>outfh, '\t'.join(map(str, c.to_list() + [arrstring, p]))
    outfh.close() 
def choose_highest_coverage_chimeras(input_file, ggmap):
    '''
    choose the highest coverage isoform pair using spanning reads,
    encompassing reads, and total reads as a measure.  ties will be
    broken by choosing a single gene pair arbitrarily 
    '''
    # break name into 5'/3' genes linked in a dictionary
    logging.debug("Building junction isoform coverage map")
    kept_isoforms_set = build_junc_coverage_map(SpanningChimera.parse(open(input_file)), ggmap)
    # write results
    logging.debug("Returning highest coverage chimeras")
    for c in SpanningChimera.parse(open(input_file)):
        pairkey = (c.mate5p.tx_name, c.mate3p.tx_name)
        if pairkey in kept_isoforms_set:
            yield c
    del kept_isoforms_set
Пример #3
0
def rank_chimeras(input_file, output_file, empirical_prob):
    '''
    rank the chimeras according to the empirical distribution
    of encompassing read coverage, spanning read coverage, 
    and junction permiscuity
    '''
    # profile the chimeras
    arr = []
    for c in SpanningChimera.parse(open(input_file)):
        arr.append(get_ranking_props(c))
    arr = np.array(arr)
    # choose bin sizes
    maxbins = 500
    bins = []
    for d in xrange(arr.shape[1]):
        bins.append(get_quantiles(arr[:, d], np.linspace(0, 1, maxbins)))
    H, edges = np.histogramdd(arr, bins=bins)
    #N = np.sum(H)
    # now rank each chimera using the empirical distribution
    chimera_scores = []
    for c in SpanningChimera.parse(open(input_file)):
        props = get_ranking_props(c)
        p = hist_interp_prob(H, edges, props)
        chimera_scores.append((1 - p, c))
    outfh = open(output_file, "w")
    sorted_chimera_scores = sorted(chimera_scores, key=operator.itemgetter(0))
    empirical_probs = np.array([x[0] for x in sorted_chimera_scores])
    prob_cutoff = scoreatpercentile(empirical_probs, empirical_prob)

    print >> outfh, '\t'.join([
        '#gene5p', 'start5p', 'end5p', 'gene3p', 'start3p', 'end3p', 'name',
        'weighted_cov', 'strand5p', 'strand3p', 'type', 'distance',
        'encompassing_reads', 'encompassing_reads_plus',
        'encompassing_reads_minus', 'multimap_hist', 'isize5p', 'isize3p',
        'exons5p', 'exons3p', 'junction_permiscuity5p',
        'junction_permiscuity3p', 'encompassing_ids', 'encompassing_read1',
        'encompassing_read2', 'junction_id', 'junction_pos', 'homology5p',
        'homology3p', 'spanning_reads', 'encomp_and_spanning', 'total_reads',
        'spanning_info', 'breakpoint_hist', 'empirical_prob'
    ])
    for p, c in sorted_chimera_scores:
        if p > prob_cutoff:
            break
        arr = get_anchor_hist(c)
        arrstring = ','.join([str(round(x, 1)) for x in arr])
        print >> outfh, '\t'.join(map(str, c.to_list() + [arrstring, p]))
    outfh.close()
def choose_highest_coverage_chimeras(input_file, ggmap):
    '''
    choose the highest coverage isoform pair using spanning reads,
    encompassing reads, and total reads as a measure.  ties will be
    broken by choosing a single gene pair arbitrarily 
    '''
    # break name into 5'/3' genes linked in a dictionary
    logging.debug("Building junction isoform coverage map")
    kept_isoforms_set = build_junc_coverage_map(
        SpanningChimera.parse(open(input_file)), ggmap)
    # write results
    logging.debug("Returning highest coverage chimeras")
    for c in SpanningChimera.parse(open(input_file)):
        pairkey = (c.mate5p.tx_name, c.mate3p.tx_name)
        if pairkey in kept_isoforms_set:
            yield c
    del kept_isoforms_set
def filter_spanning_chimeras(input_file, output_file, gene_file,
                             mate_pval, max_isize):
    '''
    processes chimera isoforms and chooses the one with the 
    highest coverage and omits the rest
    '''
    # apply more filters
    tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe')
    fh = open(tmpfile, "w")
    for c in SpanningChimera.parse(open(input_file)):
        res = filter_insert_size(c, max_isize)
        if res:
            print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))])
    fh.close()
    # choose best isoform from remaining isoforms
    logging.debug("Building gene/genome index")
    ggmap = build_gene_to_genome_map(open(gene_file))
    logging.debug("Choosing highest coverage chimeras")
    fh = open(output_file, "w")
    for c in choose_highest_coverage_chimeras(tmpfile, ggmap):
        print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))])
    fh.close()
    # remove temporary file
    os.remove(tmpfile)
def filter_spanning_chimeras(input_file, output_file, gene_file, mate_pval,
                             max_isize):
    '''
    processes chimera isoforms and chooses the one with the 
    highest coverage and omits the rest
    '''
    # apply more filters
    tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe')
    fh = open(tmpfile, "w")
    for c in SpanningChimera.parse(open(input_file)):
        res = filter_insert_size(c, max_isize)
        if res:
            print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))])
    fh.close()
    # choose best isoform from remaining isoforms
    logging.debug("Building gene/genome index")
    ggmap = build_gene_to_genome_map(open(gene_file))
    logging.debug("Choosing highest coverage chimeras")
    fh = open(output_file, "w")
    for c in choose_highest_coverage_chimeras(tmpfile, ggmap):
        print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))])
    fh.close()
    # remove temporary file
    os.remove(tmpfile)