Exemplo n.º 1
0
def main():
    usage = 'usage: %prog [options] <gff> <bam>'
    parser = OptionParser(usage)
    #parser.add_option('-c', dest='control_bam_file', default=None, help='Control BAM file')
    parser.add_option('-g', dest='geo_mean', default=False, action='store_true', help='Compute geometric mean of individual peak coverages [Default: %default]')
    parser.add_option('-i', dest='individual_plots', default=False, action='store_true', help='Print a coverage plot for every individual peak [Default: %default]')
    parser.add_option('-o', dest='out_prefix', default='peak_cov', help='Output prefix [Default: %default]')
    parser.add_option('-p', dest='properly_paired', default=False, action='store_true', help='Count entire fragments for only properly paired reads [Default: %default]')
    parser.add_option('-u', dest='range', default=500, type='int', help='Range around peak middle [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and BAM file')
    else:
        peaks_gff = args[0]
        bam_file = args[1]

    # filter BAM for mapping quality
    bam_mapq_fd, bam_mapq_file = tempfile.mkstemp(dir='%s/research/scratch' % os.environ['HOME'])
    bam_in = pysam.Samfile(bam_file, 'rb')
    bam_mapq_out = pysam.Samfile(bam_mapq_file, 'wb', template=bam_in)
    for aligned_read in bam_in:
        if aligned_read.mapq > 0:
            bam_mapq_out.write(aligned_read)
    bam_mapq_out.close()

    # count fragments and hash multi-mappers
    num_fragments = 0
    multi_maps = {}
    for aligned_read in pysam.Samfile(bam_mapq_file, 'rb'):
        if options.properly_paired:
            if aligned_read.is_properly_paired:
                num_fragments += 0.5/aligned_read.opt('NH')
        else:
            if aligned_read.is_paired:
                num_fragments += 0.5/aligned_read.opt('NH')
            else:
                num_fragments += 1.0/aligned_read.opt('NH')

        if aligned_read.opt('NH') > 1:
            multi_maps[aligned_read.qname] = aligned_read.opt('NH')

    # extend GFF entries to range
    peaks_gff_range_fd, peaks_gff_range_file = tempfile.mkstemp()
    peaks_gff_range_out = open(peaks_gff_range_file, 'w')
    for line in open(peaks_gff):
        a = line.split('\t')
        
        pstart = int(a[3])
        pend = int(a[4])
        peak_mid = pstart + (pend-pstart)/2

        a[3] = str(peak_mid - options.range/2 - 1)
        a[4] = str(peak_mid + options.range/2 + 1)

        print >> peaks_gff_range_out, '\t'.join(a),
    peaks_gff_range_out.close()

    # initialize coverage counters
    peak_cov_individual = {}
    peak_reads = {}

    # count reads
    p = subprocess.Popen('intersectBed -split -wo -bed -abam %s -b %s' % (bam_mapq_file,peaks_gff_range_file), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')
        
        rstart = int(a[1])
        rend = int(a[2])
        rheader = a[3]

        # because intersectBed screws up indels near endpoints
        if rstart < rend:
            pstart = int(a[15])
            pend = int(a[16])
            peak_id = gff.gtf_kv(a[20])['id']
            peak_reads[peak_id] = peak_reads.get(peak_id,0) + 1

            peak_mid = pstart + (pend-pstart)/2
            peak_range_start = peak_mid - options.range/2
            peak_range_end = peak_mid + options.range/2

            range_start = max(rstart, peak_range_start)
            range_end = min(rend, peak_range_end)

            if not peak_id in peak_cov_individual:
                peak_cov_individual[peak_id] = [0.0]*(1+options.range)
            for i in range(range_start - peak_range_start, range_end - peak_range_start + 1):
                peak_cov_individual[peak_id][i] += 1.0/multi_maps.get(rheader,1)

    p.communicate()

    # combine individual
    peak_cov = [0.0]*(1+options.range)
    for i in range(len(peak_cov)):
        if options.geo_mean:
            peak_cov[i] = stats.geo_mean([1+peak_cov_individual[peak_id][i] for peak_id in peak_cov_individual])
        else:
            peak_cov[i] = stats.mean([peak_cov_individual[peak_id][i] for peak_id in peak_cov_individual])

    #for peak_id in peak_reads:
    #    print peak_id, peak_reads[peak_id]

    # output
    make_output(peak_cov, options.out_prefix, options.range)

    if options.individual_plots:
        individual_dir = '%s_individuals' % options.out_prefix
        if os.path.isdir(individual_dir):
            shutil.rmtree(individual_dir)
        os.mkdir(individual_dir)

        for peak_id in peak_cov_individual:
            if peak_reads[peak_id] > 150:
                make_output(peak_cov_individual[peak_id], '%s/%s' % (individual_dir,peak_id), options.range)

    # clean
    os.close(bam_mapq_fd)
    os.remove(bam_mapq_file)
    os.close(peaks_gff_range_fd)
    os.remove(peaks_gff_range_file)
Exemplo n.º 2
0
def main():
    usage = 'usage: %prog [options] <ref_gtf>'
    parser = OptionParser(usage)
    #parser.add_option()
    parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]')
    parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform')
    parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide reference GTF')
    else:
        ref_gtf = args[0]

    g2t = gff.g2t(ref_gtf)
    transcripts = gff.read_genes(ref_gtf)
    source = open(ref_gtf).readline().split()[1]

    if options.fpkm_tracking:
        iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking)

    for gene_id in g2t:
        gene_transcripts = list(g2t[gene_id])
        gene_strand = transcripts[gene_transcripts[0]].strand
        if gene_strand not in ['+','-']:
            print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr)
            continue

        # choose TSS
        if options.fpkm_tracking:
            # find most expressed isoform
            promoter_tid = gene_transcripts[0]
            max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)])
            for transcript_id in gene_transcripts[1:]:
                transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)])
                if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm:
                    promoter_tid = transcript_id
                    max_fpkm = transcript_fpkm

            # get isoform tss
            if gene_strand == '+':
                tss = transcripts[promoter_tid].exons[0].start
            else:
                tss = transcripts[promoter_tid].exons[-1].end

        else:
            # find most upstream tss
            promoter_tid = gene_transcripts[0]
            if gene_strand == '+':
                upstream_tss = transcripts[promoter_tid].exons[0].start
            else:
                upstream_tss = transcripts[promoter_tid].exons[-1].end

            for transcript_id in gene_transcripts[1:]:
                if gene_strand == '+':
                    transcript_pos = transcripts[transcript_id].exons[0].start
                    if transcript_pos < upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos
                else:
                    transcript_pos = transcripts[transcript_id].exons[-1].end
                    if transcript_pos > upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos

            tss = upstream_tss

        # print promoter from the tss
        if gene_strand == '+':
            if tss - options.upstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)]
                print('\t'.join(cols))

        else:
            if tss - options.downstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)]
                print('\t'.join(cols))
Exemplo n.º 3
0
def main():
    usage = 'usage: %prog [options] <gff> <bam1,bam2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_bam_files', default=None, help='Control BAM files (comma separated)')
    parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]')
    parser.add_option('-k', dest='gtf_key', default=None, help='GTF key to hash gff entries by')
    parser.add_option('-m', dest='max_features', default=2000, type='int', help='Maximum number of features to plot [Default: %default]')
    parser.add_option('-o', dest='output_pre', default='bam', help='Output prefix [Default: %default]')
    parser.add_option('-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order')
    parser.add_option('-u', dest='range', default=2000, type='int', help='Range around peak middle [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and BAM file')
    else:
        gff_file = args[0]
        bam_files = args[1].split(',')

    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # extend GFF entries to range (and sample)
    ############################################
    feature_count = 0
    for line in open(gff_file):
        feature_count += 1

    sample_prob = min(1.0, options.max_features / float(feature_count))

    gff_range_fd, gff_range_file = tempfile.mkstemp()
    gff_range_out = open(gff_range_file, 'w')

    for line in open(gff_file):
        a = line.split('\t')
        
        start = int(a[3])
        end = int(a[4])
        mid = start + (end-start)/2
        a[3] = str(mid - options.range/2)
        a[4] = str(mid + options.range/2)
        a[-1] = a[-1].rstrip()

        if random.random() < sample_prob:
            print >> gff_range_out, '\t'.join(a)

    gff_range_out.close()

    ############################################
    # compute coverage
    ############################################
    coverage, fragments = compute_coverage(gff_range_file, bam_files, options.gtf_key)
    if options.control_bam_files:
        coverage_control, fragments_control = compute_coverage(gff_range_file, control_bam_files, options.gtf_key)

    # clean
    os.close(gff_range_fd)
    os.remove(gff_range_file)

    ############################################
    # normalize
    ############################################
    # normalize coverages (and add pseudocounts)
    for feature_id in coverage:
        for i in range(len(coverage[feature_id])):
            coverage[feature_id][i] = (1+coverage[feature_id][i])/fragments
            if options.control_bam_files:
                coverage_control[feature_id][i] = (1+coverage_control[feature_id][i])/fragments_control    

    ############################################
    # sorted genes
    ############################################
    features_sorted = []
    if options.sorted_gene_files:
        # for each sorted list
        for sorted_gene_file in options.sorted_gene_files.split(','):
            # collect feature_id's
            features_sorted.append([])
            for line in open(sorted_gene_file):
                feature_id = line.split()[0]
                # verify randomly selected
                if feature_id in coverage:
                    features_sorted[-1].append(feature_id)

    else:
        # tuple feature_id's with mean coverage
        feature_id_stat = []
        for feature_id in coverage:
            if options.control_bam_files:
                feature_stat = stats.mean([math.log(coverage[feature_id][i],2) - math.log(coverage_control[feature_id][i],2) for i in range(len(coverage[feature_id]))])
            else:
                feature_stat = stats.geo_mean([coverage[feature_id][i] for i in range(len(coverage[feature_id]))])

            feature_id_stat.append((feature_stat,feature_id))

        # sort
        feature_id_stat.sort(reverse=True)

        # store as the only sorted list
        features_sorted.append([feature_id for (feature_stat, feature_id) in feature_id_stat])

    ############################################
    # plot heatmap(s)
    ############################################
    # if multiple sorts, create a dir for the plots
    if len(features_sorted) > 1:
        if not os.path.isdir('%s_heat' % options.output_pre):
            os.mkdir('%s_heat' % options.output_pre)

    for s in range(len(features_sorted)):
        df = {'Index':[], 'Feature':[], 'Coverage':[]}
        for f in range(len(features_sorted[s])):
            feature_id = features_sorted[s][f]
            for i in range(-options.range/2,options.range/2+1):
                df['Index'].append(i)
                df['Feature'].append(f)

                if options.log:
                    cov = math.log(coverage[feature_id][i+options.range/2],2)
                else:
                    cov = coverage[feature_id][i+options.range/2]

                if options.control_bam_files:
                    if options.log:
                        cov -= math.log(coverage_control[feature_id][i+options.range/2],2)
                    else:
                        cov = cov / coverage_control[feature_id][i+options.range/2]

                df['Coverage'].append('%.4e' % cov)

        r_script = '%s/bam_heat_heat.r' % os.environ['RDIR']
        if len(features_sorted) == 1:
            out_pdf = '%s_heat.pdf' % options.output_pre
        else:
            sorted_gene_file = options.sorted_gene_files.split(',')[s]
            sorted_gene_pre = os.path.splitext(os.path.split(sorted_gene_file)[-1])[0]
            out_pdf = '%s_heat/%s.pdf' % (options.output_pre,sorted_gene_pre)

        ggplot.plot(r_script, df, [out_pdf, options.control_bam_files!=None], df_file='df_heat.txt')

    ############################################
    # plot meta-coverage
    ############################################
    df = {'Index':[], 'Coverage':[]}
    if options.control_bam_files:
        df['Type'] = []

    for i in range(-options.range/2,options.range/2+1):
        df['Index'].append(i)

        if options.log:
            df['Coverage'].append(stats.geo_mean([coverage[feature_id][i+options.range/2] for feature_id in coverage]))
        else:
            df['Coverage'].append(stats.mean([coverage[feature_id][i+options.range/2] for feature_id in coverage]))

        if options.control_bam_files:
            df['Type'].append('Primary')

            df['Index'].append(i)
            df['Type'].append('Control')
            if options.log:
                df['Coverage'].append(stats.geo_mean([coverage_control[feature_id][i+options.range/2] for feature_id in coverage_control]))
            else:
                df['Coverage'].append(stats.mean([coverage_control[feature_id][i+options.range/2] for feature_id in coverage_control]))

    r_script = '%s/bam_heat_meta.r' % os.environ['RDIR']
    out_pdf = '%s_meta.pdf' % options.output_pre

    ggplot.plot(r_script, df, [out_pdf], df_file='df_meta.txt')
Exemplo n.º 4
0
def main():
    usage = 'usage: %prog [options] <mode=mid/span> <anchor_gff> <event_bam1,event_bam2,...|event_gff1,event_gff2,...>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='max_anchors', default=1000, type='int', help='Maximum number of anchors to consider [Default: %default]')
    parser.add_option('-c', dest='control_files', default=None, help='Control BAM or GFF files (comma separated)')
    parser.add_option('-e', dest='plot_heat', default=False, help='Plot as a heatmap [Default: %default]')
    parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]')
    parser.add_option('-o', dest='output_pre', default='gff_cov', help='Output prefix [Default: %default]')
    parser.add_option('-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order')

    parser.add_option('-b', dest='bins', default=100, type='int', help='Number of bins across the gene span [Default: %default]')
    parser.add_option('-m', dest='min_length', default=None, type='int', help='Minimum anchor length [Default: %default]')

    parser.add_option('-w', dest='window', default=2000, type='int', help='Window around peak middle [Default: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide mode, anchor GFF, and BAM/GFF file(s)')
    else:
        mode = args[0]
        anchor_gff = args[1]
        event_files = args[2].split(',')

    if options.control_files:
        control_files = options.control_files.split(',')

    anchor_is_gtf = (anchor_gff[-4:] == '.gtf')

    # preprocess anchor GFF
    prep_anchor_fd, prep_anchor_gff = preprocess_anchors(anchor_gff, mode, options.max_anchors, anchor_is_gtf, options.min_length, options.window)

    ############################################
    # compute coverage
    ############################################
    coverage, events = compute_coverage(prep_anchor_gff, event_files, mode, anchor_is_gtf, options.bins)
    if options.control_files:
        coverage_control, events_control = compute_coverage(prep_anchor_gff, control_files, mode, anchor_is_gtf, options.bins)

    # clean
    os.close(prep_anchor_fd)
    os.remove(prep_anchor_gff)

    ############################################
    # normalize
    ############################################
    # normalize coverages (and add pseudocounts)
    for anchor_id in coverage:
        for i in range(len(coverage[anchor_id])):
            coverage[anchor_id][i] = (1+coverage[anchor_id][i])/float(events)
            if options.control_files:
                coverage_control[anchor_id][i] = (1+coverage_control[anchor_id][i])/float(events_control)

    ############################################
    # sort anchors
    ############################################
    anchors_sorted = []
    if options.sorted_gene_files:
        # for each sorted list
        for sorted_gene_file in options.sorted_gene_files.split(','):
            # collect anchor_id's
            anchors_sorted.append([])
            for line in open(sorted_gene_file):
                anchor_id = line.split()[0]
                # verify randomly selected
                if anchor_id in coverage:
                    anchors_sorted[-1].append(anchor_id)

    else:
        # tuple anchor_id's with mean coverage
        stat_aid = []
        for anchor_id in coverage:
            if options.control_files:
                astat = stats.mean([math.log(coverage[anchor_id][i],2) - math.log(coverage_control[anchor_id][i],2) for i in range(len(coverage[anchor_id]))])
            else:
                astat = stats.geo_mean([coverage[anchor_id][i] for i in range(len(coverage[anchor_id]))])

            stat_aid.append((astat, anchor_id))

        # sort
        stat_aid.sort(reverse=True)

        # store as the only sorted list
        anchors_sorted.append([anchor_id for (astat, anchor_id) in stat_aid])

    ############################################
    # plot heatmap(s)
    ############################################
    if options.plot_heat:
        # if multiple sorts, create a dir for the plots
        if len(anchors_sorted) > 1:
            if not os.path.isdir('%s_heat' % options.output_pre):
                os.mkdir('%s_heat' % options.output_pre)

        for s in range(len(anchors_sorted)):
            df = {'Index':[], 'Anchor':[], 'Coverage':[]}
            for si in range(len(anchors_sorted[s])):
                anchor_id = anchors_sorted[s][si]

                for i in range(len(coverage[anchor_id])):
                    if mode == 'mid':
                        df['Index'].append(i - options.window/2)
                    else:
                        df['Index'].append(i)
                    df['Anchor'].append(anchor_id)

                    if options.log:
                        cov = math.log(coverage[anchor_id][i], 2)
                    else:
                        cov = coverage[anchor_id][i]

                    if options.control_files:
                        if options.log:
                            cov -= math.log(coverage_control[anchor_id][i], 2)
                        else:
                            cov = cov / coverage_control[anchor_id][i]

                    df['Coverage'].append('%.4e' % cov)

            r_script = '%s/plot_gff_cov_heat.r' % os.environ['RDIR']
            if len(anchors_sorted) == 1:
                out_pdf = '%s_heat.pdf' % options.output_pre
            else:
                sorted_gene_file = options.sorted_gene_files.split(',')[s]
                sorted_gene_pre = os.path.splitext(os.path.split(sorted_gene_file)[-1])[0]
                out_pdf = '%s_heat/%s.pdf' % (options.output_pre,sorted_gene_pre)

            ggplot.plot(r_script, df, [out_pdf, options.control_files!=None])

    ############################################
    # plot meta-coverage
    ############################################
    df = {'Index':[], 'Coverage':[]}
    if options.control_files:
        df['Type'] = []

    if mode == 'mid':
        index_length = 2*(options.window/2) + 1
    elif mode == 'span':
        index_length = options.bins
    else:
        print >> sys.stderr, 'Unknown mode %s' % mode
        exit(1)

    for i in range(index_length):
        if mode == 'mid':
            df['Index'].append(i - options.window/2)
        else:
            df['Index'].append(i)

        if options.log:
            df['Coverage'].append(stats.geo_mean([coverage[anchor_id][i] for anchor_id in coverage]))
        else:
            df['Coverage'].append(stats.mean([coverage[anchor_id][i] for anchor_id in coverage]))

        if options.control_files:
            df['Type'].append('Primary')

            if mode == 'mid':
                df['Index'].append(i - options.window/2)
            else:
                df['Index'].append(i)

            df['Type'].append('Control')
            if options.log:
                df['Coverage'].append(stats.geo_mean([coverage_control[anchor_id][i] for anchor_id in coverage_control]))
            else:
                df['Coverage'].append(stats.mean([coverage_control[anchor_id][i] for anchor_id in coverage_control]))

    r_script = '%s/plot_gff_cov_meta.r' % os.environ['RDIR']
    ggplot.plot(r_script, df, [options.output_pre])
Exemplo n.º 5
0
def main():
    usage = 'usage: %prog [options] <gff> <bam1,bam2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c',
                      dest='control_bam_files',
                      default=None,
                      help='Control BAM files (comma separated)')
    parser.add_option('-l',
                      dest='log',
                      default=False,
                      action='store_true',
                      help='log2 coverage [Default: %default]')
    parser.add_option('-k',
                      dest='gtf_key',
                      default=None,
                      help='GTF key to hash gff entries by')
    parser.add_option(
        '-m',
        dest='max_features',
        default=2000,
        type='int',
        help='Maximum number of features to plot [Default: %default]')
    parser.add_option('-o',
                      dest='output_pre',
                      default='bam',
                      help='Output prefix [Default: %default]')
    parser.add_option(
        '-s',
        dest='sorted_gene_files',
        help='Files of sorted gene lists. Plot heatmaps in their order')
    parser.add_option('-u',
                      dest='range',
                      default=2000,
                      type='int',
                      help='Range around peak middle [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and BAM file')
    else:
        gff_file = args[0]
        bam_files = args[1].split(',')

    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # extend GFF entries to range (and sample)
    ############################################
    feature_count = 0
    for line in open(gff_file):
        feature_count += 1

    sample_prob = min(1.0, options.max_features / float(feature_count))

    gff_range_fd, gff_range_file = tempfile.mkstemp()
    gff_range_out = open(gff_range_file, 'w')

    for line in open(gff_file):
        a = line.split('\t')

        start = int(a[3])
        end = int(a[4])
        mid = start + (end - start) / 2

        range_start = mid - options.range / 2
        range_end = mid + options.range / 2

        if range_start > 0:
            a[3] = str(mid - options.range / 2)
            a[4] = str(mid + options.range / 2)
            a[-1] = a[-1].rstrip()

            if random.random() < sample_prob:
                print >> gff_range_out, '\t'.join(a)

    gff_range_out.close()

    ############################################
    # compute coverage
    ############################################
    coverage, fragments = compute_coverage(gff_range_file, bam_files,
                                           options.gtf_key)
    if options.control_bam_files:
        coverage_control, fragments_control = compute_coverage(
            gff_range_file, control_bam_files, options.gtf_key)

    # clean
    os.close(gff_range_fd)
    os.remove(gff_range_file)

    ############################################
    # normalize
    ############################################
    # normalize coverages (and add pseudocounts)
    for feature_id in coverage:
        for i in range(len(coverage[feature_id])):
            coverage[feature_id][i] = (1 + coverage[feature_id][i]) / fragments
            if options.control_bam_files:
                coverage_control[feature_id][i] = (
                    1 + coverage_control[feature_id][i]) / fragments_control

    ############################################
    # sorted genes
    ############################################
    features_sorted = []
    if options.sorted_gene_files:
        # for each sorted list
        for sorted_gene_file in options.sorted_gene_files.split(','):
            # collect feature_id's
            features_sorted.append([])
            for line in open(sorted_gene_file):
                feature_id = line.split()[0]
                # verify randomly selected
                if feature_id in coverage:
                    features_sorted[-1].append(feature_id)

    else:
        # tuple feature_id's with mean coverage
        feature_id_stat = []
        for feature_id in coverage:
            if options.control_bam_files:
                feature_stat = stats.mean([
                    math.log(coverage[feature_id][i], 2) -
                    math.log(coverage_control[feature_id][i], 2)
                    for i in range(len(coverage[feature_id]))
                ])
            else:
                feature_stat = stats.geo_mean([
                    coverage[feature_id][i]
                    for i in range(len(coverage[feature_id]))
                ])

            feature_id_stat.append((feature_stat, feature_id))

        # sort
        feature_id_stat.sort(reverse=True)

        # store as the only sorted list
        features_sorted.append(
            [feature_id for (feature_stat, feature_id) in feature_id_stat])

    ############################################
    # plot heatmap(s)
    ############################################
    # if multiple sorts, create a dir for the plots
    if len(features_sorted) > 1:
        if not os.path.isdir('%s_heat' % options.output_pre):
            os.mkdir('%s_heat' % options.output_pre)

    for s in range(len(features_sorted)):
        df = {'Index': [], 'Feature': [], 'Coverage': []}
        for f in range(len(features_sorted[s])):
            feature_id = features_sorted[s][f]
            for i in range(-options.range / 2, options.range / 2 + 1):
                df['Index'].append(i)
                df['Feature'].append(f)

                if options.log:
                    cov = math.log(coverage[feature_id][i + options.range / 2],
                                   2)
                else:
                    cov = coverage[feature_id][i + options.range / 2]

                if options.control_bam_files:
                    if options.log:
                        cov -= math.log(
                            coverage_control[feature_id][i +
                                                         options.range / 2], 2)
                    else:
                        cov = cov / coverage_control[feature_id][
                            i + options.range / 2]

                df['Coverage'].append('%.4e' % cov)

        r_script = '%s/bam_heat_heat.r' % os.environ['RDIR']
        if len(features_sorted) == 1:
            out_pdf = '%s_heat.pdf' % options.output_pre
        else:
            sorted_gene_file = options.sorted_gene_files.split(',')[s]
            sorted_gene_pre = os.path.splitext(
                os.path.split(sorted_gene_file)[-1])[0]
            out_pdf = '%s_heat/%s.pdf' % (options.output_pre, sorted_gene_pre)

        ggplot.plot(r_script, df, [out_pdf, options.control_bam_files != None])

    ############################################
    # plot meta-coverage
    ############################################
    df = {'Index': [], 'Coverage': []}
    if options.control_bam_files:
        df['Type'] = []

    for i in range(-options.range / 2, options.range / 2 + 1):
        df['Index'].append(i)

        if options.log:
            df['Coverage'].append(
                stats.geo_mean([
                    coverage[feature_id][i + options.range / 2]
                    for feature_id in coverage
                ]))
        else:
            df['Coverage'].append(
                stats.mean([
                    coverage[feature_id][i + options.range / 2]
                    for feature_id in coverage
                ]))

        if options.control_bam_files:
            df['Type'].append('Primary')

            df['Index'].append(i)
            df['Type'].append('Control')
            if options.log:
                df['Coverage'].append(
                    stats.geo_mean([
                        coverage_control[feature_id][i + options.range / 2]
                        for feature_id in coverage_control
                    ]))
            else:
                df['Coverage'].append(
                    stats.mean([
                        coverage_control[feature_id][i + options.range / 2]
                        for feature_id in coverage_control
                    ]))

    r_script = '%s/bam_heat_meta.r' % os.environ['RDIR']
    out_pdf = '%s_meta.pdf' % options.output_pre

    ggplot.plot(r_script, df, [out_pdf])
Exemplo n.º 6
0
def main():
    usage = 'usage: %prog [options] <mode=mid/span> <anchor_gff> <event_bam1,event_bam2,...|event_gff1,event_gff2,...>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='max_anchors',
        default=1000,
        type='int',
        help='Maximum number of anchors to consider [Default: %default]')
    parser.add_option('-c',
                      dest='control_files',
                      default=None,
                      help='Control BAM or GFF files (comma separated)')
    parser.add_option('-e',
                      dest='plot_heat',
                      default=False,
                      action='store_true',
                      help='Plot as a heatmap [Default: %default]')
    parser.add_option('-l',
                      dest='log',
                      default=False,
                      action='store_true',
                      help='log2 coverage [Default: %default]')
    parser.add_option('--labels',
                      dest='labels',
                      default='Primary,Control',
                      help='Plot labels [Default:%default]')
    parser.add_option('-o',
                      dest='output_pre',
                      default='gff_cov',
                      help='Output prefix [Default: %default]')
    parser.add_option(
        '-s',
        dest='sorted_gene_files',
        help='Files of sorted gene lists. Plot heatmaps in their order')

    parser.add_option('-p',
                      dest='smooth_span',
                      default=0.2,
                      type='float',
                      help='Smoothing span parameter [Default: %default]')

    parser.add_option(
        '-b',
        dest='bins',
        default=100,
        type='int',
        help='Number of bins across the gene span [Default: %default]')
    parser.add_option('-m',
                      dest='min_length',
                      default=None,
                      type='int',
                      help='Minimum anchor length [Default: %default]')

    parser.add_option('-w',
                      dest='window',
                      default=2000,
                      type='int',
                      help='Window around peak middle [Default: %default]')

    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide mode, anchor GFF, and BAM/GFF file(s)')
    else:
        mode = args[0]
        anchor_gff = args[1]
        event_files = args[2].split(',')

    if options.control_files:
        control_files = options.control_files.split(',')

    plot_labels = options.labels.split(',')

    anchor_is_gtf = (anchor_gff[-4:] == '.gtf')

    # preprocess anchor GFF
    prep_anchor_fd, prep_anchor_gff = preprocess_anchors(
        anchor_gff, mode, options.max_anchors, anchor_is_gtf,
        options.min_length, options.window)

    ############################################
    # compute coverage
    ############################################
    coverage, events = compute_coverage(prep_anchor_gff, event_files, mode,
                                        anchor_is_gtf, options.bins)
    if options.control_files:
        coverage_control, events_control = compute_coverage(
            prep_anchor_gff, control_files, mode, anchor_is_gtf, options.bins)

    # clean
    os.close(prep_anchor_fd)
    os.remove(prep_anchor_gff)

    ############################################
    # normalize
    ############################################
    # normalize coverages (and add pseudocounts)
    for anchor_id in coverage:
        for i in range(len(coverage[anchor_id])):
            coverage[anchor_id][i] = (1 +
                                      coverage[anchor_id][i]) / float(events)
            if options.control_files:
                coverage_control[anchor_id][i] = (
                    1 + coverage_control[anchor_id][i]) / float(events_control)

    ############################################
    # sort anchors
    ############################################
    anchors_sorted = []
    if options.sorted_gene_files:
        # for each sorted list
        for sorted_gene_file in options.sorted_gene_files.split(','):
            # collect anchor_id's
            anchors_sorted.append([])
            for line in open(sorted_gene_file):
                anchor_id = line.split()[0]
                # verify randomly selected
                if anchor_id in coverage:
                    anchors_sorted[-1].append(anchor_id)

    else:
        # tuple anchor_id's with mean coverage
        stat_aid = []
        for anchor_id in coverage:
            if options.control_files:
                astat = stats.mean([
                    math.log(coverage[anchor_id][i], 2) -
                    math.log(coverage_control[anchor_id][i], 2)
                    for i in range(len(coverage[anchor_id]))
                ])
            else:
                astat = stats.geo_mean([
                    coverage[anchor_id][i]
                    for i in range(len(coverage[anchor_id]))
                ])

            stat_aid.append((astat, anchor_id))

        # sort
        stat_aid.sort(reverse=True)

        # store as the only sorted list
        anchors_sorted.append([anchor_id for (astat, anchor_id) in stat_aid])

    ############################################
    # plot heatmap(s)
    ############################################
    if options.plot_heat:
        # if multiple sorts, create a dir for the plots
        if len(anchors_sorted) > 1:
            if not os.path.isdir('%s_heat' % options.output_pre):
                os.mkdir('%s_heat' % options.output_pre)

        for s in range(len(anchors_sorted)):
            df = {'Index': [], 'Anchor': [], 'Coverage': []}
            for si in range(len(anchors_sorted[s])):
                anchor_id = anchors_sorted[s][si]

                for i in range(len(coverage[anchor_id])):
                    if mode == 'mid':
                        df['Index'].append(i - options.window / 2)
                    else:
                        df['Index'].append(i)
                    df['Anchor'].append(anchor_id)

                    if options.log:
                        cov = math.log(coverage[anchor_id][i], 2)
                    else:
                        cov = coverage[anchor_id][i]

                    if options.control_files:
                        if options.log:
                            cov -= math.log(coverage_control[anchor_id][i], 2)
                        else:
                            cov = cov / coverage_control[anchor_id][i]

                    df['Coverage'].append('%.4e' % cov)

            if len(anchors_sorted) == 1:
                out_pdf = '%s_heat.pdf' % options.output_pre
            else:
                sorted_gene_file = options.sorted_gene_files.split(',')[s]
                sorted_gene_pre = os.path.splitext(
                    os.path.split(sorted_gene_file)[-1])[0]
                out_pdf = '%s_heat/%s.pdf' % (options.output_pre,
                                              sorted_gene_pre)

            r_script = '%s/plot_gff_cov_heat.r' % os.environ['RDIR']
            ggplot.plot(r_script, df, [out_pdf, options.control_files != None])

    ############################################
    # plot meta-coverage
    ############################################
    df = {'Index': [], 'Coverage': []}
    if options.control_files:
        df['Type'] = []

    if mode == 'mid':
        index_length = 2 * (options.window / 2) + 1
    elif mode == 'span':
        index_length = options.bins
    else:
        print >> sys.stderr, 'Unknown mode %s' % mode
        exit(1)

    for i in range(index_length):
        if mode == 'mid':
            df['Index'].append(i - options.window / 2)
        else:
            df['Index'].append(i)

        if options.log:
            df['Coverage'].append(
                stats.geo_mean(
                    [coverage[anchor_id][i] for anchor_id in coverage]))
        else:
            df['Coverage'].append(
                stats.mean([coverage[anchor_id][i] for anchor_id in coverage]))

        if options.control_files:
            df['Type'].append('Primary')

            if mode == 'mid':
                df['Index'].append(i - options.window / 2)
            else:
                df['Index'].append(i)

            df['Type'].append('Control')
            if options.log:
                df['Coverage'].append(
                    stats.geo_mean([
                        coverage_control[anchor_id][i]
                        for anchor_id in coverage_control
                    ]))
            else:
                df['Coverage'].append(
                    stats.mean([
                        coverage_control[anchor_id][i]
                        for anchor_id in coverage_control
                    ]))

    r_script = '%s/plot_gff_cov_meta.r' % os.environ['RDIR']
    out_df = '%s_meta.df' % options.output_pre
    ggplot.plot(r_script,
                df, [
                    options.output_pre, options.smooth_span, plot_labels[0],
                    plot_labels[1]
                ],
                df_file=out_df)
Exemplo n.º 7
0
def main():
    usage = 'usage: %prog [options] <ref_gtf>'
    parser = OptionParser(usage)
    #parser.add_option()
    parser.add_option('-d',
                      dest='downstream',
                      type='int',
                      default=1000,
                      help='Downstream bp for promoters [Default: %default]')
    parser.add_option(
        '-f',
        dest='fpkm_tracking',
        help='Use cufflinks FPKM estimates to choose the most expressed isoform'
    )
    parser.add_option('-u',
                      dest='upstream',
                      type='int',
                      default=1000,
                      help='Upstream bp for promoters [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide reference GTF')
    else:
        ref_gtf = args[0]

    g2t = gff.g2t(ref_gtf)
    transcripts = gff.read_genes(ref_gtf)
    source = open(ref_gtf).readline().split()[1]

    if options.fpkm_tracking:
        iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking)

    for gene_id in g2t:
        gene_transcripts = list(g2t[gene_id])
        gene_strand = transcripts[gene_transcripts[0]].strand
        if gene_strand not in ['+', '-']:
            print >> sys.stderr, 'WARNING: %s discluded for lack of strand' % gene_id
            continue

        # choose TSS
        if options.fpkm_tracking:
            # find most expressed isoform
            promoter_tid = gene_transcripts[0]
            max_fpkm = stats.geo_mean([
                1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)
            ])
            for transcript_id in gene_transcripts[1:]:
                transcript_fpkm = stats.geo_mean([
                    1 + fpkm
                    for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)
                ])
                if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm:
                    promoter_tid = transcript_id
                    max_fpkm = transcript_fpkm

            # get isoform tss
            if gene_strand == '+':
                tss = transcripts[promoter_tid].exons[0].start
            else:
                tss = transcripts[promoter_tid].exons[-1].end

        else:
            # find most upstream tss
            promoter_tid = gene_transcripts[0]
            if gene_strand == '+':
                upstream_tss = transcripts[promoter_tid].exons[0].start
            else:
                upstream_tss = transcripts[promoter_tid].exons[-1].end

            for transcript_id in gene_transcripts[1:]:
                if gene_strand == '+':
                    transcript_pos = transcripts[transcript_id].exons[0].start
                    if transcript_pos < upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos
                else:
                    transcript_pos = transcripts[transcript_id].exons[-1].end
                    if transcript_pos > upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos

            tss = upstream_tss

        # print promoter from the tss
        if gene_strand == '+':
            if tss - options.upstream < 1:
                print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id
            else:
                tx = transcripts[promoter_tid]
                cols = [
                    tx.chrom, source, 'promoter',
                    str(tss - options.upstream),
                    str(tss + options.downstream), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv)
                ]
                print '\t'.join(cols)

        else:
            if tss - options.downstream < 1:
                print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id
            else:
                tx = transcripts[promoter_tid]
                cols = [
                    tx.chrom, source, 'promoter',
                    str(tss - options.downstream),
                    str(tss + options.upstream), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv)
                ]
                print '\t'.join(cols)
Exemplo n.º 8
0
def main():
    usage = 'usage: %prog [options] <bam_file,bam_file2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]')
    parser.add_option('-f', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-g', dest='genome', default='HG19', help='Genome directory to obtain lengths from [Default: %default]')
    parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide BAM files.')
    else:
        bam_files = args[0].split(',')

    control_bam_files = []
    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # GFF filter
    ############################################
    # filter TEs and read alignments by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter BAM
        bam_gff_fds = [None]*len(bam_files)
        bam_gff_files = [None]*len(bam_files)
        for i in range(len(bam_files)):
            bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
            bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i])
            bam_files[i] = bam_gff_files[i]

        # filter control BAM
        if control_bam_files:
            cbam_gff_fds = [None]*len(control_bam_files)
            cbam_gff_files = [None]*len(control_bam_files)
            for i in range(len(control_bam_files)):
                cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
                bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i])
                control_bam_files[i] = cbam_gff_files[i]

    ############################################
    # lengths
    ############################################
    # estimate read length (just averaging across replicates for now)
    read_lens = []
    for bam_file in bam_files:
        read_lens.append(estimate_read_length(bam_file))
    read_len = stats.mean(read_lens)

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, read_len)
    else:
        genome_length = count_genome(options.genome)

    # hash counted repeat genomic bp
    if options.filter_gff:
        te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len)
    else:
        te_lengths = te_target_size(options.repeats_gff, read_len)

    ############################################
    # count TE fragments
    ############################################
    fragments = []
    te_fragments = []
    for bam_file in bam_files:
        rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split)
        fragments.append(rep_fragments)
        te_fragments.append(rep_te_fragments)

    if control_bam_files:
        control_fragments = []
        control_te_fragments = []
        for control_bam_file in control_bam_files:
            rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split)
            control_fragments.append(rep_fragments)
            control_te_fragments.append(rep_te_fragments)

    ############################################
    # combine replicates into fragment rates
    ############################################
    te_fragment_rates = {}
    for (rep,fam) in te_lengths:
        if options.strand_split:
            # positive
            rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list)
            # negative
            rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list)
        else:
            rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list)

    if control_bam_files:
        control_te_fragment_rates = {}
        for te in te_fragment_rates:
            rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))]
            control_te_fragment_rates[te] = stats.geo_mean(rate_list)

    ############################################
    # compute stats, print table
    ############################################
    for (rep,fam) in te_fragment_rates:
        # compute TE length
        if options.strand_split:
            te_len = te_lengths[(rep[:-1],fam)]
        else:
            te_len = te_lengths[(rep,fam)]

        # parameterize null model
        if options.control_bam_files:
            null_rate = control_te_fragment_rates[(rep,fam)]
        else:
            if options.strand_split:
                null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length)
            else:
                null_rate = float(te_lengths[(rep,fam)]) / genome_length

        # compute fragment counts
        count = te_fragment_rates[(rep,fam)]*sum(fragments)
        null_count = null_rate*sum(fragments)

        # compute fold change
        if null_rate > 0:
            fold = te_fragment_rates[(rep,fam)]/null_rate
        else:
            fold = 0

        # compute p-value of enrichment/depletion
        p_val = 1.0
        for i in range(len(bam_files)):
            if te_fragment_rates[(rep,fam)] > null_rate:
                p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate)
            else:
                p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate)

        cols = (rep, fam, te_len, count, null_count, fold, p_val)
        print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)

        for i in range(len(bam_files)):
            os.close(bam_gff_fds[i])
            os.remove(bam_gff_files[i])

        if options.control_bam_files:
            for i in range(len(control_bam_files)):
                os.close(cbam_gff_fds[i])
                os.remove(cbam_gff_files[i])
Exemplo n.º 9
0
def main():
    usage = 'usage: %prog [options] <bam_file,bam_file2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]')
    parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a BAM file.')
    else:
        bam_files = args[0].split(',')

    control_bam_files = []
    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # GFF filter
    ############################################
    # filter TEs and read alignments by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter BAM
        bam_gff_fds = [None]*len(bam_files)
        bam_gff_files = [None]*len(bam_files)
        for i in range(len(bam_files)):
            bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
            bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i])
            bam_files[i] = bam_gff_files[i]

        # filter control BAM
        if control_bam_files:
            cbam_gff_fds = [None]*len(control_bam_files)
            cbam_gff_files = [None]*len(control_bam_files)
            for i in range(len(control_bam_files)):
                cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
                bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i])
                control_bam_files[i] = cbam_gff_files[i]

    ############################################
    # lengths
    ############################################
    # estimate read length (just averaging across replicates for now)
    read_lens = []
    for bam_file in bam_files:
        read_lens.append(estimate_read_length(bam_file))
    read_len = stats.mean(read_lens)

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, read_len)
    else:
        genome_length = count_hg19()

    # hash counted repeat genomic bp
    if options.filter_gff:
        te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len)
    else:
        te_lengths = te_target_size(options.repeats_gff, read_len)

    ############################################
    # count TE fragments
    ############################################
    fragments = []
    te_fragments = []
    for bam_file in bam_files:
        rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split)
        fragments.append(rep_fragments)
        te_fragments.append(rep_te_fragments)

    if control_bam_files:        
        control_fragments = []
        control_te_fragments = []
        for control_bam_file in control_bam_files:
            rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split)
            control_fragments.append(rep_fragments)
            control_te_fragments.append(rep_te_fragments)

    ############################################
    # combine replicates into fragment rates
    ############################################
    te_fragment_rates = {}
    for (rep,fam) in te_lengths:
        if options.strand_split:
            # positive
            rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list)
            # negative
            rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list)
        else:
            rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list)

    if control_bam_files:
        control_te_fragment_rates = {}
        for te in te_fragment_rates:
            rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))]
            control_te_fragment_rates[te] = stats.geo_mean(rate_list)

    ############################################
    # compute stats, print table
    ############################################
    for (rep,fam) in te_fragment_rates:
        # compute TE length
        if options.strand_split:
            te_len = te_lengths[(rep[:-1],fam)]
        else:
            te_len = te_lengths[(rep,fam)]

        # parameterize null model
        if options.control_bam_files:
            null_rate = control_te_fragment_rates[(rep,fam)]
        else:
            if options.strand_split:
                null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length)
            else:
                null_rate = float(te_lengths[(rep,fam)]) / genome_length

        # compute fragment counts
        count = te_fragment_rates[(rep,fam)]*sum(fragments)
        null_count = null_rate*sum(fragments)

        # compute fold change
        if null_rate > 0:
            fold = te_fragment_rates[(rep,fam)]/null_rate
        else:
            fold = 0

        # compute p-value of enrichment/depletion
        p_val = 1.0
        for i in range(len(bam_files)):
            if te_fragment_rates[(rep,fam)] > null_rate:            
                p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate)
            else:
                p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate)

        cols = (rep, fam, te_len, count, null_count, fold, p_val)
        print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)

        for i in range(len(bam_files)):
            os.close(bam_gff_fds[i])
            os.remove(bam_gff_files[i])

        if options.control_bam_files:
            for i in range(len(control_bam_files)):
                os.close(cbam_gff_fds[i])
                os.remove(cbam_gff_files[i])