def main(): usage = 'usage: %prog [options] <gff> <bam>' parser = OptionParser(usage) #parser.add_option('-c', dest='control_bam_file', default=None, help='Control BAM file') parser.add_option('-g', dest='geo_mean', default=False, action='store_true', help='Compute geometric mean of individual peak coverages [Default: %default]') parser.add_option('-i', dest='individual_plots', default=False, action='store_true', help='Print a coverage plot for every individual peak [Default: %default]') parser.add_option('-o', dest='out_prefix', default='peak_cov', help='Output prefix [Default: %default]') parser.add_option('-p', dest='properly_paired', default=False, action='store_true', help='Count entire fragments for only properly paired reads [Default: %default]') parser.add_option('-u', dest='range', default=500, type='int', help='Range around peak middle [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and BAM file') else: peaks_gff = args[0] bam_file = args[1] # filter BAM for mapping quality bam_mapq_fd, bam_mapq_file = tempfile.mkstemp(dir='%s/research/scratch' % os.environ['HOME']) bam_in = pysam.Samfile(bam_file, 'rb') bam_mapq_out = pysam.Samfile(bam_mapq_file, 'wb', template=bam_in) for aligned_read in bam_in: if aligned_read.mapq > 0: bam_mapq_out.write(aligned_read) bam_mapq_out.close() # count fragments and hash multi-mappers num_fragments = 0 multi_maps = {} for aligned_read in pysam.Samfile(bam_mapq_file, 'rb'): if options.properly_paired: if aligned_read.is_properly_paired: num_fragments += 0.5/aligned_read.opt('NH') else: if aligned_read.is_paired: num_fragments += 0.5/aligned_read.opt('NH') else: num_fragments += 1.0/aligned_read.opt('NH') if aligned_read.opt('NH') > 1: multi_maps[aligned_read.qname] = aligned_read.opt('NH') # extend GFF entries to range peaks_gff_range_fd, peaks_gff_range_file = tempfile.mkstemp() peaks_gff_range_out = open(peaks_gff_range_file, 'w') for line in open(peaks_gff): a = line.split('\t') pstart = int(a[3]) pend = int(a[4]) peak_mid = pstart + (pend-pstart)/2 a[3] = str(peak_mid - options.range/2 - 1) a[4] = str(peak_mid + options.range/2 + 1) print >> peaks_gff_range_out, '\t'.join(a), peaks_gff_range_out.close() # initialize coverage counters peak_cov_individual = {} peak_reads = {} # count reads p = subprocess.Popen('intersectBed -split -wo -bed -abam %s -b %s' % (bam_mapq_file,peaks_gff_range_file), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') rstart = int(a[1]) rend = int(a[2]) rheader = a[3] # because intersectBed screws up indels near endpoints if rstart < rend: pstart = int(a[15]) pend = int(a[16]) peak_id = gff.gtf_kv(a[20])['id'] peak_reads[peak_id] = peak_reads.get(peak_id,0) + 1 peak_mid = pstart + (pend-pstart)/2 peak_range_start = peak_mid - options.range/2 peak_range_end = peak_mid + options.range/2 range_start = max(rstart, peak_range_start) range_end = min(rend, peak_range_end) if not peak_id in peak_cov_individual: peak_cov_individual[peak_id] = [0.0]*(1+options.range) for i in range(range_start - peak_range_start, range_end - peak_range_start + 1): peak_cov_individual[peak_id][i] += 1.0/multi_maps.get(rheader,1) p.communicate() # combine individual peak_cov = [0.0]*(1+options.range) for i in range(len(peak_cov)): if options.geo_mean: peak_cov[i] = stats.geo_mean([1+peak_cov_individual[peak_id][i] for peak_id in peak_cov_individual]) else: peak_cov[i] = stats.mean([peak_cov_individual[peak_id][i] for peak_id in peak_cov_individual]) #for peak_id in peak_reads: # print peak_id, peak_reads[peak_id] # output make_output(peak_cov, options.out_prefix, options.range) if options.individual_plots: individual_dir = '%s_individuals' % options.out_prefix if os.path.isdir(individual_dir): shutil.rmtree(individual_dir) os.mkdir(individual_dir) for peak_id in peak_cov_individual: if peak_reads[peak_id] > 150: make_output(peak_cov_individual[peak_id], '%s/%s' % (individual_dir,peak_id), options.range) # clean os.close(bam_mapq_fd) os.remove(bam_mapq_file) os.close(peaks_gff_range_fd) os.remove(peaks_gff_range_file)
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform') parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+','-']: print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr) continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols)) else: if tss - options.downstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols))
def main(): usage = 'usage: %prog [options] <gff> <bam1,bam2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', default=None, help='Control BAM files (comma separated)') parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]') parser.add_option('-k', dest='gtf_key', default=None, help='GTF key to hash gff entries by') parser.add_option('-m', dest='max_features', default=2000, type='int', help='Maximum number of features to plot [Default: %default]') parser.add_option('-o', dest='output_pre', default='bam', help='Output prefix [Default: %default]') parser.add_option('-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order') parser.add_option('-u', dest='range', default=2000, type='int', help='Range around peak middle [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and BAM file') else: gff_file = args[0] bam_files = args[1].split(',') if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # extend GFF entries to range (and sample) ############################################ feature_count = 0 for line in open(gff_file): feature_count += 1 sample_prob = min(1.0, options.max_features / float(feature_count)) gff_range_fd, gff_range_file = tempfile.mkstemp() gff_range_out = open(gff_range_file, 'w') for line in open(gff_file): a = line.split('\t') start = int(a[3]) end = int(a[4]) mid = start + (end-start)/2 a[3] = str(mid - options.range/2) a[4] = str(mid + options.range/2) a[-1] = a[-1].rstrip() if random.random() < sample_prob: print >> gff_range_out, '\t'.join(a) gff_range_out.close() ############################################ # compute coverage ############################################ coverage, fragments = compute_coverage(gff_range_file, bam_files, options.gtf_key) if options.control_bam_files: coverage_control, fragments_control = compute_coverage(gff_range_file, control_bam_files, options.gtf_key) # clean os.close(gff_range_fd) os.remove(gff_range_file) ############################################ # normalize ############################################ # normalize coverages (and add pseudocounts) for feature_id in coverage: for i in range(len(coverage[feature_id])): coverage[feature_id][i] = (1+coverage[feature_id][i])/fragments if options.control_bam_files: coverage_control[feature_id][i] = (1+coverage_control[feature_id][i])/fragments_control ############################################ # sorted genes ############################################ features_sorted = [] if options.sorted_gene_files: # for each sorted list for sorted_gene_file in options.sorted_gene_files.split(','): # collect feature_id's features_sorted.append([]) for line in open(sorted_gene_file): feature_id = line.split()[0] # verify randomly selected if feature_id in coverage: features_sorted[-1].append(feature_id) else: # tuple feature_id's with mean coverage feature_id_stat = [] for feature_id in coverage: if options.control_bam_files: feature_stat = stats.mean([math.log(coverage[feature_id][i],2) - math.log(coverage_control[feature_id][i],2) for i in range(len(coverage[feature_id]))]) else: feature_stat = stats.geo_mean([coverage[feature_id][i] for i in range(len(coverage[feature_id]))]) feature_id_stat.append((feature_stat,feature_id)) # sort feature_id_stat.sort(reverse=True) # store as the only sorted list features_sorted.append([feature_id for (feature_stat, feature_id) in feature_id_stat]) ############################################ # plot heatmap(s) ############################################ # if multiple sorts, create a dir for the plots if len(features_sorted) > 1: if not os.path.isdir('%s_heat' % options.output_pre): os.mkdir('%s_heat' % options.output_pre) for s in range(len(features_sorted)): df = {'Index':[], 'Feature':[], 'Coverage':[]} for f in range(len(features_sorted[s])): feature_id = features_sorted[s][f] for i in range(-options.range/2,options.range/2+1): df['Index'].append(i) df['Feature'].append(f) if options.log: cov = math.log(coverage[feature_id][i+options.range/2],2) else: cov = coverage[feature_id][i+options.range/2] if options.control_bam_files: if options.log: cov -= math.log(coverage_control[feature_id][i+options.range/2],2) else: cov = cov / coverage_control[feature_id][i+options.range/2] df['Coverage'].append('%.4e' % cov) r_script = '%s/bam_heat_heat.r' % os.environ['RDIR'] if len(features_sorted) == 1: out_pdf = '%s_heat.pdf' % options.output_pre else: sorted_gene_file = options.sorted_gene_files.split(',')[s] sorted_gene_pre = os.path.splitext(os.path.split(sorted_gene_file)[-1])[0] out_pdf = '%s_heat/%s.pdf' % (options.output_pre,sorted_gene_pre) ggplot.plot(r_script, df, [out_pdf, options.control_bam_files!=None], df_file='df_heat.txt') ############################################ # plot meta-coverage ############################################ df = {'Index':[], 'Coverage':[]} if options.control_bam_files: df['Type'] = [] for i in range(-options.range/2,options.range/2+1): df['Index'].append(i) if options.log: df['Coverage'].append(stats.geo_mean([coverage[feature_id][i+options.range/2] for feature_id in coverage])) else: df['Coverage'].append(stats.mean([coverage[feature_id][i+options.range/2] for feature_id in coverage])) if options.control_bam_files: df['Type'].append('Primary') df['Index'].append(i) df['Type'].append('Control') if options.log: df['Coverage'].append(stats.geo_mean([coverage_control[feature_id][i+options.range/2] for feature_id in coverage_control])) else: df['Coverage'].append(stats.mean([coverage_control[feature_id][i+options.range/2] for feature_id in coverage_control])) r_script = '%s/bam_heat_meta.r' % os.environ['RDIR'] out_pdf = '%s_meta.pdf' % options.output_pre ggplot.plot(r_script, df, [out_pdf], df_file='df_meta.txt')
def main(): usage = 'usage: %prog [options] <mode=mid/span> <anchor_gff> <event_bam1,event_bam2,...|event_gff1,event_gff2,...>' parser = OptionParser(usage) parser.add_option('-a', dest='max_anchors', default=1000, type='int', help='Maximum number of anchors to consider [Default: %default]') parser.add_option('-c', dest='control_files', default=None, help='Control BAM or GFF files (comma separated)') parser.add_option('-e', dest='plot_heat', default=False, help='Plot as a heatmap [Default: %default]') parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]') parser.add_option('-o', dest='output_pre', default='gff_cov', help='Output prefix [Default: %default]') parser.add_option('-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order') parser.add_option('-b', dest='bins', default=100, type='int', help='Number of bins across the gene span [Default: %default]') parser.add_option('-m', dest='min_length', default=None, type='int', help='Minimum anchor length [Default: %default]') parser.add_option('-w', dest='window', default=2000, type='int', help='Window around peak middle [Default: %default]') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide mode, anchor GFF, and BAM/GFF file(s)') else: mode = args[0] anchor_gff = args[1] event_files = args[2].split(',') if options.control_files: control_files = options.control_files.split(',') anchor_is_gtf = (anchor_gff[-4:] == '.gtf') # preprocess anchor GFF prep_anchor_fd, prep_anchor_gff = preprocess_anchors(anchor_gff, mode, options.max_anchors, anchor_is_gtf, options.min_length, options.window) ############################################ # compute coverage ############################################ coverage, events = compute_coverage(prep_anchor_gff, event_files, mode, anchor_is_gtf, options.bins) if options.control_files: coverage_control, events_control = compute_coverage(prep_anchor_gff, control_files, mode, anchor_is_gtf, options.bins) # clean os.close(prep_anchor_fd) os.remove(prep_anchor_gff) ############################################ # normalize ############################################ # normalize coverages (and add pseudocounts) for anchor_id in coverage: for i in range(len(coverage[anchor_id])): coverage[anchor_id][i] = (1+coverage[anchor_id][i])/float(events) if options.control_files: coverage_control[anchor_id][i] = (1+coverage_control[anchor_id][i])/float(events_control) ############################################ # sort anchors ############################################ anchors_sorted = [] if options.sorted_gene_files: # for each sorted list for sorted_gene_file in options.sorted_gene_files.split(','): # collect anchor_id's anchors_sorted.append([]) for line in open(sorted_gene_file): anchor_id = line.split()[0] # verify randomly selected if anchor_id in coverage: anchors_sorted[-1].append(anchor_id) else: # tuple anchor_id's with mean coverage stat_aid = [] for anchor_id in coverage: if options.control_files: astat = stats.mean([math.log(coverage[anchor_id][i],2) - math.log(coverage_control[anchor_id][i],2) for i in range(len(coverage[anchor_id]))]) else: astat = stats.geo_mean([coverage[anchor_id][i] for i in range(len(coverage[anchor_id]))]) stat_aid.append((astat, anchor_id)) # sort stat_aid.sort(reverse=True) # store as the only sorted list anchors_sorted.append([anchor_id for (astat, anchor_id) in stat_aid]) ############################################ # plot heatmap(s) ############################################ if options.plot_heat: # if multiple sorts, create a dir for the plots if len(anchors_sorted) > 1: if not os.path.isdir('%s_heat' % options.output_pre): os.mkdir('%s_heat' % options.output_pre) for s in range(len(anchors_sorted)): df = {'Index':[], 'Anchor':[], 'Coverage':[]} for si in range(len(anchors_sorted[s])): anchor_id = anchors_sorted[s][si] for i in range(len(coverage[anchor_id])): if mode == 'mid': df['Index'].append(i - options.window/2) else: df['Index'].append(i) df['Anchor'].append(anchor_id) if options.log: cov = math.log(coverage[anchor_id][i], 2) else: cov = coverage[anchor_id][i] if options.control_files: if options.log: cov -= math.log(coverage_control[anchor_id][i], 2) else: cov = cov / coverage_control[anchor_id][i] df['Coverage'].append('%.4e' % cov) r_script = '%s/plot_gff_cov_heat.r' % os.environ['RDIR'] if len(anchors_sorted) == 1: out_pdf = '%s_heat.pdf' % options.output_pre else: sorted_gene_file = options.sorted_gene_files.split(',')[s] sorted_gene_pre = os.path.splitext(os.path.split(sorted_gene_file)[-1])[0] out_pdf = '%s_heat/%s.pdf' % (options.output_pre,sorted_gene_pre) ggplot.plot(r_script, df, [out_pdf, options.control_files!=None]) ############################################ # plot meta-coverage ############################################ df = {'Index':[], 'Coverage':[]} if options.control_files: df['Type'] = [] if mode == 'mid': index_length = 2*(options.window/2) + 1 elif mode == 'span': index_length = options.bins else: print >> sys.stderr, 'Unknown mode %s' % mode exit(1) for i in range(index_length): if mode == 'mid': df['Index'].append(i - options.window/2) else: df['Index'].append(i) if options.log: df['Coverage'].append(stats.geo_mean([coverage[anchor_id][i] for anchor_id in coverage])) else: df['Coverage'].append(stats.mean([coverage[anchor_id][i] for anchor_id in coverage])) if options.control_files: df['Type'].append('Primary') if mode == 'mid': df['Index'].append(i - options.window/2) else: df['Index'].append(i) df['Type'].append('Control') if options.log: df['Coverage'].append(stats.geo_mean([coverage_control[anchor_id][i] for anchor_id in coverage_control])) else: df['Coverage'].append(stats.mean([coverage_control[anchor_id][i] for anchor_id in coverage_control])) r_script = '%s/plot_gff_cov_meta.r' % os.environ['RDIR'] ggplot.plot(r_script, df, [options.output_pre])
def main(): usage = 'usage: %prog [options] <gff> <bam1,bam2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', default=None, help='Control BAM files (comma separated)') parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]') parser.add_option('-k', dest='gtf_key', default=None, help='GTF key to hash gff entries by') parser.add_option( '-m', dest='max_features', default=2000, type='int', help='Maximum number of features to plot [Default: %default]') parser.add_option('-o', dest='output_pre', default='bam', help='Output prefix [Default: %default]') parser.add_option( '-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order') parser.add_option('-u', dest='range', default=2000, type='int', help='Range around peak middle [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and BAM file') else: gff_file = args[0] bam_files = args[1].split(',') if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # extend GFF entries to range (and sample) ############################################ feature_count = 0 for line in open(gff_file): feature_count += 1 sample_prob = min(1.0, options.max_features / float(feature_count)) gff_range_fd, gff_range_file = tempfile.mkstemp() gff_range_out = open(gff_range_file, 'w') for line in open(gff_file): a = line.split('\t') start = int(a[3]) end = int(a[4]) mid = start + (end - start) / 2 range_start = mid - options.range / 2 range_end = mid + options.range / 2 if range_start > 0: a[3] = str(mid - options.range / 2) a[4] = str(mid + options.range / 2) a[-1] = a[-1].rstrip() if random.random() < sample_prob: print >> gff_range_out, '\t'.join(a) gff_range_out.close() ############################################ # compute coverage ############################################ coverage, fragments = compute_coverage(gff_range_file, bam_files, options.gtf_key) if options.control_bam_files: coverage_control, fragments_control = compute_coverage( gff_range_file, control_bam_files, options.gtf_key) # clean os.close(gff_range_fd) os.remove(gff_range_file) ############################################ # normalize ############################################ # normalize coverages (and add pseudocounts) for feature_id in coverage: for i in range(len(coverage[feature_id])): coverage[feature_id][i] = (1 + coverage[feature_id][i]) / fragments if options.control_bam_files: coverage_control[feature_id][i] = ( 1 + coverage_control[feature_id][i]) / fragments_control ############################################ # sorted genes ############################################ features_sorted = [] if options.sorted_gene_files: # for each sorted list for sorted_gene_file in options.sorted_gene_files.split(','): # collect feature_id's features_sorted.append([]) for line in open(sorted_gene_file): feature_id = line.split()[0] # verify randomly selected if feature_id in coverage: features_sorted[-1].append(feature_id) else: # tuple feature_id's with mean coverage feature_id_stat = [] for feature_id in coverage: if options.control_bam_files: feature_stat = stats.mean([ math.log(coverage[feature_id][i], 2) - math.log(coverage_control[feature_id][i], 2) for i in range(len(coverage[feature_id])) ]) else: feature_stat = stats.geo_mean([ coverage[feature_id][i] for i in range(len(coverage[feature_id])) ]) feature_id_stat.append((feature_stat, feature_id)) # sort feature_id_stat.sort(reverse=True) # store as the only sorted list features_sorted.append( [feature_id for (feature_stat, feature_id) in feature_id_stat]) ############################################ # plot heatmap(s) ############################################ # if multiple sorts, create a dir for the plots if len(features_sorted) > 1: if not os.path.isdir('%s_heat' % options.output_pre): os.mkdir('%s_heat' % options.output_pre) for s in range(len(features_sorted)): df = {'Index': [], 'Feature': [], 'Coverage': []} for f in range(len(features_sorted[s])): feature_id = features_sorted[s][f] for i in range(-options.range / 2, options.range / 2 + 1): df['Index'].append(i) df['Feature'].append(f) if options.log: cov = math.log(coverage[feature_id][i + options.range / 2], 2) else: cov = coverage[feature_id][i + options.range / 2] if options.control_bam_files: if options.log: cov -= math.log( coverage_control[feature_id][i + options.range / 2], 2) else: cov = cov / coverage_control[feature_id][ i + options.range / 2] df['Coverage'].append('%.4e' % cov) r_script = '%s/bam_heat_heat.r' % os.environ['RDIR'] if len(features_sorted) == 1: out_pdf = '%s_heat.pdf' % options.output_pre else: sorted_gene_file = options.sorted_gene_files.split(',')[s] sorted_gene_pre = os.path.splitext( os.path.split(sorted_gene_file)[-1])[0] out_pdf = '%s_heat/%s.pdf' % (options.output_pre, sorted_gene_pre) ggplot.plot(r_script, df, [out_pdf, options.control_bam_files != None]) ############################################ # plot meta-coverage ############################################ df = {'Index': [], 'Coverage': []} if options.control_bam_files: df['Type'] = [] for i in range(-options.range / 2, options.range / 2 + 1): df['Index'].append(i) if options.log: df['Coverage'].append( stats.geo_mean([ coverage[feature_id][i + options.range / 2] for feature_id in coverage ])) else: df['Coverage'].append( stats.mean([ coverage[feature_id][i + options.range / 2] for feature_id in coverage ])) if options.control_bam_files: df['Type'].append('Primary') df['Index'].append(i) df['Type'].append('Control') if options.log: df['Coverage'].append( stats.geo_mean([ coverage_control[feature_id][i + options.range / 2] for feature_id in coverage_control ])) else: df['Coverage'].append( stats.mean([ coverage_control[feature_id][i + options.range / 2] for feature_id in coverage_control ])) r_script = '%s/bam_heat_meta.r' % os.environ['RDIR'] out_pdf = '%s_meta.pdf' % options.output_pre ggplot.plot(r_script, df, [out_pdf])
def main(): usage = 'usage: %prog [options] <mode=mid/span> <anchor_gff> <event_bam1,event_bam2,...|event_gff1,event_gff2,...>' parser = OptionParser(usage) parser.add_option( '-a', dest='max_anchors', default=1000, type='int', help='Maximum number of anchors to consider [Default: %default]') parser.add_option('-c', dest='control_files', default=None, help='Control BAM or GFF files (comma separated)') parser.add_option('-e', dest='plot_heat', default=False, action='store_true', help='Plot as a heatmap [Default: %default]') parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]') parser.add_option('--labels', dest='labels', default='Primary,Control', help='Plot labels [Default:%default]') parser.add_option('-o', dest='output_pre', default='gff_cov', help='Output prefix [Default: %default]') parser.add_option( '-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order') parser.add_option('-p', dest='smooth_span', default=0.2, type='float', help='Smoothing span parameter [Default: %default]') parser.add_option( '-b', dest='bins', default=100, type='int', help='Number of bins across the gene span [Default: %default]') parser.add_option('-m', dest='min_length', default=None, type='int', help='Minimum anchor length [Default: %default]') parser.add_option('-w', dest='window', default=2000, type='int', help='Window around peak middle [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide mode, anchor GFF, and BAM/GFF file(s)') else: mode = args[0] anchor_gff = args[1] event_files = args[2].split(',') if options.control_files: control_files = options.control_files.split(',') plot_labels = options.labels.split(',') anchor_is_gtf = (anchor_gff[-4:] == '.gtf') # preprocess anchor GFF prep_anchor_fd, prep_anchor_gff = preprocess_anchors( anchor_gff, mode, options.max_anchors, anchor_is_gtf, options.min_length, options.window) ############################################ # compute coverage ############################################ coverage, events = compute_coverage(prep_anchor_gff, event_files, mode, anchor_is_gtf, options.bins) if options.control_files: coverage_control, events_control = compute_coverage( prep_anchor_gff, control_files, mode, anchor_is_gtf, options.bins) # clean os.close(prep_anchor_fd) os.remove(prep_anchor_gff) ############################################ # normalize ############################################ # normalize coverages (and add pseudocounts) for anchor_id in coverage: for i in range(len(coverage[anchor_id])): coverage[anchor_id][i] = (1 + coverage[anchor_id][i]) / float(events) if options.control_files: coverage_control[anchor_id][i] = ( 1 + coverage_control[anchor_id][i]) / float(events_control) ############################################ # sort anchors ############################################ anchors_sorted = [] if options.sorted_gene_files: # for each sorted list for sorted_gene_file in options.sorted_gene_files.split(','): # collect anchor_id's anchors_sorted.append([]) for line in open(sorted_gene_file): anchor_id = line.split()[0] # verify randomly selected if anchor_id in coverage: anchors_sorted[-1].append(anchor_id) else: # tuple anchor_id's with mean coverage stat_aid = [] for anchor_id in coverage: if options.control_files: astat = stats.mean([ math.log(coverage[anchor_id][i], 2) - math.log(coverage_control[anchor_id][i], 2) for i in range(len(coverage[anchor_id])) ]) else: astat = stats.geo_mean([ coverage[anchor_id][i] for i in range(len(coverage[anchor_id])) ]) stat_aid.append((astat, anchor_id)) # sort stat_aid.sort(reverse=True) # store as the only sorted list anchors_sorted.append([anchor_id for (astat, anchor_id) in stat_aid]) ############################################ # plot heatmap(s) ############################################ if options.plot_heat: # if multiple sorts, create a dir for the plots if len(anchors_sorted) > 1: if not os.path.isdir('%s_heat' % options.output_pre): os.mkdir('%s_heat' % options.output_pre) for s in range(len(anchors_sorted)): df = {'Index': [], 'Anchor': [], 'Coverage': []} for si in range(len(anchors_sorted[s])): anchor_id = anchors_sorted[s][si] for i in range(len(coverage[anchor_id])): if mode == 'mid': df['Index'].append(i - options.window / 2) else: df['Index'].append(i) df['Anchor'].append(anchor_id) if options.log: cov = math.log(coverage[anchor_id][i], 2) else: cov = coverage[anchor_id][i] if options.control_files: if options.log: cov -= math.log(coverage_control[anchor_id][i], 2) else: cov = cov / coverage_control[anchor_id][i] df['Coverage'].append('%.4e' % cov) if len(anchors_sorted) == 1: out_pdf = '%s_heat.pdf' % options.output_pre else: sorted_gene_file = options.sorted_gene_files.split(',')[s] sorted_gene_pre = os.path.splitext( os.path.split(sorted_gene_file)[-1])[0] out_pdf = '%s_heat/%s.pdf' % (options.output_pre, sorted_gene_pre) r_script = '%s/plot_gff_cov_heat.r' % os.environ['RDIR'] ggplot.plot(r_script, df, [out_pdf, options.control_files != None]) ############################################ # plot meta-coverage ############################################ df = {'Index': [], 'Coverage': []} if options.control_files: df['Type'] = [] if mode == 'mid': index_length = 2 * (options.window / 2) + 1 elif mode == 'span': index_length = options.bins else: print >> sys.stderr, 'Unknown mode %s' % mode exit(1) for i in range(index_length): if mode == 'mid': df['Index'].append(i - options.window / 2) else: df['Index'].append(i) if options.log: df['Coverage'].append( stats.geo_mean( [coverage[anchor_id][i] for anchor_id in coverage])) else: df['Coverage'].append( stats.mean([coverage[anchor_id][i] for anchor_id in coverage])) if options.control_files: df['Type'].append('Primary') if mode == 'mid': df['Index'].append(i - options.window / 2) else: df['Index'].append(i) df['Type'].append('Control') if options.log: df['Coverage'].append( stats.geo_mean([ coverage_control[anchor_id][i] for anchor_id in coverage_control ])) else: df['Coverage'].append( stats.mean([ coverage_control[anchor_id][i] for anchor_id in coverage_control ])) r_script = '%s/plot_gff_cov_meta.r' % os.environ['RDIR'] out_df = '%s_meta.df' % options.output_pre ggplot.plot(r_script, df, [ options.output_pre, options.smooth_span, plot_labels[0], plot_labels[1] ], df_file=out_df)
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option( '-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform' ) parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+', '-']: print >> sys.stderr, 'WARNING: %s discluded for lack of strand' % gene_id continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid) ]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id) ]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.upstream), str(tss + options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print '\t'.join(cols) else: if tss - options.downstream < 1: print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.downstream), str(tss + options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print '\t'.join(cols)
def main(): usage = 'usage: %prog [options] <bam_file,bam_file2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]') parser.add_option('-f', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-g', dest='genome', default='HG19', help='Genome directory to obtain lengths from [Default: %default]') parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM files.') else: bam_files = args[0].split(',') control_bam_files = [] if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # GFF filter ############################################ # filter TEs and read alignments by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter BAM bam_gff_fds = [None]*len(bam_files) bam_gff_files = [None]*len(bam_files) for i in range(len(bam_files)): bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i]) bam_files[i] = bam_gff_files[i] # filter control BAM if control_bam_files: cbam_gff_fds = [None]*len(control_bam_files) cbam_gff_files = [None]*len(control_bam_files) for i in range(len(control_bam_files)): cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i]) control_bam_files[i] = cbam_gff_files[i] ############################################ # lengths ############################################ # estimate read length (just averaging across replicates for now) read_lens = [] for bam_file in bam_files: read_lens.append(estimate_read_length(bam_file)) read_len = stats.mean(read_lens) # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, read_len) else: genome_length = count_genome(options.genome) # hash counted repeat genomic bp if options.filter_gff: te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len) else: te_lengths = te_target_size(options.repeats_gff, read_len) ############################################ # count TE fragments ############################################ fragments = [] te_fragments = [] for bam_file in bam_files: rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split) fragments.append(rep_fragments) te_fragments.append(rep_te_fragments) if control_bam_files: control_fragments = [] control_te_fragments = [] for control_bam_file in control_bam_files: rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split) control_fragments.append(rep_fragments) control_te_fragments.append(rep_te_fragments) ############################################ # combine replicates into fragment rates ############################################ te_fragment_rates = {} for (rep,fam) in te_lengths: if options.strand_split: # positive rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list) # negative rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list) else: rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list) if control_bam_files: control_te_fragment_rates = {} for te in te_fragment_rates: rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))] control_te_fragment_rates[te] = stats.geo_mean(rate_list) ############################################ # compute stats, print table ############################################ for (rep,fam) in te_fragment_rates: # compute TE length if options.strand_split: te_len = te_lengths[(rep[:-1],fam)] else: te_len = te_lengths[(rep,fam)] # parameterize null model if options.control_bam_files: null_rate = control_te_fragment_rates[(rep,fam)] else: if options.strand_split: null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length) else: null_rate = float(te_lengths[(rep,fam)]) / genome_length # compute fragment counts count = te_fragment_rates[(rep,fam)]*sum(fragments) null_count = null_rate*sum(fragments) # compute fold change if null_rate > 0: fold = te_fragment_rates[(rep,fam)]/null_rate else: fold = 0 # compute p-value of enrichment/depletion p_val = 1.0 for i in range(len(bam_files)): if te_fragment_rates[(rep,fam)] > null_rate: p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate) else: p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate) cols = (rep, fam, te_len, count, null_count, fold, p_val) print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) for i in range(len(bam_files)): os.close(bam_gff_fds[i]) os.remove(bam_gff_files[i]) if options.control_bam_files: for i in range(len(control_bam_files)): os.close(cbam_gff_fds[i]) os.remove(cbam_gff_files[i])
def main(): usage = 'usage: %prog [options] <bam_file,bam_file2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]') parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a BAM file.') else: bam_files = args[0].split(',') control_bam_files = [] if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # GFF filter ############################################ # filter TEs and read alignments by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter BAM bam_gff_fds = [None]*len(bam_files) bam_gff_files = [None]*len(bam_files) for i in range(len(bam_files)): bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i]) bam_files[i] = bam_gff_files[i] # filter control BAM if control_bam_files: cbam_gff_fds = [None]*len(control_bam_files) cbam_gff_files = [None]*len(control_bam_files) for i in range(len(control_bam_files)): cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i]) control_bam_files[i] = cbam_gff_files[i] ############################################ # lengths ############################################ # estimate read length (just averaging across replicates for now) read_lens = [] for bam_file in bam_files: read_lens.append(estimate_read_length(bam_file)) read_len = stats.mean(read_lens) # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, read_len) else: genome_length = count_hg19() # hash counted repeat genomic bp if options.filter_gff: te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len) else: te_lengths = te_target_size(options.repeats_gff, read_len) ############################################ # count TE fragments ############################################ fragments = [] te_fragments = [] for bam_file in bam_files: rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split) fragments.append(rep_fragments) te_fragments.append(rep_te_fragments) if control_bam_files: control_fragments = [] control_te_fragments = [] for control_bam_file in control_bam_files: rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split) control_fragments.append(rep_fragments) control_te_fragments.append(rep_te_fragments) ############################################ # combine replicates into fragment rates ############################################ te_fragment_rates = {} for (rep,fam) in te_lengths: if options.strand_split: # positive rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list) # negative rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list) else: rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list) if control_bam_files: control_te_fragment_rates = {} for te in te_fragment_rates: rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))] control_te_fragment_rates[te] = stats.geo_mean(rate_list) ############################################ # compute stats, print table ############################################ for (rep,fam) in te_fragment_rates: # compute TE length if options.strand_split: te_len = te_lengths[(rep[:-1],fam)] else: te_len = te_lengths[(rep,fam)] # parameterize null model if options.control_bam_files: null_rate = control_te_fragment_rates[(rep,fam)] else: if options.strand_split: null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length) else: null_rate = float(te_lengths[(rep,fam)]) / genome_length # compute fragment counts count = te_fragment_rates[(rep,fam)]*sum(fragments) null_count = null_rate*sum(fragments) # compute fold change if null_rate > 0: fold = te_fragment_rates[(rep,fam)]/null_rate else: fold = 0 # compute p-value of enrichment/depletion p_val = 1.0 for i in range(len(bam_files)): if te_fragment_rates[(rep,fam)] > null_rate: p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate) else: p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate) cols = (rep, fam, te_len, count, null_count, fold, p_val) print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) for i in range(len(bam_files)): os.close(bam_gff_fds[i]) os.remove(bam_gff_files[i]) if options.control_bam_files: for i in range(len(control_bam_files)): os.close(cbam_gff_fds[i]) os.remove(cbam_gff_files[i])