def main(): usage = 'usage: %prog [options] <feature gff/bed>' parser = OptionParser(usage) parser.add_option('-g', dest='gff_file', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % home_dir) parser.add_option('-n', dest='null_iterations', type=int, default=50, help='Number of shuffles to perform to estimate null distribution [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a gff file for the feature of interest.') else: feature_gff = args[0] ############################################ # GFF filter ############################################ # filter TEs and features by gff file if options.gff_file: # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, options.gff_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter feature GFF feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -s -u -f 0.5 -a %s -b %s > %s' % (feature_gff, options.gff_file, feature_gff_gff_file), shell=True) feature_gff = feature_gff_gff_file ############################################ # lengths ############################################ # compute size of search space if options.gff_file: genome_length = count_gff(options.gff_file) else: genome_length = count_hg19() # compute feature length feature_len, feature_num = feature_stats(feature_gff) if feature_num == 0: print >> sys.stderr, 'Zero features' exit() # hash counted repeat genomic bp te_in = open(options.repeats_gff) genome_te_bp = hash_te(te_in) te_in.close() ############################################ # convert feature gff to bed ############################################ if feature_gff[-3:] == 'gtf': feature_bed_fd, feature_bed_file = tempfile.mkstemp() subprocess.call('gtf2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True) elif feature_gff[-3:] == 'gff': feature_bed_fd, feature_bed_file = tempfile.mkstemp() subprocess.call('gff2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True) elif feature_gff[-3:] == 'bed': feature_bed_file = feature_gff else: parser.error('Cannot recognize gff format suffix') ############################################ # null distribution ############################################ shuffle_bed_fd, shuffle_bed_file = tempfile.mkstemp() te_null_bp = {} for ni in range(options.null_iterations): print >> sys.stderr, ni # shuffle feature bed subprocess.call('shuffleBed -i %s -g %s/research/common/data/genomes/hg19/assembly/human.hg19.genome -excl %s/research/common/data/genomes/hg19/assembly/hg19_gaps.bed > %s' % (feature_bed_file, home_dir, home_dir, shuffle_bed_file), shell=True) # intersect w/ TEs and hash overlaps te_tmp_bp = intersect_hash(options.repeats_gff, shuffle_bed_file) for te in genome_te_bp: te_null_bp.setdefault(te,[]).append(te_tmp_bp.get(te,0)) ############################################ # actual ############################################ te_bp = intersect_hash(options.repeats_gff, feature_gff) ############################################ # compute stats and print ############################################ lines = [] p_vals = [] for te in genome_te_bp: feature_freq = float(te_bp.get(te,0))/feature_len genome_freq = float(genome_te_bp[te])/genome_length fold_change = feature_freq / genome_freq #print te, stats.mean(te_null_bp[te]), stats.sd(te_null_bp[te]) null_u, null_sd = stats.mean_sd(te_null_bp[te]) if null_sd == 0: null_sd = 1.0 if fold_change > 1: p = norm.sf(te_bp[te]-1, loc=null_u, scale=null_sd) else: p = norm.cdf(te_bp.get(te,0), loc=null_u, scale=null_sd) p_vals.append(p) cols = (te[0], te[1], te_bp.get(te,0), feature_freq, genome_freq, fold_change, p) lines.append('%-18s %-18s %8d %11.2e %11.2e %9.2f %10.2e' % cols) # correct for multiple hypotheses correction q_vals = fdr.ben_hoch(p_vals) for i in range(len(lines)): qline = lines[i] + ' %10.2e' % q_vals[i] print qline ############################################ # clean ############################################ os.close(shuffle_bed_fd) os.remove(shuffle_bed_file) if feature_gff[-3:] != 'bed': os.close(feature_bed_fd) os.remove(feature_bed_file) if options.gff_file: os.close(te_gff_fd) os.remove(te_gff_file) os.close(feature_gff_gff_fd) os.remove(feature_gff_gff_file)
def main(): usage = 'usage: %prog [options] <bam> <ref_gtf>' parser = OptionParser(usage) # IO options parser.add_option('-o', dest='out_dir', default='uniform', help='Output directory [Default: %default]') # window options parser.add_option('-w', dest='window_size', type='int', default=25, help='Window size for counting [Default: %default]') parser.add_option( '-i', '--ignore', dest='ignore_gff', help= 'Ignore reads overlapping overlapping troublesome regions in the given GFF file' ) parser.add_option('-u', '--unstranded', dest='unstranded', action='store_true', default=False, help='Sequencing is unstranded [Default: %default]') # cufflinks options parser.add_option( '--cuff_done', dest='cuff_done', action='store_true', default=False, help= 'The Cufflinks run to estimate the model parameters is already done [Default: %default]' ) parser.add_option('-t', dest='threads', type='int', default=2, help='Number of threads to use [Default: %default]') # debug options parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Verbose output [Default: %default]') parser.add_option('-g', '--gene', dest='gene_only', help='Call peaks on the specified gene only') #parser.add_option('--print_windows', dest='print_windows', default=False, action='store_true', help='Print statistics for all windows [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: bam = args[0] ref_gtf = args[1] clip_peaks.out_dir = options.out_dir if not os.path.isdir(clip_peaks.out_dir): os.mkdir(clip_peaks.out_dir) ############################################ # parameterize ############################################ if not options.cuff_done: # make a new gtf w/ unspliced RNAs update_ref_gtf = clip_peaks.prerna_gtf(ref_gtf) subprocess.call( 'cufflinks -o %s -p %d -G %s %s' % (clip_peaks.out_dir, options.threads, update_ref_gtf, bam), shell=True) # store transcripts transcripts = clip_peaks.read_genes('%s/transcripts.gtf' % clip_peaks.out_dir, key_id='transcript_id') # merge overlapping genes g2t_merge, antisense_clusters = clip_peaks.merged_g2t( '%s/transcripts.gtf' % clip_peaks.out_dir, options.unstranded) if options.unstranded: # alter strands clip_peaks.ambiguate_strands(transcripts, g2t_merge, antisense_clusters) # set transcript FPKMs clip_peaks.set_transcript_fpkms(transcripts, clip_peaks.out_dir, missing_fpkm=0) # possibly limit genes to examine if options.gene_only: gene_ids = [] for gids in g2t_merge.keys(): if options.gene_only in gids.split(','): gene_ids.append(gids) if len(gene_ids) == 0: print >> sys.stderr, 'gene_id %s not found' % options.gene_only exit(1) else: gene_ids = g2t_merge.keys() ############################################ # filter BAM ############################################ if options.ignore_gff: bam_ignore_fd, bam_ignore_file = tempfile.mkstemp( dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -v -abam %s -b %s > %s' % (bam, options.ignore_gff, bam_ignore_file), shell=True) bam = bam_ignore_file ############################################ # process genes ############################################ # index subprocess.call('samtools index %s' % bam, shell=True) # initialize stats table_out = open('%s/uniformity_table.txt' % clip_peaks.out_dir, 'w') id_list = [] fpkm_list = [] # open bam bam_in = pysam.Samfile(bam, 'rb') # for each gene for gene_id in gene_ids: # make a more focused transcript hash for this gene gene_transcripts = {} for tid in g2t_merge[gene_id]: gene_transcripts[tid] = transcripts[tid] # obtain basic gene attributes (gchrom, gstrand, gstart, gend) = clip_peaks.gene_attrs(gene_transcripts) # initialize window counts transcript_isoform_counts = {} for tid in gene_transcripts: transcript_isoform_counts[tid] = [] # choose a single event position and weight the reads read_pos_weights = clip_peaks.position_reads(bam_in, gchrom, gstart, gend, gstrand, mapq_zero=True) # process read alignments for (pos, weight, mm) in read_pos_weights: # map pos to isoforms iso_pos = {} for tid in gene_transcripts: iso_pos[tid] = isoform_position(gene_transcripts[tid], pos) # sum fpkms for hit isoforms fpkm_sum = sum([ gene_transcripts[tid].fpkm for tid in gene_transcripts if iso_pos[tid] != None ]) if fpkm_sum <= 0: pass #print >> sys.stderr, 'No FPKM for %s at %d' % (gene_id,pos) else: # distribute read to isoform counts for tid in gene_transcripts: if iso_pos[tid] != None: win_i = int(iso_pos[tid] / options.window_size) while win_i >= len(transcript_isoform_counts[tid]): transcript_isoform_counts[tid].append(0) transcript_isoform_counts[tid][ win_i] += weight * gene_transcripts[ tid].fpkm / fpkm_sum # compute window stats for tid in gene_transcripts: if gene_transcripts[tid].fpkm > 1 and len( transcript_isoform_counts[tid]) > 5: u, sd = stats.mean_sd(transcript_isoform_counts[tid][:-1]) if u > 0: id_list.append(sd * sd / u) fpkm_list.append(gene_transcripts[tid].fpkm) cols = (tid, gene_transcripts[tid].fpkm, len(transcript_isoform_counts[tid]) - 1, u, sd, id_list[-1]) print >> table_out, '%-20s %8.2f %6d %7.2f %7.2f %5.3f' % cols bam_in.close() table_out.close() ############################################ # summary stats ############################################ median = stats.median(id_list) mean = stats.mean(id_list) fpkm_cv_sum = sum([id_list[i] * fpkm_list[i] for i in range(len(id_list))]) fpkm_sum = sum(fpkm_list) fpkm_mean = fpkm_cv_sum / fpkm_sum logfpkm_cv_sum = sum([ id_list[i] * math.log(fpkm_list[i] + 1, 2) for i in range(len(id_list)) ]) logfpkm_sum = sum([math.log(f + 1, 2) for f in fpkm_list]) logfpkm_mean = logfpkm_cv_sum / logfpkm_sum # print print 'Median: %7.4f' % median print 'Mean: %7.4f' % mean print 'FPKM-weighted mean: %7.4f' % fpkm_mean print 'logFPKM-weighted mean: %7.4f' % logfpkm_mean # clean cufflinks output if not options.cuff_done: os.remove(update_ref_gtf) os.remove('%s/skipped.gtf' % clip_peaks.out_dir) os.remove('%s/genes.fpkm_tracking' % clip_peaks.out_dir) if options.ignore_gff: os.close(bam_ignore_fd) os.remove(bam_ignore_file)
def main(): usage = "usage: %prog [options] <input_file>" parser = OptionParser(usage) parser.add_option( "-k", dest="k_fold", type="int", default=10, help="Number of folds to use for cross-validation [Default: %default]", ) parser.add_option( "--lambda_min", dest="lambda_min", type="float", default=0.01, help="Minimum -lambda value to attempt [Default: %default]", ) parser.add_option( "--lambda_max", dest="lambda_max", type="float", default=10.0, help="Maximum -lambda value to attempt [Default: %default]", ) parser.add_option( "--lambda_mult", dest="lambda_mult", type="float", default=2.0, help="Multiplier for next -lambda value to attempt [Default: %default]", ) parser.add_option( "-l", dest="lesser_kmers", action="store_true", default=False, help="Use all kmers of length less than and equal to that given by -k [Default: %default]", ) # parser.add_option('-m', dest='model_file', help='File to output model to') parser.add_option( "-p", dest="parallel", type="int", default=4, help="Number of parallel threads to run [Default: %default]" ) parser.add_option( "-r", dest="replicates", type="int", default=1, help="Number of times to repeat the optimization for each fold [Default: %default]", ) parser.add_option( "-w", dest="weights", action="store_true", default=False, help="Print a summary of the weight vectors" ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide input file") else: input_file = args[0] input_base = os.path.splitext(input_file)[0] if options.weights: summarize_weights(input_base, options) exit() # determine % of positive examples input_pos, input_total = positive_percent(input_file) f1_base = input_pos / float(input_total) # trust me, it works for r in range(options.replicates): rep_dir = "%s_rep%d" % (input_base, r) if os.path.isdir(rep_dir): shutil.rmtree(rep_dir) os.mkdir(rep_dir) os.chdir(rep_dir) # divide data into folds divide_data("../" + input_file, options.k_fold) # collect pegasos commands cmds = [] peg_lambda = options.lambda_min while peg_lambda <= options.lambda_max: # run on each fold for f in range(options.k_fold): cmds.append( "pegasos -lambda %f -modelFile fold%d/train_%.1e.mod fold%d/train.dat &> /dev/null" % (peg_lambda, f, peg_lambda, f) ) # increase lambda peg_lambda *= options.lambda_mult # exceute pegasos commands util.exec_par(cmds, options.parallel) # start to clean up space for f in range(options.k_fold): os.remove("fold%d/train.dat" % f) os.chdir("..") # collect results peg_lambda = options.lambda_min while peg_lambda <= options.lambda_max: recalls = [] precisions = [] failed = False for r in range(options.replicates): if not failed: outcomes = {"tp": 0, "fp": 0, "fn": 0} # collect each fold for f in range(options.k_fold): if not compute_accuracy(outcomes, "%s_rep%d/fold%d" % (input_base, r, f), peg_lambda): failed = True break # save if not failed: recalls.append(float(outcomes["tp"]) / (outcomes["tp"] + outcomes["fn"])) precisions.append(float(outcomes["tp"]) / (outcomes["tp"] + outcomes["fp"])) # summarize and print if failed: print "%.1e %8s %7s %8s %7s %8s %8s" % (peg_lambda, "NA", "NA", "NA", "NA", "NA", "NA") else: recall, rsd = stats.mean_sd(recalls) rsd /= math.sqrt(len(recalls)) precision, psd = stats.mean_sd(precisions) psd /= math.sqrt(len(precisions)) # null_p = 1.0-binom.cdf(int(recall*input_total+0.5)-1, int(recall*input_total/precision + 0.5), float(input_pos)/input_total) f1 = 2 * recall * precision / (recall + precision) # print '%.1e %8.3f %6.3f %8.3f %6.3f %8.3f %8.3f %8.1e' % (peg_lambda, recall, rsd, precision, psd, f1, (f1-f1_base), null_p) print "%.1e %8.4f %7.4f %8.4f %7.4f %8.4f %8.4f" % ( peg_lambda, recall, rsd, precision, psd, f1, (f1 - f1_base), ) peg_lambda *= options.lambda_mult
def main(): usage = 'usage: %prog [options] <bam> <ref_gtf>' parser = OptionParser(usage) # IO options parser.add_option('-o', dest='out_dir', default='uniform', help='Output directory [Default: %default]') # window options parser.add_option('-w', dest='window_size', type='int', default=25, help='Window size for counting [Default: %default]') parser.add_option('-i', '--ignore', dest='ignore_gff', help='Ignore reads overlapping overlapping troublesome regions in the given GFF file') parser.add_option('-u', '--unstranded', dest='unstranded', action='store_true', default=False, help='Sequencing is unstranded [Default: %default]') # cufflinks options parser.add_option('--cuff_done', dest='cuff_done', action='store_true', default=False, help='The Cufflinks run to estimate the model parameters is already done [Default: %default]') parser.add_option('-t', dest='threads', type='int', default=2, help='Number of threads to use [Default: %default]') # debug options parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Verbose output [Default: %default]') parser.add_option('-g', '--gene', dest='gene_only', help='Call peaks on the specified gene only') #parser.add_option('--print_windows', dest='print_windows', default=False, action='store_true', help='Print statistics for all windows [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: bam = args[0] ref_gtf = args[1] clip_peaks.out_dir = options.out_dir if not os.path.isdir(clip_peaks.out_dir): os.mkdir(clip_peaks.out_dir) ############################################ # parameterize ############################################ if not options.cuff_done: # make a new gtf w/ unspliced RNAs update_ref_gtf = clip_peaks.prerna_gtf(ref_gtf) subprocess.call('cufflinks -o %s -p %d -G %s %s' % (clip_peaks.out_dir, options.threads, update_ref_gtf, bam), shell=True) # store transcripts transcripts = clip_peaks.read_genes('%s/transcripts.gtf'%clip_peaks.out_dir, key_id='transcript_id') # merge overlapping genes g2t_merge, antisense_clusters = clip_peaks.merged_g2t('%s/transcripts.gtf'%clip_peaks.out_dir, options.unstranded) if options.unstranded: # alter strands clip_peaks.ambiguate_strands(transcripts, g2t_merge, antisense_clusters) # set transcript FPKMs clip_peaks.set_transcript_fpkms(transcripts, clip_peaks.out_dir, missing_fpkm=0) # possibly limit genes to examine if options.gene_only: gene_ids = [] for gids in g2t_merge.keys(): if options.gene_only in gids.split(','): gene_ids.append(gids) if len(gene_ids) == 0: print >> sys.stderr, 'gene_id %s not found' % options.gene_only exit(1) else: gene_ids = g2t_merge.keys() ############################################ # filter BAM ############################################ if options.ignore_gff: bam_ignore_fd, bam_ignore_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -v -abam %s -b %s > %s' % (bam, options.ignore_gff, bam_ignore_file), shell=True) bam = bam_ignore_file ############################################ # process genes ############################################ # index subprocess.call('samtools index %s' % bam, shell=True) # initialize stats table_out = open('%s/uniformity_table.txt' % clip_peaks.out_dir, 'w') id_list = [] fpkm_list = [] # open bam bam_in = pysam.Samfile(bam, 'rb') # for each gene for gene_id in gene_ids: # make a more focused transcript hash for this gene gene_transcripts = {} for tid in g2t_merge[gene_id]: gene_transcripts[tid] = transcripts[tid] # obtain basic gene attributes (gchrom, gstrand, gstart, gend) = clip_peaks.gene_attrs(gene_transcripts) # initialize window counts transcript_isoform_counts = {} for tid in gene_transcripts: transcript_isoform_counts[tid] = [] # choose a single event position and weight the reads read_pos_weights = clip_peaks.position_reads(bam_in, gchrom, gstart, gend, gstrand, mapq_zero=True) # process read alignments for (pos, weight, mm) in read_pos_weights: # map pos to isoforms iso_pos = {} for tid in gene_transcripts: iso_pos[tid] = isoform_position(gene_transcripts[tid], pos) # sum fpkms for hit isoforms fpkm_sum = sum([gene_transcripts[tid].fpkm for tid in gene_transcripts if iso_pos[tid] != None]) if fpkm_sum <= 0: pass #print >> sys.stderr, 'No FPKM for %s at %d' % (gene_id,pos) else: # distribute read to isoform counts for tid in gene_transcripts: if iso_pos[tid] != None: win_i = int(iso_pos[tid] / options.window_size) while win_i >= len(transcript_isoform_counts[tid]): transcript_isoform_counts[tid].append(0) transcript_isoform_counts[tid][win_i] += weight*gene_transcripts[tid].fpkm/fpkm_sum # compute window stats for tid in gene_transcripts: if gene_transcripts[tid].fpkm > 1 and len(transcript_isoform_counts[tid]) > 5: u, sd = stats.mean_sd(transcript_isoform_counts[tid][:-1]) if u > 0: id_list.append(sd*sd/u) fpkm_list.append(gene_transcripts[tid].fpkm) cols = (tid, gene_transcripts[tid].fpkm, len(transcript_isoform_counts[tid])-1, u, sd, id_list[-1]) print >> table_out, '%-20s %8.2f %6d %7.2f %7.2f %5.3f' % cols bam_in.close() table_out.close() ############################################ # summary stats ############################################ median = stats.median(id_list) mean = stats.mean(id_list) fpkm_cv_sum = sum([id_list[i]*fpkm_list[i] for i in range(len(id_list))]) fpkm_sum = sum(fpkm_list) fpkm_mean = fpkm_cv_sum / fpkm_sum logfpkm_cv_sum = sum([id_list[i]*math.log(fpkm_list[i]+1,2) for i in range(len(id_list))]) logfpkm_sum = sum([math.log(f+1,2) for f in fpkm_list]) logfpkm_mean = logfpkm_cv_sum / logfpkm_sum # print print 'Median: %7.4f' % median print 'Mean: %7.4f' % mean print 'FPKM-weighted mean: %7.4f' % fpkm_mean print 'logFPKM-weighted mean: %7.4f' % logfpkm_mean # clean cufflinks output if not options.cuff_done: os.remove(update_ref_gtf) os.remove('%s/skipped.gtf' % clip_peaks.out_dir) os.remove('%s/genes.fpkm_tracking' % clip_peaks.out_dir) if options.ignore_gff: os.close(bam_ignore_fd) os.remove(bam_ignore_file)