def utest(self, score): """ Gives the Mann-Withney U test probability that the score is random. See: Mason & Graham (2002) Areas beneath the relative operating characteristics (ROC) and relative operating levels (ROL) curves: Statistical significance and interpretation @param score: the score predicted for each item @type score: [ float ] @return: 1-tailed P-value @rtype: float """ sample1 = N.compress(self.positives, score) sample1 = sample1[-1::-1] # invert order sample2 = N.compress(N.logical_not(self.positives), score) sample2 = sample2[-1::-1] # invert order sample1 = sample1.tolist() sample2 = sample2.tolist() p = stats.mannwhitneyu(sample1, sample2) return p[1]
def compute_stats(te_diffs, gene_diffs, plot_dir): pvals = [] table_lines = [] for te_or in te_diffs: rep, fam, orient = te_or for sample_key in te_diffs[te_or]: sample1, sample2 = sample_key # if enough data if len(te_diffs[te_or][sample_key]) >= 10: wo_te = list((gene_diffs[sample_key] - te_diffs[te_or][sample_key]).elements()) w_te = list(te_diffs[te_or][sample_key].elements()) wo_mean = stats.mean(wo_te) w_mean = stats.mean(w_te) z, p = stats.mannwhitneyu(w_te, wo_te) cols = (rep, fam, orient, sample1, sample2, len(w_te), w_mean, wo_mean, z, p) table_lines.append('%-17s %-17s %1s %-10s %-10s %6d %9.2f %9.2f %8.2f %10.2e' % cols) pvals.append(p) # plot ... if rep in ['*'] and fam in ['*','LINE/L1','SINE/Alu','LTR/ERV1','LTR/ERVL-MaLR','LINE/L2','LTR/ERVL','SINE/MIR','DNA/hAT-Charlie','LTR/ERVK','DNA/TcMar-Tigger']: out_pdf = '%s/%s_%s_%s_%s-%s.pdf' % (plot_dir,rep.replace('/','-'),fam.replace('/','-'),orient,sample1,sample2) cdf_plot(te_or, w_te, wo_te, out_pdf) return table_lines, pvals
def utest( self, score ): """ Gives the Mann-Withney U test probability that the score is random. See: Mason & Graham (2002) Areas beneath the relative operating characteristics (ROC) and relative operating levels (ROL) curves: Statistical significance and interpretation @param score: the score predicted for each item @type score: [ float ] @return: 1-tailed P-value @rtype: float """ sample1 = N.compress( self.positives, score ) sample1 = sample1[-1::-1] # invert order sample2 = N.compress( N.logical_not( self.positives ), score ) sample2 = sample2[-1::-1] # invert order sample1 = sample1.tolist() sample2 = sample2.tolist() p = stats.mannwhitneyu( sample1, sample2 ) return p[1]
def compute_stats(te_genes, gene_diff, ref_gtf, plot_dir, scale): # focus on GTF genes gtf_genes = set() for line in open(ref_gtf): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] gtf_genes.add(tid) pvals = [] table_lines = [] for sample_key in gene_diff: sample1, sample2 = sample_key stat_genes = list(gtf_genes & set(gene_diff[sample_key])) for te_key in te_genes: repeat, family, orient = te_key te_diffs = [gene_diff[sample_key][tid] for tid in stat_genes if tid in te_genes[te_key]] if len(te_diffs) > 0: note_diffs = [gene_diff[sample_key][tid] for tid in stat_genes if tid not in te_genes[te_key]] te_mean = mean(te_diffs) note_mean = mean(note_diffs) if len(te_diffs) > 5: z, p = stats.mannwhitneyu(te_diffs, note_diffs) else: z = 0 p = 1 pvals.append(p) cols = (repeat, family, orient, sample1, sample2, len(te_diffs), te_mean, len(note_diffs), note_mean, z, p) table_lines.append('%-17s %-17s %1s %-10s %-10s %6d %9.2f %6d %9.2f %8.2f %10.2e' % cols) # plot ... if repeat in ['*'] and family in ['*','LINE/L1','SINE/Alu','LTR/ERV1','LTR/ERVL-MaLR','LINE/L2','LTR/ERVL','SINE/MIR','DNA/hAT-Charlie','LTR/ERVK','DNA/TcMar-Tigger']: repeat_plot = repeat.replace('/','-').replace('*','X') family_plot = family.replace('/','-').replace('*','X') out_pdf = '%s/%s_%s_%s_%s-%s.pdf' % (plot_dir, repeat_plot, family_plot, orient, sample1, sample2) cdf_plot(te_key, te_diffs, note_diffs, out_pdf, scale) return table_lines, pvals
print stats.ttest_1samp(a,12) print 'ttest_ind:' print stats.ttest_ind(l,m) print stats.ttest_ind(a,b) print 'ttest_rel:' print stats.ttest_rel(l,m) print stats.ttest_rel(a,b) print 'chisquare:' print stats.chisquare(l) print stats.chisquare(a) print 'ks_2samp:' print stats.ks_2samp(l,m) print stats.ks_2samp(a,b) print 'mannwhitneyu:' print stats.mannwhitneyu(l,m) print stats.mannwhitneyu(a,b) print 'ranksums:' print stats.ranksums(l,m) print stats.ranksums(a,b) print 'wilcoxont:' print stats.wilcoxont(l,m) print stats.wilcoxont(a,b) print 'kruskalwallish:' print stats.kruskalwallish(l,m,l) print len(l), len(m) print stats.kruskalwallish(a,b,a) print 'friedmanchisquare:' print stats.friedmanchisquare(l,m,l) print stats.friedmanchisquare(a,b,a)
print stats.ttest_1samp(a, 12) print 'ttest_ind:' print stats.ttest_ind(l, m) print stats.ttest_ind(a, b) print 'ttest_rel:' print stats.ttest_rel(l, m) print stats.ttest_rel(a, b) print 'chisquare:' print stats.chisquare(l) print stats.chisquare(a) print 'ks_2samp:' print stats.ks_2samp(l, m) print stats.ks_2samp(a, b) print 'mannwhitneyu:' print stats.mannwhitneyu(l, m) print stats.mannwhitneyu(a, b) print 'ranksums:' print stats.ranksums(l, m) print stats.ranksums(a, b) print 'wilcoxont:' print stats.wilcoxont(l, m) print stats.wilcoxont(a, b) print 'kruskalwallish:' print stats.kruskalwallish(l, m, l) print len(l), len(m) print stats.kruskalwallish(a, b, a) print 'friedmanchisquare:' print stats.friedmanchisquare(l, m, l) print stats.friedmanchisquare(a, b, a)
print('\nINFERENTIAL') print('ttest_1samp:') print(stats.ttest_1samp(l,12)) print(stats.ttest_1samp(l,12)) print('ttest_ind:') print(stats.ttest_ind(l,m)) print(stats.ttest_ind(l,l)) print('chisquare:') print(stats.chisquare(l)) print(stats.chisquare(l)) print('ks_2samp:') print(stats.ks_2samp(l,m)) print(stats.ks_2samp(l,l)) print('mannwhitneyu:') print(stats.mannwhitneyu(l,m)) print(stats.mannwhitneyu(l,l)) print('ranksums:') print(stats.ranksums(l,m)) print(stats.ranksums(l,l)) print('wilcoxont:') print(stats.wilcoxont(l,m)) print('kruskalwallish:') print(stats.kruskalwallish(l,m,l)) print(len(l), len(m)) print(stats.kruskalwallish(l,l,l)) print('friedmanchisquare:') print(stats.friedmanchisquare(l,m,l)) print(stats.friedmanchisquare(l,l,l)) l = [float(x) for x in range(1,21)]
print('\n\nChi-Square') fo = list(map(float,[10,40])) print('\nSHOULD BE 18.0, <<<0.01 (df=1) ... Basic Stats 1st ed. p.457') print(stats.chisquare(fo)) print('\nSHOULD BE 5.556, 0.01<p<0.05 (df=1) ... Basic Stats 1st ed. p.460') print(stats.chisquare(fo,[5,45])) print('\n\nMann Whitney U') red = list(map(float,[540,480,600,590,605])) black = list(map(float,[760,890,1105,595,940])) print('\nSHOULD BE 2.0, 0.01<p<0.05 (N=5,5) ... Basic Stats 1st ed, p.473-4') print(stats.mannwhitneyu(red,black)) print('\n\nRank Sums') #(using red and black from above) print('\nSHOULD BE -2.19, p<0.0286 (slightly) ... Basic Stats 1st ed, p.474-5') print(stats.ranksums(red,black)) print('\n\nWilcoxon T') red = list(map(float,[540,580, 600,680,430,740, 600,690,605,520])) black = list(map(float,[760,710,1105,880,500,990,1050,640,595,520])) print('\nSHOULD BE +3.0, 0.01<p<0.05 (N=9) ... Basic Stats 1st ed, p.477-8') print(stats.wilcoxont(red,black))
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf' % os.environ['GENCODE']) parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r' % os.environ['RDIR'], help='Script to make plots with [Default: %default]') parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]') parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]') parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]') parser.add_option( '-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] ################################################## # process GTF ################################################## if options.single_gene_loci: single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf) options.ref_gtf = single_gtf_file gtf_genes = gff.gtf_gene_set(options.ref_gtf) ################################################## # collect CLIP peak bound genes ################################################## peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # find expressed genes in peak calls silent_genes = set() if options.clip_fpkm_file: silent_genes = find_silent(options.clip_fpkm_file) ################################################## # collect RIP stats ################################################## if options.test_stat: rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok=True, use_fold=False, max_stat=options.max_stat, one_rbp=True) else: rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True) rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True) # TEMP: print bound genes # genes_out = open('%s_genes.txt' % options.output_pre, 'w') # for gene_id in rip_bound: # if rip_bound[gene_id]: # print >> genes_out, gene_id, rip_fold[gene_id] # genes_out.close() ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Gene': [], 'CLIP': [], 'RIP': []} for gene_id in rip_fold: if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes): df_dict['Gene'].append(gene_id) df_dict['RIP'].append(rip_fold[gene_id]) if gene_id in peak_genes: df_dict['CLIP'].append('Bound') else: df_dict['CLIP'].append('Unbound') ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat]) ################################################## # compute stats on bound and unbound distributions ################################################## bound_fold = [ df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound' ] unbound_fold = [ df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound' ] # perform statistical test z, p = stats.mannwhitneyu(bound_fold, unbound_fold) stats_out = open('%s_stats.txt' % options.output_pre, 'w') cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p) print >> stats_out, '%-10s %5d %6.2f %5d %6.2f %6.2f %9.2e' % cols stats_out.close() ################################################## # plot venn diagram ################################################## rip_genes = set([ df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i], False) ]) clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) if options.clip_fpkm_file: print >> sys.stderr, 'Ignoring silent genes for hypergeometric test' # k is x # K is n # N is M # n is N # hypergeom.sf(x, M, n, N, loc=0) p1 = hypergeom.sf(both - 1, len(gtf_genes), len(peak_genes), len(rip_genes)) p2 = hypergeom.sf(both - 1, len(gtf_genes), len(rip_genes), len(peak_genes)) hyper_out = open('%s_hyper.txt' % options.output_pre, 'w') cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes)) print >> hyper_out, '%7.2e %7.2e %5d %5d %5d %5d %5d %5d' % cols hyper_out.close() if clip_only > 0 and rip_only > 0: plt.figure() # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8']) # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d']) venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838']) plt.savefig('%s_venn.pdf' % options.output_pre) ################################################## # clean ################################################## if options.single_gene_loci: os.close(single_gtf_fd) os.remove(single_gtf_file)
def main(): usage = 'usage: %prog [options] <vcf_file> <sample_beds_file> <model_file>' parser = OptionParser(usage) parser.add_option('-f', dest='genome_fasta', default='%s/assembly/hg19.fa' % os.environ['HG19'], help='Genome FASTA [Default: %default]') parser.add_option('-g', dest='gpu', default=False, action='store_true', help='Run on GPU [Default: %default]') parser.add_option( '-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]') parser.add_option('-o', dest='out_dir', default='sad_shuffle', help='Output directory') parser.add_option( '-r', dest='replot', default=False, action='store_true', help='Re-plot only, without re-computing [Default: %default]') parser.add_option('-s', dest='num_shuffles', default=1, type='int', help='Number of SNP shuffles [Default: %default]') parser.add_option('-t', dest='sad_table_file', help='Pre-computed SAD scores for the SNPs') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide VCF file, sample BEDs file, and model file') else: vcf_file = args[0] sample_beds_file = args[1] model_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # open reference genome genome = pysam.Fastafile(options.genome_fasta) # open binomial stats file binom_out = open('%s/binom.txt' % options.out_dir, 'w') # open mann-whitney stats file mw_out = open('%s/mannwhitney.txt' % options.out_dir, 'w') # plot defaults sns.set(font_scale=1.5, style='ticks') si = 0 for line in open(sample_beds_file): sample, bed_file = line.split() print(sample) ######################################### # compute SAD ######################################### # filter VCF to overlapping SNPs print(" intersecting SNPs") sample_vcf_file = '%s/%s.vcf' % (options.out_dir, sample) if not options.replot: filter_vcf(vcf_file, bed_file, sample_vcf_file) # compute SAD scores for this sample's SNPs print(" computing SAD") if options.sad_table_file: true_sad = retrieve_sad(sample_vcf_file, options.sad_table_file, si) else: true_sad = compute_sad(sample_vcf_file, model_file, si, '%s/%s_sad' % (options.out_dir, sample), options.seq_len, options.gpu, options.replot) ######################################### # compute shuffled SAD ######################################### shuffle_sad = np.zeros((true_sad.shape[0], options.num_shuffles)) for ni in range(options.num_shuffles): # shuffle the SNPs within their overlapping DHS print(" shuffle %d" % ni) sample_vcf_shuf_file = '%s/%s_shuf%d.vcf' % (options.out_dir, sample, ni) shuffle_snps(sample_vcf_file, sample_vcf_shuf_file, genome) # compute SAD scores for shuffled SNPs print(" computing shuffle SAD") shuffle_sad[:, ni] = compute_sad( sample_vcf_shuf_file, model_file, si, '%s/%s_shuf%d_sad' % (options.out_dir, sample, ni), options.seq_len, options.gpu, options.replot) ######################################### # simple stats ######################################### # compute shuffle means shuffle_sad_mean = shuffle_sad.mean(axis=1) # print sample table sample_sad_out = open('%s/%s_table.txt' % (options.out_dir, sample), 'w') for vi in range(len(true_sad)): print('%f\t%f' % (true_sad[vi], shuffle_sad_mean[vi]), file=sample_sad_out) sample_sad_out.close() # scatter plot # plt.figure() # plt.scatter(true_sad, shuffle_sad_mean, color='black', alpha=0.7) # plt.gca().grid(True, linestyle=':') # plt.savefig('%s/%s_scatter.pdf' % (options.out_dir,sample)) # plt.close() # plot CDFs sns_colors = sns.color_palette('deep') plt.figure() plt.hist(true_sad, 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[0], linewidth=1, label='SNPs') plt.hist(shuffle_sad.flatten(), 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[2], linewidth=1, label='Shuffle') ax = plt.gca() ax.grid(True, linestyle=':') ax.set_xlim(-.2, .2) plt.legend() plt.savefig('%s/%s_cdf.pdf' % (options.out_dir, sample)) plt.close() # plot Q-Q true_q = mquantiles(true_sad, np.linspace(0, 1, min(10000, true_sad.shape[0]))) shuf_q = mquantiles(shuffle_sad_mean, np.linspace(0, 1, min(10000, true_sad.shape[0]))) plt.figure() plt.scatter(true_q, shuf_q, color=sns_colors[0]) pmin = 1.05 * min(true_q[0], shuf_q[0]) pmax = 1.05 * max(true_q[-1], shuf_q[-1]) plt.plot([pmin, pmax], [pmin, pmax], color='black', linewidth=1) ax = plt.gca() ax.set_xlim(pmin, pmax) ax.set_ylim(pmin, pmax) ax.set_xlabel('True SAD') ax.set_ylabel('Shuffled SAD') ax.grid(True, linestyle=':') plt.savefig('%s/%s_qq.pdf' % (options.out_dir, sample)) plt.close() ######################################### # statistical tests ######################################### # compute matched binomial test true_great = sum((true_sad - shuffle_sad_mean) > 0) true_lo = np.log2(true_great) - np.log2(len(true_sad) - true_great) if true_lo > 0: binom_p = 1.0 - binom.cdf(true_great - 1, n=len(true_sad), p=0.5) else: binom_p = binom.cdf(true_great, n=len(true_sad), p=0.5) # print significance stats cols = (sample, len(true_sad), true_great, true_lo, binom_p) print('%-20s %5d %5d %6.2f %6.1e' % cols, file=binom_out) # compute Mann-Whitney mw_z, mw_p = stats.mannwhitneyu(true_sad, shuffle_sad.flatten()) cols = (sample, len(true_sad), true_sad.mean(), shuffle_sad.mean(), mw_z, mw_p) print('%-20s %5d %6.3f %6.3f %6.2f %6.1e' % cols, file=mw_out) # update sample index si += 1 binom_out.close() mw_out.close() genome.close()
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='control_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE']) parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] # find expressed genes in peak calls silent_genes = set() if options.control_fpkm_file: silent_genes = find_silent(options.control_fpkm_file) # find peak bound genes peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf,peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # process RIP bound_tstats = [] unbound_tstats = [] rip_genes = set() diff_in = open(diff_file) line = diff_in.readline() for line in diff_in: a = line.split('\t') gene_id = a[0] sample1 = a[4] sample2 = a[5] status = a[6] fpkm1 = float(a[7]) fpkm2 = float(a[8]) tstat = float(a[10]) sig = a[13].rstrip() if sample2 == 'input': tstat *= -1 if status == 'OK' and not math.isnan(tstat): if options.sample1 in [None,sample1] and options.sample2 in [None,sample2]: # save RIP bound if sig == 'yes': rip_genes.add(gene_id) # save test_stat if gene_id in peak_genes: bound_tstats.append(tstat) else: if not gene_id in silent_genes: unbound_tstats.append(tstat) print '%d silent genes' % len(silent_genes) print '%d bound genes' % len(bound_tstats) print '%d unbound genes' % len(unbound_tstats) # perform statistical test z, p = stats.mannwhitneyu(bound_tstats, unbound_tstats) print z, p ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Peak':(['Yes']*len(bound_tstats) + ['No']*len(unbound_tstats)), 'Test_stat':bound_tstats+unbound_tstats} r_script = '%s/peaks_diff_compare.r' % os.environ['RDIR'] ggplot.plot(r_script, df_dict, [options.output_pre]) ################################################## # plot venn diagram ################################################## clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) plt.figure() venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'RIP']) plt.savefig('%s_venn.pdf' % options.output_pre)
def compute_stats(te_genes, gene_diff, ref_gtf, plot_dir, scale): # focus on GTF genes gtf_genes = set() for line in open(ref_gtf): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] gtf_genes.add(tid) pvals = [] table_lines = [] for sample_key in gene_diff: sample1, sample2 = sample_key stat_genes = list(gtf_genes & set(gene_diff[sample_key])) for te_key in te_genes: repeat, family, orient = te_key te_diffs = [ gene_diff[sample_key][tid] for tid in stat_genes if tid in te_genes[te_key] ] if len(te_diffs) > 0: note_diffs = [ gene_diff[sample_key][tid] for tid in stat_genes if tid not in te_genes[te_key] ] te_mean = mean(te_diffs) note_mean = mean(note_diffs) if len(te_diffs) > 5: z, p = stats.mannwhitneyu(te_diffs, note_diffs) else: z = 0 p = 1 pvals.append(p) cols = (repeat, family, orient, sample1, sample2, len(te_diffs), te_mean, len(note_diffs), note_mean, z, p) table_lines.append( '%-17s %-17s %1s %-10s %-10s %6d %9.2f %6d %9.2f %8.2f %10.2e' % cols) # plot ... if repeat in ['*'] and family in [ '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR', 'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie', 'LTR/ERVK', 'DNA/TcMar-Tigger' ]: repeat_plot = repeat.replace('/', '-').replace('*', 'X') family_plot = family.replace('/', '-').replace('*', 'X') out_pdf = '%s/%s_%s_%s_%s-%s.pdf' % (plot_dir, repeat_plot, family_plot, orient, sample1, sample2) cdf_plot(te_key, te_diffs, note_diffs, out_pdf, scale) return table_lines, pvals
def main(): usage = 'usage: %prog [options] <vcf_file> <sample_beds_file> <model_file>' parser = OptionParser(usage) parser.add_option('-f', dest='genome_fasta', default='%s/assembly/hg19.fa'%os.environ['HG19'], help='Genome FASTA [Default: %default]') parser.add_option('-g', dest='gpu', default=False, action='store_true', help='Run on GPU [Default: %default]') parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]') parser.add_option('-o', dest='out_dir', default='sad_shuffle', help='Output directory') parser.add_option('-r', dest='replot', default=False, action='store_true', help='Re-plot only, without re-computing [Default: %default]') parser.add_option('-s', dest='num_shuffles', default=1, type='int', help='Number of SNP shuffles [Default: %default]') parser.add_option('-t', dest='sad_table_file', help='Pre-computed SAD scores for the SNPs') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide VCF file, sample BEDs file, and model file') else: vcf_file = args[0] sample_beds_file = args[1] model_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # open reference genome genome = pysam.Fastafile(options.genome_fasta) # open binomial stats file binom_out = open('%s/binom.txt' % options.out_dir, 'w') # open mann-whitney stats file mw_out = open('%s/mannwhitney.txt' % options.out_dir, 'w') # plot defaults sns.set(font_scale=1.5, style='ticks') si = 0 for line in open(sample_beds_file): sample, bed_file = line.split() print(sample) ######################################### # compute SAD ######################################### # filter VCF to overlapping SNPs print(" intersecting SNPs") sample_vcf_file = '%s/%s.vcf' % (options.out_dir,sample) if not options.replot: filter_vcf(vcf_file, bed_file, sample_vcf_file) # compute SAD scores for this sample's SNPs print(" computing SAD") if options.sad_table_file: true_sad = retrieve_sad(sample_vcf_file, options.sad_table_file, si) else: true_sad = compute_sad(sample_vcf_file, model_file, si, '%s/%s_sad'%(options.out_dir,sample), options.seq_len, options.gpu, options.replot) ######################################### # compute shuffled SAD ######################################### shuffle_sad = np.zeros((true_sad.shape[0],options.num_shuffles)) for ni in range(options.num_shuffles): # shuffle the SNPs within their overlapping DHS print(" shuffle %d" % ni) sample_vcf_shuf_file = '%s/%s_shuf%d.vcf' % (options.out_dir, sample, ni) shuffle_snps(sample_vcf_file, sample_vcf_shuf_file, genome) # compute SAD scores for shuffled SNPs print(" computing shuffle SAD") shuffle_sad[:,ni] = compute_sad(sample_vcf_shuf_file, model_file, si, '%s/%s_shuf%d_sad'%(options.out_dir,sample,ni), options.seq_len, options.gpu, options.replot) ######################################### # simple stats ######################################### # compute shuffle means shuffle_sad_mean = shuffle_sad.mean(axis=1) # print sample table sample_sad_out = open('%s/%s_table.txt' % (options.out_dir,sample), 'w') for vi in range(len(true_sad)): print('%f\t%f' % (true_sad[vi], shuffle_sad_mean[vi]), file=sample_sad_out) sample_sad_out.close() # scatter plot # plt.figure() # plt.scatter(true_sad, shuffle_sad_mean, color='black', alpha=0.7) # plt.gca().grid(True, linestyle=':') # plt.savefig('%s/%s_scatter.pdf' % (options.out_dir,sample)) # plt.close() # plot CDFs sns_colors = sns.color_palette('deep') plt.figure() plt.hist(true_sad, 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[0], linewidth=1, label='SNPs') plt.hist(shuffle_sad.flatten(), 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[2], linewidth=1, label='Shuffle') ax = plt.gca() ax.grid(True, linestyle=':') ax.set_xlim(-.2, .2) plt.legend() plt.savefig('%s/%s_cdf.pdf' % (options.out_dir,sample)) plt.close() # plot Q-Q true_q = mquantiles(true_sad, np.linspace(0,1,min(10000,true_sad.shape[0]))) shuf_q = mquantiles(shuffle_sad_mean, np.linspace(0,1,min(10000,true_sad.shape[0]))) plt.figure() plt.scatter(true_q, shuf_q, color=sns_colors[0]) pmin = 1.05*min(true_q[0], shuf_q[0]) pmax = 1.05*max(true_q[-1], shuf_q[-1]) plt.plot([pmin,pmax], [pmin,pmax], color='black', linewidth=1) ax = plt.gca() ax.set_xlim(pmin,pmax) ax.set_ylim(pmin,pmax) ax.set_xlabel('True SAD') ax.set_ylabel('Shuffled SAD') ax.grid(True, linestyle=':') plt.savefig('%s/%s_qq.pdf' % (options.out_dir,sample)) plt.close() ######################################### # statistical tests ######################################### # compute matched binomial test true_great = sum((true_sad-shuffle_sad_mean) > 0) true_lo = np.log2(true_great) - np.log2(len(true_sad)-true_great) if true_lo > 0: binom_p = 1.0 - binom.cdf(true_great-1, n=len(true_sad), p=0.5) else: binom_p = binom.cdf(true_great, n=len(true_sad), p=0.5) # print significance stats cols = (sample, len(true_sad), true_great, true_lo, binom_p) print('%-20s %5d %5d %6.2f %6.1e' % cols, file=binom_out) # compute Mann-Whitney mw_z, mw_p = stats.mannwhitneyu(true_sad, shuffle_sad.flatten()) cols = (sample, len(true_sad), true_sad.mean(), shuffle_sad.mean(), mw_z, mw_p) print('%-20s %5d %6.3f %6.3f %6.2f %6.1e' % cols, file=mw_out) # update sample index si += 1 binom_out.close() mw_out.close() genome.close()
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE']) parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r'%os.environ['RDIR'], help='Script to make plots with [Default: %default]') parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]') parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]') parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]') parser.add_option('-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] ################################################## # process GTF ################################################## if options.single_gene_loci: single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf) options.ref_gtf = single_gtf_file gtf_genes = gff.gtf_gene_set(options.ref_gtf) ################################################## # collect CLIP peak bound genes ################################################## peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # find expressed genes in peak calls silent_genes = set() if options.clip_fpkm_file: silent_genes = find_silent(options.clip_fpkm_file) ################################################## # collect RIP stats ################################################## if options.test_stat: rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok = True, use_fold=False, max_stat=options.max_stat, one_rbp=True) else: rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True) rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True) ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Gene':[], 'CLIP':[], 'RIP':[]} for gene_id in rip_fold: if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes): df_dict['Gene'].append(gene_id) df_dict['RIP'].append(rip_fold[gene_id]) if gene_id in peak_genes: df_dict['CLIP'].append('Bound') else: df_dict['CLIP'].append('Unbound') ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat]) ################################################## # compute stats on bound and unbound distributions ################################################## bound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound'] unbound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound'] # perform statistical test z, p = stats.mannwhitneyu(bound_fold, unbound_fold) stats_out = open('%s_stats.txt' % options.output_pre, 'w') cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p) print >> stats_out, '%-10s %5d %6.2f %5d %6.2f %6.2f %9.2e' % cols stats_out.close() ################################################## # plot venn diagram ################################################## rip_genes = set([df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i],False)]) clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) if options.clip_fpkm_file: print >> sys.stderr, 'Ignoring silent genes for hypergeometric test' # k is x # K is n # N is M # n is N # hypergeom.sf(x, M, n, N, loc=0) p1 = hypergeom.sf(both-1, len(gtf_genes), len(peak_genes), len(rip_genes)) p2 = hypergeom.sf(both-1, len(gtf_genes), len(rip_genes), len(peak_genes)) hyper_out = open('%s_hyper.txt' % options.output_pre, 'w') cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes)) print >> hyper_out, '%7.2e %7.2e %5d %5d %5d %5d %5d %5d' % cols hyper_out.close() if clip_only > 0 and rip_only > 0: plt.figure() # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8']) # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d']) venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838']) plt.savefig('%s_venn.pdf' % options.output_pre) ################################################## # clean ################################################## if options.single_gene_loci: os.close(single_gtf_fd) os.remove(single_gtf_file)
def main(): usage = 'usage: %prog [options] <vcf_file> <excl_bed_file> <model_file>' parser = OptionParser(usage) parser.add_option('-c', dest='cuda', default=False, action='store_true', help='Run on GPU [Default: %default]') parser.add_option('-e', dest='add_excl_bed', default='%s/assembly/hg19_gaps.bed'%os.environ['HG19'], help='Additional genomic regions to exclude from the shuffle [Default: %default]') parser.add_option('-f', dest='genome_fasta', default='%s/assembly/hg19.fa'%os.environ['HG19'], help='Genome FASTA [Default: %default]') parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.core.genome'%os.environ['HG19'], help='Genome file for shuffling [Default: %default]') parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]') parser.add_option('-o', dest='out_dir', default='sad_shuffle', help='Output directory') parser.add_option('-r', dest='replot', default=False, action='store_true', help='Re-plot only, without re-computing [Default: %default]') parser.add_option('-s', dest='num_shuffles', default=1, type='int', help='Number of SNP shuffles [Default: %default]') parser.add_option('-t', dest='targets_file', default=None, help='Target index, sample name table for targets to plot [Default: %default]') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide VCF file, excluded BED file, and model file') else: vcf_file = args[0] excl_bed_file = args[1] model_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ######################################### # supplement the excluded sites ######################################### if options.add_excl_bed is not None: supp_excl_bed_file = '%s/excl.bed' % options.out_dir supp_excl_bed_out = open(supp_excl_bed_file, 'w') # copy exclusion BED file for line in open(excl_bed_file): a = line.split() print('\t'.join(a[:3]), file=supp_excl_bed_out) # add on additional sites for line in open(options.add_excl_bed): a = line.split() print('\t'.join(a[:3]), file=supp_excl_bed_out) supp_excl_bed_out.close() excl_bed_file = supp_excl_bed_file ######################################### # compute SAD ######################################### # filter VCF to excluded SNPs excl_vcf_file = '%s/excl.vcf' % options.out_dir if not options.replot: exclude_vcf(vcf_file, excl_bed_file, excl_vcf_file) # compute SADs true_sad = compute_sad(excl_vcf_file, model_file, '%s/excl_sad'%options.out_dir, options.seq_len, options.cuda, options.replot) ######################################### # compute shuffled SAD ######################################### # open reference genome genome_open = pysam.Fastafile(options.genome_fasta) shuffle_sad = np.zeros((true_sad.shape[0],true_sad.shape[1],options.num_shuffles)) for ni in range(options.num_shuffles): # shuffle the SNPs shuf_vcf_file = '%s/shuf%d.vcf' % (options.out_dir, ni) shuffle_snps(excl_vcf_file, shuf_vcf_file, excl_bed_file, options.genome_file, genome_open) # compute SAD scores for shuffled SNPs shuffle_sad[:,:,ni] = compute_sad(shuf_vcf_file, model_file, '%s/shuf%d_sad'%(options.out_dir,ni), options.seq_len, options.cuda, options.replot) # compute shuffle means shuffle_sad_mean = shuffle_sad.mean(axis=2) ######################################### # stats and plots ######################################### targets = {} if options.targets_file: for line in open(options.targets_file): a = line.split() targets[int(a[0])] = a[1] else: for ti in range(true_sad.shape[1]): targets[ti] = 't%d' % ti mw_out = open('%s/mannwhitney.txt' % options.out_dir, 'w') # plot defaults sns.set(font_scale=1.5, style='ticks') for ti in targets: # plot CDFs sns_colors = sns.color_palette('deep') plt.figure() plt.hist(true_sad[:,ti], 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[0], linewidth=1, label='SNPs') plt.hist(shuffle_sad[:,ti,:].flatten(), 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[2], linewidth=1, label='Shuffle') ax = plt.gca() ax.grid(True, linestyle=':') ax.set_xlim(-.15, .15) plt.legend() plt.savefig('%s/%s_cdf.pdf' % (options.out_dir,targets[ti])) plt.close() # plot Q-Q true_q = mquantiles(true_sad[:,ti], np.linspace(0,1,min(10000,true_sad.shape[0]))) shuf_q = mquantiles(shuffle_sad_mean[:,ti], np.linspace(0,1,min(10000,true_sad.shape[0]))) plt.figure() plt.scatter(true_q, shuf_q, color=sns_colors[0]) pmin = 1.05*min(true_q[0], shuf_q[0]) pmax = 1.05*max(true_q[-1], shuf_q[-1]) plt.plot([pmin,pmax], [pmin,pmax], color='black', linewidth=1) ax = plt.gca() ax.set_xlim(pmin,pmax) ax.set_ylim(pmin,pmax) ax.set_xlabel('True SAD') ax.set_ylabel('Shuffled SAD') ax.grid(True, linestyle=':') plt.savefig('%s/%s_qq.pdf' % (options.out_dir,targets[ti])) plt.close() # compute Mann-Whitney mw_z, mw_p = stats.mannwhitneyu(true_sad[:,ti], shuffle_sad[:,ti,:].flatten()) cols = (ti, targets[ti], true_sad.shape[0], true_sad[:,ti].mean(), shuffle_sad[:,ti,:].mean(), mw_z, mw_p) print('%3d %20s %5d %7.4f %7.4f %6.2f %6.1e' % cols, file=mw_out) mw_out.close()
print(stats.ttest_1samp(l,12)) print(stats.ttest_1samp(a,12)) print('ttest_ind:') print(stats.ttest_ind(l,m)) print(stats.ttest_ind(a,b)) print('ttest_rel:') print(stats.ttest_rel(l,m)) print(stats.ttest_rel(a,b)) print('chisquare:') print(stats.chisquare(l)) print(stats.chisquare(a)) print('ks_2samp:') print(stats.ks_2samp(l,m)) print(stats.ks_2samp(a,b)) print('mannwhitneyu:') print(stats.mannwhitneyu(l,m)) print(stats.mannwhitneyu(a,b)) print('ranksums:') print(stats.ranksums(l,m)) print(stats.ranksums(a,b)) print('wilcoxont:') print(stats.wilcoxont(l,m)) print(stats.wilcoxont(a,b)) print('kruskalwallish:') print(stats.kruskalwallish(l,m,l)) print(len(l), len(m)) print(stats.kruskalwallish(a,b,a)) print('friedmanchisquare:') print(stats.friedmanchisquare(l,m,l)) print(stats.friedmanchisquare(a,b,a)) l = range(1,21)
def main(): usage = 'usage: %prog [options] <vcf_file> <excl_bed_file> <model_file>' parser = OptionParser(usage) parser.add_option('-c', dest='cuda', default=False, action='store_true', help='Run on GPU [Default: %default]') parser.add_option( '-e', dest='add_excl_bed', default='%s/assembly/hg19_gaps.bed' % os.environ['HG19'], help= 'Additional genomic regions to exclude from the shuffle [Default: %default]' ) parser.add_option('-f', dest='genome_fasta', default='%s/assembly/hg19.fa' % os.environ['HG19'], help='Genome FASTA [Default: %default]') parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.core.genome' % os.environ['HG19'], help='Genome file for shuffling [Default: %default]') parser.add_option( '-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]') parser.add_option('-o', dest='out_dir', default='sad_shuffle', help='Output directory') parser.add_option( '-r', dest='replot', default=False, action='store_true', help='Re-plot only, without re-computing [Default: %default]') parser.add_option('-s', dest='num_shuffles', default=1, type='int', help='Number of SNP shuffles [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, help= 'Target index, sample name table for targets to plot [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide VCF file, excluded BED file, and model file') else: vcf_file = args[0] excl_bed_file = args[1] model_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ######################################### # supplement the excluded sites ######################################### if options.add_excl_bed is not None: supp_excl_bed_file = '%s/excl.bed' % options.out_dir supp_excl_bed_out = open(supp_excl_bed_file, 'w') # copy exclusion BED file for line in open(excl_bed_file): a = line.split() print('\t'.join(a[:3]), file=supp_excl_bed_out) # add on additional sites for line in open(options.add_excl_bed): a = line.split() print('\t'.join(a[:3]), file=supp_excl_bed_out) supp_excl_bed_out.close() excl_bed_file = supp_excl_bed_file ######################################### # compute SAD ######################################### # filter VCF to excluded SNPs excl_vcf_file = '%s/excl.vcf' % options.out_dir if not options.replot: exclude_vcf(vcf_file, excl_bed_file, excl_vcf_file) # compute SADs true_sad = compute_sad(excl_vcf_file, model_file, '%s/excl_sad' % options.out_dir, options.seq_len, options.cuda, options.replot) ######################################### # compute shuffled SAD ######################################### # open reference genome genome_open = pysam.Fastafile(options.genome_fasta) shuffle_sad = np.zeros( (true_sad.shape[0], true_sad.shape[1], options.num_shuffles)) for ni in range(options.num_shuffles): # shuffle the SNPs shuf_vcf_file = '%s/shuf%d.vcf' % (options.out_dir, ni) shuffle_snps(excl_vcf_file, shuf_vcf_file, excl_bed_file, options.genome_file, genome_open) # compute SAD scores for shuffled SNPs shuffle_sad[:, :, ni] = compute_sad(shuf_vcf_file, model_file, '%s/shuf%d_sad' % (options.out_dir, ni), options.seq_len, options.cuda, options.replot) # compute shuffle means shuffle_sad_mean = shuffle_sad.mean(axis=2) ######################################### # stats and plots ######################################### targets = {} if options.targets_file: for line in open(options.targets_file): a = line.split() targets[int(a[0])] = a[1] else: for ti in range(true_sad.shape[1]): targets[ti] = 't%d' % ti mw_out = open('%s/mannwhitney.txt' % options.out_dir, 'w') # plot defaults sns.set(font_scale=1.5, style='ticks') for ti in targets: # plot CDFs sns_colors = sns.color_palette('deep') plt.figure() plt.hist(true_sad[:, ti], 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[0], linewidth=1, label='SNPs') plt.hist(shuffle_sad[:, ti, :].flatten(), 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[2], linewidth=1, label='Shuffle') ax = plt.gca() ax.grid(True, linestyle=':') ax.set_xlim(-.15, .15) plt.legend() plt.savefig('%s/%s_cdf.pdf' % (options.out_dir, targets[ti])) plt.close() # plot Q-Q true_q = mquantiles(true_sad[:, ti], np.linspace(0, 1, min(10000, true_sad.shape[0]))) shuf_q = mquantiles(shuffle_sad_mean[:, ti], np.linspace(0, 1, min(10000, true_sad.shape[0]))) plt.figure() plt.scatter(true_q, shuf_q, color=sns_colors[0]) pmin = 1.05 * min(true_q[0], shuf_q[0]) pmax = 1.05 * max(true_q[-1], shuf_q[-1]) plt.plot([pmin, pmax], [pmin, pmax], color='black', linewidth=1) ax = plt.gca() ax.set_xlim(pmin, pmax) ax.set_ylim(pmin, pmax) ax.set_xlabel('True SAD') ax.set_ylabel('Shuffled SAD') ax.grid(True, linestyle=':') plt.savefig('%s/%s_qq.pdf' % (options.out_dir, targets[ti])) plt.close() # compute Mann-Whitney mw_z, mw_p = stats.mannwhitneyu(true_sad[:, ti], shuffle_sad[:, ti, :].flatten()) cols = (ti, targets[ti], true_sad.shape[0], true_sad[:, ti].mean(), shuffle_sad[:, ti, :].mean(), mw_z, mw_p) print('%3d %20s %5d %7.4f %7.4f %6.2f %6.1e' % cols, file=mw_out) mw_out.close()