Exemplo n.º 1
0
    def utest(self, score):
        """
        Gives the Mann-Withney U test probability that the score is
        random.  See:

        Mason & Graham (2002) Areas beneath the relative operating
        characteristics (ROC) and relative operating levels (ROL)
        curves: Statistical significance and interpretation

        @param score: the score predicted for each item
        @type  score: [ float ]

        @return: 1-tailed P-value
        @rtype: float
        """
        sample1 = N.compress(self.positives, score)
        sample1 = sample1[-1::-1]  # invert order

        sample2 = N.compress(N.logical_not(self.positives), score)
        sample2 = sample2[-1::-1]  # invert order

        sample1 = sample1.tolist()
        sample2 = sample2.tolist()

        p = stats.mannwhitneyu(sample1, sample2)
        return p[1]
Exemplo n.º 2
0
def compute_stats(te_diffs, gene_diffs, plot_dir):
    pvals = []
    table_lines = []

    for te_or in te_diffs:
        rep, fam, orient = te_or
        
        for sample_key in te_diffs[te_or]:        
            sample1, sample2 = sample_key

            # if enough data
            if len(te_diffs[te_or][sample_key]) >= 10:
                wo_te = list((gene_diffs[sample_key] - te_diffs[te_or][sample_key]).elements())
                w_te = list(te_diffs[te_or][sample_key].elements())

                wo_mean = stats.mean(wo_te)
                w_mean = stats.mean(w_te)

                z, p = stats.mannwhitneyu(w_te, wo_te)

                cols = (rep, fam, orient, sample1, sample2, len(w_te), w_mean, wo_mean, z, p)
                table_lines.append('%-17s %-17s  %1s  %-10s %-10s %6d %9.2f %9.2f %8.2f %10.2e' % cols)

                pvals.append(p)

                # plot ...
                if rep in ['*'] and fam in ['*','LINE/L1','SINE/Alu','LTR/ERV1','LTR/ERVL-MaLR','LINE/L2','LTR/ERVL','SINE/MIR','DNA/hAT-Charlie','LTR/ERVK','DNA/TcMar-Tigger']:
                    out_pdf = '%s/%s_%s_%s_%s-%s.pdf' % (plot_dir,rep.replace('/','-'),fam.replace('/','-'),orient,sample1,sample2)
                    cdf_plot(te_or, w_te, wo_te, out_pdf)

    return table_lines, pvals
Exemplo n.º 3
0
    def utest( self, score ):
        """
        Gives the Mann-Withney U test probability that the score is
        random.  See:

        Mason & Graham (2002) Areas beneath the relative operating
        characteristics (ROC) and relative operating levels (ROL)
        curves: Statistical significance and interpretation

        @param score: the score predicted for each item
        @type  score: [ float ]

        @return: 1-tailed P-value
        @rtype: float
        """
        sample1 = N.compress( self.positives, score )
        sample1 = sample1[-1::-1]  # invert order

        sample2 = N.compress( N.logical_not( self.positives ), score )
        sample2 = sample2[-1::-1]  # invert order

        sample1 = sample1.tolist()
        sample2 = sample2.tolist()

        p = stats.mannwhitneyu( sample1, sample2 )
        return p[1]
Exemplo n.º 4
0
def compute_stats(te_genes, gene_diff, ref_gtf, plot_dir, scale):
    # focus on GTF genes
    gtf_genes = set()
    for line in open(ref_gtf):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        gtf_genes.add(tid)    

    pvals = []
    table_lines = []

    for sample_key in gene_diff:
        sample1, sample2 = sample_key

        stat_genes = list(gtf_genes & set(gene_diff[sample_key]))

        for te_key in te_genes:
            repeat, family, orient = te_key

            te_diffs = [gene_diff[sample_key][tid] for tid in stat_genes if tid in te_genes[te_key]]
            if len(te_diffs) > 0:
                note_diffs = [gene_diff[sample_key][tid] for tid in stat_genes if tid not in te_genes[te_key]]

                te_mean = mean(te_diffs)
                note_mean = mean(note_diffs)

                if len(te_diffs) > 5:
                    z, p = stats.mannwhitneyu(te_diffs, note_diffs)
                else:
                    z = 0
                    p = 1

                pvals.append(p)

                cols = (repeat, family, orient, sample1, sample2, len(te_diffs), te_mean, len(note_diffs), note_mean, z, p)
                table_lines.append('%-17s  %-17s  %1s  %-10s  %-10s  %6d  %9.2f  %6d  %9.2f  %8.2f  %10.2e' % cols)

                # plot ...
                if repeat in ['*'] and family in ['*','LINE/L1','SINE/Alu','LTR/ERV1','LTR/ERVL-MaLR','LINE/L2','LTR/ERVL','SINE/MIR','DNA/hAT-Charlie','LTR/ERVK','DNA/TcMar-Tigger']:
                    repeat_plot = repeat.replace('/','-').replace('*','X')
                    family_plot = family.replace('/','-').replace('*','X')
                    out_pdf = '%s/%s_%s_%s_%s-%s.pdf' % (plot_dir, repeat_plot, family_plot, orient, sample1, sample2)
                    cdf_plot(te_key, te_diffs, note_diffs, out_pdf, scale)

    return table_lines, pvals
Exemplo n.º 5
0
print stats.ttest_1samp(a,12)
print 'ttest_ind:'
print stats.ttest_ind(l,m)
print stats.ttest_ind(a,b)
print 'ttest_rel:'
print stats.ttest_rel(l,m)
print stats.ttest_rel(a,b)
print 'chisquare:'
print stats.chisquare(l)
print stats.chisquare(a)
print 'ks_2samp:'
print stats.ks_2samp(l,m)
print stats.ks_2samp(a,b)

print 'mannwhitneyu:'
print stats.mannwhitneyu(l,m)
print stats.mannwhitneyu(a,b)
print 'ranksums:'
print stats.ranksums(l,m)
print stats.ranksums(a,b)
print 'wilcoxont:'
print stats.wilcoxont(l,m)
print stats.wilcoxont(a,b)
print 'kruskalwallish:'
print stats.kruskalwallish(l,m,l)
print len(l), len(m)
print stats.kruskalwallish(a,b,a)
print 'friedmanchisquare:'
print stats.friedmanchisquare(l,m,l)
print stats.friedmanchisquare(a,b,a)
Exemplo n.º 6
0
print stats.ttest_1samp(a, 12)
print 'ttest_ind:'
print stats.ttest_ind(l, m)
print stats.ttest_ind(a, b)
print 'ttest_rel:'
print stats.ttest_rel(l, m)
print stats.ttest_rel(a, b)
print 'chisquare:'
print stats.chisquare(l)
print stats.chisquare(a)
print 'ks_2samp:'
print stats.ks_2samp(l, m)
print stats.ks_2samp(a, b)

print 'mannwhitneyu:'
print stats.mannwhitneyu(l, m)
print stats.mannwhitneyu(a, b)
print 'ranksums:'
print stats.ranksums(l, m)
print stats.ranksums(a, b)
print 'wilcoxont:'
print stats.wilcoxont(l, m)
print stats.wilcoxont(a, b)
print 'kruskalwallish:'
print stats.kruskalwallish(l, m, l)
print len(l), len(m)
print stats.kruskalwallish(a, b, a)
print 'friedmanchisquare:'
print stats.friedmanchisquare(l, m, l)
print stats.friedmanchisquare(a, b, a)
Exemplo n.º 7
0
print('\nINFERENTIAL')
print('ttest_1samp:')
print(stats.ttest_1samp(l,12))
print(stats.ttest_1samp(l,12))
print('ttest_ind:')
print(stats.ttest_ind(l,m))
print(stats.ttest_ind(l,l))
print('chisquare:')
print(stats.chisquare(l))
print(stats.chisquare(l))
print('ks_2samp:')
print(stats.ks_2samp(l,m))
print(stats.ks_2samp(l,l))

print('mannwhitneyu:')
print(stats.mannwhitneyu(l,m))
print(stats.mannwhitneyu(l,l))
print('ranksums:')
print(stats.ranksums(l,m))
print(stats.ranksums(l,l))
print('wilcoxont:')
print(stats.wilcoxont(l,m))
print('kruskalwallish:')
print(stats.kruskalwallish(l,m,l))
print(len(l), len(m))
print(stats.kruskalwallish(l,l,l))
print('friedmanchisquare:')
print(stats.friedmanchisquare(l,m,l))
print(stats.friedmanchisquare(l,l,l))

l = [float(x) for x in range(1,21)]
Exemplo n.º 8
0
print('\n\nChi-Square')

fo = list(map(float,[10,40]))
print('\nSHOULD BE 18.0, <<<0.01 (df=1) ... Basic Stats 1st ed. p.457')
print(stats.chisquare(fo))
print('\nSHOULD BE 5.556, 0.01<p<0.05 (df=1) ... Basic Stats 1st ed. p.460')
print(stats.chisquare(fo,[5,45]))


print('\n\nMann Whitney U')

red = list(map(float,[540,480,600,590,605]))
black = list(map(float,[760,890,1105,595,940]))
print('\nSHOULD BE 2.0, 0.01<p<0.05 (N=5,5) ... Basic Stats 1st ed, p.473-4')
print(stats.mannwhitneyu(red,black))

print('\n\nRank Sums')

#(using red and black from above)
print('\nSHOULD BE -2.19, p<0.0286 (slightly) ... Basic Stats 1st ed, p.474-5')
print(stats.ranksums(red,black))


print('\n\nWilcoxon T')

red   = list(map(float,[540,580, 600,680,430,740, 600,690,605,520]))
black = list(map(float,[760,710,1105,880,500,990,1050,640,595,520]))
print('\nSHOULD BE +3.0, 0.01<p<0.05 (N=9) ... Basic Stats 1st ed, p.477-8')
print(stats.wilcoxont(red,black))
Exemplo n.º 9
0
def main():
    usage = 'usage: %prog [options] <peaks gff> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-c',
                      dest='clip_fpkm_file',
                      help='Control FPKM tracking file')
    parser.add_option('-g',
                      dest='ref_gtf',
                      default='%s/gencode.v18.annotation.gtf' %
                      os.environ['GENCODE'])
    parser.add_option('--ggplot',
                      dest='ggplot_script',
                      default='%s/peaks_diff_compare.r' % os.environ['RDIR'],
                      help='Script to make plots with [Default: %default]')
    parser.add_option('-m',
                      dest='max_stat',
                      default=10,
                      type='float',
                      help='Max cuffdiff stat [Default: %default]')
    parser.add_option('-o',
                      dest='output_pre',
                      default='',
                      help='Output prefix [Default: %default]')
    parser.add_option('-r',
                      dest='rbp',
                      default='RBP',
                      help='RBP name [Default: %default]')
    parser.add_option('-s',
                      dest='single_gene_loci',
                      default=False,
                      action='store_true',
                      help='Only use single gene loci [Default: %default]')
    parser.add_option(
        '-t',
        dest='test_stat',
        default=False,
        action='store_true',
        help='Use test statistic rather than fold change [Default: %default]')
    parser.add_option('--sample1',
                      dest='sample1',
                      help='Sample_1 name in cuffdiff')
    parser.add_option('--sample2',
                      dest='sample2',
                      help='Sample_2 name in cuffdiff')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide peaks GFF and .diff file')
    else:
        peaks_gff = args[0]
        diff_file = args[1]

    ##################################################
    # process GTF
    ##################################################
    if options.single_gene_loci:
        single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf)
        options.ref_gtf = single_gtf_file

    gtf_genes = gff.gtf_gene_set(options.ref_gtf)

    ##################################################
    # collect CLIP peak bound genes
    ##################################################
    peak_genes = set()
    p = subprocess.Popen('intersectBed -s -u -a %s -b %s' %
                         (options.ref_gtf, peaks_gff),
                         shell=True,
                         stdout=subprocess.PIPE)
    for line in p.stdout:
        peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id'])
    p.communicate()

    # find expressed genes in peak calls
    silent_genes = set()
    if options.clip_fpkm_file:
        silent_genes = find_silent(options.clip_fpkm_file)

    ##################################################
    # collect RIP stats
    ##################################################
    if options.test_stat:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file,
                                              just_ok=True,
                                              use_fold=False,
                                              max_stat=options.max_stat,
                                              one_rbp=True)
    else:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file,
                                              use_fold=True,
                                              max_stat=options.max_stat,
                                              one_rbp=True)
        rip_fold = ripseq.hash_rip_fold(diff_file,
                                        min_fpkm=0.125,
                                        max_fold=10,
                                        one_rbp=True)

    # TEMP: print bound genes
    # genes_out = open('%s_genes.txt' % options.output_pre, 'w')
    # for gene_id in rip_bound:
    #     if rip_bound[gene_id]:
    #         print >> genes_out, gene_id, rip_fold[gene_id]
    # genes_out.close()

    ##################################################
    # plot bound and unbound distributions
    ##################################################
    # construct data frame
    df_dict = {'Gene': [], 'CLIP': [], 'RIP': []}
    for gene_id in rip_fold:
        if gene_id in gtf_genes and (len(silent_genes) == 0
                                     or gene_id not in silent_genes):
            df_dict['Gene'].append(gene_id)
            df_dict['RIP'].append(rip_fold[gene_id])
            if gene_id in peak_genes:
                df_dict['CLIP'].append('Bound')
            else:
                df_dict['CLIP'].append('Unbound')

    ggplot.plot(options.ggplot_script, df_dict,
                [options.output_pre, options.rbp, options.test_stat])

    ##################################################
    # compute stats on bound and unbound distributions
    ##################################################
    bound_fold = [
        df_dict['RIP'][i] for i in range(len(df_dict['RIP']))
        if df_dict['CLIP'][i] == 'Bound'
    ]
    unbound_fold = [
        df_dict['RIP'][i] for i in range(len(df_dict['RIP']))
        if df_dict['CLIP'][i] == 'Unbound'
    ]

    # perform statistical test
    z, p = stats.mannwhitneyu(bound_fold, unbound_fold)

    stats_out = open('%s_stats.txt' % options.output_pre, 'w')
    cols = (options.rbp, len(bound_fold), stats.mean(bound_fold),
            len(unbound_fold), stats.mean(unbound_fold), z, p)
    print >> stats_out, '%-10s  %5d  %6.2f  %5d  %6.2f  %6.2f  %9.2e' % cols
    stats_out.close()

    ##################################################
    # plot venn diagram
    ##################################################
    rip_genes = set([
        df_dict['Gene'][i] for i in range(len(df_dict['Gene']))
        if rip_bound.get(df_dict['Gene'][i], False)
    ])

    clip_only = len(peak_genes - rip_genes)
    rip_only = len(rip_genes - peak_genes)
    both = len(peak_genes & rip_genes)

    if options.clip_fpkm_file:
        print >> sys.stderr, 'Ignoring silent genes for hypergeometric test'

    # k is x
    # K is n
    # N is M
    # n is N
    # hypergeom.sf(x, M, n, N, loc=0)

    p1 = hypergeom.sf(both - 1, len(gtf_genes), len(peak_genes),
                      len(rip_genes))
    p2 = hypergeom.sf(both - 1, len(gtf_genes), len(rip_genes),
                      len(peak_genes))

    hyper_out = open('%s_hyper.txt' % options.output_pre, 'w')
    cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes),
            len(gtf_genes))
    print >> hyper_out, '%7.2e  %7.2e  %5d  %5d  %5d  %5d  %5d %5d' % cols
    hyper_out.close()

    if clip_only > 0 and rip_only > 0:
        plt.figure()
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8'])
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d'])
        venn_diag = venn2(subsets=(clip_only, rip_only, both),
                          set_labels=['CLIP', 'fRIP'],
                          set_colors=['#e41a1c', '#A1A838'])
        plt.savefig('%s_venn.pdf' % options.output_pre)

    ##################################################
    # clean
    ##################################################
    if options.single_gene_loci:
        os.close(single_gtf_fd)
        os.remove(single_gtf_file)
Exemplo n.º 10
0
def main():
    usage = 'usage: %prog [options] <vcf_file> <sample_beds_file> <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-f',
                      dest='genome_fasta',
                      default='%s/assembly/hg19.fa' % os.environ['HG19'],
                      help='Genome FASTA [Default: %default]')
    parser.add_option('-g',
                      dest='gpu',
                      default=False,
                      action='store_true',
                      help='Run on GPU [Default: %default]')
    parser.add_option(
        '-l',
        dest='seq_len',
        type='int',
        default=600,
        help='Sequence length provided to the model [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='sad_shuffle',
                      help='Output directory')
    parser.add_option(
        '-r',
        dest='replot',
        default=False,
        action='store_true',
        help='Re-plot only, without re-computing [Default: %default]')
    parser.add_option('-s',
                      dest='num_shuffles',
                      default=1,
                      type='int',
                      help='Number of SNP shuffles [Default: %default]')
    parser.add_option('-t',
                      dest='sad_table_file',
                      help='Pre-computed SAD scores for the SNPs')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide VCF file, sample BEDs file, and model file')
    else:
        vcf_file = args[0]
        sample_beds_file = args[1]
        model_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # open reference genome
    genome = pysam.Fastafile(options.genome_fasta)

    # open binomial stats file
    binom_out = open('%s/binom.txt' % options.out_dir, 'w')

    # open mann-whitney stats file
    mw_out = open('%s/mannwhitney.txt' % options.out_dir, 'w')

    # plot defaults
    sns.set(font_scale=1.5, style='ticks')

    si = 0
    for line in open(sample_beds_file):
        sample, bed_file = line.split()
        print(sample)

        #########################################
        # compute SAD
        #########################################
        # filter VCF to overlapping SNPs
        print("  intersecting SNPs")
        sample_vcf_file = '%s/%s.vcf' % (options.out_dir, sample)
        if not options.replot:
            filter_vcf(vcf_file, bed_file, sample_vcf_file)

        # compute SAD scores for this sample's SNPs
        print("  computing SAD")
        if options.sad_table_file:
            true_sad = retrieve_sad(sample_vcf_file, options.sad_table_file,
                                    si)
        else:
            true_sad = compute_sad(sample_vcf_file, model_file, si,
                                   '%s/%s_sad' % (options.out_dir, sample),
                                   options.seq_len, options.gpu,
                                   options.replot)

        #########################################
        # compute shuffled SAD
        #########################################
        shuffle_sad = np.zeros((true_sad.shape[0], options.num_shuffles))
        for ni in range(options.num_shuffles):
            # shuffle the SNPs within their overlapping DHS
            print("  shuffle %d" % ni)
            sample_vcf_shuf_file = '%s/%s_shuf%d.vcf' % (options.out_dir,
                                                         sample, ni)
            shuffle_snps(sample_vcf_file, sample_vcf_shuf_file, genome)

            # compute SAD scores for shuffled SNPs
            print("  computing shuffle SAD")
            shuffle_sad[:, ni] = compute_sad(
                sample_vcf_shuf_file, model_file, si,
                '%s/%s_shuf%d_sad' % (options.out_dir, sample, ni),
                options.seq_len, options.gpu, options.replot)

        #########################################
        # simple stats
        #########################################
        # compute shuffle means
        shuffle_sad_mean = shuffle_sad.mean(axis=1)

        # print sample table
        sample_sad_out = open('%s/%s_table.txt' % (options.out_dir, sample),
                              'w')
        for vi in range(len(true_sad)):
            print('%f\t%f' % (true_sad[vi], shuffle_sad_mean[vi]),
                  file=sample_sad_out)
        sample_sad_out.close()

        # scatter plot
        # plt.figure()
        # plt.scatter(true_sad, shuffle_sad_mean, color='black', alpha=0.7)
        # plt.gca().grid(True, linestyle=':')
        # plt.savefig('%s/%s_scatter.pdf' % (options.out_dir,sample))
        # plt.close()

        # plot CDFs
        sns_colors = sns.color_palette('deep')
        plt.figure()
        plt.hist(true_sad,
                 1000,
                 normed=1,
                 histtype='step',
                 cumulative=True,
                 color=sns_colors[0],
                 linewidth=1,
                 label='SNPs')
        plt.hist(shuffle_sad.flatten(),
                 1000,
                 normed=1,
                 histtype='step',
                 cumulative=True,
                 color=sns_colors[2],
                 linewidth=1,
                 label='Shuffle')
        ax = plt.gca()
        ax.grid(True, linestyle=':')
        ax.set_xlim(-.2, .2)
        plt.legend()
        plt.savefig('%s/%s_cdf.pdf' % (options.out_dir, sample))
        plt.close()

        # plot Q-Q
        true_q = mquantiles(true_sad,
                            np.linspace(0, 1, min(10000, true_sad.shape[0])))
        shuf_q = mquantiles(shuffle_sad_mean,
                            np.linspace(0, 1, min(10000, true_sad.shape[0])))
        plt.figure()
        plt.scatter(true_q, shuf_q, color=sns_colors[0])
        pmin = 1.05 * min(true_q[0], shuf_q[0])
        pmax = 1.05 * max(true_q[-1], shuf_q[-1])
        plt.plot([pmin, pmax], [pmin, pmax], color='black', linewidth=1)
        ax = plt.gca()
        ax.set_xlim(pmin, pmax)
        ax.set_ylim(pmin, pmax)
        ax.set_xlabel('True SAD')
        ax.set_ylabel('Shuffled SAD')
        ax.grid(True, linestyle=':')
        plt.savefig('%s/%s_qq.pdf' % (options.out_dir, sample))
        plt.close()

        #########################################
        # statistical tests
        #########################################
        # compute matched binomial test
        true_great = sum((true_sad - shuffle_sad_mean) > 0)
        true_lo = np.log2(true_great) - np.log2(len(true_sad) - true_great)
        if true_lo > 0:
            binom_p = 1.0 - binom.cdf(true_great - 1, n=len(true_sad), p=0.5)
        else:
            binom_p = binom.cdf(true_great, n=len(true_sad), p=0.5)

        # print significance stats
        cols = (sample, len(true_sad), true_great, true_lo, binom_p)
        print('%-20s  %5d  %5d  %6.2f  %6.1e' % cols, file=binom_out)

        # compute Mann-Whitney
        mw_z, mw_p = stats.mannwhitneyu(true_sad, shuffle_sad.flatten())
        cols = (sample, len(true_sad), true_sad.mean(), shuffle_sad.mean(),
                mw_z, mw_p)
        print('%-20s  %5d  %6.3f  %6.3f  %6.2f  %6.1e' % cols, file=mw_out)

        # update sample index
        si += 1

    binom_out.close()
    mw_out.close()
    genome.close()
Exemplo n.º 11
0
def main():
    usage = 'usage: %prog [options] <peaks gff> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_fpkm_file', help='Control FPKM tracking file')
    parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE'])
    parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]')
    parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff')
    parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide peaks GFF and .diff file')
    else:
        peaks_gff = args[0]
        diff_file = args[1]

    # find expressed genes in peak calls
    silent_genes = set()
    if options.control_fpkm_file:
        silent_genes = find_silent(options.control_fpkm_file)

    # find peak bound genes
    peak_genes = set()
    p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf,peaks_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id'])
    p.communicate()

    # process RIP
    bound_tstats = []
    unbound_tstats = []
    rip_genes = set()

    diff_in = open(diff_file)
    line = diff_in.readline()
    for line in diff_in:
        a = line.split('\t')

        gene_id = a[0]
        sample1 = a[4]
        sample2 = a[5]
        status = a[6]
        fpkm1 = float(a[7])
        fpkm2 = float(a[8])
        tstat = float(a[10])
        sig = a[13].rstrip()

        if sample2 == 'input':
            tstat *= -1

        if status == 'OK' and not math.isnan(tstat):
            if options.sample1 in [None,sample1] and options.sample2 in [None,sample2]:
                # save RIP bound
                if sig == 'yes':
                    rip_genes.add(gene_id)

                # save test_stat
                if gene_id in peak_genes:
                    bound_tstats.append(tstat)
                else:
                    if not gene_id in silent_genes:
                        unbound_tstats.append(tstat)

    print '%d silent genes' % len(silent_genes)
    print '%d bound genes' % len(bound_tstats)
    print '%d unbound genes' % len(unbound_tstats)

    # perform statistical test
    z, p = stats.mannwhitneyu(bound_tstats, unbound_tstats)
    print z, p

    ##################################################
    # plot bound and unbound distributions
    ##################################################
    # construct data frame
    df_dict = {'Peak':(['Yes']*len(bound_tstats) + ['No']*len(unbound_tstats)),
               'Test_stat':bound_tstats+unbound_tstats}

    r_script = '%s/peaks_diff_compare.r' % os.environ['RDIR']

    ggplot.plot(r_script, df_dict, [options.output_pre])

    ##################################################
    # plot venn diagram
    ##################################################
    clip_only = len(peak_genes - rip_genes)
    rip_only = len(rip_genes - peak_genes)
    both = len(peak_genes & rip_genes)

    plt.figure()
    venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'RIP'])
    plt.savefig('%s_venn.pdf' % options.output_pre)
Exemplo n.º 12
0
def compute_stats(te_genes, gene_diff, ref_gtf, plot_dir, scale):
    # focus on GTF genes
    gtf_genes = set()
    for line in open(ref_gtf):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        gtf_genes.add(tid)

    pvals = []
    table_lines = []

    for sample_key in gene_diff:
        sample1, sample2 = sample_key

        stat_genes = list(gtf_genes & set(gene_diff[sample_key]))

        for te_key in te_genes:
            repeat, family, orient = te_key

            te_diffs = [
                gene_diff[sample_key][tid] for tid in stat_genes
                if tid in te_genes[te_key]
            ]
            if len(te_diffs) > 0:
                note_diffs = [
                    gene_diff[sample_key][tid] for tid in stat_genes
                    if tid not in te_genes[te_key]
                ]

                te_mean = mean(te_diffs)
                note_mean = mean(note_diffs)

                if len(te_diffs) > 5:
                    z, p = stats.mannwhitneyu(te_diffs, note_diffs)
                else:
                    z = 0
                    p = 1

                pvals.append(p)

                cols = (repeat, family, orient,
                        sample1, sample2, len(te_diffs), te_mean,
                        len(note_diffs), note_mean, z, p)
                table_lines.append(
                    '%-17s  %-17s  %1s  %-10s  %-10s  %6d  %9.2f  %6d  %9.2f  %8.2f  %10.2e'
                    % cols)

                # plot ...
                if repeat in ['*'] and family in [
                        '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1',
                        'LTR/ERVL-MaLR', 'LINE/L2', 'LTR/ERVL', 'SINE/MIR',
                        'DNA/hAT-Charlie', 'LTR/ERVK', 'DNA/TcMar-Tigger'
                ]:
                    repeat_plot = repeat.replace('/', '-').replace('*', 'X')
                    family_plot = family.replace('/', '-').replace('*', 'X')
                    out_pdf = '%s/%s_%s_%s_%s-%s.pdf' % (plot_dir, repeat_plot,
                                                         family_plot, orient,
                                                         sample1, sample2)
                    cdf_plot(te_key, te_diffs, note_diffs, out_pdf, scale)

    return table_lines, pvals
Exemplo n.º 13
0
def main():
    usage = 'usage: %prog [options] <vcf_file> <sample_beds_file> <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-f', dest='genome_fasta', default='%s/assembly/hg19.fa'%os.environ['HG19'], help='Genome FASTA [Default: %default]')
    parser.add_option('-g', dest='gpu', default=False, action='store_true', help='Run on GPU [Default: %default]')
    parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='sad_shuffle', help='Output directory')
    parser.add_option('-r', dest='replot', default=False, action='store_true', help='Re-plot only, without re-computing [Default: %default]')
    parser.add_option('-s', dest='num_shuffles', default=1, type='int', help='Number of SNP shuffles [Default: %default]')
    parser.add_option('-t', dest='sad_table_file', help='Pre-computed SAD scores for the SNPs')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide VCF file, sample BEDs file, and model file')
    else:
        vcf_file = args[0]
        sample_beds_file = args[1]
        model_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # open reference genome
    genome = pysam.Fastafile(options.genome_fasta)

    # open binomial stats file
    binom_out = open('%s/binom.txt' % options.out_dir, 'w')

    # open mann-whitney stats file
    mw_out = open('%s/mannwhitney.txt' % options.out_dir, 'w')

    # plot defaults
    sns.set(font_scale=1.5, style='ticks')

    si = 0
    for line in open(sample_beds_file):
        sample, bed_file = line.split()
        print(sample)

        #########################################
        # compute SAD
        #########################################
        # filter VCF to overlapping SNPs
        print("  intersecting SNPs")
        sample_vcf_file = '%s/%s.vcf' % (options.out_dir,sample)
        if not options.replot:
            filter_vcf(vcf_file, bed_file, sample_vcf_file)

        # compute SAD scores for this sample's SNPs
        print("  computing SAD")
        if options.sad_table_file:
            true_sad = retrieve_sad(sample_vcf_file, options.sad_table_file, si)
        else:
            true_sad = compute_sad(sample_vcf_file, model_file, si, '%s/%s_sad'%(options.out_dir,sample), options.seq_len, options.gpu, options.replot)

        #########################################
        # compute shuffled SAD
        #########################################
        shuffle_sad = np.zeros((true_sad.shape[0],options.num_shuffles))
        for ni in range(options.num_shuffles):
            # shuffle the SNPs within their overlapping DHS
            print("  shuffle %d" % ni)
            sample_vcf_shuf_file = '%s/%s_shuf%d.vcf' % (options.out_dir, sample, ni)
            shuffle_snps(sample_vcf_file, sample_vcf_shuf_file, genome)

            # compute SAD scores for shuffled SNPs
            print("  computing shuffle SAD")
            shuffle_sad[:,ni] = compute_sad(sample_vcf_shuf_file, model_file, si, '%s/%s_shuf%d_sad'%(options.out_dir,sample,ni), options.seq_len, options.gpu, options.replot)

        #########################################
        # simple stats
        #########################################
        # compute shuffle means
        shuffle_sad_mean = shuffle_sad.mean(axis=1)

        # print sample table
        sample_sad_out = open('%s/%s_table.txt' % (options.out_dir,sample), 'w')
        for vi in range(len(true_sad)):
            print('%f\t%f' % (true_sad[vi], shuffle_sad_mean[vi]), file=sample_sad_out)
        sample_sad_out.close()

        # scatter plot
        # plt.figure()
        # plt.scatter(true_sad, shuffle_sad_mean, color='black', alpha=0.7)
        # plt.gca().grid(True, linestyle=':')
        # plt.savefig('%s/%s_scatter.pdf' % (options.out_dir,sample))
        # plt.close()

        # plot CDFs
        sns_colors = sns.color_palette('deep')
        plt.figure()
        plt.hist(true_sad, 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[0], linewidth=1, label='SNPs')
        plt.hist(shuffle_sad.flatten(), 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[2], linewidth=1, label='Shuffle')
        ax = plt.gca()
        ax.grid(True, linestyle=':')
        ax.set_xlim(-.2, .2)
        plt.legend()
        plt.savefig('%s/%s_cdf.pdf' % (options.out_dir,sample))
        plt.close()

        # plot Q-Q
        true_q = mquantiles(true_sad, np.linspace(0,1,min(10000,true_sad.shape[0])))
        shuf_q = mquantiles(shuffle_sad_mean, np.linspace(0,1,min(10000,true_sad.shape[0])))
        plt.figure()
        plt.scatter(true_q, shuf_q, color=sns_colors[0])
        pmin = 1.05*min(true_q[0], shuf_q[0])
        pmax = 1.05*max(true_q[-1], shuf_q[-1])
        plt.plot([pmin,pmax], [pmin,pmax], color='black', linewidth=1)
        ax = plt.gca()
        ax.set_xlim(pmin,pmax)
        ax.set_ylim(pmin,pmax)
        ax.set_xlabel('True SAD')
        ax.set_ylabel('Shuffled SAD')
        ax.grid(True, linestyle=':')
        plt.savefig('%s/%s_qq.pdf' % (options.out_dir,sample))
        plt.close()


        #########################################
        # statistical tests
        #########################################
        # compute matched binomial test
        true_great = sum((true_sad-shuffle_sad_mean) > 0)
        true_lo = np.log2(true_great) - np.log2(len(true_sad)-true_great)
        if true_lo > 0:
            binom_p = 1.0 - binom.cdf(true_great-1, n=len(true_sad), p=0.5)
        else:
            binom_p = binom.cdf(true_great, n=len(true_sad), p=0.5)

        # print significance stats
        cols = (sample, len(true_sad), true_great, true_lo, binom_p)
        print('%-20s  %5d  %5d  %6.2f  %6.1e' % cols, file=binom_out)

        # compute Mann-Whitney
        mw_z, mw_p = stats.mannwhitneyu(true_sad, shuffle_sad.flatten())
        cols = (sample, len(true_sad), true_sad.mean(), shuffle_sad.mean(), mw_z, mw_p)
        print('%-20s  %5d  %6.3f  %6.3f  %6.2f  %6.1e' % cols, file=mw_out)

        # update sample index
        si += 1

    binom_out.close()
    mw_out.close()
    genome.close()
Exemplo n.º 14
0
def main():
    usage = 'usage: %prog [options] <peaks gff> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file')
    parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE'])
    parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r'%os.environ['RDIR'], help='Script to make plots with [Default: %default]')
    parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]')
    parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]')
    parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]')
    parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]')
    parser.add_option('-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]')
    parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff')
    parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide peaks GFF and .diff file')
    else:
        peaks_gff = args[0]
        diff_file = args[1]

    ##################################################
    # process GTF
    ##################################################
    if options.single_gene_loci:
        single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf)
        options.ref_gtf = single_gtf_file

    gtf_genes = gff.gtf_gene_set(options.ref_gtf)

    ##################################################
    # collect CLIP peak bound genes
    ##################################################
    peak_genes = set()
    p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id'])
    p.communicate()

    # find expressed genes in peak calls
    silent_genes = set()
    if options.clip_fpkm_file:
        silent_genes = find_silent(options.clip_fpkm_file)

    ##################################################
    # collect RIP stats
    ##################################################
    if options.test_stat:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok = True, use_fold=False, max_stat=options.max_stat, one_rbp=True)
    else:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True)
        rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True)

    ##################################################
    # plot bound and unbound distributions
    ##################################################
    # construct data frame
    df_dict = {'Gene':[], 'CLIP':[], 'RIP':[]}
    for gene_id in rip_fold:
        if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes):
            df_dict['Gene'].append(gene_id)
            df_dict['RIP'].append(rip_fold[gene_id])
            if gene_id in peak_genes:
                df_dict['CLIP'].append('Bound')
            else:
                df_dict['CLIP'].append('Unbound')

    ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat])

    ##################################################
    # compute stats on bound and unbound distributions
    ##################################################
    bound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound']
    unbound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound']

    # perform statistical test
    z, p = stats.mannwhitneyu(bound_fold, unbound_fold)

    stats_out = open('%s_stats.txt' % options.output_pre, 'w')
    cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p)    
    print >> stats_out, '%-10s  %5d  %6.2f  %5d  %6.2f  %6.2f  %9.2e' % cols
    stats_out.close()

    ##################################################
    # plot venn diagram
    ##################################################
    rip_genes = set([df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i],False)])

    clip_only = len(peak_genes - rip_genes)
    rip_only = len(rip_genes - peak_genes)
    both = len(peak_genes & rip_genes)

    if options.clip_fpkm_file:
        print >> sys.stderr, 'Ignoring silent genes for hypergeometric test'

    # k is x
    # K is n
    # N is M
    # n is N
    # hypergeom.sf(x, M, n, N, loc=0)

    p1 = hypergeom.sf(both-1, len(gtf_genes), len(peak_genes), len(rip_genes))
    p2 = hypergeom.sf(both-1, len(gtf_genes), len(rip_genes), len(peak_genes))

    hyper_out = open('%s_hyper.txt' % options.output_pre, 'w')
    cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes))
    print >> hyper_out, '%7.2e  %7.2e  %5d  %5d  %5d  %5d  %5d %5d' % cols
    hyper_out.close()

    if clip_only > 0 and rip_only > 0:
        plt.figure()
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8'])
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d'])
        venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838'])
        plt.savefig('%s_venn.pdf' % options.output_pre)

    ##################################################
    # clean
    ##################################################
    if options.single_gene_loci:
        os.close(single_gtf_fd)
        os.remove(single_gtf_file)
Exemplo n.º 15
0
def main():
    usage = 'usage: %prog [options] <vcf_file> <excl_bed_file> <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='cuda', default=False, action='store_true', help='Run on GPU [Default: %default]')
    parser.add_option('-e', dest='add_excl_bed', default='%s/assembly/hg19_gaps.bed'%os.environ['HG19'], help='Additional genomic regions to exclude from the shuffle [Default: %default]')
    parser.add_option('-f', dest='genome_fasta', default='%s/assembly/hg19.fa'%os.environ['HG19'], help='Genome FASTA [Default: %default]')
    parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.core.genome'%os.environ['HG19'], help='Genome file for shuffling [Default: %default]')
    parser.add_option('-l', dest='seq_len', type='int', default=600, help='Sequence length provided to the model [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='sad_shuffle', help='Output directory')
    parser.add_option('-r', dest='replot', default=False, action='store_true', help='Re-plot only, without re-computing [Default: %default]')
    parser.add_option('-s', dest='num_shuffles', default=1, type='int', help='Number of SNP shuffles [Default: %default]')
    parser.add_option('-t', dest='targets_file', default=None, help='Target index, sample name table for targets to plot [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide VCF file, excluded BED file, and model file')
    else:
        vcf_file = args[0]
        excl_bed_file = args[1]
        model_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #########################################
    # supplement the excluded sites
    #########################################
    if options.add_excl_bed is not None:
        supp_excl_bed_file = '%s/excl.bed' % options.out_dir
        supp_excl_bed_out = open(supp_excl_bed_file, 'w')

        # copy exclusion BED file
        for line in open(excl_bed_file):
            a = line.split()
            print('\t'.join(a[:3]), file=supp_excl_bed_out)

        # add on additional sites
        for line in open(options.add_excl_bed):
            a = line.split()
            print('\t'.join(a[:3]), file=supp_excl_bed_out)

        supp_excl_bed_out.close()
        excl_bed_file = supp_excl_bed_file

    #########################################
    # compute SAD
    #########################################
    # filter VCF to excluded SNPs
    excl_vcf_file = '%s/excl.vcf' % options.out_dir
    if not options.replot:
        exclude_vcf(vcf_file, excl_bed_file, excl_vcf_file)

    # compute SADs
    true_sad = compute_sad(excl_vcf_file, model_file, '%s/excl_sad'%options.out_dir, options.seq_len, options.cuda, options.replot)

    #########################################
    # compute shuffled SAD
    #########################################
    # open reference genome
    genome_open = pysam.Fastafile(options.genome_fasta)

    shuffle_sad = np.zeros((true_sad.shape[0],true_sad.shape[1],options.num_shuffles))
    for ni in range(options.num_shuffles):
        # shuffle the SNPs
        shuf_vcf_file = '%s/shuf%d.vcf' % (options.out_dir, ni)
        shuffle_snps(excl_vcf_file, shuf_vcf_file, excl_bed_file, options.genome_file, genome_open)

        # compute SAD scores for shuffled SNPs
        shuffle_sad[:,:,ni] = compute_sad(shuf_vcf_file, model_file, '%s/shuf%d_sad'%(options.out_dir,ni), options.seq_len, options.cuda, options.replot)

    # compute shuffle means
    shuffle_sad_mean = shuffle_sad.mean(axis=2)

    #########################################
    # stats and plots
    #########################################
    targets = {}
    if options.targets_file:
        for line in open(options.targets_file):
            a = line.split()
            targets[int(a[0])] = a[1]
    else:
        for ti in range(true_sad.shape[1]):
            targets[ti] = 't%d' % ti

    mw_out = open('%s/mannwhitney.txt' % options.out_dir, 'w')

    # plot defaults
    sns.set(font_scale=1.5, style='ticks')

    for ti in targets:
        # plot CDFs
        sns_colors = sns.color_palette('deep')
        plt.figure()
        plt.hist(true_sad[:,ti], 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[0], linewidth=1, label='SNPs')
        plt.hist(shuffle_sad[:,ti,:].flatten(), 1000, normed=1, histtype='step', cumulative=True, color=sns_colors[2], linewidth=1, label='Shuffle')
        ax = plt.gca()
        ax.grid(True, linestyle=':')
        ax.set_xlim(-.15, .15)
        plt.legend()
        plt.savefig('%s/%s_cdf.pdf' % (options.out_dir,targets[ti]))
        plt.close()

        # plot Q-Q
        true_q = mquantiles(true_sad[:,ti], np.linspace(0,1,min(10000,true_sad.shape[0])))
        shuf_q = mquantiles(shuffle_sad_mean[:,ti], np.linspace(0,1,min(10000,true_sad.shape[0])))
        plt.figure()
        plt.scatter(true_q, shuf_q, color=sns_colors[0])
        pmin = 1.05*min(true_q[0], shuf_q[0])
        pmax = 1.05*max(true_q[-1], shuf_q[-1])
        plt.plot([pmin,pmax], [pmin,pmax], color='black', linewidth=1)
        ax = plt.gca()
        ax.set_xlim(pmin,pmax)
        ax.set_ylim(pmin,pmax)
        ax.set_xlabel('True SAD')
        ax.set_ylabel('Shuffled SAD')
        ax.grid(True, linestyle=':')
        plt.savefig('%s/%s_qq.pdf' % (options.out_dir,targets[ti]))
        plt.close()

        # compute Mann-Whitney
        mw_z, mw_p = stats.mannwhitneyu(true_sad[:,ti], shuffle_sad[:,ti,:].flatten())
        cols = (ti, targets[ti], true_sad.shape[0], true_sad[:,ti].mean(), shuffle_sad[:,ti,:].mean(), mw_z, mw_p)
        print('%3d  %20s  %5d  %7.4f  %7.4f  %6.2f  %6.1e' % cols, file=mw_out)

    mw_out.close()
Exemplo n.º 16
0
print(stats.ttest_1samp(l,12))
print(stats.ttest_1samp(a,12))
print('ttest_ind:')
print(stats.ttest_ind(l,m))
print(stats.ttest_ind(a,b))
print('ttest_rel:')
print(stats.ttest_rel(l,m))
print(stats.ttest_rel(a,b))
print('chisquare:')
print(stats.chisquare(l))
print(stats.chisquare(a))
print('ks_2samp:')
print(stats.ks_2samp(l,m))
print(stats.ks_2samp(a,b))
print('mannwhitneyu:')
print(stats.mannwhitneyu(l,m))
print(stats.mannwhitneyu(a,b))
print('ranksums:')
print(stats.ranksums(l,m))
print(stats.ranksums(a,b))
print('wilcoxont:')
print(stats.wilcoxont(l,m))
print(stats.wilcoxont(a,b))
print('kruskalwallish:')
print(stats.kruskalwallish(l,m,l))
print(len(l), len(m))
print(stats.kruskalwallish(a,b,a))
print('friedmanchisquare:')
print(stats.friedmanchisquare(l,m,l))
print(stats.friedmanchisquare(a,b,a))
l = range(1,21)
Exemplo n.º 17
0
def main():
    usage = 'usage: %prog [options] <vcf_file> <excl_bed_file> <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-c',
                      dest='cuda',
                      default=False,
                      action='store_true',
                      help='Run on GPU [Default: %default]')
    parser.add_option(
        '-e',
        dest='add_excl_bed',
        default='%s/assembly/hg19_gaps.bed' % os.environ['HG19'],
        help=
        'Additional genomic regions to exclude from the shuffle [Default: %default]'
    )
    parser.add_option('-f',
                      dest='genome_fasta',
                      default='%s/assembly/hg19.fa' % os.environ['HG19'],
                      help='Genome FASTA [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/assembly/human.hg19.core.genome' %
                      os.environ['HG19'],
                      help='Genome file for shuffling [Default: %default]')
    parser.add_option(
        '-l',
        dest='seq_len',
        type='int',
        default=600,
        help='Sequence length provided to the model [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='sad_shuffle',
                      help='Output directory')
    parser.add_option(
        '-r',
        dest='replot',
        default=False,
        action='store_true',
        help='Re-plot only, without re-computing [Default: %default]')
    parser.add_option('-s',
                      dest='num_shuffles',
                      default=1,
                      type='int',
                      help='Number of SNP shuffles [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        help=
        'Target index, sample name table for targets to plot [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide VCF file, excluded BED file, and model file')
    else:
        vcf_file = args[0]
        excl_bed_file = args[1]
        model_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #########################################
    # supplement the excluded sites
    #########################################
    if options.add_excl_bed is not None:
        supp_excl_bed_file = '%s/excl.bed' % options.out_dir
        supp_excl_bed_out = open(supp_excl_bed_file, 'w')

        # copy exclusion BED file
        for line in open(excl_bed_file):
            a = line.split()
            print('\t'.join(a[:3]), file=supp_excl_bed_out)

        # add on additional sites
        for line in open(options.add_excl_bed):
            a = line.split()
            print('\t'.join(a[:3]), file=supp_excl_bed_out)

        supp_excl_bed_out.close()
        excl_bed_file = supp_excl_bed_file

    #########################################
    # compute SAD
    #########################################
    # filter VCF to excluded SNPs
    excl_vcf_file = '%s/excl.vcf' % options.out_dir
    if not options.replot:
        exclude_vcf(vcf_file, excl_bed_file, excl_vcf_file)

    # compute SADs
    true_sad = compute_sad(excl_vcf_file, model_file,
                           '%s/excl_sad' % options.out_dir, options.seq_len,
                           options.cuda, options.replot)

    #########################################
    # compute shuffled SAD
    #########################################
    # open reference genome
    genome_open = pysam.Fastafile(options.genome_fasta)

    shuffle_sad = np.zeros(
        (true_sad.shape[0], true_sad.shape[1], options.num_shuffles))
    for ni in range(options.num_shuffles):
        # shuffle the SNPs
        shuf_vcf_file = '%s/shuf%d.vcf' % (options.out_dir, ni)
        shuffle_snps(excl_vcf_file, shuf_vcf_file, excl_bed_file,
                     options.genome_file, genome_open)

        # compute SAD scores for shuffled SNPs
        shuffle_sad[:, :,
                    ni] = compute_sad(shuf_vcf_file, model_file,
                                      '%s/shuf%d_sad' % (options.out_dir, ni),
                                      options.seq_len, options.cuda,
                                      options.replot)

    # compute shuffle means
    shuffle_sad_mean = shuffle_sad.mean(axis=2)

    #########################################
    # stats and plots
    #########################################
    targets = {}
    if options.targets_file:
        for line in open(options.targets_file):
            a = line.split()
            targets[int(a[0])] = a[1]
    else:
        for ti in range(true_sad.shape[1]):
            targets[ti] = 't%d' % ti

    mw_out = open('%s/mannwhitney.txt' % options.out_dir, 'w')

    # plot defaults
    sns.set(font_scale=1.5, style='ticks')

    for ti in targets:
        # plot CDFs
        sns_colors = sns.color_palette('deep')
        plt.figure()
        plt.hist(true_sad[:, ti],
                 1000,
                 normed=1,
                 histtype='step',
                 cumulative=True,
                 color=sns_colors[0],
                 linewidth=1,
                 label='SNPs')
        plt.hist(shuffle_sad[:, ti, :].flatten(),
                 1000,
                 normed=1,
                 histtype='step',
                 cumulative=True,
                 color=sns_colors[2],
                 linewidth=1,
                 label='Shuffle')
        ax = plt.gca()
        ax.grid(True, linestyle=':')
        ax.set_xlim(-.15, .15)
        plt.legend()
        plt.savefig('%s/%s_cdf.pdf' % (options.out_dir, targets[ti]))
        plt.close()

        # plot Q-Q
        true_q = mquantiles(true_sad[:, ti],
                            np.linspace(0, 1, min(10000, true_sad.shape[0])))
        shuf_q = mquantiles(shuffle_sad_mean[:, ti],
                            np.linspace(0, 1, min(10000, true_sad.shape[0])))
        plt.figure()
        plt.scatter(true_q, shuf_q, color=sns_colors[0])
        pmin = 1.05 * min(true_q[0], shuf_q[0])
        pmax = 1.05 * max(true_q[-1], shuf_q[-1])
        plt.plot([pmin, pmax], [pmin, pmax], color='black', linewidth=1)
        ax = plt.gca()
        ax.set_xlim(pmin, pmax)
        ax.set_ylim(pmin, pmax)
        ax.set_xlabel('True SAD')
        ax.set_ylabel('Shuffled SAD')
        ax.grid(True, linestyle=':')
        plt.savefig('%s/%s_qq.pdf' % (options.out_dir, targets[ti]))
        plt.close()

        # compute Mann-Whitney
        mw_z, mw_p = stats.mannwhitneyu(true_sad[:, ti],
                                        shuffle_sad[:, ti, :].flatten())
        cols = (ti, targets[ti], true_sad.shape[0], true_sad[:, ti].mean(),
                shuffle_sad[:, ti, :].mean(), mw_z, mw_p)
        print('%3d  %20s  %5d  %7.4f  %7.4f  %6.2f  %6.1e' % cols, file=mw_out)

    mw_out.close()