Пример #1
0
if __name__=="__main__":

    pwmid = sys.argv[2]
    sample = 'NA18505'
    model = sys.argv[1]
    location_file = "/mnt/lustre/home/anilraj/pbm_dnase_profile/cache/%s_0_short_bound.bed.gz"%(pwmid)
    handle = loadutils.ZipFile(location_file)
    locations = handle.read(threshold=11)
    print pwmid, sample, model
    print "read in locations ..."

    if pwmid[0]=='M':
        pwms = loadutils.transfac_pwms()
    elif pwmid[0]=='S':
        pwms = loadutils.selex_pwms()
    motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0]
    print "selected motif model ..."

    bound = [loc for loc in locations if int(loc[-1])>50]
    undecided = [loc for loc in locations if int(loc[-1])>0]
    dnaseobj = loadutils.Dnase(sample=sample)
    reads, undecided, ig = dnaseobj.getreads(undecided, remove_outliers=False, width=200)
    totalreads = reads.sum(1)
    print "extracted total reads ..."

    handle = PdfPages('/mnt/lustre/home/anilraj/pbm_dnase_profile/fig/compare_models_%s_%s.pdf'%(model,pwmid))
    for width in [64,128,256]:
        boundreads, ig, ig = dnaseobj.getreads(bound, remove_outliers=True, width=width)
        undecidedreads, locs_tolearn, ig = dnaseobj.getreads(undecided, remove_outliers=True, width=width)
        indices = np.array([undecided.index(loc) for loc in locs_tolearn])
Пример #2
0
def infer(pwmid, sample, pwm_thresh=8, pwmbase='transfac', chipseq=False):

    import centipede_pbm as centipede

    model = 'modelC'
    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()
    motif = [val['motif'] for val in pwms.itervalues()
             if val['AC'] == pwmid][0]

    if sample in [None, 'Gm12878', 'Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if sample in ['Gm12878', 'Gm12878All']:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locationsGm12878_Q%.1f.txt.gz" % (
            pwmid, dhs)
    else:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locations_Q%.1f.txt.gz" % (
            pwmid, dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l" % location_file,
                            stdout=subprocess.PIPE,
                            shell=True)
    Ns = int(pipe.communicate()[0].strip())

    # load scores
    alllocations = []
    pwm_cutoff = pwm_thresh + 1
    while len(alllocations) < 100:
        pwm_cutoff = pwm_cutoff - 1
        handle = loadutils.ZipFile(location_file)
        alllocations = handle.read(threshold=pwm_cutoff)
        handle.close()
    print "PWM Cutoff = %d" % pwm_cutoff

    # subsample locations, if too many
    if len(alllocations) > 100000:
        scores = np.array([loc[-1] for loc in alllocations]).astype(float)
        indices = np.argsort(scores)[-100000:]
        alllocations = [alllocations[index] for index in indices]
    print "Num of sites for learning, with pwm threshold of %d for %s = %d" % (
        pwm_thresh, pwmid, len(alllocations))

    if sample in [None, 'Gm12878', 'Gm12878All']:
        locs_tolearn = alllocations
    else:
        # compute scores for specific sample at these locations
        starttime = time.time()
        locs_tolearn = sequence.get_scores(alllocations, motif)
        print len(locs_tolearn), time.time() - starttime

    # filter mappability
    print "filtering out unmappable sites ..."
    locs_tolearn = sequence.filter_mappability(locs_tolearn,
                                               width=max([200, L / 2]))

    # load reads and locations
    print "loading dnase reads ..."
    readobj = loadutils.Dnase(sample=sample)
    dnasereads, locs_tolearn, subscores = readobj.getreads(
        locs_tolearn, remove_outliers=True, width=max([200, L / 2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size, 1)
    dnasetotal = dnasereads.sum(1)
    print "Num of mappable sites for learning for %s = %d" % (
        pwmid, len(locs_tolearn))

    if chipseq:
        chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
        chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
        chipobj.close()
    else:
        chipreads = None

    if L < 400:
        dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                dnasereads[:, 300 - L / 4:300 + L / 4]))

    locs_tolearn = [list(loc) for loc in locs_tolearn]
    footprints = []
    priors = []
    negbins = []
    posteriors = []

    null = np.ones((1, L), dtype=float) * 1. / L
    posterior, footprint, negbinparams, prior = centipede.EM(dnasereads,
                                                             dnasetotal,
                                                             subscores,
                                                             null,
                                                             model=model,
                                                             restarts=2)

    posteriors.append(posterior)
    footprints.append(footprint)
    negbins.append(negbinparams)
    priors.append(prior)

    chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=400)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()
    for posterior in posteriors:
        logodds = np.log(posterior[:, 1] / posterior[:, 0])
        logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
        logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
        R = stats.pearsonr(logodds, np.sqrt(chipreads))
        R2 = stats.pearsonr(np.sqrt(dnasetotal), np.sqrt(chipreads))

        handle = open(
            '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'
            % loadutils.factormap[pwmid], 'r')
        calls = [line.strip().split()[:3] for line in handle]
        handle.close()
        macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]])
        [
            macs[call[0]].append([int(call[1]), int(call[2])])
            for call in calls if call[0] in utils.chromosomes[:22]
        ]

        bsites = [
            locs_tolearn[i] for i, p in enumerate(posterior[:, 1]) if p > 0.99
        ]
        F, precision, sensitivity, ig = Fscore.Fscore(bsites, macs)
        chipauc, tpr, positive, negative = compute_chip_auc(
            chipreads, controlreads, logodds, macs, locs_tolearn)
        print pwmid, model, sample, R, R2, chipauc, tpr, F, precision, sensitivity

    output = {'footprint': footprints, \
            'negbin': negbins, \
            'prior': priors, \
            'posterior': posteriors, \
            'locations': locs_tolearn}

    if sample is None:
        handle = open(
            "%s/cache/combined/pbmcentipede_%s_%s.pkl" %
            (projpath, model, pwmid), 'w')
    else:
        handle = open(
            "%s/cache/separate/pbmcentipede_%s_%s_%s.pkl" %
            (projpath, model, pwmid, sample), 'w')
    cPickle.Pickler(handle, protocol=2).dump(output)
    handle.close()

    readobj.close()
    sequence.close()
Пример #3
0
def decode(pwmid,
           sample,
           cutk=0,
           pwmbase='transfac',
           pos_threshold=np.log10(99),
           chipseq=False):

    import centipede
    import millipede
    import centipede_pbm as pbmcentipede

    if sample in [None, 'Gm12878', 'Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        if pwmbase == 'transfac':
            pwms = loadutils.transfac_pwms()
        elif pwmbase == 'selex':
            pwms = loadutils.selex_pwms()
        motif = [
            val['motif'] for val in pwms.itervalues() if val['AC'] == pwmid
        ][0]
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if cutk != 0:
        sequence.set_cutrate(sample=sample, k=cutk)

    # use output from Centipede run
    # 0 = Py code, 1 = R code
    if sample is None:
        handle = open(
            "%s/cache/combined/pbmcentipede_short_%s.pkl" % (projpath, pwmid),
            'r')
    else:
        handle = open(
            "%s/cache/separate/pbmcentipede_short_%s_%s.pkl" %
            (projpath, pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    if cutk == 0:
        idx = 0
    elif cutk == 2:
        idx = 1
    elif cutk == 4:
        idx = 2
    footprint = output['footprint'][idx]
    negbinparams = output['negbin'][idx]
    prior = output['prior'][idx][0]
    dhsprior = output['prior'][idx][1]

    if sample in ['Gm12878', 'Gm12878All']:
        location_file = "%s/cache/%s_locationsGm12878_Q%.1f.txt.gz" % (
            projpath, pwmid, dhs)
    else:
        location_file = "%s/cache/%s_locations_Q%.1f.txt.gz" % (projpath,
                                                                pwmid, dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l" % location_file,
                            stdout=subprocess.PIPE,
                            shell=True)
    Ns = int(pipe.communicate()[0].strip())

    try:
        chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    except:
        pass

    readobj = loadutils.Dnase(sample=sample)
    readhandle = loadutils.ZipFile(location_file)
    loops = Ns / batch

    if sample is None:
        handle = gzip.open(
            "%s/cache/combined/%s_short_bound.bed.gz" % (projpath, pwmid),
            'wb')
    else:
        handle = gzip.open(
            "%s/cache/separate/%s_%d_%s_short_bound.bed.gz" %
            (projpath, pwmid, cutk, sample), 'wb')
    towrite = [
        'Chr', 'Start', 'Stop', 'Strand', 'PwmScore', 'LogPosOdds',
        'LogPriorOdds', 'MultLikeRatio', 'NegBinLikeRatio', 'ChipseqReads'
    ]
    handle.write('\t'.join(towrite) + '\n')

    totalreads = []
    for n in xrange(loops):
        starttime = time.time()
        # read locations from file
        locations = readhandle.read(chunk=batch)
        if sample not in [None, 'Gm12878', 'Gm12878All']:
            # compute scores at locations for specific sample
            locations = sequence.get_scores(locations, motif)
        locations = sequence.filter_mappability(locations,
                                                width=max([200, L / 2]))

        # read in Dnase read data for locations
        dnasereads, locations, subscores = readobj.getreads(locations,
                                                            width=max(
                                                                [200, L / 2]))
        subscores = np.array(subscores).astype('float')
        subscores = subscores.reshape(subscores.size, 1)
        dnasetotal = dnasereads.sum(1)
        print len(locations)

        if chipseq:
            chipreads = chipobj.getreads(locations, width=max([200, L / 2]))
        else:
            chipreads = None

        # set null footprint distribution
        if cutk == 0:
            null = np.ones((1, L), dtype=float) / L
        else:
            null = sequence.getnull(locations, width=L / 2)

        if L < 400:
            dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                    dnasereads[:, 300 - L / 4:300 + L / 4]))


#        if cutk==0:
        logodds = centipede.decode(dnasereads,
                                   dnasetotal,
                                   null,
                                   subscores,
                                   footprint,
                                   negbinparams[0],
                                   negbinparams[1],
                                   prior,
                                   dhsprior,
                                   chipreads=chipreads,
                                   damp=damp)
        #        elif cutk==2:
        #            posterior = pbmcentipede.decode(reads, chipreads, subscores, footprint[1:], negbinparams[0], negbinparams[1], prior)

        if not chipseq:
            try:
                chipreads = chipobj.get_total_reads(locations, width=400)
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                    for loc,pos,c in zip(locations,logodds,chipreads)]
            except NameError:
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                    for loc,pos in zip(locations,logodds)]

        locations = [loc for loc in locations if len(loc) > 5]
        ignore = [handle.write('\t'.join(elem) + '\n') for elem in locations]

        print time.time() - starttime

    remain = Ns - loops * batch
    locations = readhandle.read(chunk=remain)
    if sample not in [None, 'Gm12878', 'Gm12878All']:
        # compute scores at locations for specific sample
        locations = sequence.get_scores(locations, motif)
    locations = sequence.filter_mappability(locations, width=max([200, L / 2]))
    dnasereads, locations, subscores = readobj.getreads(locations,
                                                        width=max([200,
                                                                   L / 2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size, 1)
    dnasetotal = dnasereads.sum(1)

    if chipseq:
        chipreads = chipobj.get_total_reads(locations, width=200)
    else:
        chipreads = None

    # set null footprint distribution
    if cutk == 0:
        null = np.ones((1, L), dtype=float) / L
    else:
        null = sequence.getnull(locations, width=L / 2)

    if L < 400:
        dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                dnasereads[:, 300 - L / 4:300 + L / 4]))

    logodds = centipede.decode(dnasereads,
                               dnasetotal,
                               null,
                               subscores,
                               footprint,
                               negbinparams[0],
                               negbinparams[1],
                               prior,
                               dhsprior,
                               chipreads=chipreads,
                               damp=damp)

    if not chipseq:
        try:
            chipreads = chipobj.get_total_reads(locations, width=400)
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                for loc,pos,c in zip(locations,logodds,chipreads)]
        except NameError:
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                for loc,pos in zip(locations,logodds)]
    locations = [loc for loc in locations if len(loc) > 5]
    ignore = [handle.write('\t'.join(elem) + '\n') for elem in locations]

    readobj.close()
    chipobj.close()
    readhandle.close()
    handle.close()

    sequence.close()
Пример #4
0
def plotmodel(pwmid, sample=None, pwmbase='transfac'):

    import centipede_pbm as centipede
    from matplotlib.backends.backend_pdf import PdfPages

    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()

    models = ['modelA', 'modelB']
    meanfootprints = []
    stdfootprints = []
    Logodds = []

    handle = open(
        '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed' %
        loadutils.factormap[pwmid], 'r')
    calls = [line.strip().split()[:3] for line in handle]
    handle.close()
    macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]])
    [
        macs[call[0]].append([int(call[1]), int(call[2])]) for call in calls
        if call[0] in utils.chromosomes[:22]
    ]

    if sample is None:
        statsfile = "%s/fig/stats_short_%s.txt" % (projpath, pwmid)
    else:
        statsfile = "%s/fig/stats_short_%s_%s.txt" % (projpath, pwmid, sample)

    pis = []
    gammas = []
    outhandle = open(statsfile, 'w')

    for model in models:
        if sample is None:
            handle = open(
                "%s/cache/combined/pbmcentipede_%s_short_%s.pkl" %
                (projpath, model, pwmid), 'r')
        else:
            handle = open(
                "%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl" %
                (projpath, model, pwmid, sample), 'r')
        output = cPickle.load(handle)
        handle.close()
        footparams = output['footprint'][0]
        alpha, tau = output['negbin'][0]
        posterior = output['posterior'][0]
        logodds = np.log(posterior[:, 1] / posterior[:, 0])
        logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
        logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
        Logodds.append(logodds)
        means = alpha * (1 - tau) / tau
        outhandle.write('%.2f %.2f\n' % (means[0], means[1]))

        if not 'cascade' in locals():
            locs_tolearn = output['locations']
            dnaseobj = loadutils.Dnase(sample=sample)
            dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn,
                                                   width=max([200, L / 2]))
            if L < 400:
                reads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                   dnasereads[:, 300 - L / 4:300 + L / 4]))
            else:
                reads = dnasereads
            dnasereads = dnasereads.sum(1)
            dnaseobj.close()

            cascade = centipede.Cascade(L)
            cascade.setreads(reads)
            del reads

        if model == 'modelA':
            gammas.append(footparams[0])
            if isinstance(footparams[1], centipede.Pi):
                pi = footparams[1].estim
            else:
                pi = footparams[1]
            pis.append(pi)
            B = footparams[2]
            M1, M2 = centipede.bayes_optimal_estimator(cascade,
                                                       posterior,
                                                       pi,
                                                       B=B,
                                                       model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
            #            stdfootprints.append(stdfoot)
            stdfootprints.append(None)
        elif model == 'modelB':
            gammas.append(footparams[1])
            if isinstance(footparams[2], centipede.Pi):
                pi = footparams[2].estim
            else:
                pi = footparams[2]
            pis.append(pi)
            mu = footparams[3]
            M1, M2 = centipede.bayes_optimal_estimator(cascade,
                                                       posterior,
                                                       pi,
                                                       mu=mu,
                                                       model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
            #            stdfootprints.append(stdfoot)
            stdfootprints.append(None)

    chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()
    pdb.set_trace()

    #    sequence = loadutils.Sequence(sample)
    #    seqs = sequence.get_sequences(locs_tolearn, width=200)
    #    sequence.close()
    #    pdb.set_trace()
    #    np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads)
    #    pdb.set_trace()

    corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads))
    corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads))

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)

    key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0]
    if sample is None:
        title = pwms[key]['NA']
        footprintfile = "%s/fig/footprint_short_%s.pdf" % (projpath, pwmid)
        corrfile = "%s/fig/logoddsCorr_short_%s.pdf" % (projpath, pwmid)
    else:
        title = "%s / %s" % (pwms[key]['NA'], sample)
        footprintfile = "%s/fig/footprint_short_%s_%s.pdf" % (projpath, pwmid,
                                                              sample)
        corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf" % (projpath, pwmid,
                                                           sample)

    models = [
        'CentipedePBM_M1', 'CentipedePBM_M2', 'Centipede', 'CentipedeDamped'
    ]
    # plot footprints
    pdfhandle = PdfPages(footprintfile)
    figure = viz.plot_footprint(meanfootprints,
                                labels=models,
                                stderr=stdfootprints,
                                motif=pwms[key]['motif'],
                                title=title)
    pdfhandle.savefig(figure)
    models.append('CentipedeNoFoot')
    auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads,
                                                    Logodds[0], macs,
                                                    locs_tolearn)
    figure = viz.plot_auc(Logodds,
                          positive,
                          negative,
                          labels=models,
                          title=title)
    pdfhandle.savefig(figure)
    T = pis[0].size
    figure = viz.plot.figure()
    subplot = figure.add_subplot(111)
    subplot.scatter(gammas[0].value[0],
                    gammas[1].value[0],
                    s=2**T,
                    marker='o',
                    color=viz.colors[1],
                    label='gamma',
                    alpha=0.5)
    subplot.scatter(pis[0][0],
                    pis[1][0],
                    s=2**T,
                    marker='o',
                    color=viz.colors[0],
                    label='pi',
                    alpha=0.5)
    for i in xrange(1, T):
        subplot.scatter(gammas[0].value[i],
                        gammas[1].value[i],
                        s=2**(T - i),
                        marker='o',
                        color=viz.colors[1],
                        label='_nolabel_',
                        alpha=0.5)
        subplot.scatter(pis[0][i],
                        pis[1][i],
                        s=2**(T - i),
                        marker='o',
                        color=viz.colors[0],
                        label='_nolabel_',
                        alpha=0.5)
    xmin = min([pis[0].min(), pis[1].min()]) - 0.05
    xmax = max([pis[0].max(), pis[1].max()]) + 0.05
    subplot.axis([xmin, xmax, xmin, xmax])
    subplot.set_xlabel('PBM_M1')
    subplot.set_ylabel('PBM_M2')
    legend = subplot.legend(loc=1)
    for text in legend.texts:
        text.set_fontsize('8')
    legend.set_frame_on(False)
    pdfhandle.savefig(figure)
    pdfhandle.close()
    pdb.set_trace()

    pdfhandle = PdfPages(corrfile)
    lo = 0
    for logodds, model in zip(Logodds, models):
        auc, tpr, positive, negative = compute_chip_auc(
            chipreads, controlreads, logodds, macs, locs_tolearn)
        corrA = stats.pearsonr(logodds, np.sqrt(chipreads))
        corrB = stats.pearsonr(logodds, np.sqrt(controlreads))
        corra = stats.pearsonr(logodds[logodds > lo],
                               np.sqrt(chipreads)[logodds > lo])
        corrb = stats.pearsonr(logodds[logodds > lo],
                               np.sqrt(controlreads)[logodds > lo])
        corrc = stats.pearsonr(
            np.sqrt(dnasereads)[logodds > lo],
            np.sqrt(chipreads)[logodds > lo])
        corrd = stats.pearsonr(
            np.sqrt(dnasereads)[logodds > lo],
            np.sqrt(controlreads)[logodds > lo])
        towrite = [
            pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc,
            corrd, auc, tpr, logodds.size, (logodds > np.log(99)).sum()
        ]
        outhandle.write(' '.join(map(str, towrite)) + '\n')
        figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model)
        pdfhandle.savefig(figure)

    figure = viz.plot_correlation(np.sqrt(chipreads),
                                  np.sqrt(dnasereads),
                                  xlabel='sqrt(dnase reads)',
                                  title='Total Dnase reads')
    pdfhandle.savefig(figure)
    pdfhandle.close()
    outhandle.close()
Пример #5
0
def plotbound(pwmid, sample=None, cutk=0, pwmbase='transfac'):

    import random
    from matplotlib.backends.backend_pdf import PdfPages

    bounds = [(1, 5), (5, 9), (9, 13), (13, np.inf)]
    labels = ['1 - 5', '5 - 9', '9 - 13', '>13']

    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()

    dnaseobj = loadutils.Dnase(sample=sample)
    chipseqobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    mnaseobj = loadutils.Mnase(sample=sample)
    indiv_idx = loadutils.read_individuals()
    if sample in [None, 'Gm12878']:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx['NA18516'])
    else:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0]
    bound_scores = []
    bound_chipreads = []
    unbound_chipreads = []
    dnasemean_bound = []
    mnasemean_bound = []
    chiptotalreads = []
    logodds = []
    score = []
    for bound in bounds:
        # plot mean profile of all bound sites, stratified by PWM score
        all_handle = loadutils.ZipFile("%s/cache/%s_locations_Q%.1f.txt.gz" %
                                       (projpath, pwmid, 95.0))
        if sample is None:
            bound_handle = loadutils.ZipFile(
                "%s/cache/combined/%s_%d_bound_Q%.1f.bed.gz" %
                (projpath, pwmid, cutk, dhs))
        else:
            bound_handle = loadutils.ZipFile(
                "%s/cache/separate/%s_%d_%s_bound_Q%.1f.bed.gz" %
                (projpath, pwmid, cutk, sample, dhs))
        all_locations = all_handle.read(threshold=bound)
        blocs = bound_handle.read(threshold=bound)
        bound_locations = [
            loc[:5] for loc in blocs if float(loc[5]) >= np.log10(99)
        ]
        if len(all_locations) > 2 * len(bound_locations):
            all_locations = random.sample(all_locations,
                                          2 * len(bound_locations))
        unbound_locations = list(
            set(all_locations).difference(set(bound_locations)))

        chiptotalreads.extend([int(loc[-1]) for loc in blocs])
        logodds.extend([float(loc[-2]) for loc in blocs])
        score.extend([float(loc[-3]) for loc in blocs])

        # load DNase and MNase reads
        print bound, len(bound_locations), len(unbound_locations)
        x, y = aggregate(bound_locations, dnaseobj)
        dnasemean_bound.append(x)
        mnasemean_bound.append(aggregate(bound_locations, mnaseobj))

        # Total ChipSeq read counts
        chipreads = chipseqobj.getreads(bound_locations)
        bound_chipreads.extend(chipreads)
        chipreads = chipseqobj.getreads(unbound_locations)
        unbound_chipreads.extend(chipreads)

    chiptotalreads = np.array(chiptotalreads)
    logodds = np.array(logodds)
    score = np.array(score)

    if sample is None:
        title = pwms[key]['NA']
        tag = "_%s_%d_Q%.1f.pdf" % (pwmid, cutk, dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s" % (projpath, tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s" % (projpath, tag)
        chipdistfile = "%s/fig/chipdist%s" % (projpath, tag)
        scatterfile = "%s/fig/scatter%s" % (projpath, tag)
        scoreposfile = "%s/fig/scoreposition%s" % (projpath, tag)
        posagreefile = "%s/fig/posagreement%s.pdf" % (projpath, tag)
    else:
        title = "%s / %s" % (pwms[key]['NA'], sample)
        tag = "_short_%s_%d_%s_Q%.1f" % (pwmid, cutk, sample, dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s.pdf" % (projpath, tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s.pdf" % (projpath, tag)
        chipdistfile = "%s/fig/chipdist%s.pdf" % (projpath, tag)
        scatterfile = "%s/fig/scatter%s.pdf" % (projpath, tag)
        scoreposfile = "%s/fig/scoreposition%s.pdf" % (projpath, tag)
        posagreefile = "%s/fig/posagreement%s.pdf" % (projpath, tag)

    figure = viz.plot_dnaseprofile(dnasemean_bound,
                                   labels,
                                   motiflen=len(pwms[key]['motif']),
                                   title=title)
    figure.savefig(dnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_mnaseprofile(mnasemean_bound,
                                   labels,
                                   motiflen=len(pwms[key]['motif']),
                                   title=title)
    figure.savefig(mnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_chipseq_distribution(bound_chipreads,
                                           unbound_chipreads,
                                           title=title)
    figure.savefig(chipdistfile, bbox_inches=0, dpi=300, format='pdf')

    figure = viz.plot_chipseq_posterior_correlation(chiptotalreads,
                                                    logodds,
                                                    score,
                                                    title=title)
    figure.savefig(scatterfile, bbox_inches=0, dpi=300, format='pdf')

    dnaseobj.close()
    mnaseobj.close()
    chipseqobj.close()
    sequence.close()
Пример #6
0
if __name__ == "__main__":

    pwmid = sys.argv[2]
    sample = "NA18505"
    model = sys.argv[1]
    location_file = "/mnt/lustre/home/anilraj/pbm_dnase_profile/cache/%s_0_short_bound.bed.gz" % (pwmid)
    handle = loadutils.ZipFile(location_file)
    locations = handle.read(threshold=11)
    print pwmid, sample, model
    print "read in locations ..."

    if pwmid[0] == "M":
        pwms = loadutils.transfac_pwms()
    elif pwmid[0] == "S":
        pwms = loadutils.selex_pwms()
    motif = [val["motif"] for val in pwms.itervalues() if val["AC"] == pwmid][0]
    print "selected motif model ..."

    bound = [loc for loc in locations if int(loc[-1]) > 50]
    undecided = [loc for loc in locations if int(loc[-1]) > 0]
    dnaseobj = loadutils.Dnase(sample=sample)
    reads, undecided, ig = dnaseobj.getreads(undecided, remove_outliers=False, width=200)
    totalreads = reads.sum(1)
    print "extracted total reads ..."

    handle = PdfPages("/mnt/lustre/home/anilraj/pbm_dnase_profile/fig/compare_models_%s_%s.pdf" % (model, pwmid))
    for width in [64, 128, 256]:
        boundreads, ig, ig = dnaseobj.getreads(bound, remove_outliers=True, width=width)
        undecidedreads, locs_tolearn, ig = dnaseobj.getreads(undecided, remove_outliers=True, width=width)
        indices = np.array([undecided.index(loc) for loc in locs_tolearn])
Пример #7
0
def decode(pwmid, sample, cutk=0, pwmbase='transfac', pos_threshold=np.log10(99), chipseq=False):

    import centipede
    import millipede
    import centipede_pbm as pbmcentipede

    if sample in [None,'Gm12878','Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        if pwmbase=='transfac':
            pwms = loadutils.transfac_pwms()
        elif pwmbase=='selex':
            pwms = loadutils.selex_pwms()
        motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0]
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if cutk!=0:
        sequence.set_cutrate(sample=sample, k=cutk)

    # use output from Centipede run
    # 0 = Py code, 1 = R code
    if sample is None:
        handle = open("%s/cache/combined/pbmcentipede_short_%s.pkl"%(projpath,pwmid),'r')
    else:
        handle = open("%s/cache/separate/pbmcentipede_short_%s_%s.pkl"%(projpath,pwmid,sample),'r')
    output = cPickle.load(handle)
    handle.close()
    if cutk==0:
        idx = 0
    elif cutk==2:
        idx = 1
    elif cutk==4:
        idx = 2
    footprint = output['footprint'][idx]
    negbinparams = output['negbin'][idx]
    prior = output['prior'][idx][0]
    dhsprior = output['prior'][idx][1]

    if sample in ['Gm12878','Gm12878All']:
        location_file = "%s/cache/%s_locationsGm12878_Q%.1f.txt.gz"%(projpath,pwmid,dhs)
    else:
        location_file = "%s/cache/%s_locations_Q%.1f.txt.gz"%(projpath,pwmid,dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l"%location_file, stdout=subprocess.PIPE, shell=True)
    Ns = int(pipe.communicate()[0].strip())

    try:
        chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
    except:
        pass

    readobj = loadutils.Dnase(sample=sample)
    readhandle = loadutils.ZipFile(location_file)
    loops = Ns/batch

    if sample is None:
        handle = gzip.open("%s/cache/combined/%s_short_bound.bed.gz"%(projpath,pwmid),'wb')
    else:
        handle = gzip.open("%s/cache/separate/%s_%d_%s_short_bound.bed.gz"%(projpath,pwmid,cutk,sample),'wb')
    towrite = ['Chr','Start','Stop','Strand','PwmScore','LogPosOdds','LogPriorOdds','MultLikeRatio','NegBinLikeRatio','ChipseqReads']
    handle.write('\t'.join(towrite)+'\n')

    totalreads = []
    for n in xrange(loops):
        starttime = time.time()
        # read locations from file
        locations = readhandle.read(chunk=batch)
        if sample not in [None,'Gm12878','Gm12878All']:
            # compute scores at locations for specific sample
            locations = sequence.get_scores(locations, motif)
        locations = sequence.filter_mappability(locations, width=max([200,L/2]))

        # read in Dnase read data for locations
        dnasereads, locations, subscores = readobj.getreads(locations, width=max([200,L/2]))
        subscores = np.array(subscores).astype('float')
        subscores = subscores.reshape(subscores.size,1)
        dnasetotal = dnasereads.sum(1)
        print len(locations)

        if chipseq:
            chipreads = chipobj.getreads(locations, width=max([200,L/2]))
        else:
            chipreads = None

        # set null footprint distribution
        if cutk==0:
            null = np.ones((1,L),dtype=float)/L
        else:
            null = sequence.getnull(locations, width=L/2)

        if L<400:
            dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4]))

#        if cutk==0:
        logodds = centipede.decode(dnasereads, dnasetotal, null, subscores, footprint, negbinparams[0], negbinparams[1], prior, dhsprior, chipreads=chipreads, damp=damp)
#        elif cutk==2:
#            posterior = pbmcentipede.decode(reads, chipreads, subscores, footprint[1:], negbinparams[0], negbinparams[1], prior)

        if not chipseq:
            try:
                chipreads = chipobj.get_total_reads(locations, width=400)
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                    for loc,pos,c in zip(locations,logodds,chipreads)]
            except NameError:
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                    for loc,pos in zip(locations,logodds)]

        locations = [loc for loc in locations if len(loc)>5]
        ignore = [handle.write('\t'.join(elem)+'\n') for elem in locations]

        print time.time()-starttime

    remain = Ns-loops*batch
    locations = readhandle.read(chunk=remain)
    if sample not in [None,'Gm12878','Gm12878All']:
        # compute scores at locations for specific sample
        locations = sequence.get_scores(locations, motif)
    locations = sequence.filter_mappability(locations, width=max([200,L/2]))
    dnasereads, locations, subscores = readobj.getreads(locations, width=max([200,L/2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size,1)
    dnasetotal = dnasereads.sum(1)

    if chipseq:
        chipreads = chipobj.get_total_reads(locations, width=200)
    else:
        chipreads = None

    # set null footprint distribution
    if cutk==0:
        null = np.ones((1,L),dtype=float)/L
    else:
        null = sequence.getnull(locations, width=L/2)

    if L<400:
        dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4]))

    logodds = centipede.decode(dnasereads, dnasetotal, null, subscores, footprint, negbinparams[0], negbinparams[1], prior, dhsprior, chipreads=chipreads, damp=damp)

    if not chipseq:
        try:
            chipreads = chipobj.get_total_reads(locations, width=400)
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                for loc,pos,c in zip(locations,logodds,chipreads)]
        except NameError:
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                for loc,pos in zip(locations,logodds)]
    locations = [loc for loc in locations if len(loc)>5]
    ignore = [handle.write('\t'.join(elem)+'\n') for elem in locations]

    readobj.close()
    chipobj.close()
    readhandle.close()
    handle.close()

    sequence.close()
Пример #8
0
def infer(pwmid, sample, pwm_thresh=8, pwmbase='transfac', chipseq=False):

    import centipede_pbm as centipede

    model = 'modelC'
    if pwmbase=='transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase=='selex':
        pwms = loadutils.selex_pwms()
    motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0]

    if sample in [None,'Gm12878','Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if sample in ['Gm12878','Gm12878All']:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locationsGm12878_Q%.1f.txt.gz"%(pwmid,dhs)
    else:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locations_Q%.1f.txt.gz"%(pwmid,dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l"%location_file, stdout=subprocess.PIPE, shell=True)
    Ns = int(pipe.communicate()[0].strip())

    # load scores
    alllocations = []
    pwm_cutoff = pwm_thresh+1
    while len(alllocations)<100:
        pwm_cutoff = pwm_cutoff - 1
        handle = loadutils.ZipFile(location_file)
        alllocations = handle.read(threshold=pwm_cutoff)
        handle.close()
    print "PWM Cutoff = %d"%pwm_cutoff

    # subsample locations, if too many
    if len(alllocations)>100000:
        scores = np.array([loc[-1] for loc in alllocations]).astype(float)
        indices = np.argsort(scores)[-100000:]
        alllocations = [alllocations[index] for index in indices]
    print "Num of sites for learning, with pwm threshold of %d for %s = %d"%(pwm_thresh, pwmid, len(alllocations))

    if sample in [None,'Gm12878','Gm12878All']:
        locs_tolearn = alllocations
    else:
        # compute scores for specific sample at these locations
        starttime = time.time()
        locs_tolearn = sequence.get_scores(alllocations, motif)
        print len(locs_tolearn), time.time()-starttime

    # filter mappability
    print "filtering out unmappable sites ..."
    locs_tolearn = sequence.filter_mappability(locs_tolearn, width=max([200,L/2]))

    # load reads and locations
    print "loading dnase reads ..."
    readobj = loadutils.Dnase(sample=sample)
    dnasereads, locs_tolearn, subscores = readobj.getreads(locs_tolearn, remove_outliers=True, width=max([200,L/2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size,1)
    dnasetotal = dnasereads.sum(1)
    print "Num of mappable sites for learning for %s = %d"%(pwmid,len(locs_tolearn))

    if chipseq:
        chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
        chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
        chipobj.close()
    else:
        chipreads = None

    if L<400:
        dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4]))
    
    locs_tolearn = [list(loc) for loc in locs_tolearn]
    footprints = []
    priors = []
    negbins = []
    posteriors = []
    
    null = np.ones((1,L),dtype=float)*1./L
    posterior, footprint, negbinparams, prior = centipede.EM(dnasereads, dnasetotal, subscores, null, model=model, restarts=2)

    posteriors.append(posterior)
    footprints.append(footprint)
    negbins.append(negbinparams)
    priors.append(prior)

    chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878',loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=400)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()    
    for posterior in posteriors:
        logodds = np.log(posterior[:,1]/posterior[:,0])
        logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
        logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
        R = stats.pearsonr(logodds, np.sqrt(chipreads))
        R2 = stats.pearsonr(np.sqrt(dnasetotal), np.sqrt(chipreads))

        handle = open('/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'%loadutils.factormap[pwmid],'r')
        calls = [line.strip().split()[:3] for line in handle]
        handle.close()
        macs = dict([(chrom,[]) for chrom in utils.chromosomes[:22]])
        [macs[call[0]].append([int(call[1]),int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22]]

        bsites = [locs_tolearn[i] for i,p in enumerate(posterior[:,1]) if p>0.99]
        F, precision, sensitivity, ig = Fscore.Fscore(bsites, macs)
        chipauc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, logodds, macs, locs_tolearn)
        print pwmid, model, sample, R, R2, chipauc, tpr, F, precision, sensitivity

    output = {'footprint': footprints, \
            'negbin': negbins, \
            'prior': priors, \
            'posterior': posteriors, \
            'locations': locs_tolearn}

    if sample is None:
        handle = open("%s/cache/combined/pbmcentipede_%s_%s.pkl"%(projpath,model,pwmid),'w')
    else:
        handle = open("%s/cache/separate/pbmcentipede_%s_%s_%s.pkl"%(projpath,model,pwmid,sample),'w')
    cPickle.Pickler(handle, protocol=2).dump(output)
    handle.close()

    readobj.close()
    sequence.close()
Пример #9
0
def plotbound(pwmid, sample=None, cutk=0, pwmbase='transfac'):

    import random
    from matplotlib.backends.backend_pdf import PdfPages

    bounds = [(1,5),(5,9),(9,13),(13,np.inf)]
    labels = ['1 - 5', '5 - 9', '9 - 13', '>13']
    
    if pwmbase=='transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase=='selex':
        pwms = loadutils.selex_pwms()

    dnaseobj = loadutils.Dnase(sample=sample)
    chipseqobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
    mnaseobj = loadutils.Mnase(sample=sample)
    indiv_idx = loadutils.read_individuals()
    if sample in [None,'Gm12878']:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx['NA18516'])
    else:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    key = [k for k,pwm in pwms.iteritems() if pwm['AC']==pwmid][0]
    bound_scores = []
    bound_chipreads = []
    unbound_chipreads = []
    dnasemean_bound = []
    mnasemean_bound = []
    chiptotalreads = []
    logodds = []
    score = []
    for bound in bounds:
        # plot mean profile of all bound sites, stratified by PWM score
        all_handle = loadutils.ZipFile("%s/cache/%s_locations_Q%.1f.txt.gz"%(projpath,pwmid,95.0))
        if sample is None:
            bound_handle = loadutils.ZipFile("%s/cache/combined/%s_%d_bound_Q%.1f.bed.gz"%(projpath,pwmid,cutk,dhs))
        else:
            bound_handle = loadutils.ZipFile("%s/cache/separate/%s_%d_%s_bound_Q%.1f.bed.gz"%(projpath,pwmid,cutk,sample,dhs))
        all_locations = all_handle.read(threshold=bound)
        blocs = bound_handle.read(threshold=bound)
        bound_locations = [loc[:5] for loc in blocs if float(loc[5])>=np.log10(99)]
        if len(all_locations)>2*len(bound_locations):
            all_locations = random.sample(all_locations, 2*len(bound_locations))
        unbound_locations = list(set(all_locations).difference(set(bound_locations)))

        chiptotalreads.extend([int(loc[-1]) for loc in blocs])
        logodds.extend([float(loc[-2]) for loc in blocs])
        score.extend([float(loc[-3]) for loc in blocs])

        # load DNase and MNase reads
        print bound, len(bound_locations), len(unbound_locations)
        x,y = aggregate(bound_locations, dnaseobj)
        dnasemean_bound.append(x)
        mnasemean_bound.append(aggregate(bound_locations, mnaseobj))

        # Total ChipSeq read counts
        chipreads = chipseqobj.getreads(bound_locations)
        bound_chipreads.extend(chipreads)
        chipreads = chipseqobj.getreads(unbound_locations)
        unbound_chipreads.extend(chipreads)

    chiptotalreads = np.array(chiptotalreads)
    logodds = np.array(logodds)
    score = np.array(score)

    if sample is None:
        title = pwms[key]['NA']
        tag = "_%s_%d_Q%.1f.pdf"%(pwmid,cutk,dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s"%(projpath,tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s"%(projpath,tag)
        chipdistfile = "%s/fig/chipdist%s"%(projpath,tag)
        scatterfile = "%s/fig/scatter%s"%(projpath,tag)
        scoreposfile = "%s/fig/scoreposition%s"%(projpath,tag)
        posagreefile = "%s/fig/posagreement%s.pdf"%(projpath,tag)
    else:
        title = "%s / %s"%(pwms[key]['NA'], sample)
        tag = "_short_%s_%d_%s_Q%.1f"%(pwmid,cutk,sample,dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s.pdf"%(projpath,tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s.pdf"%(projpath,tag)
        chipdistfile = "%s/fig/chipdist%s.pdf"%(projpath,tag)
        scatterfile = "%s/fig/scatter%s.pdf"%(projpath,tag)
        scoreposfile = "%s/fig/scoreposition%s.pdf"%(projpath,tag)
        posagreefile = "%s/fig/posagreement%s.pdf"%(projpath,tag)

    figure = viz.plot_dnaseprofile(dnasemean_bound, labels, motiflen=len(pwms[key]['motif']), title=title)
    figure.savefig(dnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_mnaseprofile(mnasemean_bound, labels, motiflen=len(pwms[key]['motif']), title=title)
    figure.savefig(mnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_chipseq_distribution(bound_chipreads, unbound_chipreads, title=title)
    figure.savefig(chipdistfile, bbox_inches=0, dpi=300, format='pdf')

    figure = viz.plot_chipseq_posterior_correlation(chiptotalreads, logodds, score, title=title)
    figure.savefig(scatterfile, bbox_inches=0, dpi=300, format='pdf')

    dnaseobj.close()
    mnaseobj.close()
    chipseqobj.close()
    sequence.close()
Пример #10
0
def plotmodel(pwmid, sample=None, pwmbase='transfac'):

    import centipede_pbm as centipede
    from matplotlib.backends.backend_pdf import PdfPages

    if pwmbase=='transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase=='selex':
        pwms = loadutils.selex_pwms()

    models = ['modelA','modelB']
    meanfootprints = []
    stdfootprints = []
    Logodds = []

    handle = open('/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'%loadutils.factormap[pwmid],'r')
    calls = [line.strip().split()[:3] for line in handle]
    handle.close()
    macs = dict([(chrom,[]) for chrom in utils.chromosomes[:22]])
    [macs[call[0]].append([int(call[1]),int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22]]

    if sample is None:
        statsfile = "%s/fig/stats_short_%s.txt"%(projpath,pwmid)
    else:
        statsfile = "%s/fig/stats_short_%s_%s.txt"%(projpath,pwmid,sample)

    pis = []
    gammas = []
    outhandle = open(statsfile,'w')

    for model in models:
        if sample is None:
            handle = open("%s/cache/combined/pbmcentipede_%s_short_%s.pkl"%(projpath,model,pwmid),'r')
        else:
            handle = open("%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl"%(projpath,model,pwmid,sample),'r')
        output = cPickle.load(handle)
        handle.close()
        footparams = output['footprint'][0]
        alpha, tau = output['negbin'][0]
        posterior = output['posterior'][0]
        logodds = np.log(posterior[:,1]/posterior[:,0])
        logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
        logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
        Logodds.append(logodds)
        means = alpha*(1-tau)/tau
        outhandle.write('%.2f %.2f\n'%(means[0],means[1]))

        if not 'cascade' in locals():
            locs_tolearn = output['locations']
            dnaseobj = loadutils.Dnase(sample=sample)            
            dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn, width=max([200,L/2]))
            if L<400:
                reads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4]))
            else:
                reads = dnasereads
            dnasereads = dnasereads.sum(1)
            dnaseobj.close()

            cascade = centipede.Cascade(L)
            cascade.setreads(reads)
            del reads

        if model=='modelA':
            gammas.append(footparams[0])
            if isinstance(footparams[1],centipede.Pi):
                pi = footparams[1].estim
            else:
                pi = footparams[1]
            pis.append(pi)
            B = footparams[2]
            M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, B=B, model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform()-meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
#            stdfootprints.append(stdfoot)
            stdfootprints.append(None)
        elif model=='modelB':
            gammas.append(footparams[1])
            if isinstance(footparams[2],centipede.Pi):
                pi = footparams[2].estim
            else:
                pi = footparams[2]
            pis.append(pi)
            mu = footparams[3]
            M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, mu=mu, model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform()-meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
#            stdfootprints.append(stdfoot)
            stdfootprints.append(None)

    chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878',loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()
    pdb.set_trace()

#    sequence = loadutils.Sequence(sample)
#    seqs = sequence.get_sequences(locs_tolearn, width=200)
#    sequence.close()
#    pdb.set_trace()
#    np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads)
#    pdb.set_trace()

    corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads))
    corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads))

    handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl"%(pwmid,sample),'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:,1]/posterior[:,0])
    logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
    logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl"%(pwmid,sample),'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:,1]/posterior[:,0])
    logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
    logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl"%(pwmid,sample),'r')
    output = cPickle.load(handle)
    handle.close()
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:,1]/posterior[:,0])
    logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
    logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
    Logodds.append(logodds)

    key = [k for k,pwm in pwms.iteritems() if pwm['AC']==pwmid][0]
    if sample is None:
        title = pwms[key]['NA']
        footprintfile = "%s/fig/footprint_short_%s.pdf"%(projpath,pwmid)
        corrfile = "%s/fig/logoddsCorr_short_%s.pdf"%(projpath,pwmid)
    else:
        title = "%s / %s"%(pwms[key]['NA'], sample)
        footprintfile = "%s/fig/footprint_short_%s_%s.pdf"%(projpath,pwmid,sample)
        corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf"%(projpath,pwmid,sample)

    models = ['CentipedePBM_M1','CentipedePBM_M2','Centipede','CentipedeDamped']
    # plot footprints
    pdfhandle = PdfPages(footprintfile)
    figure = viz.plot_footprint(meanfootprints, labels=models, stderr=stdfootprints, motif=pwms[key]['motif'], title=title)
    pdfhandle.savefig(figure)
    models.append('CentipedeNoFoot')
    auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, Logodds[0], macs, locs_tolearn)
    figure = viz.plot_auc(Logodds, positive, negative, labels=models, title=title)
    pdfhandle.savefig(figure)
    T = pis[0].size
    figure = viz.plot.figure()
    subplot = figure.add_subplot(111)
    subplot.scatter(gammas[0].value[0], gammas[1].value[0], s=2**T, marker='o', color=viz.colors[1], label='gamma', alpha=0.5)
    subplot.scatter(pis[0][0], pis[1][0], s=2**T, marker='o', color=viz.colors[0], label='pi', alpha=0.5)
    for i in xrange(1,T):
        subplot.scatter(gammas[0].value[i], gammas[1].value[i], s=2**(T-i), marker='o', color=viz.colors[1], label='_nolabel_', alpha=0.5)
        subplot.scatter(pis[0][i], pis[1][i], s=2**(T-i), marker='o', color=viz.colors[0], label='_nolabel_', alpha=0.5)
    xmin = min([pis[0].min(), pis[1].min()])-0.05
    xmax = max([pis[0].max(), pis[1].max()])+0.05
    subplot.axis([xmin, xmax, xmin, xmax])
    subplot.set_xlabel('PBM_M1')
    subplot.set_ylabel('PBM_M2')
    legend = subplot.legend(loc=1)
    for text in legend.texts:
        text.set_fontsize('8')
    legend.set_frame_on(False)
    pdfhandle.savefig(figure)
    pdfhandle.close()
    pdb.set_trace()

    pdfhandle = PdfPages(corrfile)
    lo = 0
    for logodds,model in zip(Logodds,models):
        auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, logodds, macs, locs_tolearn)
        corrA = stats.pearsonr(logodds, np.sqrt(chipreads))
        corrB = stats.pearsonr(logodds, np.sqrt(controlreads))
        corra = stats.pearsonr(logodds[logodds>lo], np.sqrt(chipreads)[logodds>lo])
        corrb = stats.pearsonr(logodds[logodds>lo], np.sqrt(controlreads)[logodds>lo])
        corrc = stats.pearsonr(np.sqrt(dnasereads)[logodds>lo], np.sqrt(chipreads)[logodds>lo])
        corrd = stats.pearsonr(np.sqrt(dnasereads)[logodds>lo], np.sqrt(controlreads)[logodds>lo])
        towrite = [pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc, corrd, auc, tpr, logodds.size, (logodds>np.log(99)).sum()]
        outhandle.write(' '.join(map(str,towrite))+'\n')
        figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model)
        pdfhandle.savefig(figure)

    figure = viz.plot_correlation(np.sqrt(chipreads), np.sqrt(dnasereads), xlabel='sqrt(dnase reads)', title='Total Dnase reads')
    pdfhandle.savefig(figure)
    pdfhandle.close()
    outhandle.close()