Exemplo n.º 1
0
    location_file = "/mnt/lustre/home/anilraj/pbm_dnase_profile/cache/%s_0_short_bound.bed.gz"%(pwmid)
    handle = loadutils.ZipFile(location_file)
    locations = handle.read(threshold=11)
    print pwmid, sample, model
    print "read in locations ..."

    if pwmid[0]=='M':
        pwms = loadutils.transfac_pwms()
    elif pwmid[0]=='S':
        pwms = loadutils.selex_pwms()
    motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0]
    print "selected motif model ..."

    bound = [loc for loc in locations if int(loc[-1])>50]
    undecided = [loc for loc in locations if int(loc[-1])>0]
    dnaseobj = loadutils.Dnase(sample=sample)
    reads, undecided, ig = dnaseobj.getreads(undecided, remove_outliers=False, width=200)
    totalreads = reads.sum(1)
    print "extracted total reads ..."

    handle = PdfPages('/mnt/lustre/home/anilraj/pbm_dnase_profile/fig/compare_models_%s_%s.pdf'%(model,pwmid))
    for width in [64,128,256]:
        boundreads, ig, ig = dnaseobj.getreads(bound, remove_outliers=True, width=width)
        undecidedreads, locs_tolearn, ig = dnaseobj.getreads(undecided, remove_outliers=True, width=width)
        indices = np.array([undecided.index(loc) for loc in locs_tolearn])
        chipreads = np.array([int(loc[-1]) for loc in locs_tolearn if int(loc[-1])>0])
        undecidedreads = undecidedreads[totalreads[indices]>0,:]
        chipreads = chipreads[totalreads[indices]>0]
        print "extracted specific reads ..."

        """
Exemplo n.º 2
0
def decode(pwmid,
           sample,
           cutk=0,
           pwmbase='transfac',
           pos_threshold=np.log10(99),
           chipseq=False):

    import centipede
    import millipede
    import centipede_pbm as pbmcentipede

    if sample in [None, 'Gm12878', 'Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        if pwmbase == 'transfac':
            pwms = loadutils.transfac_pwms()
        elif pwmbase == 'selex':
            pwms = loadutils.selex_pwms()
        motif = [
            val['motif'] for val in pwms.itervalues() if val['AC'] == pwmid
        ][0]
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if cutk != 0:
        sequence.set_cutrate(sample=sample, k=cutk)

    # use output from Centipede run
    # 0 = Py code, 1 = R code
    if sample is None:
        handle = open(
            "%s/cache/combined/pbmcentipede_short_%s.pkl" % (projpath, pwmid),
            'r')
    else:
        handle = open(
            "%s/cache/separate/pbmcentipede_short_%s_%s.pkl" %
            (projpath, pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    if cutk == 0:
        idx = 0
    elif cutk == 2:
        idx = 1
    elif cutk == 4:
        idx = 2
    footprint = output['footprint'][idx]
    negbinparams = output['negbin'][idx]
    prior = output['prior'][idx][0]
    dhsprior = output['prior'][idx][1]

    if sample in ['Gm12878', 'Gm12878All']:
        location_file = "%s/cache/%s_locationsGm12878_Q%.1f.txt.gz" % (
            projpath, pwmid, dhs)
    else:
        location_file = "%s/cache/%s_locations_Q%.1f.txt.gz" % (projpath,
                                                                pwmid, dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l" % location_file,
                            stdout=subprocess.PIPE,
                            shell=True)
    Ns = int(pipe.communicate()[0].strip())

    try:
        chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    except:
        pass

    readobj = loadutils.Dnase(sample=sample)
    readhandle = loadutils.ZipFile(location_file)
    loops = Ns / batch

    if sample is None:
        handle = gzip.open(
            "%s/cache/combined/%s_short_bound.bed.gz" % (projpath, pwmid),
            'wb')
    else:
        handle = gzip.open(
            "%s/cache/separate/%s_%d_%s_short_bound.bed.gz" %
            (projpath, pwmid, cutk, sample), 'wb')
    towrite = [
        'Chr', 'Start', 'Stop', 'Strand', 'PwmScore', 'LogPosOdds',
        'LogPriorOdds', 'MultLikeRatio', 'NegBinLikeRatio', 'ChipseqReads'
    ]
    handle.write('\t'.join(towrite) + '\n')

    totalreads = []
    for n in xrange(loops):
        starttime = time.time()
        # read locations from file
        locations = readhandle.read(chunk=batch)
        if sample not in [None, 'Gm12878', 'Gm12878All']:
            # compute scores at locations for specific sample
            locations = sequence.get_scores(locations, motif)
        locations = sequence.filter_mappability(locations,
                                                width=max([200, L / 2]))

        # read in Dnase read data for locations
        dnasereads, locations, subscores = readobj.getreads(locations,
                                                            width=max(
                                                                [200, L / 2]))
        subscores = np.array(subscores).astype('float')
        subscores = subscores.reshape(subscores.size, 1)
        dnasetotal = dnasereads.sum(1)
        print len(locations)

        if chipseq:
            chipreads = chipobj.getreads(locations, width=max([200, L / 2]))
        else:
            chipreads = None

        # set null footprint distribution
        if cutk == 0:
            null = np.ones((1, L), dtype=float) / L
        else:
            null = sequence.getnull(locations, width=L / 2)

        if L < 400:
            dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                    dnasereads[:, 300 - L / 4:300 + L / 4]))


#        if cutk==0:
        logodds = centipede.decode(dnasereads,
                                   dnasetotal,
                                   null,
                                   subscores,
                                   footprint,
                                   negbinparams[0],
                                   negbinparams[1],
                                   prior,
                                   dhsprior,
                                   chipreads=chipreads,
                                   damp=damp)
        #        elif cutk==2:
        #            posterior = pbmcentipede.decode(reads, chipreads, subscores, footprint[1:], negbinparams[0], negbinparams[1], prior)

        if not chipseq:
            try:
                chipreads = chipobj.get_total_reads(locations, width=400)
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                    for loc,pos,c in zip(locations,logodds,chipreads)]
            except NameError:
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                    for loc,pos in zip(locations,logodds)]

        locations = [loc for loc in locations if len(loc) > 5]
        ignore = [handle.write('\t'.join(elem) + '\n') for elem in locations]

        print time.time() - starttime

    remain = Ns - loops * batch
    locations = readhandle.read(chunk=remain)
    if sample not in [None, 'Gm12878', 'Gm12878All']:
        # compute scores at locations for specific sample
        locations = sequence.get_scores(locations, motif)
    locations = sequence.filter_mappability(locations, width=max([200, L / 2]))
    dnasereads, locations, subscores = readobj.getreads(locations,
                                                        width=max([200,
                                                                   L / 2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size, 1)
    dnasetotal = dnasereads.sum(1)

    if chipseq:
        chipreads = chipobj.get_total_reads(locations, width=200)
    else:
        chipreads = None

    # set null footprint distribution
    if cutk == 0:
        null = np.ones((1, L), dtype=float) / L
    else:
        null = sequence.getnull(locations, width=L / 2)

    if L < 400:
        dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                dnasereads[:, 300 - L / 4:300 + L / 4]))

    logodds = centipede.decode(dnasereads,
                               dnasetotal,
                               null,
                               subscores,
                               footprint,
                               negbinparams[0],
                               negbinparams[1],
                               prior,
                               dhsprior,
                               chipreads=chipreads,
                               damp=damp)

    if not chipseq:
        try:
            chipreads = chipobj.get_total_reads(locations, width=400)
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                for loc,pos,c in zip(locations,logodds,chipreads)]
        except NameError:
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                for loc,pos in zip(locations,logodds)]
    locations = [loc for loc in locations if len(loc) > 5]
    ignore = [handle.write('\t'.join(elem) + '\n') for elem in locations]

    readobj.close()
    chipobj.close()
    readhandle.close()
    handle.close()

    sequence.close()
Exemplo n.º 3
0
def infer(pwmid, sample, pwm_thresh=8, pwmbase='transfac', chipseq=False):

    import centipede_pbm as centipede

    model = 'modelC'
    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()
    motif = [val['motif'] for val in pwms.itervalues()
             if val['AC'] == pwmid][0]

    if sample in [None, 'Gm12878', 'Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if sample in ['Gm12878', 'Gm12878All']:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locationsGm12878_Q%.1f.txt.gz" % (
            pwmid, dhs)
    else:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locations_Q%.1f.txt.gz" % (
            pwmid, dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l" % location_file,
                            stdout=subprocess.PIPE,
                            shell=True)
    Ns = int(pipe.communicate()[0].strip())

    # load scores
    alllocations = []
    pwm_cutoff = pwm_thresh + 1
    while len(alllocations) < 100:
        pwm_cutoff = pwm_cutoff - 1
        handle = loadutils.ZipFile(location_file)
        alllocations = handle.read(threshold=pwm_cutoff)
        handle.close()
    print "PWM Cutoff = %d" % pwm_cutoff

    # subsample locations, if too many
    if len(alllocations) > 100000:
        scores = np.array([loc[-1] for loc in alllocations]).astype(float)
        indices = np.argsort(scores)[-100000:]
        alllocations = [alllocations[index] for index in indices]
    print "Num of sites for learning, with pwm threshold of %d for %s = %d" % (
        pwm_thresh, pwmid, len(alllocations))

    if sample in [None, 'Gm12878', 'Gm12878All']:
        locs_tolearn = alllocations
    else:
        # compute scores for specific sample at these locations
        starttime = time.time()
        locs_tolearn = sequence.get_scores(alllocations, motif)
        print len(locs_tolearn), time.time() - starttime

    # filter mappability
    print "filtering out unmappable sites ..."
    locs_tolearn = sequence.filter_mappability(locs_tolearn,
                                               width=max([200, L / 2]))

    # load reads and locations
    print "loading dnase reads ..."
    readobj = loadutils.Dnase(sample=sample)
    dnasereads, locs_tolearn, subscores = readobj.getreads(
        locs_tolearn, remove_outliers=True, width=max([200, L / 2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size, 1)
    dnasetotal = dnasereads.sum(1)
    print "Num of mappable sites for learning for %s = %d" % (
        pwmid, len(locs_tolearn))

    if chipseq:
        chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
        chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
        chipobj.close()
    else:
        chipreads = None

    if L < 400:
        dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                dnasereads[:, 300 - L / 4:300 + L / 4]))

    locs_tolearn = [list(loc) for loc in locs_tolearn]
    footprints = []
    priors = []
    negbins = []
    posteriors = []

    null = np.ones((1, L), dtype=float) * 1. / L
    posterior, footprint, negbinparams, prior = centipede.EM(dnasereads,
                                                             dnasetotal,
                                                             subscores,
                                                             null,
                                                             model=model,
                                                             restarts=2)

    posteriors.append(posterior)
    footprints.append(footprint)
    negbins.append(negbinparams)
    priors.append(prior)

    chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=400)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()
    for posterior in posteriors:
        logodds = np.log(posterior[:, 1] / posterior[:, 0])
        logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
        logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
        R = stats.pearsonr(logodds, np.sqrt(chipreads))
        R2 = stats.pearsonr(np.sqrt(dnasetotal), np.sqrt(chipreads))

        handle = open(
            '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'
            % loadutils.factormap[pwmid], 'r')
        calls = [line.strip().split()[:3] for line in handle]
        handle.close()
        macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]])
        [
            macs[call[0]].append([int(call[1]), int(call[2])])
            for call in calls if call[0] in utils.chromosomes[:22]
        ]

        bsites = [
            locs_tolearn[i] for i, p in enumerate(posterior[:, 1]) if p > 0.99
        ]
        F, precision, sensitivity, ig = Fscore.Fscore(bsites, macs)
        chipauc, tpr, positive, negative = compute_chip_auc(
            chipreads, controlreads, logodds, macs, locs_tolearn)
        print pwmid, model, sample, R, R2, chipauc, tpr, F, precision, sensitivity

    output = {'footprint': footprints, \
            'negbin': negbins, \
            'prior': priors, \
            'posterior': posteriors, \
            'locations': locs_tolearn}

    if sample is None:
        handle = open(
            "%s/cache/combined/pbmcentipede_%s_%s.pkl" %
            (projpath, model, pwmid), 'w')
    else:
        handle = open(
            "%s/cache/separate/pbmcentipede_%s_%s_%s.pkl" %
            (projpath, model, pwmid, sample), 'w')
    cPickle.Pickler(handle, protocol=2).dump(output)
    handle.close()

    readobj.close()
    sequence.close()
Exemplo n.º 4
0
def compute_correlation(file, pwmid):

    condition = 0

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l" % file,
                            stdout=subprocess.PIPE,
                            shell=True)
    Ns = int(pipe.communicate()[0].strip())

    handle = loadutils.ZipFile(file)
    if Ns < batch:
        blocs = handle.read()
        chipreads = np.sqrt(
            [int(loc[-1]) for loc in blocs if float(loc[-5]) > condition])
        logodds = np.array(
            [float(loc[-5]) for loc in blocs if float(loc[-5]) > condition])
        scores = np.array(
            [float(loc[-6]) for loc in blocs if float(loc[-5]) > condition])
        locs = [loc for loc in blocs if float(loc[-5]) > condition]
    else:
        loops = Ns / batch
        chipreads = []
        logodds = []
        scores = []
        locs = []
        for num in xrange(loops):
            blocs = handle.read(chunk=batch)
            chipreads.extend(
                np.sqrt([
                    int(loc[-1]) for loc in blocs if float(loc[-5]) > condition
                ]))
            logodds.extend([
                float(loc[-5]) for loc in blocs if float(loc[-5]) > condition
            ])
            scores.extend([
                float(loc[-6]) for loc in blocs if float(loc[-5]) > condition
            ])
            locs.extend([loc for loc in blocs if float(loc[-5]) > condition])
        remain = Ns - loops * batch
        blocs = handle.read(chunk=remain)
        chipreads.extend(
            np.sqrt(
                [int(loc[-1]) for loc in blocs if float(loc[-5]) > condition]))
        logodds.extend(
            [float(loc[-5]) for loc in blocs if float(loc[-5]) > condition])
        scores.extend(
            [float(loc[-6]) for loc in blocs if float(loc[-5]) > condition])
        locs.extend([loc for loc in blocs if float(loc[-5]) > condition])

        chipreads = np.array(chipreads)
        logodds = np.array(logodds)
        scores = np.array(scores)

    bounds = [(1, 5), (5, 9), (9, 13), (13, np.inf)]
    t1 = np.log10(99)
    handle = open(
        '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed' %
        loadutils.factormap[pwmid], 'r')
    calls = [line.strip().split()[:3] for line in handle]
    handle.close()

    macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]])
    [
        macs[call[0]].append([int(call[1]), int(call[2])]) for call in calls
        if call[0] in utils.chromosomes[:22]
    ]

    outhandle = open('%s/cache/combined/%s_pstats.txt' % (projpath, pwmid),
                     'w')
    totaldnase = []
    totalchip = []
    outhandle.write('PWM id = %s\n' % pwmid)
    for bound in bounds:
        mask = np.logical_and(scores >= bound[0], scores < bound[1])
        if mask.sum() < 20:
            continue
        outhandle.write('%d < PwmScore < %d\n' %
                        (bound[0], min([50, bound[1]])))

        sublocs = [
            loc for loc, m, l in zip(locs, mask, logodds) if m and l > t1
        ]
        toextract = [loc for loc, m in zip(locs, mask) if m]

        if 'Gm12878' in file:
            if 'All' in file:
                dnaseobj = loadutils.Dnase(sample='Gm12878All')
            else:
                dnaseobj = loadutils.Dnase(sample='Gm12878')
        else:
            dnaseobj = loadutils.Dnase()
        ig, dnaseread = aggregate(toextract, dnaseobj, width=200)
        dnaseobj.close()
        totaldnase.extend(list(dnaseread))
        totalchip.extend(list(chipreads[mask]))

        corr = stats.pearsonr(np.sqrt(dnaseread), chipreads[mask])
        outhandle.write(
            "Pearson R of sqrt(Chipseq reads) (400 bp) vs sqrt(DNase reads) (200 bp) = %.3f (p-val: %.2e)\n"
            % corr)

        measures = [
            'Log Posterior Odds', 'Log Prior Odds',
            'Multinomial LogLikelihood Ratio',
            'NegBinomial LogLikelihood Ratio'
        ]
        for i, j in enumerate(xrange(-5, -1)):
            proxy = np.array([float(loc[j]) for loc in locs])
            corr = stats.pearsonr(proxy[mask], chipreads[mask])
            outhandle.write(
                "Pearson R of sqrt(Chipseq reads) (400 bp) with %s = %.3f (p-val: %.2e)\n"
                % (measures[i], corr[0], corr[1]))

        U = stats.mannwhitneyu(chipreads[mask][logodds[mask] > t1],
                               chipreads[mask][logodds[mask] <= t1])
        auc = (1. - U[0] / ((logodds[mask] > t1).sum() *
                            (logodds[mask] <= t1).sum()), U[1])
        F = Fscore.Fscore(sublocs, macs)

        outhandle.write("Number of sites with Log Posterior Odds > 2 = %d\n" %
                        np.logical_and(mask, logodds > 2).sum())
        outhandle.write("Precision = %.4f\n" % F[1])
        outhandle.write("Recall = %.4f\n" % F[2])
        outhandle.write("F-score = %.4f\n" % F[0])
        outhandle.write(
            "Distance of true positives to nearest MACS peak: Min = %.0f bp, Median = %.0f bp, Max = %.0f bp\n\n"
            % F[3])

    outhandle.write("%d < PwmScore < %d\n" % (1, 50))
    corr = stats.pearsonr(np.sqrt(totaldnase), totalchip)
    outhandle.write(
        "Pearson R of sqrt(Chipseq reads) (400 bp) vs sqrt(DNase reads) (200 bp) = %.3f (p-val: %.8f)\n"
        % corr)

    for i, j in enumerate(xrange(-5, -1)):
        proxy = np.array([float(loc[j]) for loc in locs])
        corr = stats.pearsonr(proxy, chipreads)
        outhandle.write(
            "Pearson R of sqrt(Chipseq reads) (400 bp) with %s = %.3f (p-val: %.8f)\n"
            % (measures[i], corr[0], corr[1]))

    sublocs = [loc for loc, l in zip(locs, logodds) if l > t1]
    U = stats.mannwhitneyu(chipreads[logodds > t1], chipreads[logodds <= t1])
    auc = (1. - U[0] / ((logodds > t1).sum() * (logodds <= t1).sum()), U[1])
    F = Fscore.Fscore(sublocs, macs)

    outhandle.write("Number of sites with Log Posterior Odds > 2 = %d\n" %
                    (logodds > 2).sum())
    outhandle.write("Precision = %.4f\n" % F[1])
    outhandle.write("Recall = %.4f\n" % F[2])
    outhandle.write("F-score = %.4f\n" % F[0])
    outhandle.write(
        "Distance to true positives nearest MACS peak: Min = %.0f bp, Median = %.0f bp, Max = %.0f bp\n"
        % F[3])

    outhandle.close()
Exemplo n.º 5
0
def plotbound(pwmid, sample=None, cutk=0, pwmbase='transfac'):

    import random
    from matplotlib.backends.backend_pdf import PdfPages

    bounds = [(1, 5), (5, 9), (9, 13), (13, np.inf)]
    labels = ['1 - 5', '5 - 9', '9 - 13', '>13']

    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()

    dnaseobj = loadutils.Dnase(sample=sample)
    chipseqobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    mnaseobj = loadutils.Mnase(sample=sample)
    indiv_idx = loadutils.read_individuals()
    if sample in [None, 'Gm12878']:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx['NA18516'])
    else:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0]
    bound_scores = []
    bound_chipreads = []
    unbound_chipreads = []
    dnasemean_bound = []
    mnasemean_bound = []
    chiptotalreads = []
    logodds = []
    score = []
    for bound in bounds:
        # plot mean profile of all bound sites, stratified by PWM score
        all_handle = loadutils.ZipFile("%s/cache/%s_locations_Q%.1f.txt.gz" %
                                       (projpath, pwmid, 95.0))
        if sample is None:
            bound_handle = loadutils.ZipFile(
                "%s/cache/combined/%s_%d_bound_Q%.1f.bed.gz" %
                (projpath, pwmid, cutk, dhs))
        else:
            bound_handle = loadutils.ZipFile(
                "%s/cache/separate/%s_%d_%s_bound_Q%.1f.bed.gz" %
                (projpath, pwmid, cutk, sample, dhs))
        all_locations = all_handle.read(threshold=bound)
        blocs = bound_handle.read(threshold=bound)
        bound_locations = [
            loc[:5] for loc in blocs if float(loc[5]) >= np.log10(99)
        ]
        if len(all_locations) > 2 * len(bound_locations):
            all_locations = random.sample(all_locations,
                                          2 * len(bound_locations))
        unbound_locations = list(
            set(all_locations).difference(set(bound_locations)))

        chiptotalreads.extend([int(loc[-1]) for loc in blocs])
        logodds.extend([float(loc[-2]) for loc in blocs])
        score.extend([float(loc[-3]) for loc in blocs])

        # load DNase and MNase reads
        print bound, len(bound_locations), len(unbound_locations)
        x, y = aggregate(bound_locations, dnaseobj)
        dnasemean_bound.append(x)
        mnasemean_bound.append(aggregate(bound_locations, mnaseobj))

        # Total ChipSeq read counts
        chipreads = chipseqobj.getreads(bound_locations)
        bound_chipreads.extend(chipreads)
        chipreads = chipseqobj.getreads(unbound_locations)
        unbound_chipreads.extend(chipreads)

    chiptotalreads = np.array(chiptotalreads)
    logodds = np.array(logodds)
    score = np.array(score)

    if sample is None:
        title = pwms[key]['NA']
        tag = "_%s_%d_Q%.1f.pdf" % (pwmid, cutk, dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s" % (projpath, tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s" % (projpath, tag)
        chipdistfile = "%s/fig/chipdist%s" % (projpath, tag)
        scatterfile = "%s/fig/scatter%s" % (projpath, tag)
        scoreposfile = "%s/fig/scoreposition%s" % (projpath, tag)
        posagreefile = "%s/fig/posagreement%s.pdf" % (projpath, tag)
    else:
        title = "%s / %s" % (pwms[key]['NA'], sample)
        tag = "_short_%s_%d_%s_Q%.1f" % (pwmid, cutk, sample, dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s.pdf" % (projpath, tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s.pdf" % (projpath, tag)
        chipdistfile = "%s/fig/chipdist%s.pdf" % (projpath, tag)
        scatterfile = "%s/fig/scatter%s.pdf" % (projpath, tag)
        scoreposfile = "%s/fig/scoreposition%s.pdf" % (projpath, tag)
        posagreefile = "%s/fig/posagreement%s.pdf" % (projpath, tag)

    figure = viz.plot_dnaseprofile(dnasemean_bound,
                                   labels,
                                   motiflen=len(pwms[key]['motif']),
                                   title=title)
    figure.savefig(dnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_mnaseprofile(mnasemean_bound,
                                   labels,
                                   motiflen=len(pwms[key]['motif']),
                                   title=title)
    figure.savefig(mnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_chipseq_distribution(bound_chipreads,
                                           unbound_chipreads,
                                           title=title)
    figure.savefig(chipdistfile, bbox_inches=0, dpi=300, format='pdf')

    figure = viz.plot_chipseq_posterior_correlation(chiptotalreads,
                                                    logodds,
                                                    score,
                                                    title=title)
    figure.savefig(scatterfile, bbox_inches=0, dpi=300, format='pdf')

    dnaseobj.close()
    mnaseobj.close()
    chipseqobj.close()
    sequence.close()
Exemplo n.º 6
0
def plotmodel(pwmid, sample=None, pwmbase='transfac'):

    import centipede_pbm as centipede
    from matplotlib.backends.backend_pdf import PdfPages

    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()

    models = ['modelA', 'modelB']
    meanfootprints = []
    stdfootprints = []
    Logodds = []

    handle = open(
        '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed' %
        loadutils.factormap[pwmid], 'r')
    calls = [line.strip().split()[:3] for line in handle]
    handle.close()
    macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]])
    [
        macs[call[0]].append([int(call[1]), int(call[2])]) for call in calls
        if call[0] in utils.chromosomes[:22]
    ]

    if sample is None:
        statsfile = "%s/fig/stats_short_%s.txt" % (projpath, pwmid)
    else:
        statsfile = "%s/fig/stats_short_%s_%s.txt" % (projpath, pwmid, sample)

    pis = []
    gammas = []
    outhandle = open(statsfile, 'w')

    for model in models:
        if sample is None:
            handle = open(
                "%s/cache/combined/pbmcentipede_%s_short_%s.pkl" %
                (projpath, model, pwmid), 'r')
        else:
            handle = open(
                "%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl" %
                (projpath, model, pwmid, sample), 'r')
        output = cPickle.load(handle)
        handle.close()
        footparams = output['footprint'][0]
        alpha, tau = output['negbin'][0]
        posterior = output['posterior'][0]
        logodds = np.log(posterior[:, 1] / posterior[:, 0])
        logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
        logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
        Logodds.append(logodds)
        means = alpha * (1 - tau) / tau
        outhandle.write('%.2f %.2f\n' % (means[0], means[1]))

        if not 'cascade' in locals():
            locs_tolearn = output['locations']
            dnaseobj = loadutils.Dnase(sample=sample)
            dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn,
                                                   width=max([200, L / 2]))
            if L < 400:
                reads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                   dnasereads[:, 300 - L / 4:300 + L / 4]))
            else:
                reads = dnasereads
            dnasereads = dnasereads.sum(1)
            dnaseobj.close()

            cascade = centipede.Cascade(L)
            cascade.setreads(reads)
            del reads

        if model == 'modelA':
            gammas.append(footparams[0])
            if isinstance(footparams[1], centipede.Pi):
                pi = footparams[1].estim
            else:
                pi = footparams[1]
            pis.append(pi)
            B = footparams[2]
            M1, M2 = centipede.bayes_optimal_estimator(cascade,
                                                       posterior,
                                                       pi,
                                                       B=B,
                                                       model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
            #            stdfootprints.append(stdfoot)
            stdfootprints.append(None)
        elif model == 'modelB':
            gammas.append(footparams[1])
            if isinstance(footparams[2], centipede.Pi):
                pi = footparams[2].estim
            else:
                pi = footparams[2]
            pis.append(pi)
            mu = footparams[3]
            M1, M2 = centipede.bayes_optimal_estimator(cascade,
                                                       posterior,
                                                       pi,
                                                       mu=mu,
                                                       model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
            #            stdfootprints.append(stdfoot)
            stdfootprints.append(None)

    chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()
    pdb.set_trace()

    #    sequence = loadutils.Sequence(sample)
    #    seqs = sequence.get_sequences(locs_tolearn, width=200)
    #    sequence.close()
    #    pdb.set_trace()
    #    np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads)
    #    pdb.set_trace()

    corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads))
    corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads))

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)

    key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0]
    if sample is None:
        title = pwms[key]['NA']
        footprintfile = "%s/fig/footprint_short_%s.pdf" % (projpath, pwmid)
        corrfile = "%s/fig/logoddsCorr_short_%s.pdf" % (projpath, pwmid)
    else:
        title = "%s / %s" % (pwms[key]['NA'], sample)
        footprintfile = "%s/fig/footprint_short_%s_%s.pdf" % (projpath, pwmid,
                                                              sample)
        corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf" % (projpath, pwmid,
                                                           sample)

    models = [
        'CentipedePBM_M1', 'CentipedePBM_M2', 'Centipede', 'CentipedeDamped'
    ]
    # plot footprints
    pdfhandle = PdfPages(footprintfile)
    figure = viz.plot_footprint(meanfootprints,
                                labels=models,
                                stderr=stdfootprints,
                                motif=pwms[key]['motif'],
                                title=title)
    pdfhandle.savefig(figure)
    models.append('CentipedeNoFoot')
    auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads,
                                                    Logodds[0], macs,
                                                    locs_tolearn)
    figure = viz.plot_auc(Logodds,
                          positive,
                          negative,
                          labels=models,
                          title=title)
    pdfhandle.savefig(figure)
    T = pis[0].size
    figure = viz.plot.figure()
    subplot = figure.add_subplot(111)
    subplot.scatter(gammas[0].value[0],
                    gammas[1].value[0],
                    s=2**T,
                    marker='o',
                    color=viz.colors[1],
                    label='gamma',
                    alpha=0.5)
    subplot.scatter(pis[0][0],
                    pis[1][0],
                    s=2**T,
                    marker='o',
                    color=viz.colors[0],
                    label='pi',
                    alpha=0.5)
    for i in xrange(1, T):
        subplot.scatter(gammas[0].value[i],
                        gammas[1].value[i],
                        s=2**(T - i),
                        marker='o',
                        color=viz.colors[1],
                        label='_nolabel_',
                        alpha=0.5)
        subplot.scatter(pis[0][i],
                        pis[1][i],
                        s=2**(T - i),
                        marker='o',
                        color=viz.colors[0],
                        label='_nolabel_',
                        alpha=0.5)
    xmin = min([pis[0].min(), pis[1].min()]) - 0.05
    xmax = max([pis[0].max(), pis[1].max()]) + 0.05
    subplot.axis([xmin, xmax, xmin, xmax])
    subplot.set_xlabel('PBM_M1')
    subplot.set_ylabel('PBM_M2')
    legend = subplot.legend(loc=1)
    for text in legend.texts:
        text.set_fontsize('8')
    legend.set_frame_on(False)
    pdfhandle.savefig(figure)
    pdfhandle.close()
    pdb.set_trace()

    pdfhandle = PdfPages(corrfile)
    lo = 0
    for logodds, model in zip(Logodds, models):
        auc, tpr, positive, negative = compute_chip_auc(
            chipreads, controlreads, logodds, macs, locs_tolearn)
        corrA = stats.pearsonr(logodds, np.sqrt(chipreads))
        corrB = stats.pearsonr(logodds, np.sqrt(controlreads))
        corra = stats.pearsonr(logodds[logodds > lo],
                               np.sqrt(chipreads)[logodds > lo])
        corrb = stats.pearsonr(logodds[logodds > lo],
                               np.sqrt(controlreads)[logodds > lo])
        corrc = stats.pearsonr(
            np.sqrt(dnasereads)[logodds > lo],
            np.sqrt(chipreads)[logodds > lo])
        corrd = stats.pearsonr(
            np.sqrt(dnasereads)[logodds > lo],
            np.sqrt(controlreads)[logodds > lo])
        towrite = [
            pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc,
            corrd, auc, tpr, logodds.size, (logodds > np.log(99)).sum()
        ]
        outhandle.write(' '.join(map(str, towrite)) + '\n')
        figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model)
        pdfhandle.savefig(figure)

    figure = viz.plot_correlation(np.sqrt(chipreads),
                                  np.sqrt(dnasereads),
                                  xlabel='sqrt(dnase reads)',
                                  title='Total Dnase reads')
    pdfhandle.savefig(figure)
    pdfhandle.close()
    outhandle.close()