Exemplo n.º 1
0
def decode(pwmid,
           sample,
           cutk=0,
           pwmbase='transfac',
           pos_threshold=np.log10(99),
           chipseq=False):

    import centipede
    import millipede
    import centipede_pbm as pbmcentipede

    if sample in [None, 'Gm12878', 'Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        if pwmbase == 'transfac':
            pwms = loadutils.transfac_pwms()
        elif pwmbase == 'selex':
            pwms = loadutils.selex_pwms()
        motif = [
            val['motif'] for val in pwms.itervalues() if val['AC'] == pwmid
        ][0]
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if cutk != 0:
        sequence.set_cutrate(sample=sample, k=cutk)

    # use output from Centipede run
    # 0 = Py code, 1 = R code
    if sample is None:
        handle = open(
            "%s/cache/combined/pbmcentipede_short_%s.pkl" % (projpath, pwmid),
            'r')
    else:
        handle = open(
            "%s/cache/separate/pbmcentipede_short_%s_%s.pkl" %
            (projpath, pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    if cutk == 0:
        idx = 0
    elif cutk == 2:
        idx = 1
    elif cutk == 4:
        idx = 2
    footprint = output['footprint'][idx]
    negbinparams = output['negbin'][idx]
    prior = output['prior'][idx][0]
    dhsprior = output['prior'][idx][1]

    if sample in ['Gm12878', 'Gm12878All']:
        location_file = "%s/cache/%s_locationsGm12878_Q%.1f.txt.gz" % (
            projpath, pwmid, dhs)
    else:
        location_file = "%s/cache/%s_locations_Q%.1f.txt.gz" % (projpath,
                                                                pwmid, dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l" % location_file,
                            stdout=subprocess.PIPE,
                            shell=True)
    Ns = int(pipe.communicate()[0].strip())

    try:
        chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    except:
        pass

    readobj = loadutils.Dnase(sample=sample)
    readhandle = loadutils.ZipFile(location_file)
    loops = Ns / batch

    if sample is None:
        handle = gzip.open(
            "%s/cache/combined/%s_short_bound.bed.gz" % (projpath, pwmid),
            'wb')
    else:
        handle = gzip.open(
            "%s/cache/separate/%s_%d_%s_short_bound.bed.gz" %
            (projpath, pwmid, cutk, sample), 'wb')
    towrite = [
        'Chr', 'Start', 'Stop', 'Strand', 'PwmScore', 'LogPosOdds',
        'LogPriorOdds', 'MultLikeRatio', 'NegBinLikeRatio', 'ChipseqReads'
    ]
    handle.write('\t'.join(towrite) + '\n')

    totalreads = []
    for n in xrange(loops):
        starttime = time.time()
        # read locations from file
        locations = readhandle.read(chunk=batch)
        if sample not in [None, 'Gm12878', 'Gm12878All']:
            # compute scores at locations for specific sample
            locations = sequence.get_scores(locations, motif)
        locations = sequence.filter_mappability(locations,
                                                width=max([200, L / 2]))

        # read in Dnase read data for locations
        dnasereads, locations, subscores = readobj.getreads(locations,
                                                            width=max(
                                                                [200, L / 2]))
        subscores = np.array(subscores).astype('float')
        subscores = subscores.reshape(subscores.size, 1)
        dnasetotal = dnasereads.sum(1)
        print len(locations)

        if chipseq:
            chipreads = chipobj.getreads(locations, width=max([200, L / 2]))
        else:
            chipreads = None

        # set null footprint distribution
        if cutk == 0:
            null = np.ones((1, L), dtype=float) / L
        else:
            null = sequence.getnull(locations, width=L / 2)

        if L < 400:
            dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                    dnasereads[:, 300 - L / 4:300 + L / 4]))


#        if cutk==0:
        logodds = centipede.decode(dnasereads,
                                   dnasetotal,
                                   null,
                                   subscores,
                                   footprint,
                                   negbinparams[0],
                                   negbinparams[1],
                                   prior,
                                   dhsprior,
                                   chipreads=chipreads,
                                   damp=damp)
        #        elif cutk==2:
        #            posterior = pbmcentipede.decode(reads, chipreads, subscores, footprint[1:], negbinparams[0], negbinparams[1], prior)

        if not chipseq:
            try:
                chipreads = chipobj.get_total_reads(locations, width=400)
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                    for loc,pos,c in zip(locations,logodds,chipreads)]
            except NameError:
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                    for loc,pos in zip(locations,logodds)]

        locations = [loc for loc in locations if len(loc) > 5]
        ignore = [handle.write('\t'.join(elem) + '\n') for elem in locations]

        print time.time() - starttime

    remain = Ns - loops * batch
    locations = readhandle.read(chunk=remain)
    if sample not in [None, 'Gm12878', 'Gm12878All']:
        # compute scores at locations for specific sample
        locations = sequence.get_scores(locations, motif)
    locations = sequence.filter_mappability(locations, width=max([200, L / 2]))
    dnasereads, locations, subscores = readobj.getreads(locations,
                                                        width=max([200,
                                                                   L / 2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size, 1)
    dnasetotal = dnasereads.sum(1)

    if chipseq:
        chipreads = chipobj.get_total_reads(locations, width=200)
    else:
        chipreads = None

    # set null footprint distribution
    if cutk == 0:
        null = np.ones((1, L), dtype=float) / L
    else:
        null = sequence.getnull(locations, width=L / 2)

    if L < 400:
        dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                dnasereads[:, 300 - L / 4:300 + L / 4]))

    logodds = centipede.decode(dnasereads,
                               dnasetotal,
                               null,
                               subscores,
                               footprint,
                               negbinparams[0],
                               negbinparams[1],
                               prior,
                               dhsprior,
                               chipreads=chipreads,
                               damp=damp)

    if not chipseq:
        try:
            chipreads = chipobj.get_total_reads(locations, width=400)
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                for loc,pos,c in zip(locations,logodds,chipreads)]
        except NameError:
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                for loc,pos in zip(locations,logodds)]
    locations = [loc for loc in locations if len(loc) > 5]
    ignore = [handle.write('\t'.join(elem) + '\n') for elem in locations]

    readobj.close()
    chipobj.close()
    readhandle.close()
    handle.close()

    sequence.close()
Exemplo n.º 2
0
def infer(pwmid, sample, pwm_thresh=8, pwmbase='transfac', chipseq=False):

    import centipede_pbm as centipede

    model = 'modelC'
    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()
    motif = [val['motif'] for val in pwms.itervalues()
             if val['AC'] == pwmid][0]

    if sample in [None, 'Gm12878', 'Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if sample in ['Gm12878', 'Gm12878All']:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locationsGm12878_Q%.1f.txt.gz" % (
            pwmid, dhs)
    else:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locations_Q%.1f.txt.gz" % (
            pwmid, dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l" % location_file,
                            stdout=subprocess.PIPE,
                            shell=True)
    Ns = int(pipe.communicate()[0].strip())

    # load scores
    alllocations = []
    pwm_cutoff = pwm_thresh + 1
    while len(alllocations) < 100:
        pwm_cutoff = pwm_cutoff - 1
        handle = loadutils.ZipFile(location_file)
        alllocations = handle.read(threshold=pwm_cutoff)
        handle.close()
    print "PWM Cutoff = %d" % pwm_cutoff

    # subsample locations, if too many
    if len(alllocations) > 100000:
        scores = np.array([loc[-1] for loc in alllocations]).astype(float)
        indices = np.argsort(scores)[-100000:]
        alllocations = [alllocations[index] for index in indices]
    print "Num of sites for learning, with pwm threshold of %d for %s = %d" % (
        pwm_thresh, pwmid, len(alllocations))

    if sample in [None, 'Gm12878', 'Gm12878All']:
        locs_tolearn = alllocations
    else:
        # compute scores for specific sample at these locations
        starttime = time.time()
        locs_tolearn = sequence.get_scores(alllocations, motif)
        print len(locs_tolearn), time.time() - starttime

    # filter mappability
    print "filtering out unmappable sites ..."
    locs_tolearn = sequence.filter_mappability(locs_tolearn,
                                               width=max([200, L / 2]))

    # load reads and locations
    print "loading dnase reads ..."
    readobj = loadutils.Dnase(sample=sample)
    dnasereads, locs_tolearn, subscores = readobj.getreads(
        locs_tolearn, remove_outliers=True, width=max([200, L / 2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size, 1)
    dnasetotal = dnasereads.sum(1)
    print "Num of mappable sites for learning for %s = %d" % (
        pwmid, len(locs_tolearn))

    if chipseq:
        chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
        chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
        chipobj.close()
    else:
        chipreads = None

    if L < 400:
        dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                dnasereads[:, 300 - L / 4:300 + L / 4]))

    locs_tolearn = [list(loc) for loc in locs_tolearn]
    footprints = []
    priors = []
    negbins = []
    posteriors = []

    null = np.ones((1, L), dtype=float) * 1. / L
    posterior, footprint, negbinparams, prior = centipede.EM(dnasereads,
                                                             dnasetotal,
                                                             subscores,
                                                             null,
                                                             model=model,
                                                             restarts=2)

    posteriors.append(posterior)
    footprints.append(footprint)
    negbins.append(negbinparams)
    priors.append(prior)

    chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=400)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()
    for posterior in posteriors:
        logodds = np.log(posterior[:, 1] / posterior[:, 0])
        logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
        logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
        R = stats.pearsonr(logodds, np.sqrt(chipreads))
        R2 = stats.pearsonr(np.sqrt(dnasetotal), np.sqrt(chipreads))

        handle = open(
            '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'
            % loadutils.factormap[pwmid], 'r')
        calls = [line.strip().split()[:3] for line in handle]
        handle.close()
        macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]])
        [
            macs[call[0]].append([int(call[1]), int(call[2])])
            for call in calls if call[0] in utils.chromosomes[:22]
        ]

        bsites = [
            locs_tolearn[i] for i, p in enumerate(posterior[:, 1]) if p > 0.99
        ]
        F, precision, sensitivity, ig = Fscore.Fscore(bsites, macs)
        chipauc, tpr, positive, negative = compute_chip_auc(
            chipreads, controlreads, logodds, macs, locs_tolearn)
        print pwmid, model, sample, R, R2, chipauc, tpr, F, precision, sensitivity

    output = {'footprint': footprints, \
            'negbin': negbins, \
            'prior': priors, \
            'posterior': posteriors, \
            'locations': locs_tolearn}

    if sample is None:
        handle = open(
            "%s/cache/combined/pbmcentipede_%s_%s.pkl" %
            (projpath, model, pwmid), 'w')
    else:
        handle = open(
            "%s/cache/separate/pbmcentipede_%s_%s_%s.pkl" %
            (projpath, model, pwmid, sample), 'w')
    cPickle.Pickler(handle, protocol=2).dump(output)
    handle.close()

    readobj.close()
    sequence.close()
Exemplo n.º 3
0
def plotbound(pwmid, sample=None, cutk=0, pwmbase='transfac'):

    import random
    from matplotlib.backends.backend_pdf import PdfPages

    bounds = [(1, 5), (5, 9), (9, 13), (13, np.inf)]
    labels = ['1 - 5', '5 - 9', '9 - 13', '>13']

    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()

    dnaseobj = loadutils.Dnase(sample=sample)
    chipseqobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    mnaseobj = loadutils.Mnase(sample=sample)
    indiv_idx = loadutils.read_individuals()
    if sample in [None, 'Gm12878']:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx['NA18516'])
    else:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0]
    bound_scores = []
    bound_chipreads = []
    unbound_chipreads = []
    dnasemean_bound = []
    mnasemean_bound = []
    chiptotalreads = []
    logodds = []
    score = []
    for bound in bounds:
        # plot mean profile of all bound sites, stratified by PWM score
        all_handle = loadutils.ZipFile("%s/cache/%s_locations_Q%.1f.txt.gz" %
                                       (projpath, pwmid, 95.0))
        if sample is None:
            bound_handle = loadutils.ZipFile(
                "%s/cache/combined/%s_%d_bound_Q%.1f.bed.gz" %
                (projpath, pwmid, cutk, dhs))
        else:
            bound_handle = loadutils.ZipFile(
                "%s/cache/separate/%s_%d_%s_bound_Q%.1f.bed.gz" %
                (projpath, pwmid, cutk, sample, dhs))
        all_locations = all_handle.read(threshold=bound)
        blocs = bound_handle.read(threshold=bound)
        bound_locations = [
            loc[:5] for loc in blocs if float(loc[5]) >= np.log10(99)
        ]
        if len(all_locations) > 2 * len(bound_locations):
            all_locations = random.sample(all_locations,
                                          2 * len(bound_locations))
        unbound_locations = list(
            set(all_locations).difference(set(bound_locations)))

        chiptotalreads.extend([int(loc[-1]) for loc in blocs])
        logodds.extend([float(loc[-2]) for loc in blocs])
        score.extend([float(loc[-3]) for loc in blocs])

        # load DNase and MNase reads
        print bound, len(bound_locations), len(unbound_locations)
        x, y = aggregate(bound_locations, dnaseobj)
        dnasemean_bound.append(x)
        mnasemean_bound.append(aggregate(bound_locations, mnaseobj))

        # Total ChipSeq read counts
        chipreads = chipseqobj.getreads(bound_locations)
        bound_chipreads.extend(chipreads)
        chipreads = chipseqobj.getreads(unbound_locations)
        unbound_chipreads.extend(chipreads)

    chiptotalreads = np.array(chiptotalreads)
    logodds = np.array(logodds)
    score = np.array(score)

    if sample is None:
        title = pwms[key]['NA']
        tag = "_%s_%d_Q%.1f.pdf" % (pwmid, cutk, dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s" % (projpath, tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s" % (projpath, tag)
        chipdistfile = "%s/fig/chipdist%s" % (projpath, tag)
        scatterfile = "%s/fig/scatter%s" % (projpath, tag)
        scoreposfile = "%s/fig/scoreposition%s" % (projpath, tag)
        posagreefile = "%s/fig/posagreement%s.pdf" % (projpath, tag)
    else:
        title = "%s / %s" % (pwms[key]['NA'], sample)
        tag = "_short_%s_%d_%s_Q%.1f" % (pwmid, cutk, sample, dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s.pdf" % (projpath, tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s.pdf" % (projpath, tag)
        chipdistfile = "%s/fig/chipdist%s.pdf" % (projpath, tag)
        scatterfile = "%s/fig/scatter%s.pdf" % (projpath, tag)
        scoreposfile = "%s/fig/scoreposition%s.pdf" % (projpath, tag)
        posagreefile = "%s/fig/posagreement%s.pdf" % (projpath, tag)

    figure = viz.plot_dnaseprofile(dnasemean_bound,
                                   labels,
                                   motiflen=len(pwms[key]['motif']),
                                   title=title)
    figure.savefig(dnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_mnaseprofile(mnasemean_bound,
                                   labels,
                                   motiflen=len(pwms[key]['motif']),
                                   title=title)
    figure.savefig(mnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_chipseq_distribution(bound_chipreads,
                                           unbound_chipreads,
                                           title=title)
    figure.savefig(chipdistfile, bbox_inches=0, dpi=300, format='pdf')

    figure = viz.plot_chipseq_posterior_correlation(chiptotalreads,
                                                    logodds,
                                                    score,
                                                    title=title)
    figure.savefig(scatterfile, bbox_inches=0, dpi=300, format='pdf')

    dnaseobj.close()
    mnaseobj.close()
    chipseqobj.close()
    sequence.close()
Exemplo n.º 4
0
def decode(pwmid, sample, cutk=0, pwmbase='transfac', pos_threshold=np.log10(99), chipseq=False):

    import centipede
    import millipede
    import centipede_pbm as pbmcentipede

    if sample in [None,'Gm12878','Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        if pwmbase=='transfac':
            pwms = loadutils.transfac_pwms()
        elif pwmbase=='selex':
            pwms = loadutils.selex_pwms()
        motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0]
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if cutk!=0:
        sequence.set_cutrate(sample=sample, k=cutk)

    # use output from Centipede run
    # 0 = Py code, 1 = R code
    if sample is None:
        handle = open("%s/cache/combined/pbmcentipede_short_%s.pkl"%(projpath,pwmid),'r')
    else:
        handle = open("%s/cache/separate/pbmcentipede_short_%s_%s.pkl"%(projpath,pwmid,sample),'r')
    output = cPickle.load(handle)
    handle.close()
    if cutk==0:
        idx = 0
    elif cutk==2:
        idx = 1
    elif cutk==4:
        idx = 2
    footprint = output['footprint'][idx]
    negbinparams = output['negbin'][idx]
    prior = output['prior'][idx][0]
    dhsprior = output['prior'][idx][1]

    if sample in ['Gm12878','Gm12878All']:
        location_file = "%s/cache/%s_locationsGm12878_Q%.1f.txt.gz"%(projpath,pwmid,dhs)
    else:
        location_file = "%s/cache/%s_locations_Q%.1f.txt.gz"%(projpath,pwmid,dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l"%location_file, stdout=subprocess.PIPE, shell=True)
    Ns = int(pipe.communicate()[0].strip())

    try:
        chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
    except:
        pass

    readobj = loadutils.Dnase(sample=sample)
    readhandle = loadutils.ZipFile(location_file)
    loops = Ns/batch

    if sample is None:
        handle = gzip.open("%s/cache/combined/%s_short_bound.bed.gz"%(projpath,pwmid),'wb')
    else:
        handle = gzip.open("%s/cache/separate/%s_%d_%s_short_bound.bed.gz"%(projpath,pwmid,cutk,sample),'wb')
    towrite = ['Chr','Start','Stop','Strand','PwmScore','LogPosOdds','LogPriorOdds','MultLikeRatio','NegBinLikeRatio','ChipseqReads']
    handle.write('\t'.join(towrite)+'\n')

    totalreads = []
    for n in xrange(loops):
        starttime = time.time()
        # read locations from file
        locations = readhandle.read(chunk=batch)
        if sample not in [None,'Gm12878','Gm12878All']:
            # compute scores at locations for specific sample
            locations = sequence.get_scores(locations, motif)
        locations = sequence.filter_mappability(locations, width=max([200,L/2]))

        # read in Dnase read data for locations
        dnasereads, locations, subscores = readobj.getreads(locations, width=max([200,L/2]))
        subscores = np.array(subscores).astype('float')
        subscores = subscores.reshape(subscores.size,1)
        dnasetotal = dnasereads.sum(1)
        print len(locations)

        if chipseq:
            chipreads = chipobj.getreads(locations, width=max([200,L/2]))
        else:
            chipreads = None

        # set null footprint distribution
        if cutk==0:
            null = np.ones((1,L),dtype=float)/L
        else:
            null = sequence.getnull(locations, width=L/2)

        if L<400:
            dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4]))

#        if cutk==0:
        logodds = centipede.decode(dnasereads, dnasetotal, null, subscores, footprint, negbinparams[0], negbinparams[1], prior, dhsprior, chipreads=chipreads, damp=damp)
#        elif cutk==2:
#            posterior = pbmcentipede.decode(reads, chipreads, subscores, footprint[1:], negbinparams[0], negbinparams[1], prior)

        if not chipseq:
            try:
                chipreads = chipobj.get_total_reads(locations, width=400)
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                    for loc,pos,c in zip(locations,logodds,chipreads)]
            except NameError:
                ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                    for loc,pos in zip(locations,logodds)]

        locations = [loc for loc in locations if len(loc)>5]
        ignore = [handle.write('\t'.join(elem)+'\n') for elem in locations]

        print time.time()-starttime

    remain = Ns-loops*batch
    locations = readhandle.read(chunk=remain)
    if sample not in [None,'Gm12878','Gm12878All']:
        # compute scores at locations for specific sample
        locations = sequence.get_scores(locations, motif)
    locations = sequence.filter_mappability(locations, width=max([200,L/2]))
    dnasereads, locations, subscores = readobj.getreads(locations, width=max([200,L/2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size,1)
    dnasetotal = dnasereads.sum(1)

    if chipseq:
        chipreads = chipobj.get_total_reads(locations, width=200)
    else:
        chipreads = None

    # set null footprint distribution
    if cutk==0:
        null = np.ones((1,L),dtype=float)/L
    else:
        null = sequence.getnull(locations, width=L/2)

    if L<400:
        dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4]))

    logodds = centipede.decode(dnasereads, dnasetotal, null, subscores, footprint, negbinparams[0], negbinparams[1], prior, dhsprior, chipreads=chipreads, damp=damp)

    if not chipseq:
        try:
            chipreads = chipobj.get_total_reads(locations, width=400)
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \
                for loc,pos,c in zip(locations,logodds,chipreads)]
        except NameError:
            ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \
                for loc,pos in zip(locations,logodds)]
    locations = [loc for loc in locations if len(loc)>5]
    ignore = [handle.write('\t'.join(elem)+'\n') for elem in locations]

    readobj.close()
    chipobj.close()
    readhandle.close()
    handle.close()

    sequence.close()
Exemplo n.º 5
0
def infer(pwmid, sample, pwm_thresh=8, pwmbase='transfac', chipseq=False):

    import centipede_pbm as centipede

    model = 'modelC'
    if pwmbase=='transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase=='selex':
        pwms = loadutils.selex_pwms()
    motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0]

    if sample in [None,'Gm12878','Gm12878All']:
        sequence = loadutils.Sequence(sample)
    else:
        indiv_idx = loadutils.read_individuals()
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    if sample in ['Gm12878','Gm12878All']:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locationsGm12878_Q%.1f.txt.gz"%(pwmid,dhs)
    else:
        location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locations_Q%.1f.txt.gz"%(pwmid,dhs)

    # check file size
    pipe = subprocess.Popen("zcat %s | wc -l"%location_file, stdout=subprocess.PIPE, shell=True)
    Ns = int(pipe.communicate()[0].strip())

    # load scores
    alllocations = []
    pwm_cutoff = pwm_thresh+1
    while len(alllocations)<100:
        pwm_cutoff = pwm_cutoff - 1
        handle = loadutils.ZipFile(location_file)
        alllocations = handle.read(threshold=pwm_cutoff)
        handle.close()
    print "PWM Cutoff = %d"%pwm_cutoff

    # subsample locations, if too many
    if len(alllocations)>100000:
        scores = np.array([loc[-1] for loc in alllocations]).astype(float)
        indices = np.argsort(scores)[-100000:]
        alllocations = [alllocations[index] for index in indices]
    print "Num of sites for learning, with pwm threshold of %d for %s = %d"%(pwm_thresh, pwmid, len(alllocations))

    if sample in [None,'Gm12878','Gm12878All']:
        locs_tolearn = alllocations
    else:
        # compute scores for specific sample at these locations
        starttime = time.time()
        locs_tolearn = sequence.get_scores(alllocations, motif)
        print len(locs_tolearn), time.time()-starttime

    # filter mappability
    print "filtering out unmappable sites ..."
    locs_tolearn = sequence.filter_mappability(locs_tolearn, width=max([200,L/2]))

    # load reads and locations
    print "loading dnase reads ..."
    readobj = loadutils.Dnase(sample=sample)
    dnasereads, locs_tolearn, subscores = readobj.getreads(locs_tolearn, remove_outliers=True, width=max([200,L/2]))
    subscores = np.array(subscores)
    subscores = subscores.reshape(subscores.size,1)
    dnasetotal = dnasereads.sum(1)
    print "Num of mappable sites for learning for %s = %d"%(pwmid,len(locs_tolearn))

    if chipseq:
        chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
        chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
        chipobj.close()
    else:
        chipreads = None

    if L<400:
        dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4]))
    
    locs_tolearn = [list(loc) for loc in locs_tolearn]
    footprints = []
    priors = []
    negbins = []
    posteriors = []
    
    null = np.ones((1,L),dtype=float)*1./L
    posterior, footprint, negbinparams, prior = centipede.EM(dnasereads, dnasetotal, subscores, null, model=model, restarts=2)

    posteriors.append(posterior)
    footprints.append(footprint)
    negbins.append(negbinparams)
    priors.append(prior)

    chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878',loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=400)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()    
    for posterior in posteriors:
        logodds = np.log(posterior[:,1]/posterior[:,0])
        logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
        logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
        R = stats.pearsonr(logodds, np.sqrt(chipreads))
        R2 = stats.pearsonr(np.sqrt(dnasetotal), np.sqrt(chipreads))

        handle = open('/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'%loadutils.factormap[pwmid],'r')
        calls = [line.strip().split()[:3] for line in handle]
        handle.close()
        macs = dict([(chrom,[]) for chrom in utils.chromosomes[:22]])
        [macs[call[0]].append([int(call[1]),int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22]]

        bsites = [locs_tolearn[i] for i,p in enumerate(posterior[:,1]) if p>0.99]
        F, precision, sensitivity, ig = Fscore.Fscore(bsites, macs)
        chipauc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, logodds, macs, locs_tolearn)
        print pwmid, model, sample, R, R2, chipauc, tpr, F, precision, sensitivity

    output = {'footprint': footprints, \
            'negbin': negbins, \
            'prior': priors, \
            'posterior': posteriors, \
            'locations': locs_tolearn}

    if sample is None:
        handle = open("%s/cache/combined/pbmcentipede_%s_%s.pkl"%(projpath,model,pwmid),'w')
    else:
        handle = open("%s/cache/separate/pbmcentipede_%s_%s_%s.pkl"%(projpath,model,pwmid,sample),'w')
    cPickle.Pickler(handle, protocol=2).dump(output)
    handle.close()

    readobj.close()
    sequence.close()
Exemplo n.º 6
0
def plotbound(pwmid, sample=None, cutk=0, pwmbase='transfac'):

    import random
    from matplotlib.backends.backend_pdf import PdfPages

    bounds = [(1,5),(5,9),(9,13),(13,np.inf)]
    labels = ['1 - 5', '5 - 9', '9 - 13', '>13']
    
    if pwmbase=='transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase=='selex':
        pwms = loadutils.selex_pwms()

    dnaseobj = loadutils.Dnase(sample=sample)
    chipseqobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
    mnaseobj = loadutils.Mnase(sample=sample)
    indiv_idx = loadutils.read_individuals()
    if sample in [None,'Gm12878']:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx['NA18516'])
    else:
        sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample])

    key = [k for k,pwm in pwms.iteritems() if pwm['AC']==pwmid][0]
    bound_scores = []
    bound_chipreads = []
    unbound_chipreads = []
    dnasemean_bound = []
    mnasemean_bound = []
    chiptotalreads = []
    logodds = []
    score = []
    for bound in bounds:
        # plot mean profile of all bound sites, stratified by PWM score
        all_handle = loadutils.ZipFile("%s/cache/%s_locations_Q%.1f.txt.gz"%(projpath,pwmid,95.0))
        if sample is None:
            bound_handle = loadutils.ZipFile("%s/cache/combined/%s_%d_bound_Q%.1f.bed.gz"%(projpath,pwmid,cutk,dhs))
        else:
            bound_handle = loadutils.ZipFile("%s/cache/separate/%s_%d_%s_bound_Q%.1f.bed.gz"%(projpath,pwmid,cutk,sample,dhs))
        all_locations = all_handle.read(threshold=bound)
        blocs = bound_handle.read(threshold=bound)
        bound_locations = [loc[:5] for loc in blocs if float(loc[5])>=np.log10(99)]
        if len(all_locations)>2*len(bound_locations):
            all_locations = random.sample(all_locations, 2*len(bound_locations))
        unbound_locations = list(set(all_locations).difference(set(bound_locations)))

        chiptotalreads.extend([int(loc[-1]) for loc in blocs])
        logodds.extend([float(loc[-2]) for loc in blocs])
        score.extend([float(loc[-3]) for loc in blocs])

        # load DNase and MNase reads
        print bound, len(bound_locations), len(unbound_locations)
        x,y = aggregate(bound_locations, dnaseobj)
        dnasemean_bound.append(x)
        mnasemean_bound.append(aggregate(bound_locations, mnaseobj))

        # Total ChipSeq read counts
        chipreads = chipseqobj.getreads(bound_locations)
        bound_chipreads.extend(chipreads)
        chipreads = chipseqobj.getreads(unbound_locations)
        unbound_chipreads.extend(chipreads)

    chiptotalreads = np.array(chiptotalreads)
    logodds = np.array(logodds)
    score = np.array(score)

    if sample is None:
        title = pwms[key]['NA']
        tag = "_%s_%d_Q%.1f.pdf"%(pwmid,cutk,dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s"%(projpath,tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s"%(projpath,tag)
        chipdistfile = "%s/fig/chipdist%s"%(projpath,tag)
        scatterfile = "%s/fig/scatter%s"%(projpath,tag)
        scoreposfile = "%s/fig/scoreposition%s"%(projpath,tag)
        posagreefile = "%s/fig/posagreement%s.pdf"%(projpath,tag)
    else:
        title = "%s / %s"%(pwms[key]['NA'], sample)
        tag = "_short_%s_%d_%s_Q%.1f"%(pwmid,cutk,sample,dhs)
        dnaseprofilefile = "%s/fig/dnaseprofile%s.pdf"%(projpath,tag)
        mnaseprofilefile = "%s/fig/mnaseprofile%s.pdf"%(projpath,tag)
        chipdistfile = "%s/fig/chipdist%s.pdf"%(projpath,tag)
        scatterfile = "%s/fig/scatter%s.pdf"%(projpath,tag)
        scoreposfile = "%s/fig/scoreposition%s.pdf"%(projpath,tag)
        posagreefile = "%s/fig/posagreement%s.pdf"%(projpath,tag)

    figure = viz.plot_dnaseprofile(dnasemean_bound, labels, motiflen=len(pwms[key]['motif']), title=title)
    figure.savefig(dnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_mnaseprofile(mnasemean_bound, labels, motiflen=len(pwms[key]['motif']), title=title)
    figure.savefig(mnaseprofilefile, dpi=300, format='pdf')

    figure = viz.plot_chipseq_distribution(bound_chipreads, unbound_chipreads, title=title)
    figure.savefig(chipdistfile, bbox_inches=0, dpi=300, format='pdf')

    figure = viz.plot_chipseq_posterior_correlation(chiptotalreads, logodds, score, title=title)
    figure.savefig(scatterfile, bbox_inches=0, dpi=300, format='pdf')

    dnaseobj.close()
    mnaseobj.close()
    chipseqobj.close()
    sequence.close()