예제 #1
0
    def calc_occupancy_scores(self, factor, motif, tf_concs=[1e-6,]):
        mean_rvs = []
        max_rvs = []

        trimmed_atacseq_cov = self.atacseq_cov[len(motif)+1:]
        atacseq_weights = trimmed_atacseq_cov/trimmed_atacseq_cov.max()

        for tf_conc in tf_concs:
            log_tf_conc = numpy.log(tf_conc)

            raw_occ = logistic(
                log_tf_conc - self.score_cov[motif.name]/(R*T))
            occ = raw_occ*atacseq_weights
            mean_rv.append(occ.mean())
            max_rv.append(occ.max())
        return mean_rv, max_rvs
예제 #2
0
def find_optimal_GFE(motif, pks, chipseq_scores, atacseq_signal):
    res = []
    max_len = max(len(pk) for pk in pks)
    scores = numpy.zeros((len(pks), max_len+1))

    for GFE in numpy.arange(-20, 10, 1.0):
        motif.build_occupancy_weights(4, GFE)
        for pk_i, pk in enumerate(pks):
            score_cov = numpy.array(
                [score for pos, score in motif.iter_seq_score(pk.seq)])
            scores[pk_i, len(motif)+1:] = score_cov
        res.append((
            spearmanr((logistic(-scores/(R*T))*atacseq_signal).mean(1), chipseq_scores)[0],
            GFE))
        print motif.name, GFE, res[-1]
    max_cor = max(x[0] for x in res)
    print motif.name, [x[1] for x in res if x[0] == max_cor][0], max_cor
    print >> sys.stderr, motif.name, [x[1] for x in res if x[0] == max_cor][0], max_cor

    return (motif.name, [x[1] for x in res if x[0] == max_cor][0], max_cor)
예제 #3
0
    def calc_summary_stats(self):
        header = []
        rv = []
        
        # add on the region and atacseq data
        header.append('pk_length')
        rv.append(self.stop - self.start)

        header.append('ATAC_mean')        
        rv.append(self.atacseq_cov.mean())

        header.append('ATAC_max')        
        rv.append(self.atacseq_cov.max())

        # find all factors with motif and chip-seq data
        factors = sorted(set(motif.factor for name, motif 
                            in self.motifs.iteritems()
                        ).intersection(self.chipseq_cov.iterkeys()))

        percentiles = numpy.array(
            [1e-3, 1e-2, 0.02, 0.05, 0.10, 0.25, 0.50])
        
        for factor in sorted(factors):
            for BSID, cov in self.chipseq_cov[factor].iteritems():
                header.append('%s_%s_mean_ChIPseq_cov' % (factor, BSID))
                rv.append(cov.mean())
            for motif_name, motif in sorted(self.motifs.iteritems()):
                # skip motifs that aren't the correct factor
                if factor != motif.factor: continue

                header.append('%s_mean_score' % motif_name)
                rv.append(self.score_cov[motif_name].mean())

                header.append('%s_max_score' % motif_name)
                rv.append(self.score_cov[motif_name].min())
                #for percentile, score in self.iter_upper_rank_means(
                #        self.score_cov[motif_name], percentiles):
                #    header.append('%s_q_%.2f_score' % (motif_name, percentile))
                #    rv.append(score)

                trimmed_atacseq_cov = self.atacseq_cov[len(motif)+1:]
                atacseq_weights = trimmed_atacseq_cov/trimmed_atacseq_cov.max()
                #1000, trimmed_atacseq_cov.max())
                
                w_pwm_scores = self.pwm_cov[motif_name]*atacseq_weights
                header.append('%s_mean_w_pwm_score' % motif_name)
                rv.append(w_pwm_scores.mean())
                #for percentile, score in self.iter_upper_rank_means(
                #        w_pwm_scores, percentiles):
                #    header.append('%s_q_%.2f_w_pwm_score' % (motif_name, percentile))
                #    rv.append(score)
                    
                #header.append('%s_max_w_pwm_score' % motif_name)
                #rv.append(w_pwm_scores.max())

                """
                for tf_conc in [1e-30, 1e-20, 1e-15, 1e-10, 1e-7, 1e-5, 1e-2, 1e-1, 
                                1, 1e2, 1e5, 1e7, 1e10, 1e15, 1e20, 1e30 ]:
                    log_tf_conc = numpy.log(tf_conc)

                    raw_occ = logistic(
                        log_tf_conc - self.score_cov[motif.name]/(R*T))
                    occ = raw_occ*atacseq_weights
                    header.append('%s_%e_mean_occ' % (motif_name, tf_conc))
                    rv.append(occ.mean())

                    #header.append('%s_%e_max_occ' % (motif_name, tf_conc))
                    #rv.append(occ.max())
                """
                log_tf_conc = numpy.log(1e5)
                raw_occ = logistic(
                    log_tf_conc - self.score_cov[motif_name]/(R*T))
                occ = raw_occ*atacseq_weights
                header.append('%s_mean_occ' % motif_name)
                rv.append(occ.mean())

                header.append('%s_max_occ' % motif_name)
                rv.append(occ.max())

                """
                # XXX
                # find the raw occupancy that provies the best correpondence
                # between the signals, and then try and predict these 
                # sequentially
                unbnd_conc = self.estimate_unbnd_conc_in_region(motif_name)
                #print self.score_cov[motif_name].mean(), unbnd_conc

                #print unbnd_conc
                #unbnd_conc = 0.0
                raw_occ = logistic(
                    unbnd_conc + self.score_cov[motif_name]/(R*T))
                occ = raw_occ*atacseq_weights
                header.append('%s_weighted_occ' % motif_name)
                rv.append(occ.mean())

                header.append('%s_unbnd_conc' % motif_name)
                rv.append(unbnd_conc)
                """

                #for percentile, score in self.iter_upper_rank_means(
                #        occ, percentiles):
                #    header.append('%s_q_%.2f_occ' % (motif_name, percentile))
                #    rv.append(score)

                #header.append('%s_max_occ' % motif_name)
                #rv.append(occ.max())

        return header, rv