Пример #1
0
class QuadScoreSample5(QuadScoreSample4):
    """
    uses global std and mean
    """
    def __init__(self, *args, **kwargs):
        QuadScoreSample4.__init__(self, *args, **kwargs)
        self.global_std = StdFunc()
        self.global_mean = AvgFunc()
        self.global_bounds = [1e10000, -1e10000]
        self.epsilon = kwargs.get('epsilon', 0.005)
    
    def evaluate_split(self, stats_list):
        probs = []
        for stat in stats_list:
            if not stat:
                continue
            est, std = stat.est, stat.std
            if not len(stat.vals):
                prob = 0.
            elif std == 0:
                prob = 1.
            else:
                weight = self.weight(max(stat.vals))
                if weight == 0:
                    prob = 1.
                else:
                    bound = max(stat.vals) - min(stat.vals)
                    prob = (std * (2.58 + 2.58)) / weight
                    prob = 1 - prob  / (self.global_bounds[1] - self.global_bounds[0])
            #prob = est + 2.58 * std
            # if std == 0:
            #     prob = 1.
            # else:
            #     # Prob( (X-mean)^2 < epsilon ) >= 0.95
            #     w = self.weight(est + 2.58 * std)
            #     alpha = self.epsilon * abs(est) / w
            #     prob = math.erf(alpha / (std * math.sqrt(2.)))
            probs.append(prob)
        return np.mean(probs) if probs else 0.
            

    def evaluate(self, table):
        if not len(table):
            return PartitionStats(self.global_mean.value(),
                                  std=self.global_std.value(),
                                  vals=[])
        
        vals = []
        newvals = []
        for row in table:
            if row[self.SCORE_ID].value == -inf:
                est = self.err_func([row])
                row[self.SCORE_ID] = est
                newvals.append(est)
            vals.append(row[self.SCORE_ID].value)
        samp_size = len(vals)

        newvals = np.array(newvals)
        self.global_std.delta(add=[newvals], update=True)
        self.global_mean.delta(add=[newvals], update=True)
        self.global_bounds[0] = min(self.global_bounds[0], min(vals))
        self.global_bounds[1] = max(self.global_bounds[1], max(vals))

        if samp_size == 1:
            est, std = vals[0], 0.
        else:
            # slightly biased std estimator
            est = np.mean(vals)
            S2 = 1. / (samp_size - 1) * sum([(v-est)**2 for v in vals])
            S = math.sqrt(S2)
            std = self.kn(samp_size) * S

        if samp_size > 2:
            _logger.debug('\tsampsize(%d)\t%.4f+-%.4f\t%.4f - %.4f',
                          samp_size,
                          est,
                          std,
                          self.global_bounds[0],
                          self.global_bounds[1]
                           )
        return PartitionStats(est, std=std, vals=vals)

    def weight(self, val):
        u = self.global_mean.value()
        std = self.global_std.value()
        if std == 0:
            return 1.

        max_std = 2.58
        #max_std = 1.6

        # weight increases quadratically.
        nstds = (val - u) / std
        nstds = min(max(0, nstds), max_std)
        y = (nstds / max_std) ** 2
        
        return y

        # linear scale, hits maximum at 2.58-0.5 i think
        r = 2.58 + 2.58 + 0.5 # why is a 0.5 here?
        v = min(r, max(0., (val - u) / std - 0.5))
        return 0.0001 + (v / r) * (1 - 0.0001)

        # using ERF
        w = .5 * (1 + math.erf( (val-u) / math.sqrt(2*std**2) ))
        # rescale to be between 0.2 - 1
        return 0.001 + w * (1 - 0.001)

    def should_stop(self, table, stats):
        if len(table) <= self.min_points:
            return True

        std, est = stats.std, stats.est
        val, allsame = None, True
        for i, row in enumerate(table):
            if i == 0:
                val = tuple([row[aggcol].value for aggcol in self.aggcols])
            else:
                if val != tuple([row[aggcol].value for aggcol in self.aggcols]):
                    allsame = False
                    break
        if allsame:
            return True

        # Prob( (X-mean)^2 < epsilon ) >= 0.95
        w = self.weight(est + 2.58 * std)
        if w == 0 or std == 0:
            prob = 1.
        else:
            alpha = math.sqrt( self.epsilon * abs(est) / w )
            #alpha = self.epsilon * 2.58 * self.global_std.value() / w
            #alpha = math.sqrt( self.epsilon * 2 * 2.58 * self.global_std.value() / w )
            prob = math.erf(alpha / (std * math.sqrt(2.)))

        return prob >= 0.95
Пример #2
0
class Evaluator(object):


    def __init__(self, SCORE_ID, errprob, err_funcs, aggcols, epsilon, **kwargs):
        self.global_std = StdFunc()
        self.global_mean = AvgFunc()
        self.global_bounds = [1e10000, -1e10000]

        self.SCORE_ID = SCORE_ID
        self.err_funcs = err_funcs
        self.aggcols = aggcols
        self.epsilon = epsilon
        self.errprob = errprob
        self.min_points = kwargs.get('min_points', 2)

        self.sampler = Sampler(self.errprob, self.SCORE_ID)        

        
    def kn(self, n):
        """
        return Kn, where UMVU estimator of std is Kn*S
        """
        try:
            return math.sqrt(2./(n-1)) * (math.gamma(n/2.) / (math.gamma((n-1.)/2.)))
        except:
            return 1.


    def evaluate(self, tables, sample=True):
        if isinstance(tables, list):
            self.samples = self.sampler(tables) if sample else tables

            if not self.samples:
                return None
            
            ests, stds, vals = [], [], []
            for table, err_func in zip(self.samples, self.err_funcs):
                est, std, vs = self.evaluate_table(table, err_func)
                ests.append(est)
                stds.append(std)
                vals.extend(vs)

            est = np.mean(ests)
            std = np.mean(stds)
            
            if len(self.err_funcs) and 'Sum' in str(self.err_funcs[0].klass):
                est = est / sum(map(len, self.samples)) * sum(map(len, tables))


            if len(vals) != sum(map(len, self.samples)):
                raise RuntimeError("# vals != # samples")
            
                if sample and len(vals) != sum(map(len, tables)):
                    raise RuntimeError("# vals != # pts")

            return PartitionStats(est, std=std, vals=vals)
        else:
            return self.evaluate([tables], sample=sample)


    def evaluate_table(self, table, err_func):
        if not len(table):
            return (err_func([]),
                    0.,
                    [])

        
        vals = []
        newvals = []
        for row in table:
            if row[self.SCORE_ID].value == -inf:
                est = err_func([row])
                row[self.SCORE_ID] = est
                newvals.append(est)
            vals.append(row[self.SCORE_ID].value)
        samp_size = len(vals)


        newvals = np.array(newvals)
        self.global_std.delta(add=[newvals], update=True)
        self.global_mean.delta(add=[newvals], update=True)
        self.global_bounds[0] = min(self.global_bounds[0], min(vals))
        self.global_bounds[1] = max(self.global_bounds[1], max(vals))

        if samp_size == 1:
            est, std = vals[0], 0.
        else:
            # slightly biased std estimator
            try:
                est = np.mean(vals)
            except:
                pdb.set_trace()
            S2 = 1. / (samp_size - 1) * sum([(v-est)**2 for v in vals])
            S = math.sqrt(S2)
            std = self.kn(samp_size) * S

        if samp_size > 2:
            _logger.debug('\tsampsize(%d)\t%.4f+-%.4f\t%.4f - %.4f',
                          samp_size,
                          est,
                          std,
                          self.global_bounds[0],
                          self.global_bounds[1]
                           )

        return est, std, vals


    def weight(self, val):
        u = self.global_mean.value()
        std = self.global_std.value()
        if std == 0:
            return 1.

        max_std = 2.58
        #max_std = 1.6

        # weight increases quadratically.
        nstds = (val - u) / std
        nstds = min(max(0, nstds + 2), max_std)
        y = (nstds / max_std) ** 2

        return y

        # linear scale, hits maximum at 2.58-0.5 i think
        r = 2.58 + 2.58 + 0.5 # why is a 0.5 here?
        v = min(r, max(0., (val - u) / std - 0.5))
        return 0.0001 + (v / r) * (1 - 0.0001)

        # using ERF
        w = .5 * (1 + math.erf( (val-u) / math.sqrt(2*std**2) ))
        # rescale to be between 0.2 - 1
        return 0.001 + w * (1 - 0.001)


    def should_stop(self, tables, bad_stats, good_stats):
        if max(map(len,tables)) <= self.min_points:
            return True


        # val, allsame = None, True
        # for i, row in enumerate(table):
        #     if i == 0:
        #         val = tuple([row[aggcol].value for aggcol in self.aggcols])
        #     else:
        #         if val != tuple([row[aggcol].value for aggcol in self.aggcols]):
        #             allsame = False
        #             break

        # if allsame or std == 0:
        #     return True
        if bad_stats.std == 0:
            return True


        weight = self.weight(max(bad_stats.vals))
        if weight == 0:
            return True
        threshold = (self.global_bounds[1] - self.global_bounds[0]) * self.epsilon / weight
        bounds = max(bad_stats.vals) - min(bad_stats.vals)
        bounds = max(bounds, bad_stats.std * 2.58 * 2)
        return bounds < threshold
        #w = self.weight(est + 2.58 * std)        
        wmse = np.mean([self.weight(v) * (abs(v - bad_stats.est))**2 for v in bad_stats.vals])
        return wmse < self.epsilon * (self.global_bounds[1] * 0.8)