class QuadScoreSample5(QuadScoreSample4): """ uses global std and mean """ def __init__(self, *args, **kwargs): QuadScoreSample4.__init__(self, *args, **kwargs) self.global_std = StdFunc() self.global_mean = AvgFunc() self.global_bounds = [1e10000, -1e10000] self.epsilon = kwargs.get('epsilon', 0.005) def evaluate_split(self, stats_list): probs = [] for stat in stats_list: if not stat: continue est, std = stat.est, stat.std if not len(stat.vals): prob = 0. elif std == 0: prob = 1. else: weight = self.weight(max(stat.vals)) if weight == 0: prob = 1. else: bound = max(stat.vals) - min(stat.vals) prob = (std * (2.58 + 2.58)) / weight prob = 1 - prob / (self.global_bounds[1] - self.global_bounds[0]) #prob = est + 2.58 * std # if std == 0: # prob = 1. # else: # # Prob( (X-mean)^2 < epsilon ) >= 0.95 # w = self.weight(est + 2.58 * std) # alpha = self.epsilon * abs(est) / w # prob = math.erf(alpha / (std * math.sqrt(2.))) probs.append(prob) return np.mean(probs) if probs else 0. def evaluate(self, table): if not len(table): return PartitionStats(self.global_mean.value(), std=self.global_std.value(), vals=[]) vals = [] newvals = [] for row in table: if row[self.SCORE_ID].value == -inf: est = self.err_func([row]) row[self.SCORE_ID] = est newvals.append(est) vals.append(row[self.SCORE_ID].value) samp_size = len(vals) newvals = np.array(newvals) self.global_std.delta(add=[newvals], update=True) self.global_mean.delta(add=[newvals], update=True) self.global_bounds[0] = min(self.global_bounds[0], min(vals)) self.global_bounds[1] = max(self.global_bounds[1], max(vals)) if samp_size == 1: est, std = vals[0], 0. else: # slightly biased std estimator est = np.mean(vals) S2 = 1. / (samp_size - 1) * sum([(v-est)**2 for v in vals]) S = math.sqrt(S2) std = self.kn(samp_size) * S if samp_size > 2: _logger.debug('\tsampsize(%d)\t%.4f+-%.4f\t%.4f - %.4f', samp_size, est, std, self.global_bounds[0], self.global_bounds[1] ) return PartitionStats(est, std=std, vals=vals) def weight(self, val): u = self.global_mean.value() std = self.global_std.value() if std == 0: return 1. max_std = 2.58 #max_std = 1.6 # weight increases quadratically. nstds = (val - u) / std nstds = min(max(0, nstds), max_std) y = (nstds / max_std) ** 2 return y # linear scale, hits maximum at 2.58-0.5 i think r = 2.58 + 2.58 + 0.5 # why is a 0.5 here? v = min(r, max(0., (val - u) / std - 0.5)) return 0.0001 + (v / r) * (1 - 0.0001) # using ERF w = .5 * (1 + math.erf( (val-u) / math.sqrt(2*std**2) )) # rescale to be between 0.2 - 1 return 0.001 + w * (1 - 0.001) def should_stop(self, table, stats): if len(table) <= self.min_points: return True std, est = stats.std, stats.est val, allsame = None, True for i, row in enumerate(table): if i == 0: val = tuple([row[aggcol].value for aggcol in self.aggcols]) else: if val != tuple([row[aggcol].value for aggcol in self.aggcols]): allsame = False break if allsame: return True # Prob( (X-mean)^2 < epsilon ) >= 0.95 w = self.weight(est + 2.58 * std) if w == 0 or std == 0: prob = 1. else: alpha = math.sqrt( self.epsilon * abs(est) / w ) #alpha = self.epsilon * 2.58 * self.global_std.value() / w #alpha = math.sqrt( self.epsilon * 2 * 2.58 * self.global_std.value() / w ) prob = math.erf(alpha / (std * math.sqrt(2.))) return prob >= 0.95
class Evaluator(object): def __init__(self, SCORE_ID, errprob, err_funcs, aggcols, epsilon, **kwargs): self.global_std = StdFunc() self.global_mean = AvgFunc() self.global_bounds = [1e10000, -1e10000] self.SCORE_ID = SCORE_ID self.err_funcs = err_funcs self.aggcols = aggcols self.epsilon = epsilon self.errprob = errprob self.min_points = kwargs.get('min_points', 2) self.sampler = Sampler(self.errprob, self.SCORE_ID) def kn(self, n): """ return Kn, where UMVU estimator of std is Kn*S """ try: return math.sqrt(2./(n-1)) * (math.gamma(n/2.) / (math.gamma((n-1.)/2.))) except: return 1. def evaluate(self, tables, sample=True): if isinstance(tables, list): self.samples = self.sampler(tables) if sample else tables if not self.samples: return None ests, stds, vals = [], [], [] for table, err_func in zip(self.samples, self.err_funcs): est, std, vs = self.evaluate_table(table, err_func) ests.append(est) stds.append(std) vals.extend(vs) est = np.mean(ests) std = np.mean(stds) if len(self.err_funcs) and 'Sum' in str(self.err_funcs[0].klass): est = est / sum(map(len, self.samples)) * sum(map(len, tables)) if len(vals) != sum(map(len, self.samples)): raise RuntimeError("# vals != # samples") if sample and len(vals) != sum(map(len, tables)): raise RuntimeError("# vals != # pts") return PartitionStats(est, std=std, vals=vals) else: return self.evaluate([tables], sample=sample) def evaluate_table(self, table, err_func): if not len(table): return (err_func([]), 0., []) vals = [] newvals = [] for row in table: if row[self.SCORE_ID].value == -inf: est = err_func([row]) row[self.SCORE_ID] = est newvals.append(est) vals.append(row[self.SCORE_ID].value) samp_size = len(vals) newvals = np.array(newvals) self.global_std.delta(add=[newvals], update=True) self.global_mean.delta(add=[newvals], update=True) self.global_bounds[0] = min(self.global_bounds[0], min(vals)) self.global_bounds[1] = max(self.global_bounds[1], max(vals)) if samp_size == 1: est, std = vals[0], 0. else: # slightly biased std estimator try: est = np.mean(vals) except: pdb.set_trace() S2 = 1. / (samp_size - 1) * sum([(v-est)**2 for v in vals]) S = math.sqrt(S2) std = self.kn(samp_size) * S if samp_size > 2: _logger.debug('\tsampsize(%d)\t%.4f+-%.4f\t%.4f - %.4f', samp_size, est, std, self.global_bounds[0], self.global_bounds[1] ) return est, std, vals def weight(self, val): u = self.global_mean.value() std = self.global_std.value() if std == 0: return 1. max_std = 2.58 #max_std = 1.6 # weight increases quadratically. nstds = (val - u) / std nstds = min(max(0, nstds + 2), max_std) y = (nstds / max_std) ** 2 return y # linear scale, hits maximum at 2.58-0.5 i think r = 2.58 + 2.58 + 0.5 # why is a 0.5 here? v = min(r, max(0., (val - u) / std - 0.5)) return 0.0001 + (v / r) * (1 - 0.0001) # using ERF w = .5 * (1 + math.erf( (val-u) / math.sqrt(2*std**2) )) # rescale to be between 0.2 - 1 return 0.001 + w * (1 - 0.001) def should_stop(self, tables, bad_stats, good_stats): if max(map(len,tables)) <= self.min_points: return True # val, allsame = None, True # for i, row in enumerate(table): # if i == 0: # val = tuple([row[aggcol].value for aggcol in self.aggcols]) # else: # if val != tuple([row[aggcol].value for aggcol in self.aggcols]): # allsame = False # break # if allsame or std == 0: # return True if bad_stats.std == 0: return True weight = self.weight(max(bad_stats.vals)) if weight == 0: return True threshold = (self.global_bounds[1] - self.global_bounds[0]) * self.epsilon / weight bounds = max(bad_stats.vals) - min(bad_stats.vals) bounds = max(bounds, bad_stats.std * 2.58 * 2) return bounds < threshold #w = self.weight(est + 2.58 * std) wmse = np.mean([self.weight(v) * (abs(v - bad_stats.est))**2 for v in bad_stats.vals]) return wmse < self.epsilon * (self.global_bounds[1] * 0.8)