def regress(self, x, y): self.points = len(x) if len(x) < 5: return; self.xmin = min(x) self.xmax = max(x) # Shift regression to start with 0 for i in range(0, self.points): x[i] = x[i] - self.xmin #1. Compute the regression coefficients n = self.points x_mean = 0.0 y_mean = 0.0 for i in range(0, self.points): y_mean += y[i] x_mean += x[i] y_mean = y_mean / float(len(y)) x_mean = x_mean / float(len(x)) sum_xy_err = 0.0 sum_x_2_err = 0.0 for i in range(0, self.points): sum_xy_err = (x[i]-x_mean)*(y[i]-y_mean) sum_x_2_err = (x[i] - x_mean)*(x[i] - x_mean) if (sum_x_2_err == 0): # This means that all the points are located at the same x point) return; self.b_1 = sum_xy_err / sum_x_2_err self.b_0 = y_mean - self.b_1 * x_mean self.setRegressionFormula() # Now, compute SSR, SSE, fo the F-test ANOVA SSE = 0.0 SSR = 0.0 n = len(x) # number of observations p = 1 # number of predictor variables for i in range(0, len(x)): y_predict = self.b_1 * x[i] + self.b_0 SSE += (y_predict - y[i])*(y_predict - y[i]) SSR += (y_predict - y_mean)*(y_predict - y_mean) pass #Now, r squared = SSR/(SSR + SSE) if (SSE < POSITIVIE_ZERO): # Perfect fir, r_2 is over the charts self.r_2 = 1.0 self.f_value = 0.1+f_table_value(1, n-2)*10 else: self.r_2 = SSR/(SSR+SSE) MSM = SSR / (1) MSE = SSE / (n - 2) self.f_value = MSM/MSE #Perform an f-test self.f_goal = f_table_value(1, n - 2) if self.f_value > 10 * self.f_goal: self.veryStrongSignificance() elif self.f_value > self.f_goal: self.strongSignificance() else: self.weakSignificance() pass
def analyse(self, data): #TODO: complain if data set is empty! if len(data) < 5: return # add the real value to the correct bin for datum in data: self.levels[datum.x].append(datum.y) self.ymax = self.ymax if datum.y < self.ymax else datum.y self.ymin = self.ymin if datum.y > self.ymin else datum.y #TODO: Prune away empty levels levelsToPrune = filter(lambda lvl: len(self.levels[lvl]) == 0, self.levels) for lvlDel in levelsToPrune: del self.levels[lvlDel] #find the mean. Since bins are probably unbalanced # (we're crowdsourcing, after all, weight each bin) # by its size #while doing that, find column means just as well means = {} weights = {} mu = 0 total = 0 for level in self.levels: means[level] = sum(self.levels[level]) / float(len(self.levels[level])) weights[level] = len(self.levels[level]) total += weights[level] for level in self.levels: weights[level] = float(weights[level]) / total mu += weights[level] * means[level] self.means = means self.find1StdDevIntervals() # We have to check that we have at least 1 more # class in data to try to get this if len(self.levels) > 1: #Ok, we have column and grand means. # find bin alphas self.means = means alphas = {} for level in self.levels: alphas[level] = means[level] - mu # Next, find SSA, SSE SSA = 0 SSE = 0 for j in self.levels: r_j = len(self.levels[j]) SSA += r_j*alphas[j]*alphas[j] for y_ij in self.levels[j]: e_ij = y_ij - mu - alphas[j] SSE += e_ij * e_ij # Now, we have SSA, SSE, compute MSA, MSE # Note that r computation is a bit sketchy: since we # don't have the same number of observations for each # label, we have to compute it as a weighted average of # all classes, then round and make an integer to get # an f-table value a = len(alphas) r = sum( map(lambda i: weights[i]*len(self.levels[i]), self.levels) ) r = int(round(r)) MSA = SSA / (a - 1) MSE = SSE / (a * (r - 1)) self.f_value = MSA / MSE self.f_goal = f_table_value(a-1, a*(r-1)) self.alphas = alphas self.x_mean_effect = map(lambda li: [li, self.means[li], self.alphas[li]], self.levels) if (self.f_value > 10 * self.f_goal): self.veryStrongSignificance() elif (self.f_value > self.f_goal): self.strongSignificance() else: self.weakSignificance() pass