def header(header_cells): """ Returns a data object after setting it up using the header. Parameters ---------- header_cells : list of strings List of all the column names Returns ------- Data object """ data = Data() for col_index, col_name in header_cells.items(): if not re.match(r'\?',col_name): c = data.use.len data.use[c] = col_index data.name[c] = col_name if re.match(r'[<>\$]', col_name): data.nums[c] = Num() else: data.syms[c] = Sym() if re.match(r'<', col_name): data.w[c] = -1 elif re.match(r'>', col_name): data.w[c] = 1 elif re.match(r'!', col_name): data.class_col = c return data
def header(self, cells): indeps = [] for c0, x in enumerate(cells): if not re.match('\?', x): c = len(self._use) self._use.append(c0) self.name.append(x) if re.match(r'[<>\$]', x): self.nums[c] = Num() else: self.syms[c] = Sym() if re.match(r'<', x): self.w[c] = -1 elif re.match(r'>', x): self.w[c] = 1 elif re.match(r'!', x): self.label_class = c else: self.indeps.append(c) return
def header(self, cells): for i, x in enumerate(cells): if not re.match('\?', x): # print ("printing x", x) c = len(self.use) # print ("Printing C", c) self.use[c] = i self.name[c] = x if re.match("[<>$]", x): self.nums[c] = Num(0) else: self.syms[c] = Sym() #----why are setting goals for length of us? Shouldn't it be for each column? if re.match("<", x): self.w[c] = -1 elif re.match(">", x): self.w[c] = 1 elif re.match("!", x): self.dclass = c else: self.indeps.append(c)
def argmin(c, lo, hi): cut = None if (hi - lo > 2 * enough): l, r = Num(0), Num(0) for i in range(lo, hi + 1): r.numInc(rows[i][c]) best = r.sd for i in range(lo, hi + 1): x = rows[i][c] l.numInc(x) r.numDec(x) if l.n >= enough and r.n >= enough: tmp = Num.numXpect(l, r) * 1.04 if tmp < best: cut, best = i, tmp return cut
def argmin(self, c, lo, hi): cut = False if hi - lo > 2 * self.enough: l, r = Num(), Num() for i in range(lo, hi + 1): r.numInc(self.rows[i][c]) best = r.sd for i in range(lo, hi + 1): x = self.rows[i][c] l.numInc(x) r.numDec(x) if l.n >= self.enough and r.n >= self.enough: tmp = Num.numXpect(l, r) * 1.05 # magic constant if tmp < best: cut, best = i, tmp return cut
def argmin(c, lo, hi): cut = None if hi - lo > 2 * enough: l = Num() # left split r = Num() # right split # push everything in the right for i in range(lo, hi): r.numInc(rows[i][c]) best = r.sd # currently all data is in right so best is sd on right # print(best) # push to the left one by one and keep track of best for i in range(lo, hi): x = rows[i][c] l.numInc(x) r.numDec(x) if l.n >= enough and r.n >= enough: tmp = Num.numXpect(l, r) * 1.05 # print(tmp, x) if tmp < best: cut, best = i, tmp # print(tmp, best) return cut
def argmin(c, lo, hi): cut = None xl, yl = Num(), Num( ) # left split for both features and label cols xr, yr = Num(), Num( ) # right split for both features and label cols # push everything in the right for i in range(lo, hi): xr.numInc(rows[i][c]) yr.numInc(rows[i][goal]) best_x = xr.sd # currently all data is in right so best is sd on right best_y = yr.sd # currently all data is in right so best is sd on right mu = yr.mu # print(best) # push to the left one by one and keep track of best if hi - lo > 2 * enough: for i in range(lo, hi): x = rows[i][c] y = rows[i][goal] xl.numInc(x) yl.numInc(y) xr.numDec(x) yr.numDec(y) if xl.n >= enough and xr.n >= enough: tmp_x = xl.numXpect(xr) * 1.05 tmp_y = yl.numXpect(yr) * 1.05 # print(tmp, x) try: if tmp_x < best_x: if tmp_y < best_y: cut, best_x, best_y = i, tmp_x, tmp_y # print(tmp_x, tmp_y, best_x, best_y) except: print(tmp_x, tmp_y) return cut, mu
def computeExpectedValue(self, lo, hi, n, dom_index): x = Num() x.bulkAdd(self.rows[lo:hi + 1][dom_index]) return x.sd * x.n / n
def argmin(self, c, lo, hi): cut = False xl, xr = Num(), Num() yl, yr = Num(), Num() for i in range(lo, hi + 1): xr.numInc(self.rows[i][c]) yr.numInc(self.rows[i][self.goal]) bestx = xr.sd besty = yr.sd mu = yr.mu if hi - lo > 2 * self.enough: for i in range(lo, hi + 1): x = self.rows[i][c] y = self.rows[i][self.goal] xl.numInc(x) xr.numDec(x) yl.numInc(y) yr.numDec(y) if xl.n >= self.enough and xr.n >= self.enough: tmpx = Num.numXpect(xl, xr) * 1.05 # magic constant tmpy = Num.numXpect(yl, yr) * 1.05 # print('i=',i,' c=',c,' tmpx=',tmpx,' bestx',bestx) if tmpx < bestx: if tmpy < besty: cut, bestx, besty = i, tmpx, tmpy return cut, mu