def header(header_cells):
  """
  Returns a data object after setting it up using the 
  header.

  Parameters
  ----------
  header_cells : list of strings
    List of all the column names

  Returns
  -------
  Data object
  """
  data = Data()
  for col_index, col_name in header_cells.items():
    if not re.match(r'\?',col_name):
      c = data.use.len
      data.use[c] = col_index
      data.name[c] = col_name
      if re.match(r'[<>\$]', col_name):
        data.nums[c] = Num()
      else:
        data.syms[c] = Sym()
      if re.match(r'<', col_name):
        data.w[c] = -1
      elif re.match(r'>', col_name):
        data.w[c] = 1
      elif re.match(r'!', col_name):
        data.class_col = c
  return data
示例#2
0
文件: data.py 项目: vivek7266/fss18
 def header(self, cells):
     indeps = []
     for c0, x in enumerate(cells):
         if not re.match('\?', x):
             c = len(self._use)
             self._use.append(c0)
             self.name.append(x)
             if re.match(r'[<>\$]', x):
                 self.nums[c] = Num()
             else:
                 self.syms[c] = Sym()
             if re.match(r'<', x):
                 self.w[c] = -1
             elif re.match(r'>', x):
                 self.w[c] = 1
             elif re.match(r'!', x):
                 self.label_class = c
             else:
                 self.indeps.append(c)
     return
示例#3
0
 def header(self, cells):
     for i, x in enumerate(cells):
         if not re.match('\?', x):
             # print ("printing x", x)
             c = len(self.use)
             # print ("Printing C", c)
             self.use[c] = i
             self.name[c] = x
             if re.match("[<>$]", x):
                 self.nums[c] = Num(0)
             else:
                 self.syms[c] = Sym()
                 #----why are setting goals for length of us?  Shouldn't it be for each column?
             if re.match("<", x):
                 self.w[c] = -1
             elif re.match(">", x):
                 self.w[c] = 1
             elif re.match("!", x):
                 self.dclass = c
             else:
                 self.indeps.append(c)
示例#4
0
 def argmin(c, lo, hi):
     cut = None
     if (hi - lo > 2 * enough):
         l, r = Num(0), Num(0)
         for i in range(lo, hi + 1):
             r.numInc(rows[i][c])
         best = r.sd
         for i in range(lo, hi + 1):
             x = rows[i][c]
             l.numInc(x)
             r.numDec(x)
             if l.n >= enough and r.n >= enough:
                 tmp = Num.numXpect(l, r) * 1.04
                 if tmp < best:
                     cut, best = i, tmp
     return cut
 def argmin(self, c, lo, hi):
     cut = False
     if hi - lo > 2 * self.enough:
         l, r = Num(), Num()
         for i in range(lo, hi + 1):
             r.numInc(self.rows[i][c])
         best = r.sd
         for i in range(lo, hi + 1):
             x = self.rows[i][c]
             l.numInc(x)
             r.numDec(x)
             if l.n >= self.enough and r.n >= self.enough:
                 tmp = Num.numXpect(l, r) * 1.05  # magic constant
                 if tmp < best:
                     cut, best = i, tmp
     return cut
示例#6
0
文件: data.py 项目: vivek7266/fss18
        def argmin(c, lo, hi):
            cut = None
            if hi - lo > 2 * enough:
                l = Num()  # left split
                r = Num()  # right split

                # push everything in the right
                for i in range(lo, hi):
                    r.numInc(rows[i][c])

                best = r.sd  # currently all data is in right so best is sd on right
                # print(best)
                # push to the left one by one and keep track of best
                for i in range(lo, hi):
                    x = rows[i][c]
                    l.numInc(x)
                    r.numDec(x)
                    if l.n >= enough and r.n >= enough:
                        tmp = Num.numXpect(l, r) * 1.05
                        # print(tmp, x)
                        if tmp < best:
                            cut, best = i, tmp
                            # print(tmp, best)
            return cut
示例#7
0
文件: data.py 项目: vivek7266/fss18
        def argmin(c, lo, hi):
            cut = None
            xl, yl = Num(), Num(
            )  # left split for both features and label cols
            xr, yr = Num(), Num(
            )  # right split for both features and label cols

            # push everything in the right
            for i in range(lo, hi):
                xr.numInc(rows[i][c])
                yr.numInc(rows[i][goal])

            best_x = xr.sd  # currently all data is in right so best is sd on right
            best_y = yr.sd  # currently all data is in right so best is sd on right
            mu = yr.mu
            # print(best)
            # push to the left one by one and keep track of best
            if hi - lo > 2 * enough:
                for i in range(lo, hi):
                    x = rows[i][c]
                    y = rows[i][goal]
                    xl.numInc(x)
                    yl.numInc(y)
                    xr.numDec(x)
                    yr.numDec(y)
                    if xl.n >= enough and xr.n >= enough:
                        tmp_x = xl.numXpect(xr) * 1.05
                        tmp_y = yl.numXpect(yr) * 1.05
                        # print(tmp, x)
                        try:
                            if tmp_x < best_x:
                                if tmp_y < best_y:
                                    cut, best_x, best_y = i, tmp_x, tmp_y
                                    # print(tmp_x, tmp_y, best_x, best_y)
                        except:
                            print(tmp_x, tmp_y)
            return cut, mu
 def computeExpectedValue(self, lo, hi, n, dom_index):
     x = Num()
     x.bulkAdd(self.rows[lo:hi + 1][dom_index])
     return x.sd * x.n / n
 def argmin(self, c, lo, hi):
     cut = False
     xl, xr = Num(), Num()
     yl, yr = Num(), Num()
     for i in range(lo, hi + 1):
         xr.numInc(self.rows[i][c])
         yr.numInc(self.rows[i][self.goal])
     bestx = xr.sd
     besty = yr.sd
     mu = yr.mu
     if hi - lo > 2 * self.enough:
         for i in range(lo, hi + 1):
             x = self.rows[i][c]
             y = self.rows[i][self.goal]
             xl.numInc(x)
             xr.numDec(x)
             yl.numInc(y)
             yr.numDec(y)
             if xl.n >= self.enough and xr.n >= self.enough:
                 tmpx = Num.numXpect(xl, xr) * 1.05  # magic constant
                 tmpy = Num.numXpect(yl, yr) * 1.05
                 # print('i=',i,' c=',c,' tmpx=',tmpx,' bestx',bestx)
                 if tmpx < bestx:
                     if tmpy < besty:
                         cut, bestx, besty = i, tmpx, tmpy
     return cut, mu