示例#1
0
def _test():
    tbl_loc = explore(dir='../data.dat/Seigmund/', name='Apache')
    tbl = csv2DF(tbl_loc)

    # Define Tree settings
    opt = Thing(min=1,
                maxLvL=10,
                infoPrune=0.5,
                klass=-1,
                prune=False,
                debug=True,
                verbose=True)

    # Build a tree
    tree = dtree(tbl, opt=opt)

    # Show the tree
    if opt.verbose: show(tree)

    # ----- Debug? -----
    if opt.debug: set_trace()
示例#2
0
def dtree(tbl,
          rows=None,
          lvl=-1,
          asIs=10**32,
          up=None,
          klass=-1,
          branch=[],
          f=None,
          val=None,
          opt=None):
    if not opt:
        opt = Thing(min=1,
                    maxLvL=10,
                    infoPrune=0.5,
                    klass=-1,
                    prune=True,
                    debug=True,
                    verbose=True)

    here = Thing(t=tbl,
                 kids=[],
                 f=f,
                 val=val,
                 up=up,
                 lvl=lvl,
                 rows=rows,
                 modes={},
                 branch=branch)

    features = fWeight(tbl)

    if opt.prune and lvl < 0:
        features = fWeight(tbl)[:int(len(features) * opt.infoPrune)]

    name = features.pop(0)
    remaining = tbl[features + [tbl.columns[opt.klass]]]
    feature = tbl[name].values
    klass = tbl[tbl.columns[opt.klass]].values
    N = len(klass)
    here.score = np.mean(klass)
    splits = discretize(feature, klass)
    LO, HI = min(feature), max(feature)

    def pairs(lst):
        while len(lst) > 1:
            yield (lst.pop(0), lst[0])

    cutoffs = [t for t in pairs(sorted(list(set(splits + [LO, HI]))))]

    if lvl > (opt.maxLvL if opt.prune else int(len(features) * opt.infoPrune)):
        return here
    if asIs == 0:
        return here
    if len(features) < 1:
        return here

    def rows():
        for span in cutoffs:
            new = []
            for f, row in zip(feature, remaining.values.tolist()):
                if span[0] <= f < span[1]:
                    new.append(row)
                elif f == span[1] == HI:
                    new.append(row)
            yield pd.DataFrame(new, columns=remaining.columns), span

    def ent(x):
        C = Counter(x)
        N = len(x)
        return sum([-C[n] / N * np.log(C[n] / N) for n in C.keys()])

    for child, span in rows():
        n = child.shape[0]
        toBe = ent(child[child.columns[opt.klass]])
        if opt.min <= n < N:
            here.kids += [
                dtree(child,
                      lvl=lvl + 1,
                      asIs=toBe,
                      up=here,
                      branch=branch + [(name, span)],
                      f=name,
                      val=span,
                      opt=opt)
            ]

    return here
示例#3
0
文件: pyC45.py 项目: rahlk/RAAT
def dtree2(tbl, rows=None, lvl=-1, asIs=10 ** 32, up=None, klass = -1, branch=[],
          f=None, val=None, opt=None):
  """
  Discrete independent variables
  """
  if not opt:
      opt = Thing(
         min=1,
         maxLvL=10,
         infoPrune=1,
         klass=-1,
         prune=True,
         debug=True,
         verbose=True)

  here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl
              , rows=rows, modes={}, branch=branch)

  features = fWeight(tbl)

  if opt.prune and lvl<0:
    features = fWeight(tbl)[:int(len(features)*opt.infoPrune)]

  name = features.pop(0)
  remaining = tbl[features+[tbl.columns[opt.klass]]]
  feature = tbl[name].values
  klass = tbl[tbl.columns[opt.klass]].values
  N = len(klass)
  here.score = np.mean(klass)
  splits = discretize(feature, klass, discrete=True)
  LO, HI = min(feature), max(feature)
  def pairs(lst):
    while len(lst)>1:
      yield (lst.pop(0), lst[0])
  cutoffs = [LO, HI]

  # set_trace()
  if lvl>(opt.maxLvL if opt.prune else int(len(features)*opt.infoPrune)):
    return here
  if asIs == 0:
    return here
  if len(features)<1:
    return here

  def rows():
    for span in cutoffs:
      new=[]
      for f, row in zip(feature, remaining.values.tolist()):
        if f==span:
          new.append(row)
      yield pd.DataFrame(new,columns=remaining.columns), span

  def ent(x):
    C = Counter(x)
    N = len(x)
    return sum([-C[n]/N*np.log(C[n]/N) for n in C.keys()])

  for child, span in rows():
    # set_trace()
    n = child.shape[0]
    toBe = ent(child[child.columns[opt.klass]])
    if opt.min<=n<N:
      here.kids += [dtree2(child, lvl=lvl + 1, asIs=toBe, up=here
                          , branch= branch + [(name, span)]
                          , f=name, val=(span, span), opt=opt)]

  return here
示例#4
0
文件: pyC45.py 项目: rahlk/MAPGen
def dtree2(tbl,
           rows=None,
           lvl=-1,
           asIs=10**32,
           up=None,
           klass=-5,
           branch=[],
           f=None,
           val=None,
           opt=None,
           encode=True):
    """
    Discrete independent variables
    """
    if not opt:
        opt = Thing(min=1,
                    maxLvL=10,
                    infoPrune=1,
                    klass=-1,
                    prune=True,
                    debug=True,
                    verbose=True)

    features = fWeight(tbl)
    # if encode==True:
    #     encode(tbl, features, opt=opt)
    here = Thing(t=tbl,
                 kids=[],
                 f=f,
                 val=val,
                 up=up,
                 lvl=lvl,
                 rows=rows,
                 modes={},
                 branch=branch)

    if opt.prune and lvl < 0:
        features = fWeight(tbl)[:int(len(features) * opt.infoPrune)]

    name = features.pop(0)
    remaining = tbl[features + [tbl.columns[opt.klass]]]
    feature = tbl[name].values
    klass = tbl[tbl.columns[opt.klass:]].values
    N = len(klass)
    here.score = np.mean(klass, axis=0)
    # splits = discretize(feature, klass, discrete=True)
    LO, HI = min(feature), max(feature)

    def pairs(lst):
        while len(lst) > 1:
            yield (lst.pop(0), lst[0])

    cutoffs = [LO, HI]

    # set_trace()
    if lvl > (opt.maxLvL if opt.prune else int(len(features) * opt.infoPrune)):
        return here
    if asIs < 0.1:
        return here
    if len(features) < 1:
        return here

    def rows():
        for span in cutoffs:
            new = []
            for f, row in zip(feature, remaining.values.tolist()):
                if f == span:
                    new.append(row)
            yield pd.DataFrame(new, columns=remaining.columns), span

    ent = lambda x: sum([
        -Counter[n] / len(x) * np.log(Counter[n] / len(x))
        for n in Counter.keys()
    ])
    sdv = lambda x: np.mean(np.var(x, axis=0))

    for child, span in rows():
        # set_trace()
        n = child.shape[0]
        toBe = sdv(child[child.columns[opt.klass]])
        if opt.min <= n < N:
            here.kids += [
                dtree2(child,
                       lvl=lvl + 1,
                       asIs=toBe,
                       up=here,
                       branch=branch + [(name, span)],
                       f=name,
                       val=(span, span),
                       opt=opt,
                       encode=False)
            ]

    return here
示例#5
0
    def _tree_builder(self,
                      tbl,
                      rows=None,
                      lvl=-1,
                      asIs=10**32,
                      up=None,
                      klass=-1,
                      branch=[],
                      f=None,
                      val=None,
                      opt=None):

        here = Thing(t=tbl,
                     kids=[],
                     f=f,
                     val=val,
                     up=up,
                     lvl=lvl,
                     rows=rows,
                     modes={},
                     branch=branch)

        features = fWeight(tbl)

        if self.prune and lvl < 0:
            features = fWeight(tbl)[:int(len(features) * self.infoPrune)]

        name = features.pop(0)
        remaining = tbl[features + [tbl.columns[self.klass]]]
        feature = tbl[name].values
        klass = tbl[tbl.columns[self.klass]].values
        N = len(klass)
        here.score = np.mean(klass)
        splits = discretize(feature, klass)
        lo, hi = min(feature), max(feature)

        def _pairs(lst):
            while len(lst) > 1:
                yield (lst.pop(0), lst[0])

        cutoffs = [t for t in _pairs(sorted(list(set(splits + [lo, hi]))))]

        if lvl > (self.max_level if self.prune else int(
                len(features) * self.infoPrune)):
            return here
        if asIs == 0:
            return here
        if len(features) < 1:
            return here

        def _rows():
            for span in cutoffs:
                new = []
                for f, row in zip(feature, remaining.values.tolist()):
                    if span[0] <= f < span[1]:
                        new.append(row)
                    elif f == span[1] == hi:
                        new.append(row)
                yield pd.DataFrame(new, columns=remaining.columns), span

        def _entropy(x):
            C = Counter(x)
            N = len(x)
            return sum([-C[n] / N * np.log(C[n] / N) for n in C.keys()])

        for child, span in _rows():
            n = child.shape[0]
            toBe = _entropy(child[child.columns[self.klass]])
            if self.min <= n < N:
                here.kids += [
                    self._tree_builder(child,
                                       lvl=lvl + 1,
                                       asIs=toBe,
                                       up=here,
                                       branch=branch + [(name, span)],
                                       f=name,
                                       val=span,
                                       opt=opt)
                ]

        return here