Пример #1
0
def dtree(tbl,
          rows=None,
          lvl=-1,
          asIs=10**32,
          up=None,
          klass=-1,
          branch=[],
          f=None,
          val=None,
          opt=None):
    if not opt:
        opt = Thing(min=1,
                    maxLvL=10,
                    infoPrune=0.5,
                    klass=-1,
                    prune=True,
                    debug=True,
                    verbose=True)

    here = Thing(t=tbl,
                 kids=[],
                 f=f,
                 val=val,
                 up=up,
                 lvl=lvl,
                 rows=rows,
                 modes={},
                 branch=branch)

    features = fWeight(tbl)

    if opt.prune and lvl < 0:
        features = fWeight(tbl)[:int(len(features) * opt.infoPrune)]

    name = features.pop(0)
    remaining = tbl[features + [tbl.columns[opt.klass]]]
    feature = tbl[name].values
    klass = tbl[tbl.columns[opt.klass]].values
    N = len(klass)
    here.score = np.mean(klass)
    splits = discretize(feature, klass)
    LO, HI = min(feature), max(feature)

    def pairs(lst):
        while len(lst) > 1:
            yield (lst.pop(0), lst[0])

    cutoffs = [t for t in pairs(sorted(list(set(splits + [LO, HI]))))]

    if lvl > (opt.maxLvL if opt.prune else int(len(features) * opt.infoPrune)):
        return here
    if asIs == 0:
        return here
    if len(features) < 1:
        return here

    def rows():
        for span in cutoffs:
            new = []
            for f, row in zip(feature, remaining.values.tolist()):
                if span[0] <= f < span[1]:
                    new.append(row)
                elif f == span[1] == HI:
                    new.append(row)
            yield pd.DataFrame(new, columns=remaining.columns), span

    def ent(x):
        C = Counter(x)
        N = len(x)
        return sum([-C[n] / N * np.log(C[n] / N) for n in C.keys()])

    for child, span in rows():
        n = child.shape[0]
        toBe = ent(child[child.columns[opt.klass]])
        if opt.min <= n < N:
            here.kids += [
                dtree(child,
                      lvl=lvl + 1,
                      asIs=toBe,
                      up=here,
                      branch=branch + [(name, span)],
                      f=name,
                      val=span,
                      opt=opt)
            ]

    return here
Пример #2
0
def dtree2(tbl, rows=None, lvl=-1, asIs=10 ** 32, up=None, klass = -1, branch=[],
          f=None, val=None, opt=None):
  """
  Discrete independent variables
  """
  if not opt:
      opt = Thing(
         min=1,
         maxLvL=10,
         infoPrune=1,
         klass=-1,
         prune=True,
         debug=True,
         verbose=True)

  here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl
              , rows=rows, modes={}, branch=branch)

  features = fWeight(tbl)

  if opt.prune and lvl<0:
    features = fWeight(tbl)[:int(len(features)*opt.infoPrune)]

  name = features.pop(0)
  remaining = tbl[features+[tbl.columns[opt.klass]]]
  feature = tbl[name].values
  klass = tbl[tbl.columns[opt.klass]].values
  N = len(klass)
  here.score = np.mean(klass)
  splits = discretize(feature, klass, discrete=True)
  LO, HI = min(feature), max(feature)
  def pairs(lst):
    while len(lst)>1:
      yield (lst.pop(0), lst[0])
  cutoffs = [LO, HI]

  # set_trace()
  if lvl>(opt.maxLvL if opt.prune else int(len(features)*opt.infoPrune)):
    return here
  if asIs == 0:
    return here
  if len(features)<1:
    return here

  def rows():
    for span in cutoffs:
      new=[]
      for f, row in zip(feature, remaining.values.tolist()):
        if f==span:
          new.append(row)
      yield pd.DataFrame(new,columns=remaining.columns), span

  def ent(x):
    C = Counter(x)
    N = len(x)
    return sum([-C[n]/N*np.log(C[n]/N) for n in C.keys()])

  for child, span in rows():
    # set_trace()
    n = child.shape[0]
    toBe = ent(child[child.columns[opt.klass]])
    if opt.min<=n<N:
      here.kids += [dtree2(child, lvl=lvl + 1, asIs=toBe, up=here
                          , branch= branch + [(name, span)]
                          , f=name, val=(span, span), opt=opt)]

  return here
Пример #3
0
    def _tree_builder(self, dframe, lvl=-1, as_is=float("inf"),
                      parent=None, branch=[], f=None, val=None):
        """
        Construct decision tree

        Parameters
        ----------
        dframe: <pandas.core.Frame.DataFrame>
            Raw data as a dataframe
        lvl: int (default -1)
            Level of the tree
        as_is: float (defaulf "inf")
            Entropy of the class variable in the current rows
        parent: Thing (default None)
            Parent Node
        branch: List[Thing] (default [])
            Parent nodes visitied to reach current node
        f: str (default None)
            Name of the attribute represented by the current node
        val: Tuple(low, high)
            The minimum and maximum range of the attribute in the current node

        Returns
        -------
        Thing:
            The root node of the tree

        Notes
        -----
        + Thing is a generic container, in this case it's a node in the tree.
        + You'll find it in <src.tools.containers>
        """

        current = Thing(t=dframe, kids=[], f=f, val=val,
                        parent=parent, lvl=lvl, branch=branch)

        features = fWeight(dframe)

        if self.prune and lvl < 0:
            features = fWeight(dframe)[:int(len(features) * self.info_prune)]

        name = features.pop(0)
        remaining = dframe[features + [dframe.columns[self.klass]]]
        feature = dframe[name].values
        dependent_var = dframe[dframe.columns[self.klass]].values
        N = len(dependent_var)
        current.score = np.mean(dependent_var)
        splits = discretize(feature, dependent_var)
        low = min(feature)
        high = max(feature)
        cutoffs = [t for t in self.pairs(
            sorted(list(set(splits + [low, high]))))]

        if lvl > (self.max_levels if self.prune else int(
                len(features) * self.info_prune)):
            return current
        if as_is == 0:
            return current
        if len(features) < 1:
            return current

        def _rows():
            for span in cutoffs:
                new = []
                for f, row in zip(feature, remaining.values.tolist()):
                    if span[0] <= f < span[1]:
                        new.append(row)
                    elif f == span[1] == high:
                        new.append(row)
                yield pd.DataFrame(new, columns=remaining.columns), span

        for child, span in _rows():
            n = child.shape[0]
            to_be = self._entropy(child[child.columns[self.klass]])
            if self.min_levels <= n < N:
                current.kids += [
                    self._tree_builder(child, lvl=lvl + 1, as_is=to_be,
                                       parent=current, branch=branch +
                                       [(name, span)],
                                       f=name, val=span)]

        return current
Пример #4
0
    def _tree_builder(self,
                      tbl,
                      rows=None,
                      lvl=-1,
                      asIs=10**32,
                      up=None,
                      klass=-1,
                      branch=[],
                      f=None,
                      val=None,
                      opt=None):

        here = Thing(t=tbl,
                     kids=[],
                     f=f,
                     val=val,
                     up=up,
                     lvl=lvl,
                     rows=rows,
                     modes={},
                     branch=branch)

        features = fWeight(tbl)

        if self.prune and lvl < 0:
            features = fWeight(tbl)[:int(len(features) * self.infoPrune)]

        name = features.pop(0)
        remaining = tbl[features + [tbl.columns[self.klass]]]
        feature = tbl[name].values
        klass = tbl[tbl.columns[self.klass]].values
        N = len(klass)
        here.score = np.mean(klass)
        splits = discretize(feature, klass)
        lo, hi = min(feature), max(feature)

        def _pairs(lst):
            while len(lst) > 1:
                yield (lst.pop(0), lst[0])

        cutoffs = [t for t in _pairs(sorted(list(set(splits + [lo, hi]))))]

        if lvl > (self.max_level if self.prune else int(
                len(features) * self.infoPrune)):
            return here
        if asIs == 0:
            return here
        if len(features) < 1:
            return here

        def _rows():
            for span in cutoffs:
                new = []
                for f, row in zip(feature, remaining.values.tolist()):
                    if span[0] <= f < span[1]:
                        new.append(row)
                    elif f == span[1] == hi:
                        new.append(row)
                yield pd.DataFrame(new, columns=remaining.columns), span

        def _entropy(x):
            C = Counter(x)
            N = len(x)
            return sum([-C[n] / N * np.log(C[n] / N) for n in C.keys()])

        for child, span in _rows():
            n = child.shape[0]
            toBe = _entropy(child[child.columns[self.klass]])
            if self.min <= n < N:
                here.kids += [
                    self._tree_builder(child,
                                       lvl=lvl + 1,
                                       asIs=toBe,
                                       up=here,
                                       branch=branch + [(name, span)],
                                       f=name,
                                       val=span,
                                       opt=opt)
                ]

        return here