예제 #1
0
    def create_inf_func(self, l):
      if self.inf_state is None:
        raise Exception("inf_state is None, cant' create inf_func")

      inf_state = self.inf_state
      vs = [gv for gv, gc in zip(inf_state[2], inf_state[3]) if gc]
      if vs:
        maxg = max(vs)
      else:
        maxg = 0
      bds, bcs = [], []
      for idx in xrange(len(inf_state[0])):
        bd, bc = inf_state[0][idx], inf_state[1][idx]
        if valid_number(bd) and valid_number(bc):
          bds.append(bd)
          bcs.append(bc)
      f = lambda c: l*compute_bad_score(bds, bcs, c) - (1.-l)*maxg
      return f
예제 #2
0
    def foo(self, rule, max_card=None):
        for cols in powerset(self.cols):
            if not cols:
                continue
            if len(cols) > self.max_complexity:
                continue

            _logger.debug(str( cols))
            all_clauses = [self.get_all_clauses(col, max_card) for col in cols]
            for clauses in self.dfs(*all_clauses):
                new_rule = SDRule(rule.data, None, clauses, None)
                
                self.n_rules_checked -= len(clauses)
                if self.n_rules_checked <= 0:
                    diff = time.time() - self.start
                    for c in self.cs:
                        if (not self.checkpoints_per_c or 
                            not self.checkpoints_per_c[c] or 
                            diff - self.checkpoints_per_c[c][-1][0] > 10):
                            bests = self.bests_per_c[c]
                            if bests:
                                best_rule = max(bests, key=lambda r: r.quality)
                                clone = best_rule.clone()
                                clone.quality = best_rule.quality
                                self.checkpoints_per_c[c].append((diff, clone))
                    self.stop = diff > self.max_wait
                    self.n_rules_checked = 100

                    _logger.debug( "%.4f\t%d", time.time() - self.start, self.n_rules_checked)
                    _logger.debug(str( new_rule))

                if self.stop:
                    return

                if max_card is not None and self.max_card_in_conds(clauses) < max_card:
                    continue

                influences = self.influences(new_rule, cs=self.cs)
                for c, influence in zip(self.cs, influences):
                    clone = new_rule.clone()
                    clone.quality = influence
                    clone.__examples__ = None

                    if not valid_number(clone.quality):
                        continue

                    if len(self.bests_per_c[c]) < self.max_bests:
                        heapq.heappush(self.bests_per_c[c], clone)
                    else:
                        heapq.heapreplace(self.bests_per_c[c], clone)
                new_rule.__examples__ = None
예제 #3
0
def filter_bad_clusters(clusters):
  f = lambda c: c and valid_number(c.error)
  return filter(f, clusters)
예제 #4
0
    def __call__(self, clusters, **kwargs):
        if not clusters:
            return list(clusters)

        _logger.debug("merging %d clusters", len(clusters))

        self.set_params(**kwargs)
        self.setup_stats(clusters)

        # adj_graph is used to track adjacent partitions
        _logger.debug("building adj graph")
        self.adj_graph = self.make_adjacency(clusters, self.partitions_complete)


        # rtree is static (computed once) to find base partitions within 
        # a merged partition
        #_logger.debug("building rtree")
        #self.rtree = self.construct_rtree(clusters)

        # load state from cache
        can_stop, clusters_set, mergable_clusters = self.load_from_cache(clusters)
        if can_stop:
            return sorted(clusters_set, key=lambda c: c.error, reverse=True)

        _logger.debug("start merging!")
        while len(clusters_set) > self.min_clusters:

          cur_clusters = sorted(clusters_set, key=lambda c: c.error, reverse=True)

          _logger.debug("# mergable clusters: %d\tout of\t%d",
                  len(mergable_clusters),
                  len(cur_clusters))
          if not mergable_clusters:
              break


          merged_clusters, new_clusters = set(), set()
          seen = set()

          for cluster in mergable_clusters:
            if (cluster in merged_clusters or 
                cluster in new_clusters or cluster in seen):
                continue

            canskip = False
            for test in chain(new_clusters, mergable_clusters):
              if test == cluster: continue
              if test.contains(cluster, .01):
                #_logger.debug("skipped\n\t%s\n\t%s", str(cluster), str(test))
                canskip = True
                break
            if canskip:
              _logger.debug("skipped\n\t%s\n\t%s", str(cluster), str(test))
              continue

            merged, rms = self.expand(cluster, clusters) 
            if not merged or merged == cluster or len(filter(lambda c: c.contains(merged), cur_clusters)):
              seen.add(cluster)
              continue
            if not valid_number(merged.error):
              continue
            
            _logger.debug("%.4f\t%.4f\t-> %.4f",
                            merged.parents[0].error,
                            merged.parents[0].error,
                            merged.error)
            seen.update(merged.parents)
            seen.update(rms)


            if merged not in cur_clusters:
              new_clusters.add(merged)                    
            merged_clusters.update(rms)

          _logger.debug("merged %d\t%d new clusters\tout of %d",
                        len(merged_clusters),
                        len(new_clusters),
                        len(mergable_clusters))

          
          if not new_clusters:
              break


          map(self.adj_graph.remove, merged_clusters)
          map(self.adj_graph.insert, new_clusters)
          clusters_set.difference_update(merged_clusters)
          clusters_set.update(new_clusters)
          mergable_clusters = sorted(new_clusters, key=lambda c: c.error, reverse=True)

        clusters_set = filter_bad_clusters(clusters_set)
        self.cache_results(clusters_set, mergable_clusters)
        return sorted(clusters_set, key=lambda c: c.error, reverse=True)