def create_inf_func(self, l): if self.inf_state is None: raise Exception("inf_state is None, cant' create inf_func") inf_state = self.inf_state vs = [gv for gv, gc in zip(inf_state[2], inf_state[3]) if gc] if vs: maxg = max(vs) else: maxg = 0 bds, bcs = [], [] for idx in xrange(len(inf_state[0])): bd, bc = inf_state[0][idx], inf_state[1][idx] if valid_number(bd) and valid_number(bc): bds.append(bd) bcs.append(bc) f = lambda c: l*compute_bad_score(bds, bcs, c) - (1.-l)*maxg return f
def foo(self, rule, max_card=None): for cols in powerset(self.cols): if not cols: continue if len(cols) > self.max_complexity: continue _logger.debug(str( cols)) all_clauses = [self.get_all_clauses(col, max_card) for col in cols] for clauses in self.dfs(*all_clauses): new_rule = SDRule(rule.data, None, clauses, None) self.n_rules_checked -= len(clauses) if self.n_rules_checked <= 0: diff = time.time() - self.start for c in self.cs: if (not self.checkpoints_per_c or not self.checkpoints_per_c[c] or diff - self.checkpoints_per_c[c][-1][0] > 10): bests = self.bests_per_c[c] if bests: best_rule = max(bests, key=lambda r: r.quality) clone = best_rule.clone() clone.quality = best_rule.quality self.checkpoints_per_c[c].append((diff, clone)) self.stop = diff > self.max_wait self.n_rules_checked = 100 _logger.debug( "%.4f\t%d", time.time() - self.start, self.n_rules_checked) _logger.debug(str( new_rule)) if self.stop: return if max_card is not None and self.max_card_in_conds(clauses) < max_card: continue influences = self.influences(new_rule, cs=self.cs) for c, influence in zip(self.cs, influences): clone = new_rule.clone() clone.quality = influence clone.__examples__ = None if not valid_number(clone.quality): continue if len(self.bests_per_c[c]) < self.max_bests: heapq.heappush(self.bests_per_c[c], clone) else: heapq.heapreplace(self.bests_per_c[c], clone) new_rule.__examples__ = None
def filter_bad_clusters(clusters): f = lambda c: c and valid_number(c.error) return filter(f, clusters)
def __call__(self, clusters, **kwargs): if not clusters: return list(clusters) _logger.debug("merging %d clusters", len(clusters)) self.set_params(**kwargs) self.setup_stats(clusters) # adj_graph is used to track adjacent partitions _logger.debug("building adj graph") self.adj_graph = self.make_adjacency(clusters, self.partitions_complete) # rtree is static (computed once) to find base partitions within # a merged partition #_logger.debug("building rtree") #self.rtree = self.construct_rtree(clusters) # load state from cache can_stop, clusters_set, mergable_clusters = self.load_from_cache(clusters) if can_stop: return sorted(clusters_set, key=lambda c: c.error, reverse=True) _logger.debug("start merging!") while len(clusters_set) > self.min_clusters: cur_clusters = sorted(clusters_set, key=lambda c: c.error, reverse=True) _logger.debug("# mergable clusters: %d\tout of\t%d", len(mergable_clusters), len(cur_clusters)) if not mergable_clusters: break merged_clusters, new_clusters = set(), set() seen = set() for cluster in mergable_clusters: if (cluster in merged_clusters or cluster in new_clusters or cluster in seen): continue canskip = False for test in chain(new_clusters, mergable_clusters): if test == cluster: continue if test.contains(cluster, .01): #_logger.debug("skipped\n\t%s\n\t%s", str(cluster), str(test)) canskip = True break if canskip: _logger.debug("skipped\n\t%s\n\t%s", str(cluster), str(test)) continue merged, rms = self.expand(cluster, clusters) if not merged or merged == cluster or len(filter(lambda c: c.contains(merged), cur_clusters)): seen.add(cluster) continue if not valid_number(merged.error): continue _logger.debug("%.4f\t%.4f\t-> %.4f", merged.parents[0].error, merged.parents[0].error, merged.error) seen.update(merged.parents) seen.update(rms) if merged not in cur_clusters: new_clusters.add(merged) merged_clusters.update(rms) _logger.debug("merged %d\t%d new clusters\tout of %d", len(merged_clusters), len(new_clusters), len(mergable_clusters)) if not new_clusters: break map(self.adj_graph.remove, merged_clusters) map(self.adj_graph.insert, new_clusters) clusters_set.difference_update(merged_clusters) clusters_set.update(new_clusters) mergable_clusters = sorted(new_clusters, key=lambda c: c.error, reverse=True) clusters_set = filter_bad_clusters(clusters_set) self.cache_results(clusters_set, mergable_clusters) return sorted(clusters_set, key=lambda c: c.error, reverse=True)