def discrete_vals_iter(self, table): """ @param @return tuple of position bits, parameters for calling bottomup, and the vertical subset of the table position bits is actually a list of indexes that will be added to the resulting clusters """ ddists = Orange.statistics.distribution.Domain(table) discrete_vals = [] for pos, attr in enumerate(table.domain): if attr.var_type == Orange.feature.Type.Discrete: discrete_vals.append((attr.name, attr.values)) cards = [len(vals) for name, vals in discrete_vals] if not cards: return all_clusters = [] for bits in bitarray_iterator(cards): partition = self.get_partition(bits, discrete_vals, table) if not len(partition): continue rmcols = [attr.name for attr in partition.domain if attr.var_type == Orange.feature.Type.Discrete and attr.name in self.cols] cols = [col for col in self.cols if col not in rmcols] continuous_table = rm_attr_from_domain(partition, rmcols) partition_keys = {} for bit, (name, vals) in zip(bits, discrete_vals): partition_keys[name] = [bit] yield partition_keys, cols, continuous_table
def __call__(self, table, **kwargs): """ partition table by the discrete attributes """ # setup the error function to execute on slim table rmcols = [attr.name for attr in table.domain if attr.var_type == Orange.feature.Type.Discrete and attr.name in self.cols] thin_table = rm_attr_from_domain(table, rmcols) self.params['err_func'].setup(thin_table) bottomup_func = DiscreteBottomUpF(self.params) if self.parallelize: start = time.time() pool = Pool(self.nprocesses) results = pool.map(bottomup_func, self.discrete_vals_iter(table)) pool.close() self.merge_cost += time.time() - start else: results = [] for args in self.discrete_vals_iter(table): results.append(bottomup_func(args)) if results: all_stats, clusters_list = zip(*results) kd_cost, sample_cost, initclusters_cost, merge_cost = zip(*all_stats) self.kd_cost += sum(kd_cost) self.sample_cost += sum(sample_cost) self.initclusters_cost += sum(initclusters_cost) self.merge_cost += sum(merge_cost) if not len(results): return super(DiscreteBottomUp, self).__call__(table) all_stats, clusters_list = zip(*results) self.all_clusters = [] map(self.all_clusters.extend, clusters_list) BottomUp.setup(self, table) thresh = compute_clusters_threshold(self.all_clusters) final_clusters = self.normalize_results(self.all_clusters, is_mergable=lambda c: c.error >= thresh) self.final_clusters = final_clusters return final_clusters
def discrete_vals_iter(self, table): domain = table.domain for r in self.rules: partition = r.filter_table(table) if not len(partition): continue rmcols = [attr.name for attr in domain if (attr.var_type == Orange.feature.Type.Discrete and attr.name in self.cols)] cols = [col for col in self.cols if col not in rmcols] continuous_table = rm_attr_from_domain(partition, rmcols) partition_keys = {} for c in r.filter.conditions: if domain[c.position].var_type == Orange.feature.Type.Discrete: partition_keys[domain[c.position].name] = map(int, c.values) yield partition_keys, cols, continuous_table
def f(bad_tables, aggerr, klass, params, kwargs, queue): try: cols = valid_table_cols(bad_tables[0], aggerr.agg.cols, kwargs) all_cols = cols + aggerr.agg.cols torm = [attr.name for attr in bad_tables[0].domain if attr.name not in all_cols] bad_tables = [rm_attr_from_domain(t, torm) for t in bad_tables] good_tables = [] _, full_table = reconcile_tables(bad_tables) start = time.time() hybrid = klass(**params) clusters = hybrid(full_table, bad_tables, good_tables) normalize_cluster_errors(clusters) rules = clusters_to_rules(clusters, full_table) cost = time.time() - start ncalls = 0 queue.put( (rules, cost, ncalls) ) except: traceback.print_exc() queue.put(None)
def serial_hybrid(obj, aggerr, **kwargs): costs = {} db = connect(obj.dbname) obj.db = db start = time.time() all_keys = list(chain(aggerr.keys, obj.goodkeys[aggerr.agg.shortname])) all_tables = get_provenance_split(obj, aggerr.agg.cols, all_keys) bad_tables = all_tables[:len(aggerr.keys)] good_tables = all_tables[len(aggerr.keys):] costs['data_load'] = time.time() - start _logger.debug("bad table counts: %s" % ', '.join(map(str, map(len, bad_tables)))) _logger.debug("good table counts: %s" % ', '.join(map(str, map(len, good_tables)))) print "agg error %s \t %s" % (aggerr.agg, aggerr.errtype) cost, ncalls = 0, 0 rules = [] try: full_start = time.time() start = time.time() cols = valid_table_cols(bad_tables[0], aggerr.agg.cols, kwargs) all_cols = cols + aggerr.agg.cols torm = [attr.name for attr in bad_tables[0].domain if attr.name not in all_cols] _logger.debug("valid cols: %s" % cols) bad_tables = [rm_attr_from_domain(t, torm) for t in bad_tables] good_tables = [rm_attr_from_domain(t, torm) for t in good_tables] all_full_table = union_tables(bad_tables, good_tables) full_table = union_tables(bad_tables) costs['data_setup'] = time.time() - start # make sure aggerr keys and tables are consistent one last time if len(bad_tables) != len(aggerr.keys): pdb.set_trace() raise RuntimeError("#badtables (%d) != #aggerr keys (%d)" % (len(bad_tables), len(aggerr.keys))) params = { 'aggerr':aggerr, 'cols':cols, 'c': obj.c, 'aggerr':aggerr, 'cols':cols, 'c': obj.c, 'c_range': [0.05, 1], 'l' : 0.6, 'msethreshold': 0.01, 'max_wait':5, 'DEBUG': False } # msethreshold=0.01, # k=10, # nprocesses=4, # parallelize=True, # complexity_multiplier=1.5} params.update(dict(kwargs)) if aggerr.agg.func.__class__ in (errfunc.SumErrFunc, errfunc.CountErrFunc): klass = MR params.update({ 'use_cache': False, 'use_mtuples': False, 'granularity': 100 }) params['c'] = params.get('c', .15) else: klass = BDT params.update({ 'use_cache': True, 'use_mtuples': False,#True, 'epsilon': 0.0015, 'min_improvement': 0.01, 'tau': [0.08, 0.5], 'p': 0.7 }) params['c'] = params.get('c', .3) #klass = SVM #params.update({}) _logger.debug("c is set to: %.4f", params['c']) start = time.time() hybrid = klass(**params) clusters = hybrid(all_full_table, bad_tables, good_tables) rules = clusters_to_rules(clusters, full_table) print "nclusters: %d" % len(clusters) costs['rules_get'] = time.time() - start _logger.debug("clustering %d rules" % len(rules)) for r in rules[:10]: _logger.debug("%.4f\t%.4f - %.4f\t%s" % (r.quality, r.c_range[0], r.c_range[1], str(r))) clustered_rules = hybrid.group_rules(rules, 5) rules = clustered_rules costs['rules_cluster'] = time.time() - start ncalls = 0 except: traceback.print_exc() # return the best rules first in the list start = time.time() rules.sort(key=lambda r: r.c_range[0]) rules = [r.simplify(all_full_table) for r in rules[:10]] costs['rules_simplify'] = time.time() - start cost = time.time() - full_start print "found rules" for rule in rules[:5]: print "%.5f\t%s" % (rule.quality, rule) print "=== Costs ===" for key, cost in costs.iteritems(): print "%.5f\t%s" % (cost, key) return cost, ncalls, table, rules