def make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, top_k, alpha, w): first_level = [] counter = 0 all_nodes = {} # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds for feature in all_features: new_node = Node(complete_x, loss, x_size, y_test, errors) new_node.parents = [(feature, counter)] new_node.attributes.append((feature, counter)) new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) all_nodes[new_node.key] = new_node new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w) new_node.c_upper = new_node.score first_level.append(new_node) new_node.print_debug(top_k, 0) # constraints for 1st level nodes to be problematic candidates if new_node.check_constraint(top_k, x_size, alpha): # this method updates top k slices if needed top_k.add_new_top_slice(new_node) counter = counter + 1 return first_level, all_nodes
def make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k): all_nodes = {} counter = 0 first_level = [] for feature in all_features: new_node = Node(complete_x, loss, x_size, y_test, errors) new_node.parents = [(feature, counter)] new_node.attributes.append((feature, counter)) new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) all_nodes[new_node.key] = new_node new_node.process_slice(loss_type) # for first level nodes all bounds are strict as concrete metrics new_node.s_upper = new_node.size new_node.s_lower = 0 new_node.e_upper = new_node.loss new_node.e_max_upper = new_node.e_max new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w) new_node.c_upper = new_node.score first_level.append(new_node) new_node.print_debug(top_k, 0) # constraints for 1st level nodes to be problematic candidates if new_node.score > 1 and new_node.size >= x_size / alpha: # this method updates top k slices if needed top_k.add_new_top_slice(new_node) counter = counter + 1 return first_level, all_nodes
def join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type, b_update, cur_lvl, all_nodes, top_k, cur_lvl_nodes): for node_j in range(len(prev_lvl)): flag = slice_name_nonsense(prev_lvl[node_i], prev_lvl[node_j], cur_lvl) if not flag and prev_lvl[node_j].key[0] > prev_lvl[node_i].key[0]: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(prev_lvl[node_i]) parents_set.add(prev_lvl[node_j]) new_node.parents = list(parents_set) parent1_attr = prev_lvl[node_i].attributes parent2_attr = prev_lvl[node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper(cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds(top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha) and new_node.key not in top_k.keys: cur_lvl_nodes.append(new_node) top_k.add_new_top_slice(new_node) elif new_node.check_bounds(top_k, x_size, alpha): cur_lvl_nodes.append(new_node) else: if new_node.check_bounds(top_k, x_size, alpha): cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return cur_lvl_nodes, all_nodes
def test_non_nonsense(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]] parent3.attributes = [('x1_2', 4), ('x2_2', 7)] combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent2 = combined[0]['x0_3 && x2_3'] parent3.key = (8, 'x1_2 && x2_2') flag_nonsense = slicer.slice_name_nonsense(parent2, parent3, 2) self.assertEqual(True, flag_nonsense) print("check4")
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update): top_k = Topk(k) # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds levels = [] first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k) # double appending of first level nodes in order to enumerating second level in the same way as others levels.append((first_level[0], len(all_features))) all_nodes = first_level[1] # cur_lvl - index of current level, correlates with number of slice forming features cur_lvl = 1 # level that is planned to be filled later cur_lvl_nodes = first_level # currently for debug print("Level 1 had " + str(len(all_features)) + " candidates") print() print("Current topk are: ") top_k.print_topk() # DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist while len(cur_lvl_nodes) > 0: cur_lvl_nodes = [] count = 0 for left in range(int(cur_lvl / 2) + 1): right = cur_lvl - 1 - left for node_i in range(len(levels[left][0])): for node_j in range(len(levels[right][0])): flag = check_attributes(levels[left][0][node_i], levels[right][0][node_j]) if not flag: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(levels[left][0][node_i]) parents_set.add(levels[right][0][node_j]) new_node.parents = list(parents_set) parent1_attr = levels[left][0][node_i].attributes parent2_attr = levels[right][0][node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper( cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds( top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun( new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha ) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) count = count + levels[left][1] * levels[right][1] print("Level " + str(cur_lvl) + " had " + str(count) + " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level") cur_lvl = cur_lvl + 1 levels.append((cur_lvl_nodes, count)) top_k.print_topk() print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()