def join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type, b_update, cur_lvl, all_nodes, top_k, cur_lvl_nodes): for node_j in range(len(prev_lvl)): flag = slice_name_nonsense(prev_lvl[node_i], prev_lvl[node_j], cur_lvl) if not flag and prev_lvl[node_j].key[0] > prev_lvl[node_i].key[0]: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(prev_lvl[node_i]) parents_set.add(prev_lvl[node_j]) new_node.parents = list(parents_set) parent1_attr = prev_lvl[node_i].attributes parent2_attr = prev_lvl[node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper(cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds(top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha) and new_node.key not in top_k.keys: cur_lvl_nodes.append(new_node) top_k.add_new_top_slice(new_node) elif new_node.check_bounds(top_k, x_size, alpha): cur_lvl_nodes.append(new_node) else: if new_node.check_bounds(top_k, x_size, alpha): cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return cur_lvl_nodes, all_nodes
def test_non_nonsense(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]] parent3.attributes = [('x1_2', 4), ('x2_2', 7)] combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent2 = combined[0]['x0_3 && x2_3'] parent3.key = (8, 'x1_2 && x2_2') flag_nonsense = slicer.slice_name_nonsense(parent2, parent3, 2) self.assertEqual(True, flag_nonsense) print("check4")
def test_uppers(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]] parent3.attributes = [('x1_2', 4), ('x2_2', 7)] combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent1 = combined[0]['x0_3 && x1_3'] parent2 = combined[0]['x0_3 && x2_3'] new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) new_node.parents = [parent1, parent2] new_node.calc_bounds(2, self.w) self.assertEqual(25, new_node.s_upper) print("check5") self.assertEqual(398, int(new_node.c_upper)) print("check6")
def test_nonsense(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent1 = combined[0][('x0_3 && x1_3')] parent2 = combined[0][('x0_3 && x2_2')] new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) new_node.parents = [parent1, parent2] parent1_attr = parent1.attributes parent2_attr = parent2.attributes new_node_attr = slicer.union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() flagTrue = slicer.slice_name_nonsense(parent1, parent2, 2) self.assertEqual(True, flagTrue) print("check3")
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update): top_k = Topk(k) # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds levels = [] first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k) # double appending of first level nodes in order to enumerating second level in the same way as others levels.append((first_level[0], len(all_features))) all_nodes = first_level[1] # cur_lvl - index of current level, correlates with number of slice forming features cur_lvl = 1 # level that is planned to be filled later cur_lvl_nodes = first_level # currently for debug print("Level 1 had " + str(len(all_features)) + " candidates") print() print("Current topk are: ") top_k.print_topk() # DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist while len(cur_lvl_nodes) > 0: cur_lvl_nodes = [] count = 0 for left in range(int(cur_lvl / 2) + 1): right = cur_lvl - 1 - left for node_i in range(len(levels[left][0])): for node_j in range(len(levels[right][0])): flag = check_attributes(levels[left][0][node_i], levels[right][0][node_j]) if not flag: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(levels[left][0][node_i]) parents_set.add(levels[right][0][node_j]) new_node.parents = list(parents_set) parent1_attr = levels[left][0][node_i].attributes parent2_attr = levels[right][0][node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper( cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds( top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun( new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha ) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) count = count + levels[left][1] * levels[right][1] print("Level " + str(cur_lvl) + " had " + str(count) + " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level") cur_lvl = cur_lvl + 1 levels.append((cur_lvl_nodes, count)) top_k.print_topk() print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()