def join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type, b_update, cur_lvl, all_nodes, top_k, cur_lvl_nodes): for node_j in range(len(prev_lvl)): flag = slice_name_nonsense(prev_lvl[node_i], prev_lvl[node_j], cur_lvl) if not flag and prev_lvl[node_j].key[0] > prev_lvl[node_i].key[0]: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(prev_lvl[node_i]) parents_set.add(prev_lvl[node_j]) new_node.parents = list(parents_set) parent1_attr = prev_lvl[node_i].attributes parent2_attr = prev_lvl[node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper(cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds(top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha) and new_node.key not in top_k.keys: cur_lvl_nodes.append(new_node) top_k.add_new_top_slice(new_node) elif new_node.check_bounds(top_k, x_size, alpha): cur_lvl_nodes.append(new_node) else: if new_node.check_bounds(top_k, x_size, alpha): cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) return cur_lvl_nodes, all_nodes
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update): top_k = Topk(k) # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds levels = [] first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k) # double appending of first level nodes in order to enumerating second level in the same way as others levels.append((first_level[0], len(all_features))) all_nodes = first_level[1] # cur_lvl - index of current level, correlates with number of slice forming features cur_lvl = 1 # level that is planned to be filled later cur_lvl_nodes = first_level # currently for debug print("Level 1 had " + str(len(all_features)) + " candidates") print() print("Current topk are: ") top_k.print_topk() # DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist while len(cur_lvl_nodes) > 0: cur_lvl_nodes = [] count = 0 for left in range(int(cur_lvl / 2) + 1): right = cur_lvl - 1 - left for node_i in range(len(levels[left][0])): for node_j in range(len(levels[right][0])): flag = check_attributes(levels[left][0][node_i], levels[right][0][node_j]) if not flag: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(levels[left][0][node_i]) parents_set.add(levels[right][0][node_j]) new_node.parents = list(parents_set) parent1_attr = levels[left][0][node_i].attributes parent2_attr = levels[right][0][node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper( cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds( top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun( new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha ) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) count = count + levels[left][1] * levels[right][1] print("Level " + str(cur_lvl) + " had " + str(count) + " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level") cur_lvl = cur_lvl + 1 levels.append((cur_lvl_nodes, count)) top_k.print_topk() print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()