Пример #1
0
def parallel_process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type):
    top_k = Topk(k)
    cur_lvl = 0
    cur_lvl_nodes = list(all_features)
    pred_pandas = predictions.toPandas()
    x_size = len(pred_pandas)
    b_topk = SparkContext.broadcast(sc, top_k)
    b_cur_lvl = SparkContext.broadcast(sc, cur_lvl)
    buckets = {}
    for node in cur_lvl_nodes:
        bucket = Bucket(node, cur_lvl, w, x_size, loss)
        buckets[bucket.name] = bucket
    b_buckets = SparkContext.broadcast(sc, buckets)
    rows = predictions.rdd.map(lambda row: (row[1].indices, row[2]))\
        .map(lambda item: list(item))
    mapped = rows.map(lambda row: rows_mapper(row, b_buckets.value, loss_type))
    flattened = mapped.flatMap(lambda line: (line.items()))
    reduced = flattened.combineByKey(combiner, merge_values, merge_combiners)
    cur_lvl_nodes = reduced.values()\
        .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket, loss, w, x_size, b_cur_lvl.value))
    if debug:
        cur_lvl_nodes.map(lambda bucket: bucket.print_debug(b_topk.value)).collect()
    cur_lvl = 1
    prev_level = cur_lvl_nodes.collect()
    top_k = top_k.buckets_top_k(prev_level, x_size, alpha, 1)
    while len(prev_level) > 0:
        b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level)
        b_topk = SparkContext.broadcast(sc, top_k)
        cur_min = top_k.min_score
        b_cur_lvl = SparkContext.broadcast(sc, cur_lvl)
        top_k.print_topk()
        buckets = join_enum(prev_level, cur_lvl, x_size, alpha, top_k, w, loss)
        b_buckets = SparkContext.broadcast(sc, buckets)
        to_slice = dict(filter(lambda bucket: bucket[1].check_bounds(x_size, alpha, top_k), buckets.items()))
        b_to_slice = SparkContext.broadcast(sc, to_slice)
        mapped = rows.map(lambda row: rows_mapper(row, b_to_slice.value, loss_type))
        flattened = mapped.flatMap(lambda line: (line.items()))
        to_process = flattened.combineByKey(combiner, merge_values, merge_combiners)
        if debug:
            to_process.values().map(lambda bucket: bucket.print_debug(b_topk.value)).collect()
        prev_level = to_process\
            .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket[1], loss, w, x_size, b_cur_lvl.value))\
            .collect()
        cur_lvl += 1
        top_k = top_k.buckets_top_k(prev_level, x_size, alpha, cur_min)
        print("Level " + str(cur_lvl) + " had " + str(
            len(b_cur_lvl_nodes.value * (len(prev_level) - 1)))+" candidates but after pruning only " +
              str(len(prev_level)) + " go to the next level")
        top_k.print_topk()
    print()
    print("Program stopped at level " + str(cur_lvl - 1))
    print("Selected slices are: ")
    top_k.print_topk()
    return None
Пример #2
0
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update):
    levels = []
    top_k = Topk(k)
    first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, top_k, alpha, w)
    all_nodes = first_level[1]
    levels.append(first_level[0])
    # cur_lvl - index of current level, correlates with number of slice forming features
    cur_lvl = 1  # currently filled level after first init iteration
    # currently for debug
    print("Level 1 had " + str(len(all_features)) + " candidates")
    print()
    print("Current topk are: ")
    top_k.print_topk()
    # combining each candidate of previous level with every till it becomes useless (one node can't make a pair)
    while len(levels[cur_lvl - 1]) > 1:
        cur_lvl_nodes = []
        prev_lvl = levels[cur_lvl - 1]
        for node_i in range(len(prev_lvl)):
            partial = join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type,
                                b_update, cur_lvl, all_nodes, top_k, cur_lvl_nodes)
            cur_lvl_nodes = partial[0]
            all_nodes = partial[1]
        cur_lvl = cur_lvl + 1
        levels.append(cur_lvl_nodes)
        top_k.print_topk()
        print("Level " + str(cur_lvl) + " had " + str(len(prev_lvl) * (len(prev_lvl) - 1)) +
              " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level")
    print("Program stopped at level " + str(cur_lvl + 1))
    print()
    print("Selected slices are: ")
    top_k.print_topk()
Пример #3
0
def process(all_features, complete_x, loss, x_size, y_test, errors, debug,
            alpha, k, w, loss_type, b_update):
    levels = []
    top_k = Topk(k)
    first_level = make_first_level(all_features, complete_x, loss, x_size,
                                   y_test, errors, loss_type, top_k, alpha, w)
    candidates = []
    pruned = []
    indexes = []
    indexes.append(1)
    candidates.append(len(first_level[0]))
    pruned.append(len(first_level[0]))
    all_nodes = first_level[1]
    levels.append(first_level[0])
    # cur_lvl - index of current level, correlates with number of slice forming features
    cur_lvl = 1  # currently filled level after first init iteration
    print()
    print("Current topk are: ")
    top_k.print_topk()
    # combining each candidate of previous level with every till it becomes useless (one node can't make a pair)
    while len(levels[cur_lvl - 1]) > 1:
        cur_lvl_nodes = {}
        prev_lvl = levels[cur_lvl - 1]
        level_candidates = len(prev_lvl) * (len(prev_lvl) - 1)
        candidates.append(level_candidates)
        for node_i in prev_lvl:
            partial = join_enum(node_i, prev_lvl, complete_x, loss, x_size,
                                y_test, errors, debug, alpha, w, loss_type,
                                b_update, cur_lvl, all_nodes, top_k,
                                cur_lvl_nodes)
            cur_lvl_nodes = partial[0]
            all_nodes = partial[1]
        cur_lvl = cur_lvl + 1
        indexes.append(cur_lvl)
        levels.append(cur_lvl_nodes)
        print("Level " + str(cur_lvl) + " had " + str(candidates) +
              " candidates but after pruning only " + str(len(cur_lvl_nodes)) +
              " go to the next level")
        pruned.append(len(cur_lvl_nodes))
        print()
        print("Current topk are: ")
        top_k.print_topk()
    plt.plot(indexes, candidates, 'r--', indexes, pruned, 'g--')
    plt.xlabel('Level')
    plt.ylabel('Number of slices')
    plt.show()
    print("Program stopped at level " + str(cur_lvl))
    print()
    print("Selected slices are: ")
    top_k.print_topk()
    print("candidates:")
    print(candidates)
    print(">>>>>>>>>")
    print("pruned:")
    print(pruned)
    return top_k
Пример #4
0
def process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type,
            enumerator):
    top_k = Topk(k)
    cur_lvl = 0
    levels = []
    all_features = list(all_features)
    first_level = {}
    first_tasks = sc.parallelize(all_features)
    b_topk = SparkContext.broadcast(sc, top_k)
    init_slices = first_tasks.mapPartitions(lambda features: spark_utils.make_first_level(features, predictions, loss,
                                                                                          b_topk.value, w, loss_type)) \
        .map(lambda node: (node.key, node)) \
        .collect()
    first_level.update(init_slices)
    update_top_k(first_level, top_k, alpha, predictions)
    prev_level = SparkContext.broadcast(sc, first_level)
    levels.append(prev_level)
    cur_lvl = 1
    top_k.print_topk()
    while len(levels[cur_lvl - 1].value) > 0:
        cur_lvl_res = {}
        b_topk = SparkContext.broadcast(sc, top_k)
        for left in range(int(cur_lvl / 2) + 1):
            right = cur_lvl - left - 1
            partitions = sc.parallelize(levels[left].value.values())
            mapped = partitions.mapPartitions(
                lambda nodes: spark_utils.nodes_enum(
                    nodes, levels[right].value.values(
                    ), predictions, loss, b_topk.value, alpha, k, w, loss_type,
                    cur_lvl, debug, enumerator))
            flattened = mapped.flatMap(lambda node: node)
            partial = flattened.map(lambda node: (node.key, node)).collect()
            cur_lvl_res.update(partial)
        prev_level = SparkContext.broadcast(sc, cur_lvl_res)
        levels.append(prev_level)
        update_top_k(cur_lvl_res, top_k, alpha, predictions)
        cur_lvl = cur_lvl + 1
        top_k.print_topk()
        print("Level " + str(cur_lvl) + " had " + str(
            len(levels[cur_lvl - 1].value) *
            (len(levels[cur_lvl - 1].value) - 1)) +
              " candidates but after pruning only " +
              str(len(prev_level.value)) + " go to the next level")
    print("Program stopped at level " + str(cur_lvl))
    print()
    print("Selected slices are: ")
    top_k.print_topk()
Пример #5
0
def process(all_features, predictions, f_l2, sc, debug, alpha, k, w, loss_type, enumerator):
    top_k = Topk(k)
    cur_lvl = 0
    levels = []
    all_features = list(all_features)
    first_tasks = sc.parallelize(all_features)
    partitions = first_tasks.glom()
    SparkContext.broadcast(sc, top_k)
    first_level = partitions.map(lambda features: sparked_utils.make_first_level(features, predictions, f_l2, top_k,
                                                                                 alpha, k, w, loss_type))
    first_lvl_res = first_level.reduce(lambda a, b: a + b)
    update_top_k(first_lvl_res, top_k, alpha, predictions)
    SparkContext.broadcast(sc, top_k)
    levels.append(first_lvl_res)
    levels.append(first_lvl_res)
    SparkContext.broadcast(sc, levels)
    cur_lvl = 2
    top_k.print_topk()
    SparkContext.broadcast(sc, top_k)
    while len(levels[cur_lvl - 1]) > 0:
        cur_lvl_res = {}
        nodes_list = []
        for left in range(int(cur_lvl / 2)):
            right = cur_lvl - 1 - left
            partitions = sc.parallelize(levels[left])
            part = partitions.glom()
            print(levels[right])
            mapped = part.map(lambda nodes: sparked_utils.nodes_enum(nodes, levels[right], predictions, f_l2,
                                                                      top_k, alpha, k, w, loss_type, cur_lvl, debug, enumerator))
            partial_nodes = mapped.reduce(lambda a, b: a + b)
            partial_res = flatten(partial_nodes)
            result = update_nodes(partial_res, nodes_list, cur_lvl_res, w)
            cur_lvl_res = result[0]
            nodes_list = result[1]
        levels.append(nodes_list)
        SparkContext.broadcast(sc, levels)
        SparkContext.broadcast(sc, top_k)
        update_top_k(list(nodes_list), top_k, alpha, predictions)
        SparkContext.broadcast(sc, top_k)
        cur_lvl = cur_lvl + 1
        top_k.print_topk()
        print("Level " + str(cur_lvl) + " had " + str(len(levels) * (len(levels) - 1)) +
              " candidates but after pruning only " + str(len(cur_lvl_res)) + " go to the next level")
    print("Program stopped at level " + str(cur_lvl))
    print()
    print("Selected slices are: ")
    top_k.print_topk()
Пример #6
0
def parallel_process(all_features, predictions, f_l2, sc, debug, alpha, k, w,
                     loss_type, enumerator):
    top_k = Topk(k)
    cur_lvl = 0
    levels = []
    all_features = list(all_features)
    first_tasks = sc.parallelize(all_features)
    partitions = first_tasks.glom()
    SparkContext.broadcast(sc, top_k)
    first_level = partitions.map(
        lambda features: sparked_utils.make_first_level(
            features, predictions, f_l2, top_k, alpha, k, w, loss_type))
    first_lvl_res = first_level.reduce(lambda a, b: a + b)
    update_top_k(first_lvl_res, top_k, alpha, predictions)
    SparkContext.broadcast(sc, top_k)
    levels = first_lvl_res
    SparkContext.broadcast(sc, levels)
    cur_lvl = cur_lvl + 1
    top_k.print_topk()
    SparkContext.broadcast(sc, top_k)
    # checking the first partition of level. if not empty then processing otherwise no elements were added to this level
    while len(levels) > 1:
        partitions = sc.parallelize(levels)
        part = partitions.glom()
        mapped = part.map(lambda nodes: sparked_utils.nodes_enum(
            nodes, levels, predictions, f_l2, top_k, alpha, k, w, loss_type,
            cur_lvl, debug, enumerator))
        cur_lvl_nodes = mapped.reduce(lambda a, b: a + b)
        lvl_nodes_res = flatten(cur_lvl_nodes)
        update_top_k(list(lvl_nodes_res), top_k, alpha, predictions)
        levels = lvl_nodes_res
        SparkContext.broadcast(sc, levels)
        SparkContext.broadcast(sc, top_k)
        cur_lvl = cur_lvl + 1
        top_k.print_topk()
        print("Level " + str(cur_lvl) + " had " +
              str(len(levels) *
                  (len(levels) - 1)) + " candidates but after pruning only " +
              str(len(lvl_nodes_res)) + " go to the next level")
    print("Program stopped at level " + str(cur_lvl))
    print()
    print("Selected slices are: ")
    top_k.print_topk()
Пример #7
0
def parallel_process(all_features, predictions, loss, sc, debug, alpha, k, w,
                     loss_type, enumerator):
    top_k = Topk(k)
    cur_lvl = 0
    levels = []
    first_level = {}
    all_features = list(all_features)
    first_tasks = sc.parallelize(all_features)
    b_topk = SparkContext.broadcast(sc, top_k)
    init_slices = first_tasks.mapPartitions(lambda features: spark_utils.make_first_level(features, predictions, loss,
                                                                                          b_topk.value, w, loss_type)) \
        .map(lambda node: (node.key, node)).collect()
    first_level.update(init_slices)
    update_top_k(first_level, b_topk.value, alpha, predictions)
    prev_level = SparkContext.broadcast(sc, first_level)
    levels.append(prev_level)
    cur_lvl = cur_lvl + 1
    b_topk.value.print_topk()
    # checking the first partition of level. if not empty then processing otherwise no elements were added to this level
    while len(levels[cur_lvl - 1].value) > 0:
        nodes_list = {}
        partitions = sc.parallelize(levels[cur_lvl - 1].value.values())
        mapped = partitions.mapPartitions(lambda nodes: spark_utils.nodes_enum(
            nodes, levels[cur_lvl - 1].value.values(), predictions, loss,
            b_topk.value, alpha, k, w, loss_type, cur_lvl, debug, enumerator))
        flattened = mapped.flatMap(lambda node: node)
        nodes_list.update(
            flattened.map(lambda node: (node.key, node)).distinct().collect())
        prev_level = SparkContext.broadcast(sc, nodes_list)
        levels.append(prev_level)
        update_top_k(nodes_list, b_topk.value, alpha, predictions)
        cur_lvl = cur_lvl + 1
        b_topk.value.print_topk()
        print("Level " + str(cur_lvl) + " had " + str(
            len(levels[cur_lvl - 1].value) *
            (len(levels[cur_lvl - 1].value) - 1)) +
              " candidates but after pruning only " + str(len(nodes_list)) +
              " go to the next level")
    print("Program stopped at level " + str(cur_lvl))
    print()
    print("Selected slices are: ")
    b_topk.value.print_topk()
Пример #8
0
 def test_extreme_target(self):
     test_dataset = pd.read_csv("/home/lana/diploma/project/slicing/datasets/toy_extreme_change.csv")
     y_test = test_dataset.iloc[:, self.attributes_amount - 1:self.attributes_amount].values
     x_test = test_dataset.iloc[:, 0:self.attributes_amount - 1].values
     y_pred = self.model.predict(x_test)
     print("Mean squared error: %.2f"
           % mean_squared_error(y_test, y_pred))
     print('r_2 statistic: %.2f' % r2_score(y_test, y_pred))
     # Now that we have trained the model, we can print the coefficient of x that it has predicted
     print('Coefficients: \n', self.model.coef_)
     enc = OneHotEncoder(handle_unknown='ignore')
     x = enc.fit_transform(x_test).toarray()
     complete_x = []
     complete_y = []
     counter = 0
     for item in x:
         complete_x.append((counter, item))
         complete_y.append((counter, y_test[counter]))
         counter = counter + 1
     all_features = enc.get_feature_names()
     loss = mean_squared_error(y_test, y_pred)
     devs = (y_pred - y_test) ** 2
     errors = []
     counter = 0
     for pred in devs:
         errors.append((counter, pred))
         counter = counter + 1
     k = 5
     w = 0.5
     alpha = 4
     top_k = Topk(k)
     debug = True
     b_update = True
     first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors,
                                           self.loss_type, top_k, alpha, w)
     first_level_nodes = first_level[0]
     slice_member = first_level_nodes[(7, 'x2_2')]
     self.assertGreater(slice_member.loss, self.slice_member.loss)
     print("check 1")
     self.assertGreater(slice_member.score, self.slice_member.score)
     print("check 2")
Пример #9
0
def parallel_process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type):
    top_k = Topk(k)
    cur_lvl = 0
    levels = []
    cur_lvl_nodes = list(all_features)
    pred_pandas = predictions.toPandas()
    x_size = len(pred_pandas)
    b_topk = SparkContext.broadcast(sc, top_k)
    b_cur_lvl = SparkContext.broadcast(sc, cur_lvl)
    buckets = {}
    for node in cur_lvl_nodes:
        bucket = Bucket(node, cur_lvl, w, x_size, loss)
        buckets[bucket.name] = bucket
    b_buckets = SparkContext.broadcast(sc, buckets)
    # rows = predictions.rdd.map(lambda row: (row[0], row[1].indices, row[2])) \
    #     .map(lambda item: (item[0], item[1].tolist(), item[2]))
    rows = predictions.rdd.map(lambda row: row[1].indices) \
        .map(lambda item: list(item))
    mapped = rows.map(lambda row: rows_mapper(row, b_buckets.value, loss_type))
    flattened = mapped.flatMap(lambda line: (line.items()))
    reduced = flattened.combineByKey(combiner, join_data_parallel.merge_values, join_data_parallel.merge_combiners)
    cur_lvl_nodes = reduced.values() \
        .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket, loss, w, x_size, b_cur_lvl.value))
    if debug:
        cur_lvl_nodes.map(lambda bucket: bucket.print_debug(b_topk.value)).collect()
    cur_lvl = 1
    prev_level = cur_lvl_nodes.collect()
    b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level)
    levels.append(b_cur_lvl_nodes)
    top_k = top_k.buckets_top_k(prev_level, x_size, alpha, 1)
    while len(prev_level) > 0:
        b_topk = SparkContext.broadcast(sc, top_k)
        cur_min = top_k.min_score
        b_cur_lvl = SparkContext.broadcast(sc, cur_lvl)
        top_k.print_topk()
        buckets = []
        for left in range(int(cur_lvl / 2) + 1):
            right = cur_lvl - left - 1
            nodes = union_enum(levels[left].value, levels[right].value, x_size, alpha, top_k, w, loss, cur_lvl)
            buckets.append(nodes)
        b_buckets = sc.parallelize(buckets)
        all_buckets = b_buckets.flatMap(lambda line: (line.items()))
        combined = dict(all_buckets.combineByKey(combiner, merge_values, merge_combiners).collect())
        b_buckets = SparkContext.broadcast(sc, combined)
        to_slice = dict(filter(lambda bucket: bucket[1].check_bounds(x_size, alpha, top_k), combined.items()))
        b_to_slice = SparkContext.broadcast(sc, to_slice)
        mapped = rows.map(lambda row: rows_mapper(row, b_to_slice.value, loss_type))
        flattened = mapped.flatMap(lambda line: (line.items()))
        partial = flattened.combineByKey(combiner, join_data_parallel.merge_values, join_data_parallel.merge_combiners)
        prev_level = partial\
            .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket[1], loss, w, x_size, b_cur_lvl.value)).collect()
        top_k = top_k.buckets_top_k(prev_level, x_size, alpha, cur_min)
        b_topk = SparkContext.broadcast(sc, top_k)
        if debug:
            partial.values().map(lambda bucket: bucket.print_debug(b_topk.value)).collect()
        print("Level " + str(cur_lvl) + " had " + str(
            len(levels[cur_lvl - 1].value) * (len(levels[cur_lvl - 1].value) - 1)) +
              " candidates but after pruning only " + str(len(prev_level)) + " go to the next level")
        print("Program stopped at level " + str(cur_lvl))
        b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level)
        levels.append(b_cur_lvl_nodes)
        cur_lvl += 1
    print()
    print("Selected slices are: ")
    top_k.print_topk()
    return None
Пример #10
0
def process(all_features, complete_x, loss, x_size, y_test, errors, debug,
            alpha, k, w, loss_type, b_update):
    top_k = Topk(k)
    # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds
    levels = []
    first_level = make_first_level(all_features, complete_x, loss, x_size,
                                   y_test, errors, loss_type, w, alpha, top_k)
    # double appending of first level nodes in order to enumerating second level in the same way as others
    levels.append((first_level[0], len(all_features)))
    all_nodes = first_level[1]
    # cur_lvl - index of current level, correlates with number of slice forming features
    cur_lvl = 1  # level that is planned to be filled later
    cur_lvl_nodes = first_level
    # currently for debug
    print("Level 1 had " + str(len(all_features)) + " candidates")
    print()
    print("Current topk are: ")
    top_k.print_topk()
    # DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist
    while len(cur_lvl_nodes) > 0:
        cur_lvl_nodes = []
        count = 0
        for left in range(int(cur_lvl / 2) + 1):
            right = cur_lvl - 1 - left
            for node_i in range(len(levels[left][0])):
                for node_j in range(len(levels[right][0])):
                    flag = check_attributes(levels[left][0][node_i],
                                            levels[right][0][node_j])
                    if not flag:
                        new_node = Node(complete_x, loss, x_size, y_test,
                                        errors)
                        parents_set = set(new_node.parents)
                        parents_set.add(levels[left][0][node_i])
                        parents_set.add(levels[right][0][node_j])
                        new_node.parents = list(parents_set)
                        parent1_attr = levels[left][0][node_i].attributes
                        parent2_attr = levels[right][0][node_j].attributes
                        new_node_attr = union(parent1_attr, parent2_attr)
                        new_node.attributes = new_node_attr
                        new_node.name = new_node.make_name()
                        new_id = len(all_nodes)
                        new_node.key = new_node.make_key(new_id)
                        if new_node.key[1] in all_nodes:
                            existing_item = all_nodes[new_node.key[1]]
                            parents_set = set(existing_item.parents)
                            existing_item.parents = parents_set
                            if b_update:
                                s_upper = new_node.calc_s_upper(cur_lvl)
                                s_lower = new_node.calc_s_lower(cur_lvl)
                                e_upper = new_node.calc_e_upper()
                                e_max_upper = new_node.calc_e_max_upper(
                                    cur_lvl)
                                new_node.update_bounds(s_upper, s_lower,
                                                       e_upper, e_max_upper, w)
                        else:
                            new_node.calc_bounds(cur_lvl, w)
                            all_nodes[new_node.key[1]] = new_node
                            # check if concrete data should be extracted or not (only for those that have score upper
                            # big enough and if size of subset is big enough
                            to_slice = new_node.check_bounds(
                                top_k, x_size, alpha)
                            if to_slice:
                                new_node.process_slice(loss_type)
                                new_node.score = opt_fun(
                                    new_node.loss, new_node.size, loss, x_size,
                                    w)
                                # we decide to add node to current level nodes (in order to make new combinations
                                # on the next one or not basing on its score value
                                if new_node.check_constraint(
                                        top_k, x_size, alpha
                                ) and new_node.key not in top_k.keys:
                                    top_k.add_new_top_slice(new_node)
                                cur_lvl_nodes.append(new_node)
                            if debug:
                                new_node.print_debug(top_k, cur_lvl)
            count = count + levels[left][1] * levels[right][1]
        print("Level " + str(cur_lvl) + " had " + str(count) +
              " candidates but after pruning only " + str(len(cur_lvl_nodes)) +
              " go to the next level")
        cur_lvl = cur_lvl + 1
        levels.append((cur_lvl_nodes, count))
        top_k.print_topk()
    print("Program stopped at level " + str(cur_lvl))
    print()
    print("Selected slices are: ")
    top_k.print_topk()
Пример #11
0
class SliceTests(unittest.TestCase):
    loss_type = 0
    # x, y = m.generate_dataset(10, 100)
    train_dataset = pd.read_csv("toy_train.csv")
    attributes_amount = len(train_dataset.values[0])
    model = linear_model.LinearRegression()
    y_train = train_dataset.iloc[:, attributes_amount - 1:attributes_amount].values
    x_train = train_dataset.iloc[:, 0:attributes_amount - 1].values
    model.fit(x_train, y_train)
    test_dataset = pd.read_csv("toy.csv")
    y_test = test_dataset.iloc[:, attributes_amount - 1:attributes_amount].values
    x_test = test_dataset.iloc[:, 0:attributes_amount - 1].values
    y_pred = model.predict(x_test)
    print("Mean squared error: %.2f"
          % mean_squared_error(y_test, y_pred))
    print('r_2 statistic: %.2f' % r2_score(y_test, y_pred))
    # Now that we have trained the model, we can print the coefficient of x that it has predicted
    print('Coefficients: \n', model.coef_)
    enc = OneHotEncoder(handle_unknown='ignore')
    x = enc.fit_transform(x_test).toarray()
    complete_x = []
    complete_y = []
    counter = 0
    for item in x:
        complete_x.append((counter, item))
        complete_y.append((counter, y_test[counter]))
        counter = counter + 1
    all_features = enc.get_feature_names()
    loss = mean_squared_error(y_test, y_pred)
    devs = (y_pred - y_test) ** 2
    errors = []
    counter = 0
    for pred in devs:
        errors.append((counter, pred))
        counter = counter + 1
    k = 5
    w = 0.5
    alpha = 4
    top_k = Topk(k)
    debug = True
    b_update = True
    first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors,
                                          loss_type, top_k, alpha, w)
    first_level_nodes = first_level[0]
    slice_member = first_level_nodes[(7, 'x2_2')]

    def test_attr_spark(self):
        conf = SparkConf().setAppName("toy_test").setMaster('local[2]')
        num_partitions = 2
        enumerator = "join"
        model_type = "regression"
        label = 'target'
        sparkContext = SparkContext(conf=conf)
        sqlContext = SQLContext(sparkContext)
        train_df = sqlContext.read.csv("toy_train.csv", header='true',
                            inferSchema='true')
        test_df = sqlContext.read.csv("toy.csv", header='true',
                            inferSchema='true')
        # initializing stages of main transformation pipeline
        stages = []
        # list of categorical features for further hot-encoding
        cat_features = ['a', 'b', 'c']
        for feature in cat_features:
            string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index").setHandleInvalid("skip")
            encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"])
            encoder.setDropLast(False)
            stages += [string_indexer, encoder]
        assembler_inputs = [feature + "_vec" for feature in cat_features]
        assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs")
        stages += [assembler]
        assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features")
        stages += [assembler_final]
        pipeline = Pipeline(stages=stages)
        train_pipeline_model = pipeline.fit(train_df)
        test_pipeline_model = pipeline.fit(test_df)
        train_df_transformed = train_pipeline_model.transform(train_df)
        test_df_transformed = test_pipeline_model.transform(test_df)
        train_df_transformed = train_df_transformed.withColumn('model_type', sf.lit(0))
        test_df_transformed = test_df_transformed.withColumn('model_type', sf.lit(0))
        decode_dict = {}
        counter = 0
        cat = 0
        for feature in cat_features:
            colIdx = test_df_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap()
            colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])}
            for item in colIdx:
                decode_dict[counter] = (cat, item, colIdx[item], counter)
                counter = counter + 1
            cat = cat + 1
        train_df_transform_fin = train_df_transformed.select('features', label, 'model_type')
        test_df_transform_fin = test_df_transformed.select('features', label, 'model_type')
        lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.0, elasticNetParam=0.8)
        lr_model = lr.fit(train_df_transform_fin)
        eval = lr_model.evaluate(test_df_transform_fin)
        f_l2 = eval.meanSquaredError
        pred = eval.predictions
        pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type']))
        predictions = pred_df_fin.select('features', 'error').repartition(num_partitions)
        converter = IndexToString(inputCol='features', outputCol='cats')
        all_features = list(decode_dict)
        predictions = predictions.collect()
        spark_join = spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                      k=self.k, w=self.w, loss_type=self.loss_type, enumerator="join")
        spark_union = spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                      k=self.k, w=self.w, loss_type=self.loss_type, enumerator="union")
        self.assertEqual(3, len(spark_join.slices))
        print("check1")
        self.assertEqual(spark_join.min_score, spark_union.min_score)
        print("check2")
        self.assertEqual(spark_join.keys, spark_union.keys)
        print("check3")
        self.assertEqual(len(spark_join.slices), len(spark_union.slices))
        print("check4")
        idx = -1
        for sliced in spark_join.slices:
            idx += 1
            self.assertEqual(sliced.score, spark_union.slices[idx].score)
        print("check5")

    def test_features_number(self):
        self.assertEqual(len(self.all_features), 9)
        print("check 1")

    def test_base_first_level(self):
        self.assertEqual(9, len(self.first_level_nodes))
        print("check 2")

    def test_parents_first(self):
        self.assertIn(('x2_2', 7), self.slice_member.parents)
        print("check 3")

    def test_name(self):
        self.assertEqual('x2_2', self.slice_member.make_name())
        print("check 4")

    def test_size(self):
        self.assertEqual(36, self.slice_member.size)
        print("check 5")

    def test_e_upper(self):
        self.assertEqual(81, self.slice_member.e_upper)
        print("check 6")

    def test_loss(self):
        self.assertEqual(22, int(self.slice_member.loss))
        print("check 7")

    def test_opt_fun(self):
        self.slice_member.score = slicer.opt_fun(self.slice_member.loss, self.slice_member.size, self.loss, len(self.x_test), self.w)
        print("check 8")

    def test_score(self):
        self.assertEqual(1.2673015873015872, self.slice_member.score)
        print("check 9")

    def test_base_join_enum(self):
        cur_lvl_nodes = {}
        all_nodes = {}
        b_update = True
        cur_lvl = 1
        slice_index = (2, 'x0_3')
        combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                               len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                               self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
        self.assertEqual(6, len(combined[0]))
        print("check1")

    def test_parents_second(self):
        cur_lvl_nodes = {}
        all_nodes = {}
        b_update = True
        cur_lvl = 1
        slice_index = (2, 'x0_3')
        combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                                    len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                                    self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
        parent1 = combined[0][('x0_3 && x1_3')]
        parent2 = combined[0][('x0_3 && x2_2')]
        new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
        new_node.parents = [parent1, parent2]
        parent1_attr = parent1.attributes
        parent2_attr = parent2.attributes
        new_node_attr = slicer.union(parent1_attr, parent2_attr)
        self.assertEqual(new_node_attr, [('x0_3', 2), ('x1_3', 5), ('x2_2', 7)])
        print("check2")

    def test_nonsense(self):
        cur_lvl_nodes = {}
        all_nodes = {}
        b_update = True
        cur_lvl = 1
        slice_index = (2, 'x0_3')
        combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                                    len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                                    self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
        parent1 = combined[0][('x0_3 && x1_3')]
        parent2 = combined[0][('x0_3 && x2_2')]
        new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
        new_node.parents = [parent1, parent2]
        parent1_attr = parent1.attributes
        parent2_attr = parent2.attributes
        new_node_attr = slicer.union(parent1_attr, parent2_attr)
        new_node.attributes = new_node_attr
        new_node.name = new_node.make_name()
        flagTrue = slicer.slice_name_nonsense(parent1, parent2, 2)
        self.assertEqual(True, flagTrue)
        print("check3")

    def test_non_nonsense(self):
        cur_lvl_nodes = {}
        all_nodes = {}
        b_update = True
        cur_lvl = 1
        slice_index = (2, 'x0_3')
        parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
        parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]]
        parent3.attributes = [('x1_2', 4), ('x2_2', 7)]
        combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                                    len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                                    self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
        parent2 = combined[0]['x0_3 && x2_3']
        parent3.key = (8, 'x1_2 && x2_2')
        flag_nonsense = slicer.slice_name_nonsense(parent2, parent3, 2)
        self.assertEqual(True, flag_nonsense)
        print("check4")

    def test_uppers(self):
        cur_lvl_nodes = {}
        all_nodes = {}
        b_update = True
        cur_lvl = 1
        slice_index = (2, 'x0_3')
        parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
        parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]]
        parent3.attributes = [('x1_2', 4), ('x2_2', 7)]
        combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss,
                                    len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w,
                                    self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes)
        parent1 = combined[0]['x0_3 && x1_3']
        parent2 = combined[0]['x0_3 && x2_3']
        new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors)
        new_node.parents = [parent1, parent2]
        new_node.calc_bounds(2, self.w)
        self.assertEqual(25, new_node.s_upper)
        print("check5")
        self.assertEqual(398, int(new_node.c_upper))
        print("check6")

    def test_topk_slicing(self):
        join_top_k = slicer.process(self.all_features, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors,
                       self.debug, self.alpha, self.k, self.w, self.loss_type, self.b_update)
        union_top_k = union_slicer.process(self.all_features, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors,
                       self.debug, self.alpha, self.k, self.w, self.loss_type, self.b_update)
        self.assertEqual(join_top_k.min_score, union_top_k.min_score)
        print("check1")
        self.assertEqual(join_top_k.keys, union_top_k.keys)
        print("check2")
        self.assertEqual(len(join_top_k.slices), len(union_top_k.slices))
        print("check3")
        idx = -1
        for sliced in join_top_k.slices:
            idx += 1
            self.assertEqual(sliced.score, union_top_k.slices[idx].score)
        print("check4")

    def test_extreme_target(self):
        test_dataset = pd.read_csv("/home/lana/diploma/project/slicing/datasets/toy_extreme_change.csv")
        y_test = test_dataset.iloc[:, self.attributes_amount - 1:self.attributes_amount].values
        x_test = test_dataset.iloc[:, 0:self.attributes_amount - 1].values
        y_pred = self.model.predict(x_test)
        print("Mean squared error: %.2f"
              % mean_squared_error(y_test, y_pred))
        print('r_2 statistic: %.2f' % r2_score(y_test, y_pred))
        # Now that we have trained the model, we can print the coefficient of x that it has predicted
        print('Coefficients: \n', self.model.coef_)
        enc = OneHotEncoder(handle_unknown='ignore')
        x = enc.fit_transform(x_test).toarray()
        complete_x = []
        complete_y = []
        counter = 0
        for item in x:
            complete_x.append((counter, item))
            complete_y.append((counter, y_test[counter]))
            counter = counter + 1
        all_features = enc.get_feature_names()
        loss = mean_squared_error(y_test, y_pred)
        devs = (y_pred - y_test) ** 2
        errors = []
        counter = 0
        for pred in devs:
            errors.append((counter, pred))
            counter = counter + 1
        k = 5
        w = 0.5
        alpha = 4
        top_k = Topk(k)
        debug = True
        b_update = True
        first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors,
                                              self.loss_type, top_k, alpha, w)
        first_level_nodes = first_level[0]
        slice_member = first_level_nodes[(7, 'x2_2')]
        self.assertGreater(slice_member.loss, self.slice_member.loss)
        print("check 1")
        self.assertGreater(slice_member.score, self.slice_member.score)
        print("check 2")

    def test_error_significance(self):
        y_test = self.test_dataset.iloc[:, self.attributes_amount - 1:self.attributes_amount].values
        x_test = self.test_dataset.iloc[:, 0:self.attributes_amount - 1].values
        y_pred = self.model.predict(x_test)
        print("Mean squared error: %.2f"
              % mean_squared_error(y_test, y_pred))
        print('r_2 statistic: %.2f' % r2_score(y_test, y_pred))
        # Now that we have trained the model, we can print the coefficient of x that it has predicted
        print('Coefficients: \n', self.model.coef_)
        enc = OneHotEncoder(handle_unknown='ignore')
        x = enc.fit_transform(x_test).toarray()
        complete_x = []
        complete_y = []
        counter = 0
        for item in x:
            complete_x.append((counter, item))
            complete_y.append((counter, y_test[counter]))
            counter = counter + 1
        all_features = enc.get_feature_names()
        loss = mean_squared_error(y_test, y_pred)
        devs = (y_pred - y_test) ** 2
        errors = []
        counter = 0
        for pred in devs:
            errors.append((counter, pred))
            counter = counter + 1
        k = 5
        # Maximized size significance
        w = 0
        alpha = 4
        top_k = Topk(k)
        debug = True
        b_update = True
        first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors,
                                              self.loss_type, top_k, alpha, w)
        first_level_nodes = first_level[0]
        slice_member = first_level_nodes[(7, 'x2_2')]
        self.assertGreater(self.slice_member.score, slice_member.score)

    def test_size_significance(self):
        y_test = self.test_dataset.iloc[:, self.attributes_amount - 1:self.attributes_amount].values
        x_test = self.test_dataset.iloc[:, 0:self.attributes_amount - 1].values
        y_pred = self.model.predict(x_test)
        print("Mean squared error: %.2f"
                  % mean_squared_error(y_test, y_pred))
        print('r_2 statistic: %.2f' % r2_score(y_test, y_pred))
        # Now that we have trained the model, we can print the coefficient of x that it has predicted
        print('Coefficients: \n', self.model.coef_)
        enc = OneHotEncoder(handle_unknown='ignore')
        x = enc.fit_transform(x_test).toarray()
        complete_x = []
        complete_y = []
        counter = 0
        for item in x:
            complete_x.append((counter, item))
            complete_y.append((counter, y_test[counter]))
            counter = counter + 1
        all_features = enc.get_feature_names()
        loss = mean_squared_error(y_test, y_pred)
        devs = (y_pred - y_test) ** 2
        errors = []
        counter = 0
        for pred in devs:
            errors.append((counter, pred))
            counter = counter + 1
        k = 5
        # Maximized size significance
        w = 1
        alpha = 4
        top_k = Topk(k)
        debug = True
        b_update = True
        first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors,
                                                  self.loss_type, top_k, alpha, w)
        first_level_nodes = first_level[0]
        slice_member = first_level_nodes[(7, 'x2_2')]
        self.assertGreater(slice_member.score, self.slice_member.score)