def parallel_process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type): top_k = Topk(k) cur_lvl = 0 cur_lvl_nodes = list(all_features) pred_pandas = predictions.toPandas() x_size = len(pred_pandas) b_topk = SparkContext.broadcast(sc, top_k) b_cur_lvl = SparkContext.broadcast(sc, cur_lvl) buckets = {} for node in cur_lvl_nodes: bucket = Bucket(node, cur_lvl, w, x_size, loss) buckets[bucket.name] = bucket b_buckets = SparkContext.broadcast(sc, buckets) rows = predictions.rdd.map(lambda row: (row[1].indices, row[2]))\ .map(lambda item: list(item)) mapped = rows.map(lambda row: rows_mapper(row, b_buckets.value, loss_type)) flattened = mapped.flatMap(lambda line: (line.items())) reduced = flattened.combineByKey(combiner, merge_values, merge_combiners) cur_lvl_nodes = reduced.values()\ .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket, loss, w, x_size, b_cur_lvl.value)) if debug: cur_lvl_nodes.map(lambda bucket: bucket.print_debug(b_topk.value)).collect() cur_lvl = 1 prev_level = cur_lvl_nodes.collect() top_k = top_k.buckets_top_k(prev_level, x_size, alpha, 1) while len(prev_level) > 0: b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level) b_topk = SparkContext.broadcast(sc, top_k) cur_min = top_k.min_score b_cur_lvl = SparkContext.broadcast(sc, cur_lvl) top_k.print_topk() buckets = join_enum(prev_level, cur_lvl, x_size, alpha, top_k, w, loss) b_buckets = SparkContext.broadcast(sc, buckets) to_slice = dict(filter(lambda bucket: bucket[1].check_bounds(x_size, alpha, top_k), buckets.items())) b_to_slice = SparkContext.broadcast(sc, to_slice) mapped = rows.map(lambda row: rows_mapper(row, b_to_slice.value, loss_type)) flattened = mapped.flatMap(lambda line: (line.items())) to_process = flattened.combineByKey(combiner, merge_values, merge_combiners) if debug: to_process.values().map(lambda bucket: bucket.print_debug(b_topk.value)).collect() prev_level = to_process\ .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket[1], loss, w, x_size, b_cur_lvl.value))\ .collect() cur_lvl += 1 top_k = top_k.buckets_top_k(prev_level, x_size, alpha, cur_min) print("Level " + str(cur_lvl) + " had " + str( len(b_cur_lvl_nodes.value * (len(prev_level) - 1)))+" candidates but after pruning only " + str(len(prev_level)) + " go to the next level") top_k.print_topk() print() print("Program stopped at level " + str(cur_lvl - 1)) print("Selected slices are: ") top_k.print_topk() return None
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update): levels = [] top_k = Topk(k) first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, top_k, alpha, w) all_nodes = first_level[1] levels.append(first_level[0]) # cur_lvl - index of current level, correlates with number of slice forming features cur_lvl = 1 # currently filled level after first init iteration # currently for debug print("Level 1 had " + str(len(all_features)) + " candidates") print() print("Current topk are: ") top_k.print_topk() # combining each candidate of previous level with every till it becomes useless (one node can't make a pair) while len(levels[cur_lvl - 1]) > 1: cur_lvl_nodes = [] prev_lvl = levels[cur_lvl - 1] for node_i in range(len(prev_lvl)): partial = join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type, b_update, cur_lvl, all_nodes, top_k, cur_lvl_nodes) cur_lvl_nodes = partial[0] all_nodes = partial[1] cur_lvl = cur_lvl + 1 levels.append(cur_lvl_nodes) top_k.print_topk() print("Level " + str(cur_lvl) + " had " + str(len(prev_lvl) * (len(prev_lvl) - 1)) + " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level") print("Program stopped at level " + str(cur_lvl + 1)) print() print("Selected slices are: ") top_k.print_topk()
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update): levels = [] top_k = Topk(k) first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, top_k, alpha, w) candidates = [] pruned = [] indexes = [] indexes.append(1) candidates.append(len(first_level[0])) pruned.append(len(first_level[0])) all_nodes = first_level[1] levels.append(first_level[0]) # cur_lvl - index of current level, correlates with number of slice forming features cur_lvl = 1 # currently filled level after first init iteration print() print("Current topk are: ") top_k.print_topk() # combining each candidate of previous level with every till it becomes useless (one node can't make a pair) while len(levels[cur_lvl - 1]) > 1: cur_lvl_nodes = {} prev_lvl = levels[cur_lvl - 1] level_candidates = len(prev_lvl) * (len(prev_lvl) - 1) candidates.append(level_candidates) for node_i in prev_lvl: partial = join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type, b_update, cur_lvl, all_nodes, top_k, cur_lvl_nodes) cur_lvl_nodes = partial[0] all_nodes = partial[1] cur_lvl = cur_lvl + 1 indexes.append(cur_lvl) levels.append(cur_lvl_nodes) print("Level " + str(cur_lvl) + " had " + str(candidates) + " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level") pruned.append(len(cur_lvl_nodes)) print() print("Current topk are: ") top_k.print_topk() plt.plot(indexes, candidates, 'r--', indexes, pruned, 'g--') plt.xlabel('Level') plt.ylabel('Number of slices') plt.show() print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk() print("candidates:") print(candidates) print(">>>>>>>>>") print("pruned:") print(pruned) return top_k
def process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type, enumerator): top_k = Topk(k) cur_lvl = 0 levels = [] all_features = list(all_features) first_level = {} first_tasks = sc.parallelize(all_features) b_topk = SparkContext.broadcast(sc, top_k) init_slices = first_tasks.mapPartitions(lambda features: spark_utils.make_first_level(features, predictions, loss, b_topk.value, w, loss_type)) \ .map(lambda node: (node.key, node)) \ .collect() first_level.update(init_slices) update_top_k(first_level, top_k, alpha, predictions) prev_level = SparkContext.broadcast(sc, first_level) levels.append(prev_level) cur_lvl = 1 top_k.print_topk() while len(levels[cur_lvl - 1].value) > 0: cur_lvl_res = {} b_topk = SparkContext.broadcast(sc, top_k) for left in range(int(cur_lvl / 2) + 1): right = cur_lvl - left - 1 partitions = sc.parallelize(levels[left].value.values()) mapped = partitions.mapPartitions( lambda nodes: spark_utils.nodes_enum( nodes, levels[right].value.values( ), predictions, loss, b_topk.value, alpha, k, w, loss_type, cur_lvl, debug, enumerator)) flattened = mapped.flatMap(lambda node: node) partial = flattened.map(lambda node: (node.key, node)).collect() cur_lvl_res.update(partial) prev_level = SparkContext.broadcast(sc, cur_lvl_res) levels.append(prev_level) update_top_k(cur_lvl_res, top_k, alpha, predictions) cur_lvl = cur_lvl + 1 top_k.print_topk() print("Level " + str(cur_lvl) + " had " + str( len(levels[cur_lvl - 1].value) * (len(levels[cur_lvl - 1].value) - 1)) + " candidates but after pruning only " + str(len(prev_level.value)) + " go to the next level") print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()
def process(all_features, predictions, f_l2, sc, debug, alpha, k, w, loss_type, enumerator): top_k = Topk(k) cur_lvl = 0 levels = [] all_features = list(all_features) first_tasks = sc.parallelize(all_features) partitions = first_tasks.glom() SparkContext.broadcast(sc, top_k) first_level = partitions.map(lambda features: sparked_utils.make_first_level(features, predictions, f_l2, top_k, alpha, k, w, loss_type)) first_lvl_res = first_level.reduce(lambda a, b: a + b) update_top_k(first_lvl_res, top_k, alpha, predictions) SparkContext.broadcast(sc, top_k) levels.append(first_lvl_res) levels.append(first_lvl_res) SparkContext.broadcast(sc, levels) cur_lvl = 2 top_k.print_topk() SparkContext.broadcast(sc, top_k) while len(levels[cur_lvl - 1]) > 0: cur_lvl_res = {} nodes_list = [] for left in range(int(cur_lvl / 2)): right = cur_lvl - 1 - left partitions = sc.parallelize(levels[left]) part = partitions.glom() print(levels[right]) mapped = part.map(lambda nodes: sparked_utils.nodes_enum(nodes, levels[right], predictions, f_l2, top_k, alpha, k, w, loss_type, cur_lvl, debug, enumerator)) partial_nodes = mapped.reduce(lambda a, b: a + b) partial_res = flatten(partial_nodes) result = update_nodes(partial_res, nodes_list, cur_lvl_res, w) cur_lvl_res = result[0] nodes_list = result[1] levels.append(nodes_list) SparkContext.broadcast(sc, levels) SparkContext.broadcast(sc, top_k) update_top_k(list(nodes_list), top_k, alpha, predictions) SparkContext.broadcast(sc, top_k) cur_lvl = cur_lvl + 1 top_k.print_topk() print("Level " + str(cur_lvl) + " had " + str(len(levels) * (len(levels) - 1)) + " candidates but after pruning only " + str(len(cur_lvl_res)) + " go to the next level") print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()
def parallel_process(all_features, predictions, f_l2, sc, debug, alpha, k, w, loss_type, enumerator): top_k = Topk(k) cur_lvl = 0 levels = [] all_features = list(all_features) first_tasks = sc.parallelize(all_features) partitions = first_tasks.glom() SparkContext.broadcast(sc, top_k) first_level = partitions.map( lambda features: sparked_utils.make_first_level( features, predictions, f_l2, top_k, alpha, k, w, loss_type)) first_lvl_res = first_level.reduce(lambda a, b: a + b) update_top_k(first_lvl_res, top_k, alpha, predictions) SparkContext.broadcast(sc, top_k) levels = first_lvl_res SparkContext.broadcast(sc, levels) cur_lvl = cur_lvl + 1 top_k.print_topk() SparkContext.broadcast(sc, top_k) # checking the first partition of level. if not empty then processing otherwise no elements were added to this level while len(levels) > 1: partitions = sc.parallelize(levels) part = partitions.glom() mapped = part.map(lambda nodes: sparked_utils.nodes_enum( nodes, levels, predictions, f_l2, top_k, alpha, k, w, loss_type, cur_lvl, debug, enumerator)) cur_lvl_nodes = mapped.reduce(lambda a, b: a + b) lvl_nodes_res = flatten(cur_lvl_nodes) update_top_k(list(lvl_nodes_res), top_k, alpha, predictions) levels = lvl_nodes_res SparkContext.broadcast(sc, levels) SparkContext.broadcast(sc, top_k) cur_lvl = cur_lvl + 1 top_k.print_topk() print("Level " + str(cur_lvl) + " had " + str(len(levels) * (len(levels) - 1)) + " candidates but after pruning only " + str(len(lvl_nodes_res)) + " go to the next level") print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()
def parallel_process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type, enumerator): top_k = Topk(k) cur_lvl = 0 levels = [] first_level = {} all_features = list(all_features) first_tasks = sc.parallelize(all_features) b_topk = SparkContext.broadcast(sc, top_k) init_slices = first_tasks.mapPartitions(lambda features: spark_utils.make_first_level(features, predictions, loss, b_topk.value, w, loss_type)) \ .map(lambda node: (node.key, node)).collect() first_level.update(init_slices) update_top_k(first_level, b_topk.value, alpha, predictions) prev_level = SparkContext.broadcast(sc, first_level) levels.append(prev_level) cur_lvl = cur_lvl + 1 b_topk.value.print_topk() # checking the first partition of level. if not empty then processing otherwise no elements were added to this level while len(levels[cur_lvl - 1].value) > 0: nodes_list = {} partitions = sc.parallelize(levels[cur_lvl - 1].value.values()) mapped = partitions.mapPartitions(lambda nodes: spark_utils.nodes_enum( nodes, levels[cur_lvl - 1].value.values(), predictions, loss, b_topk.value, alpha, k, w, loss_type, cur_lvl, debug, enumerator)) flattened = mapped.flatMap(lambda node: node) nodes_list.update( flattened.map(lambda node: (node.key, node)).distinct().collect()) prev_level = SparkContext.broadcast(sc, nodes_list) levels.append(prev_level) update_top_k(nodes_list, b_topk.value, alpha, predictions) cur_lvl = cur_lvl + 1 b_topk.value.print_topk() print("Level " + str(cur_lvl) + " had " + str( len(levels[cur_lvl - 1].value) * (len(levels[cur_lvl - 1].value) - 1)) + " candidates but after pruning only " + str(len(nodes_list)) + " go to the next level") print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") b_topk.value.print_topk()
def test_extreme_target(self): test_dataset = pd.read_csv("/home/lana/diploma/project/slicing/datasets/toy_extreme_change.csv") y_test = test_dataset.iloc[:, self.attributes_amount - 1:self.attributes_amount].values x_test = test_dataset.iloc[:, 0:self.attributes_amount - 1].values y_pred = self.model.predict(x_test) print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) print('r_2 statistic: %.2f' % r2_score(y_test, y_pred)) # Now that we have trained the model, we can print the coefficient of x that it has predicted print('Coefficients: \n', self.model.coef_) enc = OneHotEncoder(handle_unknown='ignore') x = enc.fit_transform(x_test).toarray() complete_x = [] complete_y = [] counter = 0 for item in x: complete_x.append((counter, item)) complete_y.append((counter, y_test[counter])) counter = counter + 1 all_features = enc.get_feature_names() loss = mean_squared_error(y_test, y_pred) devs = (y_pred - y_test) ** 2 errors = [] counter = 0 for pred in devs: errors.append((counter, pred)) counter = counter + 1 k = 5 w = 0.5 alpha = 4 top_k = Topk(k) debug = True b_update = True first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors, self.loss_type, top_k, alpha, w) first_level_nodes = first_level[0] slice_member = first_level_nodes[(7, 'x2_2')] self.assertGreater(slice_member.loss, self.slice_member.loss) print("check 1") self.assertGreater(slice_member.score, self.slice_member.score) print("check 2")
def parallel_process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type): top_k = Topk(k) cur_lvl = 0 levels = [] cur_lvl_nodes = list(all_features) pred_pandas = predictions.toPandas() x_size = len(pred_pandas) b_topk = SparkContext.broadcast(sc, top_k) b_cur_lvl = SparkContext.broadcast(sc, cur_lvl) buckets = {} for node in cur_lvl_nodes: bucket = Bucket(node, cur_lvl, w, x_size, loss) buckets[bucket.name] = bucket b_buckets = SparkContext.broadcast(sc, buckets) # rows = predictions.rdd.map(lambda row: (row[0], row[1].indices, row[2])) \ # .map(lambda item: (item[0], item[1].tolist(), item[2])) rows = predictions.rdd.map(lambda row: row[1].indices) \ .map(lambda item: list(item)) mapped = rows.map(lambda row: rows_mapper(row, b_buckets.value, loss_type)) flattened = mapped.flatMap(lambda line: (line.items())) reduced = flattened.combineByKey(combiner, join_data_parallel.merge_values, join_data_parallel.merge_combiners) cur_lvl_nodes = reduced.values() \ .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket, loss, w, x_size, b_cur_lvl.value)) if debug: cur_lvl_nodes.map(lambda bucket: bucket.print_debug(b_topk.value)).collect() cur_lvl = 1 prev_level = cur_lvl_nodes.collect() b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level) levels.append(b_cur_lvl_nodes) top_k = top_k.buckets_top_k(prev_level, x_size, alpha, 1) while len(prev_level) > 0: b_topk = SparkContext.broadcast(sc, top_k) cur_min = top_k.min_score b_cur_lvl = SparkContext.broadcast(sc, cur_lvl) top_k.print_topk() buckets = [] for left in range(int(cur_lvl / 2) + 1): right = cur_lvl - left - 1 nodes = union_enum(levels[left].value, levels[right].value, x_size, alpha, top_k, w, loss, cur_lvl) buckets.append(nodes) b_buckets = sc.parallelize(buckets) all_buckets = b_buckets.flatMap(lambda line: (line.items())) combined = dict(all_buckets.combineByKey(combiner, merge_values, merge_combiners).collect()) b_buckets = SparkContext.broadcast(sc, combined) to_slice = dict(filter(lambda bucket: bucket[1].check_bounds(x_size, alpha, top_k), combined.items())) b_to_slice = SparkContext.broadcast(sc, to_slice) mapped = rows.map(lambda row: rows_mapper(row, b_to_slice.value, loss_type)) flattened = mapped.flatMap(lambda line: (line.items())) partial = flattened.combineByKey(combiner, join_data_parallel.merge_values, join_data_parallel.merge_combiners) prev_level = partial\ .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket[1], loss, w, x_size, b_cur_lvl.value)).collect() top_k = top_k.buckets_top_k(prev_level, x_size, alpha, cur_min) b_topk = SparkContext.broadcast(sc, top_k) if debug: partial.values().map(lambda bucket: bucket.print_debug(b_topk.value)).collect() print("Level " + str(cur_lvl) + " had " + str( len(levels[cur_lvl - 1].value) * (len(levels[cur_lvl - 1].value) - 1)) + " candidates but after pruning only " + str(len(prev_level)) + " go to the next level") print("Program stopped at level " + str(cur_lvl)) b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level) levels.append(b_cur_lvl_nodes) cur_lvl += 1 print() print("Selected slices are: ") top_k.print_topk() return None
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update): top_k = Topk(k) # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds levels = [] first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k) # double appending of first level nodes in order to enumerating second level in the same way as others levels.append((first_level[0], len(all_features))) all_nodes = first_level[1] # cur_lvl - index of current level, correlates with number of slice forming features cur_lvl = 1 # level that is planned to be filled later cur_lvl_nodes = first_level # currently for debug print("Level 1 had " + str(len(all_features)) + " candidates") print() print("Current topk are: ") top_k.print_topk() # DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist while len(cur_lvl_nodes) > 0: cur_lvl_nodes = [] count = 0 for left in range(int(cur_lvl / 2) + 1): right = cur_lvl - 1 - left for node_i in range(len(levels[left][0])): for node_j in range(len(levels[right][0])): flag = check_attributes(levels[left][0][node_i], levels[right][0][node_j]) if not flag: new_node = Node(complete_x, loss, x_size, y_test, errors) parents_set = set(new_node.parents) parents_set.add(levels[left][0][node_i]) parents_set.add(levels[right][0][node_j]) new_node.parents = list(parents_set) parent1_attr = levels[left][0][node_i].attributes parent2_attr = levels[right][0][node_j].attributes new_node_attr = union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() new_id = len(all_nodes) new_node.key = new_node.make_key(new_id) if new_node.key[1] in all_nodes: existing_item = all_nodes[new_node.key[1]] parents_set = set(existing_item.parents) existing_item.parents = parents_set if b_update: s_upper = new_node.calc_s_upper(cur_lvl) s_lower = new_node.calc_s_lower(cur_lvl) e_upper = new_node.calc_e_upper() e_max_upper = new_node.calc_e_max_upper( cur_lvl) new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w) else: new_node.calc_bounds(cur_lvl, w) all_nodes[new_node.key[1]] = new_node # check if concrete data should be extracted or not (only for those that have score upper # big enough and if size of subset is big enough to_slice = new_node.check_bounds( top_k, x_size, alpha) if to_slice: new_node.process_slice(loss_type) new_node.score = opt_fun( new_node.loss, new_node.size, loss, x_size, w) # we decide to add node to current level nodes (in order to make new combinations # on the next one or not basing on its score value if new_node.check_constraint( top_k, x_size, alpha ) and new_node.key not in top_k.keys: top_k.add_new_top_slice(new_node) cur_lvl_nodes.append(new_node) if debug: new_node.print_debug(top_k, cur_lvl) count = count + levels[left][1] * levels[right][1] print("Level " + str(cur_lvl) + " had " + str(count) + " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level") cur_lvl = cur_lvl + 1 levels.append((cur_lvl_nodes, count)) top_k.print_topk() print("Program stopped at level " + str(cur_lvl)) print() print("Selected slices are: ") top_k.print_topk()
class SliceTests(unittest.TestCase): loss_type = 0 # x, y = m.generate_dataset(10, 100) train_dataset = pd.read_csv("toy_train.csv") attributes_amount = len(train_dataset.values[0]) model = linear_model.LinearRegression() y_train = train_dataset.iloc[:, attributes_amount - 1:attributes_amount].values x_train = train_dataset.iloc[:, 0:attributes_amount - 1].values model.fit(x_train, y_train) test_dataset = pd.read_csv("toy.csv") y_test = test_dataset.iloc[:, attributes_amount - 1:attributes_amount].values x_test = test_dataset.iloc[:, 0:attributes_amount - 1].values y_pred = model.predict(x_test) print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) print('r_2 statistic: %.2f' % r2_score(y_test, y_pred)) # Now that we have trained the model, we can print the coefficient of x that it has predicted print('Coefficients: \n', model.coef_) enc = OneHotEncoder(handle_unknown='ignore') x = enc.fit_transform(x_test).toarray() complete_x = [] complete_y = [] counter = 0 for item in x: complete_x.append((counter, item)) complete_y.append((counter, y_test[counter])) counter = counter + 1 all_features = enc.get_feature_names() loss = mean_squared_error(y_test, y_pred) devs = (y_pred - y_test) ** 2 errors = [] counter = 0 for pred in devs: errors.append((counter, pred)) counter = counter + 1 k = 5 w = 0.5 alpha = 4 top_k = Topk(k) debug = True b_update = True first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors, loss_type, top_k, alpha, w) first_level_nodes = first_level[0] slice_member = first_level_nodes[(7, 'x2_2')] def test_attr_spark(self): conf = SparkConf().setAppName("toy_test").setMaster('local[2]') num_partitions = 2 enumerator = "join" model_type = "regression" label = 'target' sparkContext = SparkContext(conf=conf) sqlContext = SQLContext(sparkContext) train_df = sqlContext.read.csv("toy_train.csv", header='true', inferSchema='true') test_df = sqlContext.read.csv("toy.csv", header='true', inferSchema='true') # initializing stages of main transformation pipeline stages = [] # list of categorical features for further hot-encoding cat_features = ['a', 'b', 'c'] for feature in cat_features: string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index").setHandleInvalid("skip") encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"]) encoder.setDropLast(False) stages += [string_indexer, encoder] assembler_inputs = [feature + "_vec" for feature in cat_features] assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs") stages += [assembler] assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features") stages += [assembler_final] pipeline = Pipeline(stages=stages) train_pipeline_model = pipeline.fit(train_df) test_pipeline_model = pipeline.fit(test_df) train_df_transformed = train_pipeline_model.transform(train_df) test_df_transformed = test_pipeline_model.transform(test_df) train_df_transformed = train_df_transformed.withColumn('model_type', sf.lit(0)) test_df_transformed = test_df_transformed.withColumn('model_type', sf.lit(0)) decode_dict = {} counter = 0 cat = 0 for feature in cat_features: colIdx = test_df_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap() colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])} for item in colIdx: decode_dict[counter] = (cat, item, colIdx[item], counter) counter = counter + 1 cat = cat + 1 train_df_transform_fin = train_df_transformed.select('features', label, 'model_type') test_df_transform_fin = test_df_transformed.select('features', label, 'model_type') lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.0, elasticNetParam=0.8) lr_model = lr.fit(train_df_transform_fin) eval = lr_model.evaluate(test_df_transform_fin) f_l2 = eval.meanSquaredError pred = eval.predictions pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type'])) predictions = pred_df_fin.select('features', 'error').repartition(num_partitions) converter = IndexToString(inputCol='features', outputCol='cats') all_features = list(decode_dict) predictions = predictions.collect() spark_join = spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha, k=self.k, w=self.w, loss_type=self.loss_type, enumerator="join") spark_union = spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha, k=self.k, w=self.w, loss_type=self.loss_type, enumerator="union") self.assertEqual(3, len(spark_join.slices)) print("check1") self.assertEqual(spark_join.min_score, spark_union.min_score) print("check2") self.assertEqual(spark_join.keys, spark_union.keys) print("check3") self.assertEqual(len(spark_join.slices), len(spark_union.slices)) print("check4") idx = -1 for sliced in spark_join.slices: idx += 1 self.assertEqual(sliced.score, spark_union.slices[idx].score) print("check5") def test_features_number(self): self.assertEqual(len(self.all_features), 9) print("check 1") def test_base_first_level(self): self.assertEqual(9, len(self.first_level_nodes)) print("check 2") def test_parents_first(self): self.assertIn(('x2_2', 7), self.slice_member.parents) print("check 3") def test_name(self): self.assertEqual('x2_2', self.slice_member.make_name()) print("check 4") def test_size(self): self.assertEqual(36, self.slice_member.size) print("check 5") def test_e_upper(self): self.assertEqual(81, self.slice_member.e_upper) print("check 6") def test_loss(self): self.assertEqual(22, int(self.slice_member.loss)) print("check 7") def test_opt_fun(self): self.slice_member.score = slicer.opt_fun(self.slice_member.loss, self.slice_member.size, self.loss, len(self.x_test), self.w) print("check 8") def test_score(self): self.assertEqual(1.2673015873015872, self.slice_member.score) print("check 9") def test_base_join_enum(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) self.assertEqual(6, len(combined[0])) print("check1") def test_parents_second(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent1 = combined[0][('x0_3 && x1_3')] parent2 = combined[0][('x0_3 && x2_2')] new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) new_node.parents = [parent1, parent2] parent1_attr = parent1.attributes parent2_attr = parent2.attributes new_node_attr = slicer.union(parent1_attr, parent2_attr) self.assertEqual(new_node_attr, [('x0_3', 2), ('x1_3', 5), ('x2_2', 7)]) print("check2") def test_nonsense(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent1 = combined[0][('x0_3 && x1_3')] parent2 = combined[0][('x0_3 && x2_2')] new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) new_node.parents = [parent1, parent2] parent1_attr = parent1.attributes parent2_attr = parent2.attributes new_node_attr = slicer.union(parent1_attr, parent2_attr) new_node.attributes = new_node_attr new_node.name = new_node.make_name() flagTrue = slicer.slice_name_nonsense(parent1, parent2, 2) self.assertEqual(True, flagTrue) print("check3") def test_non_nonsense(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]] parent3.attributes = [('x1_2', 4), ('x2_2', 7)] combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent2 = combined[0]['x0_3 && x2_3'] parent3.key = (8, 'x1_2 && x2_2') flag_nonsense = slicer.slice_name_nonsense(parent2, parent3, 2) self.assertEqual(True, flag_nonsense) print("check4") def test_uppers(self): cur_lvl_nodes = {} all_nodes = {} b_update = True cur_lvl = 1 slice_index = (2, 'x0_3') parent3 = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) parent3.parents = [self.first_level_nodes[(4, 'x1_2')], self.first_level_nodes[(7, 'x2_2')]] parent3.attributes = [('x1_2', 4), ('x2_2', 7)] combined = slicer.join_enum(slice_index, self.first_level_nodes, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.w, self.loss_type, b_update, cur_lvl, all_nodes, self.top_k, cur_lvl_nodes) parent1 = combined[0]['x0_3 && x1_3'] parent2 = combined[0]['x0_3 && x2_3'] new_node = Node(self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors) new_node.parents = [parent1, parent2] new_node.calc_bounds(2, self.w) self.assertEqual(25, new_node.s_upper) print("check5") self.assertEqual(398, int(new_node.c_upper)) print("check6") def test_topk_slicing(self): join_top_k = slicer.process(self.all_features, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.k, self.w, self.loss_type, self.b_update) union_top_k = union_slicer.process(self.all_features, self.complete_x, self.loss, len(self.complete_x), self.y_test, self.errors, self.debug, self.alpha, self.k, self.w, self.loss_type, self.b_update) self.assertEqual(join_top_k.min_score, union_top_k.min_score) print("check1") self.assertEqual(join_top_k.keys, union_top_k.keys) print("check2") self.assertEqual(len(join_top_k.slices), len(union_top_k.slices)) print("check3") idx = -1 for sliced in join_top_k.slices: idx += 1 self.assertEqual(sliced.score, union_top_k.slices[idx].score) print("check4") def test_extreme_target(self): test_dataset = pd.read_csv("/home/lana/diploma/project/slicing/datasets/toy_extreme_change.csv") y_test = test_dataset.iloc[:, self.attributes_amount - 1:self.attributes_amount].values x_test = test_dataset.iloc[:, 0:self.attributes_amount - 1].values y_pred = self.model.predict(x_test) print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) print('r_2 statistic: %.2f' % r2_score(y_test, y_pred)) # Now that we have trained the model, we can print the coefficient of x that it has predicted print('Coefficients: \n', self.model.coef_) enc = OneHotEncoder(handle_unknown='ignore') x = enc.fit_transform(x_test).toarray() complete_x = [] complete_y = [] counter = 0 for item in x: complete_x.append((counter, item)) complete_y.append((counter, y_test[counter])) counter = counter + 1 all_features = enc.get_feature_names() loss = mean_squared_error(y_test, y_pred) devs = (y_pred - y_test) ** 2 errors = [] counter = 0 for pred in devs: errors.append((counter, pred)) counter = counter + 1 k = 5 w = 0.5 alpha = 4 top_k = Topk(k) debug = True b_update = True first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors, self.loss_type, top_k, alpha, w) first_level_nodes = first_level[0] slice_member = first_level_nodes[(7, 'x2_2')] self.assertGreater(slice_member.loss, self.slice_member.loss) print("check 1") self.assertGreater(slice_member.score, self.slice_member.score) print("check 2") def test_error_significance(self): y_test = self.test_dataset.iloc[:, self.attributes_amount - 1:self.attributes_amount].values x_test = self.test_dataset.iloc[:, 0:self.attributes_amount - 1].values y_pred = self.model.predict(x_test) print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) print('r_2 statistic: %.2f' % r2_score(y_test, y_pred)) # Now that we have trained the model, we can print the coefficient of x that it has predicted print('Coefficients: \n', self.model.coef_) enc = OneHotEncoder(handle_unknown='ignore') x = enc.fit_transform(x_test).toarray() complete_x = [] complete_y = [] counter = 0 for item in x: complete_x.append((counter, item)) complete_y.append((counter, y_test[counter])) counter = counter + 1 all_features = enc.get_feature_names() loss = mean_squared_error(y_test, y_pred) devs = (y_pred - y_test) ** 2 errors = [] counter = 0 for pred in devs: errors.append((counter, pred)) counter = counter + 1 k = 5 # Maximized size significance w = 0 alpha = 4 top_k = Topk(k) debug = True b_update = True first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors, self.loss_type, top_k, alpha, w) first_level_nodes = first_level[0] slice_member = first_level_nodes[(7, 'x2_2')] self.assertGreater(self.slice_member.score, slice_member.score) def test_size_significance(self): y_test = self.test_dataset.iloc[:, self.attributes_amount - 1:self.attributes_amount].values x_test = self.test_dataset.iloc[:, 0:self.attributes_amount - 1].values y_pred = self.model.predict(x_test) print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) print('r_2 statistic: %.2f' % r2_score(y_test, y_pred)) # Now that we have trained the model, we can print the coefficient of x that it has predicted print('Coefficients: \n', self.model.coef_) enc = OneHotEncoder(handle_unknown='ignore') x = enc.fit_transform(x_test).toarray() complete_x = [] complete_y = [] counter = 0 for item in x: complete_x.append((counter, item)) complete_y.append((counter, y_test[counter])) counter = counter + 1 all_features = enc.get_feature_names() loss = mean_squared_error(y_test, y_pred) devs = (y_pred - y_test) ** 2 errors = [] counter = 0 for pred in devs: errors.append((counter, pred)) counter = counter + 1 k = 5 # Maximized size significance w = 1 alpha = 4 top_k = Topk(k) debug = True b_update = True first_level = slicer.make_first_level(all_features, list(complete_x), loss, len(complete_x), y_test, errors, self.loss_type, top_k, alpha, w) first_level_nodes = first_level[0] slice_member = first_level_nodes[(7, 'x2_2')] self.assertGreater(slice_member.score, self.slice_member.score)