def sample_examples(df, strategy, sample_n=10, random_state=42): df = df.copy() df = df[df["strategy"] == strategy] print("Strategy:", strategy) print("Most improved examples:") df = df[~pd.isnull(df["graph_repaired"])] df = df[df["score_diff"] > 0] df = df.sort_values("score_diff", ascending=False) df = df.head(sample_n) for _, row in df.iterrows(): print("Dataset:", row.dataset) print("Score: {} -> {}".format(row.mean_test_score_orig, row.mean_test_score_repaired)) print("Distance:", row.distance) _, edits = pt.tree_edit_distance( row.graph_orig, row.graph_repaired, return_operations=True, ) print("Edits:") for e in edits: if not is_match_edit(e): print(edit_to_str(e)) print("Orig pipeline:") print(pt.to_json(row.graph_orig)) print("New pipeline:") print(pt.to_json(row.graph_repaired)) print("\n\n")
def test_removes(): ex1 = PairedExample( Pipeline([("s0", MinMaxScaler()), ("clf", LogisticRegression(penalty="l2"))]), Pipeline([("clf", LogisticRegression(penalty="l2"))]), ) assert local_rules.ComponentRemove.can_build_rule(ex1.removes[0]) p = LogisticRegression(penalty="l1") ptree = pt.to_tree(p) ptree.annotate() penalty_node = get_node_with_prefix(ptree.children[0].children, "param_penalty") assert penalty_node is not None nparams = len(ptree.children[0].children) ptree.children[0].delete_child(penalty_node) post_nparams = len(ptree.children[0].children) assert (nparams - post_nparams) == 1 orig_ptree = pt.to_tree(p) orig_ptree.annotate() dist, edits = pt.tree_edit_distance( orig_ptree, ptree, return_operations=True, ) assert dist == 1 remove_edits = [e for e in edits if local_rules.is_remove_edit(e)] assert len(remove_edits) == 1 remove_edit = remove_edits[0] assert local_rules.HyperparamRemove.can_build_rule(remove_edit) assert not local_rules.ComponentRemove.can_build_rule(remove_edit) assert not local_rules.HyperparamRemove.can_build_rule(ex1.removes[0]) # apply them ptree = ex1.trees[0] comp_rule = local_rules.ComponentRemove(ex1.removes[0]) comp_node = ptree.children[0].children[0] hyper_rule = local_rules.HyperparamRemove(remove_edit) hyper_node = penalty_node assert comp_rule.can_apply(comp_node) assert comp_rule.apply(comp_node) is None assert hyper_rule.can_apply(hyper_node) assert hyper_rule.apply(hyper_node) is None assert not comp_rule.can_apply(hyper_node) assert not hyper_rule.can_apply(comp_node)
def get_distance_and_edits(self, pre_ix, post_ix): # distance is symmetric, but edit ops are not if self.computed_mat[pre_ix, post_ix]: distance = self.dist_mat[pre_ix, post_ix] edits = self.edits[(pre_ix, post_ix)] return distance, edits else: pipeline1 = self.pipelines_df.iloc[pre_ix]["obj_graph"] pipeline2 = self.pipelines_df.iloc[post_ix]["obj_graph"] distance, edits = pt.tree_edit_distance( pipeline1, pipeline2, return_operations=True, ) self.computed_mat[pre_ix, post_ix] = True self.dist_mat[pre_ix, post_ix] = distance self.edits[(pre_ix, post_ix)] = edits return distance, edits
def main(): args = get_args() with open(args.input, "rb") as fin: corpus = pickle.load(fin) pipeline = sklearn.pipeline.Pipeline([ ("clf", sklearn.linear_model.LogisticRegression()) ]) rule_sampler = get_rule_sampler(args.rule_strategy, corpus, args.random_state) enumerator = get_tree_enumerator(args.enumeration_strategy, rule_sampler) orig_tree = pt.to_tree(pipeline) explored = set([orig_tree]) ix = 0 for p in enumerator.enumerate(pipeline, args.bound_k): if ix >= args.bound_num_pipelines: break new_tree = pt.to_tree(p) h = pt.to_hashable_json(new_tree) if h in explored: continue explored.add(h) print("New pipeline", ix) dist, edits = pt.tree_edit_distance( orig_tree, new_tree, return_operations=True, ) print("Distance", dist) ct_edits = 0 for edit in edits: if is_match_edit(edit): continue msg = "Edit: {} -> {}".format(get_safe_label(edit.arg1), get_safe_label(edit.arg2)) print(msg) ct_edits += 1 print(pt.to_json(new_tree)) ix += 1
def __init__(self, clf1, clf2): self.pipelines = [clf1, clf2] self.trees = [pt.to_tree(p) for p in self.pipelines] for t in self.trees: t.annotate() self.dist, self.edits = pt.tree_edit_distance( self.trees[0], self.trees[1], return_operations=True, ) self.updates = [] self.removes = [] self.inserts = [] for e in self.edits: if local_rules.is_update_edit(e): self.updates.append(e) elif local_rules.is_remove_edit(e): self.removes.append(e) elif local_rules.is_insert_edit(e): self.inserts.append(e) else: pass
def prepare_df(df, compute_dist=False): df_orig = df[df["type"] == "orig"] df_orig = df_orig[~df_orig["failed"]] # make sure we only consider dataset/id where we have the orig # for all strategies unique_strategies = df["strategy"].unique() n_strategies = len(unique_strategies) strategy_cts = df_orig.groupby(["dataset", "id" ])["strategy"].agg(lambda x: len(set(x))) strategy_cts = strategy_cts.to_frame(name="strategy_cts").reset_index() df_orig = pd.merge(df_orig, strategy_cts, how="left", on=["dataset", "id"]) df_orig = df_orig[df_orig["strategy_cts"] == n_strategies] df_repaired = df[df["type"] == "repair"] df_repaired = df_repaired[~df_repaired["failed"]] df_repaired = df_repaired.sort_values("mean_test_score", ascending=False) # there should only be one secore per dataset/id/strategy assert df_repaired.groupby(["dataset", "strategy", "id"]).size().max() == 1 df_orig = df_orig[[ "dataset", "strategy", "id", "mean_test_score", "graph", "timestamp", ]] df_repaired = df_repaired[[ "dataset", "strategy", "id", "mean_test_score", "graph" ]] df_combined = pd.merge( df_orig, df_repaired, how="left", on=["dataset", "strategy", "id"], suffixes=("_orig", "_repaired"), ) if compute_dist: dist = [ None if pd.isnull(repaired) else pt.tree_edit_distance( orig, repaired) for orig, repaired in tqdm.tqdm( list( zip(df_combined["graph_orig"], df_combined["graph_repaired"]))) ] else: dist = np.nan df_combined["distance"] = dist # assign "row" to timestamp-based quartile df_combined["ts_quartile"] = add_timestamp_percentile( df_combined, [0.0, 0.25, 0.5, 0.75, 1.0], ["0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1.0"], ) df_combined["ts_decile"] = add_timestamp_percentile( df_combined, np.arange(0, 1.1, 0.1), (lambda x: ["{:.1f}-{:.1f}".format(i, j) for i, j in zip(x, x[1:])])(np.arange(0, 1.1, 0.1))) df_combined["score_diff"] = df_combined[ "mean_test_score_repaired"] - df_combined["mean_test_score_orig"] df_combined["improved"] = (df_combined["score_diff"] > 0) & (~pd.isnull(df_combined["score_diff"])) df_combined["improved_int"] = df_combined["improved"].astype(int) df_combined["has_repair"] = ~pd.isnull( df_combined["mean_test_score_repaired"]) df_combined["dummy"] = 1 return df_combined
def build_paired_corpus( pipelines_df, num_pre, num_post, k, sample_method="approximate", ): start_time = time.time() builder = PairedCorpusBuilder(k) # make sure only unique pipelines pipelines_df["json"] = pipelines_df["obj_graph"].map(pt.to_hashable_json) column_types = pipelines_df.dtypes float_cols = column_types[column_types == float].index.values other_cols = column_types[column_types != float].index.values agg_ops = {col: np.mean for col in float_cols} agg_ops.update({col: (lambda x: x.values[0]) for col in other_cols}) agg_ops.pop("json") pipelines_df = pipelines_df.groupby("json").agg(agg_ops) pipelines_df = pipelines_df.reset_index() pipelines_df["failed"] = pipelines_df["external_score"].isna() success_ixs = np.where(~pipelines_df["failed"])[0].tolist() print("Building post-sampler of type: {}".format(sample_method)) if sample_method == "random": post_sampler = RandomPostSampler(pipelines_df) elif sample_method == "approximate": post_sampler = ApproximatePostSampler(pipelines_df) elif sample_method == "exact": post_sampler = ExactPostSampler(pipelines_df) else: raise Exception("Unknown sample method: {}".format(sample_method)) n = pipelines_df.shape[0] ixs = np.arange(0, n) pre_ixs = ixs.tolist() if num_pre is not None: random.shuffle(pre_ixs) pre_ixs = pre_ixs[:num_pre] for pre_ix in tqdm.tqdm(pre_ixs): pre = pipelines_df.iloc[pre_ix] if pre.failed: # any succeeding pipeline can be post if pre is failure post_ixs = list(success_ixs) else: # only higher scores can be post higher_score = pipelines_df["external_score"] > pre.external_score success = ~pipelines_df["failed"] post_ixs = (ixs[success & higher_score]).tolist() if num_post is not None: sampled_results = post_sampler.sample(pre_ix, num_post, post_ixs) for res in tqdm.tqdm(sampled_results): if sample_method == "exact": post_ix, distance, edits = res else: post_ix, distance, edits = res, None, None if post_ix == pre_ix: continue post = pipelines_df.iloc[post_ix] assert not post.failed, "Post-tree can never be a failure" if distance is None: # only compute if needed distance, edits = pt.tree_edit_distance( pre.obj_graph, post.obj_graph, return_operations=True, ) entry = CorpusEntry(pre, post, distance, edits) builder.push(entry) end_time = time.time() entries = builder.get_entries() for entry in entries: # add parent/sibling info entry.pre.obj_graph.annotate() entry.post.obj_graph.annotate() corpus = TreePairCorpus( entries, compute_time=end_time - start_time, sample_method=sample_method, ) return corpus
def test_beam_enumerator(): p1 = Pipeline([("clf", LogisticRegression(penalty="l2"))]) p2 = Pipeline([("clf", LogisticRegression(penalty="l1"))]) _, ops = pt.tree_edit_distance(p1, p2, return_operations=True) update_op = [o for o in ops if is_update_edit(o)][0] # rule 1: penalty=l2 -> penalty=l1 r1 = HyperparamUpdate(update_op) p5 = Pipeline([("clf", LogisticRegression(penalty="elasticnet"))]) _, ops = pt.tree_edit_distance(p1, p5, return_operations=True) update_op = [o for o in ops if is_update_edit(o)][0] # rule 1.5: penalty=l2 -> penalty=elasticnet r1_5 = HyperparamUpdate(update_op) p3 = Pipeline([("s0", MinMaxScaler()), ("clf", LogisticRegression(penalty="l2"))]) _, ops = pt.tree_edit_distance(p3, p1, return_operations=True) remove_op = [o for o in ops if is_remove_edit(o)][0] # rule 2: remove MinMaxScaler r2 = ComponentRemove(remove_op) p4 = Pipeline([("s0", StandardScaler()), ("clf", LogisticRegression(penalty="l2"))]) _, ops = pt.tree_edit_distance(p1, p4, return_operations=True) insert_op = [o for o in ops if is_insert_edit(o)][0] augedit = AugmentedEdit(insert_op, get_parent_match_edit(insert_op, ops)) # rule 3: insert StandardScaler r3 = ComponentInsert(augedit) n1 = r1.pre n2 = r2.pre n3 = r3.pre rules = { get_node_key(n1): [r1, r1_5], get_node_key(n2): [r2], get_node_key(n3): [r3], } node_probs = { get_node_key(n1): 0.5, get_node_key(n2): 0.2, get_node_key(n3): 0.3, } cond_probs = { r1: 0.7, r1_5: 0.15, r2: 0.3, r3: 0.1, } rule_sampler = FakeRuleSampler(rules, node_probs, cond_probs) rs = rule_sampler.sample_rules(n1, return_proba=True) assert len(rs) == 2 enumerator = tree_enumerator.RepeatedBeamSearchEnumerator( rule_sampler, force_apply=False, ) # # should sort max to min prob by rules t1 = pt.to_tree(p1).annotate() opt_rules = enumerator.collect_node_rule_probs(t1, past_rules=[], acc={}) flat_nodes = flatten(t1) assert len(opt_rules) == len(flat_nodes) # rule 1 is best for that node target_n = next(n for n in flat_nodes if n.label == r1.pre.label) opt_rule_and_prob = opt_rules[target_n] assert opt_rule_and_prob[0] == r1 # we normalize the conditional probabilities to those that # can be applied: norm_cond_prob = cond_probs[r1] / (cond_probs[r1] + cond_probs[r1_5]) expected_prob = node_probs[get_node_key(target_n)] * norm_cond_prob assert (opt_rule_and_prob[1] - expected_prob) < 1e-5 # if we collect optimal node/rules again after using r1, we should get r1_5 # for that node opt_rules = enumerator.collect_node_rule_probs(t1, past_rules=[r1], acc={}) opt_rule_and_prob = opt_rules[target_n] assert opt_rule_and_prob[0] == r1_5 # we normalize the conditional probabilities to those that # can be applied: norm_cond_prob = cond_probs[r1_5] / (cond_probs[r1] + cond_probs[r1_5]) expected_prob = node_probs[get_node_key(target_n)] * norm_cond_prob assert (opt_rule_and_prob[1] - expected_prob) < 1e-5 new_trees, lineage = list( enumerator.derive_variant_trees(t1, k=5, past_rules=[])) # at most 2 (even though k = 5) # penalty=l2->l1, insert(StandardScaler) assert len(new_trees) == 2 assert list(lineage[0])[0] == r1 assert list(lineage[1])[0] == r3 gen = enumerator.enumerate(p1, k=5) trees_t1 = list(gen) # l2->l1, insert(StandardScaler), l2->elastic assert len(trees_t1) == 3 gen = enumerator.enumerate(p5, k=5) trees_t2 = list(gen) # insert(StandardScaler) assert len(trees_t2) == 1 gen = enumerator.enumerate(p3, k=10) trees_t3 = list(gen) # (Overall possible rules): outcome pipeline # l2 -> l1: MinMax, LR(l1) (yes) # l2 -> elastic: MinMax, LR(elastic) # insert(StandardScaler): MinMax, SS, LR(l2) (yes) # remove(MinMaxScaler): LR(L2) (yes) # insert(SS), l2->l1: MinMax, SS, LR(l1) (yes) # insert(SS), l2->elastic: MinMax, SS, LR(elastic) # remove(MM), l2->l1: LR(l1) (yes) # remove(MM), l2->elastic: LR(elastic) # remove(MM), insert(SS) SS, LR(l2) (yes) # remove(MM), insert(SS), l2->l1: SS, LR(l1) (yes) # remove(MM), insert(SS), l2->elastic: SS, LR(elastic) assert len(trees_t3) < 11