def sample_examples(df, strategy, sample_n=10, random_state=42):
    df = df.copy()
    df = df[df["strategy"] == strategy]
    print("Strategy:", strategy)
    print("Most improved examples:")
    df = df[~pd.isnull(df["graph_repaired"])]
    df = df[df["score_diff"] > 0]
    df = df.sort_values("score_diff", ascending=False)
    df = df.head(sample_n)
    for _, row in df.iterrows():
        print("Dataset:", row.dataset)
        print("Score: {} -> {}".format(row.mean_test_score_orig,
                                       row.mean_test_score_repaired))
        print("Distance:", row.distance)

        _, edits = pt.tree_edit_distance(
            row.graph_orig,
            row.graph_repaired,
            return_operations=True,
        )
        print("Edits:")
        for e in edits:
            if not is_match_edit(e):
                print(edit_to_str(e))

        print("Orig pipeline:")
        print(pt.to_json(row.graph_orig))

        print("New pipeline:")
        print(pt.to_json(row.graph_repaired))

        print("\n\n")
Пример #2
0
def test_removes():
    ex1 = PairedExample(
        Pipeline([("s0", MinMaxScaler()),
                  ("clf", LogisticRegression(penalty="l2"))]),
        Pipeline([("clf", LogisticRegression(penalty="l2"))]),
    )
    assert local_rules.ComponentRemove.can_build_rule(ex1.removes[0])

    p = LogisticRegression(penalty="l1")
    ptree = pt.to_tree(p)
    ptree.annotate()

    penalty_node = get_node_with_prefix(ptree.children[0].children,
                                        "param_penalty")
    assert penalty_node is not None
    nparams = len(ptree.children[0].children)
    ptree.children[0].delete_child(penalty_node)
    post_nparams = len(ptree.children[0].children)
    assert (nparams - post_nparams) == 1

    orig_ptree = pt.to_tree(p)
    orig_ptree.annotate()

    dist, edits = pt.tree_edit_distance(
        orig_ptree,
        ptree,
        return_operations=True,
    )
    assert dist == 1
    remove_edits = [e for e in edits if local_rules.is_remove_edit(e)]
    assert len(remove_edits) == 1
    remove_edit = remove_edits[0]

    assert local_rules.HyperparamRemove.can_build_rule(remove_edit)

    assert not local_rules.ComponentRemove.can_build_rule(remove_edit)
    assert not local_rules.HyperparamRemove.can_build_rule(ex1.removes[0])

    # apply them
    ptree = ex1.trees[0]
    comp_rule = local_rules.ComponentRemove(ex1.removes[0])
    comp_node = ptree.children[0].children[0]

    hyper_rule = local_rules.HyperparamRemove(remove_edit)
    hyper_node = penalty_node

    assert comp_rule.can_apply(comp_node)
    assert comp_rule.apply(comp_node) is None

    assert hyper_rule.can_apply(hyper_node)
    assert hyper_rule.apply(hyper_node) is None

    assert not comp_rule.can_apply(hyper_node)
    assert not hyper_rule.can_apply(comp_node)
Пример #3
0
 def get_distance_and_edits(self, pre_ix, post_ix):
     # distance is symmetric, but edit ops are not
     if self.computed_mat[pre_ix, post_ix]:
         distance = self.dist_mat[pre_ix, post_ix]
         edits = self.edits[(pre_ix, post_ix)]
         return distance, edits
     else:
         pipeline1 = self.pipelines_df.iloc[pre_ix]["obj_graph"]
         pipeline2 = self.pipelines_df.iloc[post_ix]["obj_graph"]
         distance, edits = pt.tree_edit_distance(
             pipeline1,
             pipeline2,
             return_operations=True,
         )
         self.computed_mat[pre_ix, post_ix] = True
         self.dist_mat[pre_ix, post_ix] = distance
         self.edits[(pre_ix, post_ix)] = edits
         return distance, edits
Пример #4
0
def main():
    args = get_args()
    with open(args.input, "rb") as fin:
        corpus = pickle.load(fin)

    pipeline = sklearn.pipeline.Pipeline([
        ("clf", sklearn.linear_model.LogisticRegression())
    ])

    rule_sampler = get_rule_sampler(args.rule_strategy, corpus,
                                    args.random_state)
    enumerator = get_tree_enumerator(args.enumeration_strategy, rule_sampler)
    orig_tree = pt.to_tree(pipeline)
    explored = set([orig_tree])

    ix = 0
    for p in enumerator.enumerate(pipeline, args.bound_k):
        if ix >= args.bound_num_pipelines:
            break
        new_tree = pt.to_tree(p)
        h = pt.to_hashable_json(new_tree)
        if h in explored:
            continue
        explored.add(h)

        print("New pipeline", ix)
        dist, edits = pt.tree_edit_distance(
            orig_tree,
            new_tree,
            return_operations=True,
        )
        print("Distance", dist)
        ct_edits = 0
        for edit in edits:
            if is_match_edit(edit):
                continue
            msg = "Edit: {} -> {}".format(get_safe_label(edit.arg1),
                                          get_safe_label(edit.arg2))
            print(msg)
            ct_edits += 1
        print(pt.to_json(new_tree))
        ix += 1
Пример #5
0
 def __init__(self, clf1, clf2):
     self.pipelines = [clf1, clf2]
     self.trees = [pt.to_tree(p) for p in self.pipelines]
     for t in self.trees:
         t.annotate()
     self.dist, self.edits = pt.tree_edit_distance(
         self.trees[0],
         self.trees[1],
         return_operations=True,
     )
     self.updates = []
     self.removes = []
     self.inserts = []
     for e in self.edits:
         if local_rules.is_update_edit(e):
             self.updates.append(e)
         elif local_rules.is_remove_edit(e):
             self.removes.append(e)
         elif local_rules.is_insert_edit(e):
             self.inserts.append(e)
         else:
             pass
def prepare_df(df, compute_dist=False):
    df_orig = df[df["type"] == "orig"]
    df_orig = df_orig[~df_orig["failed"]]

    # make sure we only consider dataset/id where we have the orig
    # for all strategies
    unique_strategies = df["strategy"].unique()
    n_strategies = len(unique_strategies)
    strategy_cts = df_orig.groupby(["dataset", "id"
                                    ])["strategy"].agg(lambda x: len(set(x)))
    strategy_cts = strategy_cts.to_frame(name="strategy_cts").reset_index()
    df_orig = pd.merge(df_orig, strategy_cts, how="left", on=["dataset", "id"])
    df_orig = df_orig[df_orig["strategy_cts"] == n_strategies]

    df_repaired = df[df["type"] == "repair"]
    df_repaired = df_repaired[~df_repaired["failed"]]

    df_repaired = df_repaired.sort_values("mean_test_score", ascending=False)

    # there should only be one secore per dataset/id/strategy
    assert df_repaired.groupby(["dataset", "strategy", "id"]).size().max() == 1
    df_orig = df_orig[[
        "dataset",
        "strategy",
        "id",
        "mean_test_score",
        "graph",
        "timestamp",
    ]]
    df_repaired = df_repaired[[
        "dataset", "strategy", "id", "mean_test_score", "graph"
    ]]

    df_combined = pd.merge(
        df_orig,
        df_repaired,
        how="left",
        on=["dataset", "strategy", "id"],
        suffixes=("_orig", "_repaired"),
    )
    if compute_dist:
        dist = [
            None if pd.isnull(repaired) else pt.tree_edit_distance(
                orig, repaired) for orig, repaired in tqdm.tqdm(
                    list(
                        zip(df_combined["graph_orig"],
                            df_combined["graph_repaired"])))
        ]
    else:
        dist = np.nan
    df_combined["distance"] = dist

    # assign "row" to timestamp-based quartile
    df_combined["ts_quartile"] = add_timestamp_percentile(
        df_combined,
        [0.0, 0.25, 0.5, 0.75, 1.0],
        ["0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1.0"],
    )
    df_combined["ts_decile"] = add_timestamp_percentile(
        df_combined, np.arange(0, 1.1, 0.1),
        (lambda x: ["{:.1f}-{:.1f}".format(i, j)
                    for i, j in zip(x, x[1:])])(np.arange(0, 1.1, 0.1)))

    df_combined["score_diff"] = df_combined[
        "mean_test_score_repaired"] - df_combined["mean_test_score_orig"]
    df_combined["improved"] = (df_combined["score_diff"] >
                               0) & (~pd.isnull(df_combined["score_diff"]))
    df_combined["improved_int"] = df_combined["improved"].astype(int)
    df_combined["has_repair"] = ~pd.isnull(
        df_combined["mean_test_score_repaired"])
    df_combined["dummy"] = 1
    return df_combined
Пример #7
0
def build_paired_corpus(
        pipelines_df,
        num_pre,
        num_post,
        k,
        sample_method="approximate",
):
    start_time = time.time()
    builder = PairedCorpusBuilder(k)
    # make sure only unique pipelines
    pipelines_df["json"] = pipelines_df["obj_graph"].map(pt.to_hashable_json)
    column_types = pipelines_df.dtypes
    float_cols = column_types[column_types == float].index.values
    other_cols = column_types[column_types != float].index.values
    agg_ops = {col: np.mean for col in float_cols}
    agg_ops.update({col: (lambda x: x.values[0]) for col in other_cols})
    agg_ops.pop("json")
    pipelines_df = pipelines_df.groupby("json").agg(agg_ops)
    pipelines_df = pipelines_df.reset_index()
    pipelines_df["failed"] = pipelines_df["external_score"].isna()
    success_ixs = np.where(~pipelines_df["failed"])[0].tolist()

    print("Building post-sampler of type: {}".format(sample_method))
    if sample_method == "random":
        post_sampler = RandomPostSampler(pipelines_df)
    elif sample_method == "approximate":
        post_sampler = ApproximatePostSampler(pipelines_df)
    elif sample_method == "exact":
        post_sampler = ExactPostSampler(pipelines_df)
    else:
        raise Exception("Unknown sample method: {}".format(sample_method))

    n = pipelines_df.shape[0]
    ixs = np.arange(0, n)
    pre_ixs = ixs.tolist()

    if num_pre is not None:
        random.shuffle(pre_ixs)
        pre_ixs = pre_ixs[:num_pre]

    for pre_ix in tqdm.tqdm(pre_ixs):
        pre = pipelines_df.iloc[pre_ix]

        if pre.failed:
            # any succeeding pipeline can be post if pre is failure
            post_ixs = list(success_ixs)
        else:
            # only higher scores can be post
            higher_score = pipelines_df["external_score"] > pre.external_score

            success = ~pipelines_df["failed"]
            post_ixs = (ixs[success & higher_score]).tolist()

        if num_post is not None:
            sampled_results = post_sampler.sample(pre_ix, num_post, post_ixs)

        for res in tqdm.tqdm(sampled_results):
            if sample_method == "exact":
                post_ix, distance, edits = res
            else:
                post_ix, distance, edits = res, None, None

            if post_ix == pre_ix:
                continue
            post = pipelines_df.iloc[post_ix]
            assert not post.failed, "Post-tree can never be a failure"
            if distance is None:
                # only compute if needed
                distance, edits = pt.tree_edit_distance(
                    pre.obj_graph,
                    post.obj_graph,
                    return_operations=True,
                )
            entry = CorpusEntry(pre, post, distance, edits)
            builder.push(entry)

    end_time = time.time()
    entries = builder.get_entries()
    for entry in entries:
        # add parent/sibling info
        entry.pre.obj_graph.annotate()
        entry.post.obj_graph.annotate()

    corpus = TreePairCorpus(
        entries,
        compute_time=end_time - start_time,
        sample_method=sample_method,
    )
    return corpus
Пример #8
0
def test_beam_enumerator():
    p1 = Pipeline([("clf", LogisticRegression(penalty="l2"))])
    p2 = Pipeline([("clf", LogisticRegression(penalty="l1"))])

    _, ops = pt.tree_edit_distance(p1, p2, return_operations=True)
    update_op = [o for o in ops if is_update_edit(o)][0]
    # rule 1: penalty=l2 -> penalty=l1
    r1 = HyperparamUpdate(update_op)

    p5 = Pipeline([("clf", LogisticRegression(penalty="elasticnet"))])
    _, ops = pt.tree_edit_distance(p1, p5, return_operations=True)
    update_op = [o for o in ops if is_update_edit(o)][0]
    # rule 1.5: penalty=l2 -> penalty=elasticnet
    r1_5 = HyperparamUpdate(update_op)

    p3 = Pipeline([("s0", MinMaxScaler()),
                   ("clf", LogisticRegression(penalty="l2"))])
    _, ops = pt.tree_edit_distance(p3, p1, return_operations=True)
    remove_op = [o for o in ops if is_remove_edit(o)][0]

    # rule 2: remove MinMaxScaler
    r2 = ComponentRemove(remove_op)

    p4 = Pipeline([("s0", StandardScaler()),
                   ("clf", LogisticRegression(penalty="l2"))])

    _, ops = pt.tree_edit_distance(p1, p4, return_operations=True)
    insert_op = [o for o in ops if is_insert_edit(o)][0]
    augedit = AugmentedEdit(insert_op, get_parent_match_edit(insert_op, ops))
    # rule 3: insert StandardScaler
    r3 = ComponentInsert(augedit)

    n1 = r1.pre
    n2 = r2.pre
    n3 = r3.pre
    rules = {
        get_node_key(n1): [r1, r1_5],
        get_node_key(n2): [r2],
        get_node_key(n3): [r3],
    }
    node_probs = {
        get_node_key(n1): 0.5,
        get_node_key(n2): 0.2,
        get_node_key(n3): 0.3,
    }
    cond_probs = {
        r1: 0.7,
        r1_5: 0.15,
        r2: 0.3,
        r3: 0.1,
    }

    rule_sampler = FakeRuleSampler(rules, node_probs, cond_probs)

    rs = rule_sampler.sample_rules(n1, return_proba=True)
    assert len(rs) == 2

    enumerator = tree_enumerator.RepeatedBeamSearchEnumerator(
        rule_sampler,
        force_apply=False,
    )

    # # should sort max to min prob by rules
    t1 = pt.to_tree(p1).annotate()
    opt_rules = enumerator.collect_node_rule_probs(t1, past_rules=[], acc={})
    flat_nodes = flatten(t1)
    assert len(opt_rules) == len(flat_nodes)

    # rule 1 is best for that node
    target_n = next(n for n in flat_nodes if n.label == r1.pre.label)
    opt_rule_and_prob = opt_rules[target_n]
    assert opt_rule_and_prob[0] == r1
    # we normalize the conditional probabilities to those that
    # can be applied:
    norm_cond_prob = cond_probs[r1] / (cond_probs[r1] + cond_probs[r1_5])
    expected_prob = node_probs[get_node_key(target_n)] * norm_cond_prob
    assert (opt_rule_and_prob[1] - expected_prob) < 1e-5

    # if we collect optimal node/rules again after using r1, we should get r1_5
    # for that node
    opt_rules = enumerator.collect_node_rule_probs(t1, past_rules=[r1], acc={})
    opt_rule_and_prob = opt_rules[target_n]
    assert opt_rule_and_prob[0] == r1_5
    # we normalize the conditional probabilities to those that
    # can be applied:
    norm_cond_prob = cond_probs[r1_5] / (cond_probs[r1] + cond_probs[r1_5])
    expected_prob = node_probs[get_node_key(target_n)] * norm_cond_prob
    assert (opt_rule_and_prob[1] - expected_prob) < 1e-5

    new_trees, lineage = list(
        enumerator.derive_variant_trees(t1, k=5, past_rules=[]))
    # at most 2 (even though k = 5)
    # penalty=l2->l1, insert(StandardScaler)
    assert len(new_trees) == 2
    assert list(lineage[0])[0] == r1
    assert list(lineage[1])[0] == r3

    gen = enumerator.enumerate(p1, k=5)
    trees_t1 = list(gen)
    # l2->l1, insert(StandardScaler), l2->elastic
    assert len(trees_t1) == 3

    gen = enumerator.enumerate(p5, k=5)
    trees_t2 = list(gen)
    # insert(StandardScaler)
    assert len(trees_t2) == 1

    gen = enumerator.enumerate(p3, k=10)
    trees_t3 = list(gen)
    # (Overall possible rules): outcome pipeline
    # l2 -> l1:                            MinMax, LR(l1)           (yes)
    # l2 -> elastic:                       MinMax, LR(elastic)
    # insert(StandardScaler):              MinMax, SS, LR(l2)       (yes)
    # remove(MinMaxScaler):                LR(L2)                   (yes)
    # insert(SS), l2->l1:                  MinMax, SS, LR(l1)       (yes)
    # insert(SS), l2->elastic:             MinMax, SS, LR(elastic)
    # remove(MM), l2->l1:                  LR(l1)                   (yes)
    # remove(MM), l2->elastic:             LR(elastic)
    # remove(MM), insert(SS)               SS, LR(l2)               (yes)
    # remove(MM), insert(SS), l2->l1:      SS, LR(l1)               (yes)
    # remove(MM), insert(SS), l2->elastic: SS, LR(elastic)
    assert len(trees_t3) < 11