示例#1
0
def test_repair_deterministic():
    rules = []
    seed = 42
    random_rule_sampler = get_rule_sampler(
        "mutation",
        None,
    )
    random_enumerator = get_tree_enumerator(
        "beam", random_rule_sampler, force_apply=True)
    random_repairer = PipelineRepairer(random_enumerator)
    print("Random-mutation")
    random_passes = get_repair_hashes(random_repairer, seed)
    assert len(random_passes[0]) > 0
    assert len(set(random_passes[0])) > 1
    assert random_passes[0] == random_passes[
        1], "random-mutation should be deterministic"

    # hack up some "fake" rules
    rules = []
    for _, lineage in random_enumerator.statistics.trace:
        for r in lineage:
            r._score_delta = np.random.random()
            rules.append(r)
    weighted_rule_sampler = get_rule_sampler("weighted", rules)
    weighted_enumerator = get_tree_enumerator(
        "beam",
        weighted_rule_sampler,
        force_apply=False,
    )
    weighted_repairer = PipelineRepairer(weighted_enumerator)
    print("Weighted-transducer")
    weighted_passes = get_repair_hashes(weighted_repairer, seed)
    assert len(weighted_passes[0]) > 0
    assert len(set(weighted_passes[0])) > 1
    assert weighted_passes[0] == weighted_passes[
        1], "weighted-transducer should be deterministic"
def main():
    args = get_args()
    with open(args.input, "rb") as fin:
        corpus = pickle.load(fin)

    pipeline = sklearn.pipeline.Pipeline([
        ("clf", sklearn.linear_model.LogisticRegression())
    ])

    rule_sampler = get_rule_sampler(args.rule_strategy, corpus,
                                    args.random_state)
    enumerator = get_tree_enumerator(args.enumeration_strategy, rule_sampler)
    orig_tree = pt.to_tree(pipeline)
    explored = set([orig_tree])

    ix = 0
    for p in enumerator.enumerate(pipeline, args.bound_k):
        if ix >= args.bound_num_pipelines:
            break
        new_tree = pt.to_tree(p)
        h = pt.to_hashable_json(new_tree)
        if h in explored:
            continue
        explored.add(h)

        print("New pipeline", ix)
        dist, edits = pt.tree_edit_distance(
            orig_tree,
            new_tree,
            return_operations=True,
        )
        print("Distance", dist)
        ct_edits = 0
        for edit in edits:
            if is_match_edit(edit):
                continue
            msg = "Edit: {} -> {}".format(get_safe_label(edit.arg1),
                                          get_safe_label(edit.arg2))
            print(msg)
            ct_edits += 1
        print(pt.to_json(new_tree))
        ix += 1
示例#3
0
def main():
    args = get_args()
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    rules = load_rules(args.input)

    df = df_from_rules(rules)
    df["type_str"] = df["type_str"].map(rule_name_for_paper)
    print("Total count of raw rules", df.shape[0])

    df_sampler = None
    if args.rule_sampler is not None:
        sampler = get_rule_sampler(args.rule_sampler,
                                   rules,
                                   random_state=args.seed)
        sampler_rules = [
            r for group in sampler.rule_map.values() for r in group
        ]
        df_sampler = df_from_rules(sampler_rules)
        print("Total count of summarized rules", df_sampler.shape[0])

    ax = histogram_rule_type(df)
    savefig(ax, os.path.join(args.output_dir, "rule_type.pdf"))

    if df_sampler is not None:
        df_sampler["strategy"] = "summarized"
        ax = histogram_rule_type(df_sampler)
        # remove y axis label (shared with other plot)
        ax.get_yaxis().set_visible(False)
        plt.tight_layout()
        savefig(ax, os.path.join(args.output_dir, "rule_type_summarized.pdf"))

    ax = histogram_hyperrule_component(df)
    savefig(ax, os.path.join(args.output_dir, "hyperrules.pdf"))

    ax = histogram_comprule_component(df)
    savefig(ax, os.path.join(args.output_dir, "comprules.pdf"))
def main():
    args = get_args()
    if args.random_state is not None:
        np.random.seed(args.random_state)
        random.seed(args.random_state)

    rules = []
    if args.predefined_strategy is not None:
        assert args.rule_strategy is None
        assert args.enumeration_strategy is None
    if args.predefined_strategy == "weighted-transducer":
        args.rule_strategy = "weighted"
        args.enumeration_strategy = "beam"
    elif args.predefined_strategy == "rf-transducer":
        args.rule_strategy = "predictive"
        args.enumeration_strategy = "beam"
    elif args.predefined_strategy == "random-mutation":
        args.rule_strategy = "mutation"
        args.enumeration_strategy = "beam"
        args.bound_k = args.bound_num_repaired_pipelines
    else:
        raise Exception("Unknown predefined_strategy: " +
                        args.predefined_strategy)
    rules = []
    if args.rule_strategy != "mutation":
        for p in args.rules:
            with open(p, "rb") as fin:
                rule_corpus = pickle.load(fin)
                rules.extend(rule_corpus.rules)
    rule_sampler = get_rule_sampler(
        args.rule_strategy,
        rules,
        args.random_state,
    )
    enumerator = get_tree_enumerator(
        args.enumeration_strategy,
        rule_sampler,
        force_apply=(args.rule_strategy == "mutation"))

    df_test_data = pd.read_pickle(args.test)
    # only focus on pipelines that worked before
    df_test_data = df_test_data[~df_test_data["failed"]]
    df_test_data = df_test_data.reset_index(drop=True)

    if df_test_data.shape[0] > args.num_test_pipelines:
        df_test_data = df_test_data.sample(
            n=args.num_test_pipelines,
            replace=False,
            random_state=args.random_state,
        )

    df_test_data = df_test_data.sort_values("timestamp", ascending=True)
    # graph representation of the pipeline
    test_trees = df_test_data["obj_graph"].values
    timestamps = df_test_data["timestamp"].values
    timestamped_test_trees = zip(test_trees, timestamps)

    test_dataset_name = df_test_data.iloc[0]["dataset"]
    test_dataset = utils.get_dataset(
        test_dataset_name,
        replace_nan_and_inf=0.0,
    )

    with open(args.idx_search, "rb") as fin:
        idx_search = pickle.load(fin)

    df_results = run_evaluation(
        test_dataset,
        timestamped_test_trees,
        enumerator,
        args.bound_num_repaired_pipelines,
        idx_search=idx_search,
        dev_cv=args.dev_cv,
        bound_k=args.bound_k,
        cv=args.cv,
        scoring=args.scoring,
        random_state=args.random_state,
    )
    df_results["dataset"] = test_dataset_name
    if args.predefined_strategy is not None:
        df_results["strategy"] = args.predefined_strategy
    df_results["rule_strategy"] = args.rule_strategy
    df_results["enumeration_strategy"] = args.enumeration_strategy

    print("Dumping results to", args.output)
    df_results.to_pickle(args.output)

    print("Dumping enumerator statistics")
    with open(args.output + "-enumerator-statistics", "wb") as fout:
        enumerator.statistics.strategy = args.predefined_strategy
        pickle.dump(enumerator.statistics, fout)
def main():
    args = get_args()
    if args.random_state is not None:
        np.random.seed(args.random_state)
        random.seed(args.random_state)

    rules = []
    if args.predefined_strategy is not None:
        assert args.rule_strategy is None
        assert args.enumeration_strategy is None
    if args.predefined_strategy == "weighted-transducer":
        args.rule_strategy = "weighted"
        args.enumeration_strategy = "beam"
    elif args.predefined_strategy == "rf-transducer":
        args.rule_strategy = "predictive"
        args.enumeration_strategy = "beam"
    elif args.predefined_strategy == "random-mutation":
        args.rule_strategy = "mutation"
        args.enumeration_strategy = "beam"
        args.bound_k = args.bound_num_repaired_pipelines
    else:
        raise Exception("Unknown predefined_strategy: " +
                        args.predefined_strategy)

    rules = []
    if args.rule_strategy != "mutation":
        for p in args.rules:
            with open(p, "rb") as fin:
                rule_corpus = pickle.load(fin)
                rules.extend(rule_corpus.rules)
    rule_sampler = get_rule_sampler(
        args.rule_strategy,
        rules,
        args.random_state,
    )
    enumerator = get_tree_enumerator(
        args.enumeration_strategy,
        rule_sampler,
        force_apply=(args.rule_strategy == "mutation"))

    df_results = run_evaluation(
        args.scripts,
        enumerator,
        args.bound_num_repaired_pipelines,
        scoring=args.scoring,
        max_size=args.max_size,
        dev_fraction=args.dev_fraction,
        dev_cv=args.dev_cv,
        bound_k=args.bound_k,
        cv=args.cv,
        random_state=args.random_state,
    )

    if args.predefined_strategy is not None:
        df_results["strategy"] = args.predefined_strategy
    df_results["rule_strategy"] = args.rule_strategy
    df_results["enumeration_strategy"] = args.enumeration_strategy
    print("Dumping results to", args.output)
    df_results.to_pickle(args.output)
    print("Dumping enumerator statistics")
    with open(args.output + "-enumerator-statistics", "wb") as fout:
        enumerator.statistics.strategy = args.predefined_strategy
        pickle.dump(enumerator.statistics, fout)