def test_repair_deterministic(): rules = [] seed = 42 random_rule_sampler = get_rule_sampler( "mutation", None, ) random_enumerator = get_tree_enumerator( "beam", random_rule_sampler, force_apply=True) random_repairer = PipelineRepairer(random_enumerator) print("Random-mutation") random_passes = get_repair_hashes(random_repairer, seed) assert len(random_passes[0]) > 0 assert len(set(random_passes[0])) > 1 assert random_passes[0] == random_passes[ 1], "random-mutation should be deterministic" # hack up some "fake" rules rules = [] for _, lineage in random_enumerator.statistics.trace: for r in lineage: r._score_delta = np.random.random() rules.append(r) weighted_rule_sampler = get_rule_sampler("weighted", rules) weighted_enumerator = get_tree_enumerator( "beam", weighted_rule_sampler, force_apply=False, ) weighted_repairer = PipelineRepairer(weighted_enumerator) print("Weighted-transducer") weighted_passes = get_repair_hashes(weighted_repairer, seed) assert len(weighted_passes[0]) > 0 assert len(set(weighted_passes[0])) > 1 assert weighted_passes[0] == weighted_passes[ 1], "weighted-transducer should be deterministic"
def main(): args = get_args() with open(args.input, "rb") as fin: corpus = pickle.load(fin) pipeline = sklearn.pipeline.Pipeline([ ("clf", sklearn.linear_model.LogisticRegression()) ]) rule_sampler = get_rule_sampler(args.rule_strategy, corpus, args.random_state) enumerator = get_tree_enumerator(args.enumeration_strategy, rule_sampler) orig_tree = pt.to_tree(pipeline) explored = set([orig_tree]) ix = 0 for p in enumerator.enumerate(pipeline, args.bound_k): if ix >= args.bound_num_pipelines: break new_tree = pt.to_tree(p) h = pt.to_hashable_json(new_tree) if h in explored: continue explored.add(h) print("New pipeline", ix) dist, edits = pt.tree_edit_distance( orig_tree, new_tree, return_operations=True, ) print("Distance", dist) ct_edits = 0 for edit in edits: if is_match_edit(edit): continue msg = "Edit: {} -> {}".format(get_safe_label(edit.arg1), get_safe_label(edit.arg2)) print(msg) ct_edits += 1 print(pt.to_json(new_tree)) ix += 1
def main(): args = get_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) rules = load_rules(args.input) df = df_from_rules(rules) df["type_str"] = df["type_str"].map(rule_name_for_paper) print("Total count of raw rules", df.shape[0]) df_sampler = None if args.rule_sampler is not None: sampler = get_rule_sampler(args.rule_sampler, rules, random_state=args.seed) sampler_rules = [ r for group in sampler.rule_map.values() for r in group ] df_sampler = df_from_rules(sampler_rules) print("Total count of summarized rules", df_sampler.shape[0]) ax = histogram_rule_type(df) savefig(ax, os.path.join(args.output_dir, "rule_type.pdf")) if df_sampler is not None: df_sampler["strategy"] = "summarized" ax = histogram_rule_type(df_sampler) # remove y axis label (shared with other plot) ax.get_yaxis().set_visible(False) plt.tight_layout() savefig(ax, os.path.join(args.output_dir, "rule_type_summarized.pdf")) ax = histogram_hyperrule_component(df) savefig(ax, os.path.join(args.output_dir, "hyperrules.pdf")) ax = histogram_comprule_component(df) savefig(ax, os.path.join(args.output_dir, "comprules.pdf"))
def main(): args = get_args() if args.random_state is not None: np.random.seed(args.random_state) random.seed(args.random_state) rules = [] if args.predefined_strategy is not None: assert args.rule_strategy is None assert args.enumeration_strategy is None if args.predefined_strategy == "weighted-transducer": args.rule_strategy = "weighted" args.enumeration_strategy = "beam" elif args.predefined_strategy == "rf-transducer": args.rule_strategy = "predictive" args.enumeration_strategy = "beam" elif args.predefined_strategy == "random-mutation": args.rule_strategy = "mutation" args.enumeration_strategy = "beam" args.bound_k = args.bound_num_repaired_pipelines else: raise Exception("Unknown predefined_strategy: " + args.predefined_strategy) rules = [] if args.rule_strategy != "mutation": for p in args.rules: with open(p, "rb") as fin: rule_corpus = pickle.load(fin) rules.extend(rule_corpus.rules) rule_sampler = get_rule_sampler( args.rule_strategy, rules, args.random_state, ) enumerator = get_tree_enumerator( args.enumeration_strategy, rule_sampler, force_apply=(args.rule_strategy == "mutation")) df_test_data = pd.read_pickle(args.test) # only focus on pipelines that worked before df_test_data = df_test_data[~df_test_data["failed"]] df_test_data = df_test_data.reset_index(drop=True) if df_test_data.shape[0] > args.num_test_pipelines: df_test_data = df_test_data.sample( n=args.num_test_pipelines, replace=False, random_state=args.random_state, ) df_test_data = df_test_data.sort_values("timestamp", ascending=True) # graph representation of the pipeline test_trees = df_test_data["obj_graph"].values timestamps = df_test_data["timestamp"].values timestamped_test_trees = zip(test_trees, timestamps) test_dataset_name = df_test_data.iloc[0]["dataset"] test_dataset = utils.get_dataset( test_dataset_name, replace_nan_and_inf=0.0, ) with open(args.idx_search, "rb") as fin: idx_search = pickle.load(fin) df_results = run_evaluation( test_dataset, timestamped_test_trees, enumerator, args.bound_num_repaired_pipelines, idx_search=idx_search, dev_cv=args.dev_cv, bound_k=args.bound_k, cv=args.cv, scoring=args.scoring, random_state=args.random_state, ) df_results["dataset"] = test_dataset_name if args.predefined_strategy is not None: df_results["strategy"] = args.predefined_strategy df_results["rule_strategy"] = args.rule_strategy df_results["enumeration_strategy"] = args.enumeration_strategy print("Dumping results to", args.output) df_results.to_pickle(args.output) print("Dumping enumerator statistics") with open(args.output + "-enumerator-statistics", "wb") as fout: enumerator.statistics.strategy = args.predefined_strategy pickle.dump(enumerator.statistics, fout)
def main(): args = get_args() if args.random_state is not None: np.random.seed(args.random_state) random.seed(args.random_state) rules = [] if args.predefined_strategy is not None: assert args.rule_strategy is None assert args.enumeration_strategy is None if args.predefined_strategy == "weighted-transducer": args.rule_strategy = "weighted" args.enumeration_strategy = "beam" elif args.predefined_strategy == "rf-transducer": args.rule_strategy = "predictive" args.enumeration_strategy = "beam" elif args.predefined_strategy == "random-mutation": args.rule_strategy = "mutation" args.enumeration_strategy = "beam" args.bound_k = args.bound_num_repaired_pipelines else: raise Exception("Unknown predefined_strategy: " + args.predefined_strategy) rules = [] if args.rule_strategy != "mutation": for p in args.rules: with open(p, "rb") as fin: rule_corpus = pickle.load(fin) rules.extend(rule_corpus.rules) rule_sampler = get_rule_sampler( args.rule_strategy, rules, args.random_state, ) enumerator = get_tree_enumerator( args.enumeration_strategy, rule_sampler, force_apply=(args.rule_strategy == "mutation")) df_results = run_evaluation( args.scripts, enumerator, args.bound_num_repaired_pipelines, scoring=args.scoring, max_size=args.max_size, dev_fraction=args.dev_fraction, dev_cv=args.dev_cv, bound_k=args.bound_k, cv=args.cv, random_state=args.random_state, ) if args.predefined_strategy is not None: df_results["strategy"] = args.predefined_strategy df_results["rule_strategy"] = args.rule_strategy df_results["enumeration_strategy"] = args.enumeration_strategy print("Dumping results to", args.output) df_results.to_pickle(args.output) print("Dumping enumerator statistics") with open(args.output + "-enumerator-statistics", "wb") as fout: enumerator.statistics.strategy = args.predefined_strategy pickle.dump(enumerator.statistics, fout)