def wf_single_fit(fworker, fit_name, pipe_config, name, df, target, tags=None): """ Submit a dataset to be fit for a single pipeline (i.e., to train on a dataset for real predictions). """ # todo this is not working probably warnings.warn("Single fitted MatPipe not being stored in automatminer db " "collections. Please consult fw_spec to find the benchmark " "on {}".format(fworker)) if fworker not in VALID_FWORKERS: raise ValueError("fworker must be in {}".format(VALID_FWORKERS)) data_file = None now = get_time_str() base_save_dir = now + "_single_fit" spec = { "pipe_config": pipe_config, "base_save_dir": base_save_dir, "data_file": data_file, "target": target, "automatminer_commit": get_last_commit(), "tags": tags if tags else [], "_fworker": fworker, } fw_name = "{} single fit".format(name) wf_name = "single fit: {} ({}) [{}]".format(name, fit_name, fworker) fw = Firework(RunSingleFit(), spec=spec, name=fw_name) wf = Workflow([fw], metadata={"tags": tags}, name=wf_name) return wf
else: raise ValueError(f"Worker {worker} not valid!") o_utf = o.decode("utf-8") o_all = o_utf.split("\n") o_all.remove("") password = o_all[-1] ssh = SSHClient() ssh.load_system_host_keys() ssh.connect(host, username=user, password=password, look_for_keys=False) with SCPClient(ssh.get_transport()) as scp: scp.put(filepath, recursive=True, remote_path="/global/home/users/ardunn") else: pass if __name__ == "__main__": import pandas as pd from matminer.datasets import load_dataset from automatminer_dev.workflows.util import get_time_str df = load_dataset("matbench_jdft2d") transfer_data(df, "lrc", get_time_str())
def wf_benchmark( fworker, pipe_config, name, data_file, target, problem_type, clf_pos_label, cache=True, kfold_config=KFOLD_DEFAULT, tags=None, return_fireworks=False, add_dataset_to_names=True, build_id=None, prepend_name="", ): if fworker not in VALID_FWORKERS: raise ValueError("fworker must be in {}".format(VALID_FWORKERS)) # if fworker == "cori": # n_cori_jobs = 32 # warnings.warn( # "Worker is cori. Overriding n_jobs to {}".format(n_cori_jobs)) # pipe_config["learner_kwargs"]["n_jobs"] = n_cori_jobs # pipe_config["autofeaturizer_kwargs"]["n_jobs"] = n_cori_jobs # Single (run) hash is the combination of pipe configuration + last commit # + data_file last_commit = get_last_commit() benchmark_config_for_hash = copy.deepcopy(pipe_config) benchmark_config_for_hash["last_commit"] = last_commit benchmark_config_for_hash["data_file"] = data_file benchmark_config_for_hash["worker"] = fworker benchmark_config_for_hash = str(benchmark_config_for_hash).encode("UTF-8") benchmark_hash = hashlib.sha1(benchmark_config_for_hash).hexdigest()[:10] base_save_dir = get_time_str() + "_" + benchmark_hash common_spec = { "pipe_config": pipe_config, "base_save_dir": base_save_dir, "kfold_config": kfold_config, "data_file": data_file, "target": target, "clf_pos_label": clf_pos_label, "problem_type": problem_type, "automatminer_commit": last_commit, "name": name, "benchmark_hash": benchmark_hash, "tags": tags if tags else [], "cache": cache, "build_id": build_id, "_fworker": fworker, } dataset_name = "" if not add_dataset_to_names else name + " " fws_all_folds = [] kfold = KFold(**kfold_config) for fold in range(kfold.n_splits): save_dir = os.path.join("fold_{}".format(fold)) foldspec = copy.deepcopy(common_spec) foldspec["fold"] = fold foldspec["save_dir"] = save_dir if fold == 0 and cache: pipename = "{}fold {} + featurization ({})".format( dataset_name, fold, benchmark_hash) else: pipename = "{}fold {} ({})".format(dataset_name, fold, benchmark_hash) fws_all_folds.append( Firework([RunPipe(), StorePipeResults()], spec=foldspec, name=pipename)) fw_consolidate = Firework( ConsolidatePipesToBenchmark(), spec=common_spec, name="bench merge ({})".format(benchmark_hash), ) if cache: fw_fold0 = fws_all_folds[0] fws_folds = fws_all_folds[1:] links = {fw: [fw_consolidate] for fw in fws_folds} links[fw_fold0] = fws_folds links[fw_consolidate] = [] else: links = {fw: [fw_consolidate] for fw in fws_all_folds} links[fw_consolidate] = [] fw_fold0 = fws_all_folds if return_fireworks: connected_to_top_wf = [fw_fold0] if cache else fw_fold0 return links, connected_to_top_wf, fw_consolidate else: wf_name = "benchmark {}: ({}) [{}]".format(benchmark_hash, name, fworker) if prepend_name: wf_name = "<<{}>> {}".format(prepend_name, wf_name) wf = Workflow( list(links.keys()), links_dict=links, name=wf_name, metadata={ "benchmark_hash": benchmark_hash, "tags": tags }, ) return wf