def _run_in_clean_lab(remote_url: str, add_in_name: str, full_hash: str, cuda_id: int) -> None: """ note: using `git --git-dir tmp_repo_dir` in order not to change global working directory. (Alternatively subprocess might be used, but concurrent.futures.ProcessPoolExecutor apparently shares the working directory across processes) :param remote_url: of git repo :param full_hash: 'git commit hash to be run from' """ with TemporaryDirectory() as lab: os.chdir(lab) logger.debug("running in clean lab: %s", lab) try: repo_name = f"summer_{full_hash}" pbs3.git.clone("--recurse-submodules", "-j8", remote_url, repo_name) os.chdir(repo_name) pbs3.git("fetch", "origin", full_hash) pbs3.git("checkout", "--force", full_hash) pbs3.git("submodule", "update", "--recursive") test_env = os.environ.copy() test_env[PYTHON_PATH_NAME] = os.path.join( lab, repo_name) + os.pathsep + test_env.get( PYTHON_PATH_NAME, "") test_env[CUDA_VISIBLE_DEVICES_NAME] = str(cuda_id) cmd = [ "python", "-c", f'import summer;summer.run("{add_in_name}")' ] out = subprocess.run(cmd, env=test_env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, start_new_session=True) if out.returncode: tag = f"{FAILED_ON}{datetime.now().strftime('%y-%m-%d_%H-%M')}" else: tag = f"{SUCCEEDED_ON}{datetime.now().strftime('%y-%m-%d_%H-%M')}" msg = out.stdout.decode("utf-8").replace("'", '"') pbs3.git("tag", "-a", tag, "-m", f"'{msg}'", full_hash) pbs3.git("push", "origin", tag) logger.debug("pushed tag '%s' to '%s' at %s", tag, "origin", full_hash) except Exception as e: logger.exception(e) raise e
def __init__(self): self.log_config = LogConfig() assert self.log_config.log_scalars_every[1] in ( "iterations", "epochs"), self.log_config.log_scalars_every[1] assert self.log_config.log_images_every[1] in ( "iterations", "epochs"), self.log_config.log_images_every[1] self.commit_hash = pbs3.git("rev-parse", "--verify", "HEAD").stdout self.commit_subject = pbs3.git.log("-1", "--pretty=%B").stdout.split("\n")[0] if self.add_in_name is None: self.add_in_name = (pbs3.git("rev-parse", "--abbrev-ref", "HEAD").stdout.strip().replace( "'", "").replace('"', "")) if self.valid_dataset == self.test_dataset and self.max_validation_samples >= len( self.test_dataset): raise ValueError("no samples for testing left")
parser.add_argument( "--rerun-failed", action="store_true", help="wether or not to rerun previously failed experiments") parser.add_argument( "--rerun-succeeded", action="store_true", help="whether or not to rerun previously succeeded experiments") args = parser.parse_args() cuda_devices = args.cuda if cuda_devices is None: cuda_devices = os.environ.get(CUDA_VISIBLE_DEVICES_NAME, None) if cuda_devices is None: raise Exception(f"{CUDA_VISIBLE_DEVICES_NAME} not set") commit_hash_cmd = pbs3.git("rev-parse", "--verify", args.start) current_commit_hash = commit_hash_cmd.stdout remote_branches = [ bn if "/" in bn else "origin/" + bn for bn in args.remote_branch ] experimenter( start_commit=current_commit_hash.strip(), experiment_identifier=args.exp, remote_branches=remote_branches, rerun_failed=args.rerun_failed, rerun_succeeded=args.rerun_succeeded, )