def test_parse_overrides(self): assert parse_overrides("") == {} assert parse_overrides("{}") == {} override_dict = parse_overrides('{"train_data": "/train", "trainer.num_epochs": 10}') assert override_dict == { "train_data": "/train", "trainer": { "num_epochs": 10 } } params = with_fallback( preferred=override_dict, fallback={ "train_data": "/test", "model": "bidaf", "trainer": {"num_epochs": 100, "optimizer": "sgd"} }) assert params == { "train_data": "/train", "model": "bidaf", "trainer": {"num_epochs": 10, "optimizer": "sgd"} }
def yaml_to_params(params_file: str, overrides: str = "") -> Params: # redirect to cache, if necessary params_file = cached_path(params_file) with open(params_file) as f: file_dict = yaml.safe_load(f) overrides_dict = parse_overrides(overrides) param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) return Params(param_dict)
def load_params(param_file, overrides): """Param loader with YAML support.""" if not param_file.endswith(('.yaml', '.yml')): return Params.from_file(param_file, overrides) with open(param_file) as f: file_dict = yaml.safe_load(f) overrides_dict = parse_overrides(overrides) param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) return Params(param_dict)
def _get_predictor(args: argparse.Namespace) -> Predictor: check_for_gpu(args.cuda_device) archive = load_archive(args.archive_file, weights_file=args.weights_file, cuda_device=args.cuda_device, overrides=args.overrides) ov = parse_overrides(args.overrides) paper_features_path = None try: paper_features_path = ov['dataset_reader']['paper_features_path'] except KeyError: pass return predictor_from_archive(archive, args.predictor, paper_features_path)
def load_archive_from_folder(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) logger.info(f"loading model from direactory {archive_file}") serialization_dir = resolved_archive_file # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, filename in files_to_archive.items(): if not filename.startswith("/"): filename = os.path.join(serialization_dir, f"fta/{key}") replacements_dict[key] = filename overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) return Archive(model=model, config=config)
def train_func(config, reporter): logger.debug( f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}") for package_name in getattr(run_args, "include_package", ()): import_submodules(package_name) run_parameters = {k: json.dumps(v) for k, v in config.items()} file_dict = json.loads( _jsonnet.evaluate_snippet("config", parameter_file_snippet, tla_codes=run_parameters)) if default_args.num_gpus == 0: logger.warning(f"No GPU specified, using CPU.") file_dict["trainer"]["cuda_device"] = -1 overrides_dict = parse_overrides(run_args.overrides) params_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) # Make sure path is absolute (as Ray workers do not use the same working dir) train_data_path = params_dict["train_data_path"] validation_data_path = params_dict.get("validation_data_path") if not os.path.isabs(train_data_path): params_dict["train_data_path"] = os.path.abspath( os.path.join(default_args.cwd, train_data_path)) if validation_data_path and not os.path.isabs( validation_data_path): params_dict["validation_data_path"] = os.path.abspath( os.path.join(default_args.cwd, validation_data_path)) params = Params(params_dict) logger.debug(f"AllenNLP Configuration: {params.as_dict()}") train_model(params=params, serialization_dir="./trial/") reporter(done=True)
def from_file(params_file: str, params_overrides: str = "", ext_vars: dict = None) -> 'Params': """ Load a `Params` object from a configuration file. Parameters ---------- params_file : ``str`` The path to the configuration file to load. params_overrides : ``str``, optional A dict of overrides that can be applied to final object. e.g. {"model.embedding_dim": 10} ext_vars : ``dict``, optional Our config files are Jsonnet, which allows specifying external variables for later substitution. Typically we substitute these using environment variables; however, you can also specify them here, in which case they take priority over environment variables. e.g. {"HOME_DIR": "/Users/allennlp/home"} """ if ext_vars is None: ext_vars = {} options = ext_vars # Escape values with `json.dumps` in order to preserve type ext_vars = {k: str(v) for k, v in ext_vars.items()} # json.dumps # redirect to cache, if necessary params_file = cached_path(params_file) ext_vars = {**_environment_variables(), **ext_vars} file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars)) overrides_dict = parse_overrides(params_overrides) param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) return OptionsParams(param_dict, options=options)
def _load_archive(archive_file: str, adapters_dir: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None): """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}") if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info(f"extracting archive file {resolved_archive_file} to temp dir {tempdir}") with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, "files_to_archive.json") if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, original_filename in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") if os.path.exists(replacement_filename): replacements_dict[key] = replacement_filename else: logger.warning(f"Archived file {replacement_filename} not found! At train time " f"this file was located at {original_filename}. This may be " "because you are loading a serialization directory. Attempting to " "load the file from its train-time location.") overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=overrides_dict, fallback=unflatten(replacements_dict)) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, "config.json"), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, "weights.th") # Fallback for serialization directories. if not os.path.exists(weights_path): weights_path = os.path.join(serialization_dir, "best.th") # Instantiate model. Use a duplicate of the config, as it will get consumed. model = _load(config.duplicate(), adapters_dir=adapters_dir, weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) return Archive(model=model, config=config)
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info( f"loading archive file {archive_file} from cache at {resolved_archive_file}" ) tempdir = None if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info( f"extracting archive file {resolved_archive_file} to temp dir {tempdir}" ) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, _ in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") replacements_dict[key] = replacement_filename overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) return Archive(model=model, config=config)
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}") if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info(f"extracting archive file {resolved_archive_file} to temp dir {tempdir}") with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, _ in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") replacements_dict[key] = replacement_filename overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) return Archive(model=model, config=config)
def setup(args): """ Create the blackbox function to optimize. This is a complex function that wraps the true parameter setting and training in subprocess calls to allennlp. """ base_config = json.loads(_jsonnet.evaluate_file(args.base_config_path)) search_config = json.loads(_jsonnet.evaluate_file(args.search_config_path)) arg_overrides = parse_overrides(args.overrides) # Flatten configs and get shorthand mappings flat_base_config = flatten(base_config) flat_search_config = flatten(search_config) shorthands = get_shorthands(flat_search_config) # Extract any variable dimensions and the mapping to their keys search_space = extract_search_space(flat_search_config) lambdas = extract_lambdas(flat_search_config) dimensions = list(search_space.values()) # We no longer use the base config as an initial point because the base config # needs to be minimal -- cannot contain fields which aren't used by certain hp # configurations since overrides cannot "delete" a field in the base config. x0 = None # get_x0(flat_base_config, search_space) trial_num = 0 trial_paths = dict() # Construct f def f(x): nonlocal trial_num nonlocal trial_paths # Map the x to the config keys that need updated newx = [] for d,p in zip(dimensions, x): print(d.name, d, p, type(p)) if 'numpy' in str(type(p)): p = p.item() newx.append(p) x = newx overrides = skopt.utils.point_asdict(search_space, x) overrides = fill_search_constants(overrides, flat_search_config) overrides = restrict_type_overrides(overrides, flat_search_config) # print(f'Overrides after fill and restrict: {json.dumps(overrides, indent=2)}') # Construct the trial serialization path trial_str = construct_trial_name(overrides, shorthands, trial_num) trial_path = os.path.join(args.serialization_dir, trial_str) trial_paths[trial_num] = trial_path # Construct the overrides string processed_overrides = format_overrides(overrides, lambdas, base_config, arg_overrides) print(f'Sampled config: {json.dumps(processed_overrides, indent=2)}') override_str = json.dumps(processed_overrides, indent=None) # Run Allennlp train subprocess cmd = f"allennlp train {args.base_config_path} -f -s {trial_path} -o '{override_str}' --file-friendly-logging --include-package {args.include_package}" print(f'CMD: {cmd}') try: subprocess.check_call(cmd, shell=True) except Exception as e: logger.error(e, exc_info=True) raise e trial_num += 1 # Retrieve the best validation metric and return that value metrics = json.load(open(os.path.join(trial_path, 'metrics.json'))) validation_metric = base_config['trainer']['validation_metric'] negate = validation_metric.startswith('+') validation_metric = validation_metric.lstrip('+-') y = metrics[f'best_validation_{validation_metric}'] if negate: y = -y return y # Construct a callback which maintains only the best weights/archive def delete_worse_files_cb(results): """ Remove .th and .gz files for any trials that aren't the best so far. """ nonlocal trial_num nonlocal trial_paths logger.info(f'DELETE WORSE FILES, trial num:{trial_num}') best_trial_num = np.argmin(results.func_vals).item() logger.info(f'Func values: {results.func_vals}, best is {best_trial_num} with path {trial_paths[best_trial_num]}') for i in range(trial_num): if i != best_trial_num: logger.info(f'Deleting .th and .gz files at {trial_paths[i]}') th_path = os.path.join(trial_paths[i], '*.th') gz_path = os.path.join(trial_paths[i], '*.gz') cmd = f"rm -f {th_path} && rm -f {gz_path}" try: subprocess.check_call(cmd, shell=True) except Exception as e: logger.error(e, exc_info=True) raise e return f, dimensions, x0, trial_paths, delete_worse_files_cb