def load_experiment(path_to_yml_file): config = load_yaml(path_to_yml_file) api_key = os.getenv('COMET_API_KEY', None) exp = None if not config['info']['experiment_key']: if api_key: exp = Experiment(api_key=api_key, project_name=config['info']['project_name']) exp_key = exp.get_key() else: exp_key = make_random_string(20) os.environ['EXPERIMENT_KEY'] = exp_key _env_variables = env_variables + ['EXPERIMENT_KEY'] config = load_yaml(path_to_yml_file, _env_variables) config['info']['experiment_key'] = exp_key path_to_yml_file = save_experiment(config, exp) else: logging.info( f"Experiment is already set up @ {config['info']['output_folder']}!" ) try: exp = ExistingExperiment( api_key=api_key, previous_experiment=config['info']['experiment_key']) except: pass return config, exp, path_to_yml_file
def main(path_to_yml_file): config, exp, path_to_yml_file = load_experiment(path_to_yml_file) paths = glob.glob(os.path.join(config['info']['output_folder'], 'results', '**.yml'), recursive=True) results = [] for _path in paths: data = load_yaml(_path, []) for _data in data: keys = sorted(list(_data.keys())) keys.remove('permutation') for key in keys: flattened = { 'experiment_key': config['info']['experiment_key'], 'notes': config['info']['notes'], 'file_name': _path, 'dataset': config['datasets']['test']['folder'], 'source_name': key.split('/')[-1], } flattened.update(flatten(config)) for metric in _data[key]: flattened[metric] = np.mean(_data[key][metric]) results.append(flattened) results = pd.DataFrame(results) return results, config, exp
def sweep_experiment(path_to_yml_file): base_experiment = load_yaml(path_to_yml_file) sweep = base_experiment.pop('sweep', []) experiments = [] cache_experiments = [] for k, _sweep in enumerate(sweep): lists = [] keys = [] for key in _sweep: if isinstance(_sweep[key], list): keys.append(key) lists.append(_sweep[key]) _combos = list(itertools.product(*lists)) combos = [] for c in _combos: combos.append({keys[i]: c[i] for i in range(len(c))}) if _sweep['populate_cache']: cache_config, cache_exp, cache_path_to_yml_file = load_experiment( path_to_yml_file) cache_config.pop('sweep') logging_str = ( f"Creating cache population experiment {0}/{len(combos)} " f"for sweep {0}/{len(sweep)}") this_experiment = update_config_with_sweep(cache_config, _sweep, combos[0], logging_str) this_experiment['train_config']['num_epochs'] = 0 this_experiment['dataset_config']['overwrite_cache'] = True if 'num_cache_workers' in _sweep: this_experiment['train_config']['num_workers'] = ( _sweep['num_cache_workers']) cache_experiments.append( save_experiment(this_experiment, cache_exp)) for j, c in enumerate(combos): config, exp, _path_to_yml_file = load_experiment(path_to_yml_file) config.pop('sweep') logging_str = (f"\n\tCreating experiment {j+1}/{len(combos)} " f"for sweep {k+1}/{len(sweep)}") this_experiment = update_config_with_sweep(config, _sweep, c, logging_str) experiments.append(save_experiment(this_experiment, exp)) return experiments, cache_experiments
def cmd(script_func, parser_func, exec_func=sequential_job_execution): """ Builds a parser for any script in the scripts/ directory. Scripts should have two main functions: 1) a function that actually runs the script and 2) a build_parser function that builds up an ArgumentParser with informative help text for the script. This function allows the command line arguments to be passed to the script either through the command line as normal or through a YAML file which has matching keyword arguments for the script. Positional arguments are discouraged. The arguments in the YAML file are checked by passing them back into the command line parser function before giving them to the script. This also allows for default values to be defined in the script argument parser. A script can be called multiple times using a YAML file by having a top-level key called 'jobs'. 'jobs' should contain a list where each item in the list is a set of arguments to be passed to the script one by one. For each script, simply add this like so:: if __name__ == "__main__": cmd(script_func, parser_func) Then to run a script, simply do:: python -m scripts.[script_name] --yml [path_to_yml_file] # for yml python -m scripts.[script_name] [--arg val] # for cmd line Arguments: script_func (function): A function that will take in the arguments as keyword arguments and perform some action. parser_func (function): A function that will build up the argument parser for the script. """ # first check if environment variables exist if not os.getenv('DATA_DIRECTORY'): logging.info( """ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | It doesn't look like you sourced your environment variables! Make sure to | | run 'source setup/environment/[machine_name]_local.sh' before running scripts, | | as the scripts depend on the environment variables. | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ """ ) return jobs = [] yml_parser = build_parser_for_yml_script() cmd_parser = parser_func() args = vars(yml_parser.parse_known_args()[0]) if args['help']: print('Usage via YML file.') yml_parser_help = yml_parser.print_help() if cmd_parser: print('\nDirect usage via command line arguments.') cmd_parser_help = cmd_parser.print_help() return extra_args = {} if args['yml'] is None: args, unknown_args = cmd_parser.parse_known_args() unknown_args = [u.replace('--', '') for u in unknown_args] unknown_args = dict(zip(unknown_args[:-1:2], unknown_args[1::2])) args = vars(args) script_signature = inspect.getfullargspec(script_func) if script_signature.varkw is not None: args.update(unknown_args) jobs.append(args) else: _args = load_yaml(args['yml']) _jobs = [] if 'jobs' in _args: _jobs = _args.pop('jobs') extra_args = _args else: _jobs.append(_args) for job in _jobs: if cmd_parser: args = [] for key, val in job.items(): if isinstance(val, bool): if val: args.append(f'--{key}') else: args.append(f'--{key}') args.append(str(val)) args, unknown_args = cmd_parser.parse_known_args(args) unknown_args = [u.replace('--', '') for u in unknown_args] unknown_args = dict(zip(unknown_args[:-1:2], unknown_args[1::2])) args = vars(args) script_signature = inspect.getfullargspec(script_func) if script_signature.varkw is not None: args.update(unknown_args) [job.pop(k) for k in args if k in job] args.update(job) else: args = job jobs.append(args) exec_args = inspect.getfullargspec(exec_func) for key in extra_args.copy(): if key not in exec_args.args: extra_args.pop(key) exec_func(script_func, jobs, **extra_args)
def analyze(path_to_yml_file, use_gsheet=False, upload_source_metrics=False): """ Analyzes the metrics for all the files that were evaluated in the experiment. Args: path_to_yml_file (str): Path to the yml file that defines the experiment. The corresponding results folder for the experiment is what will be analyzed and put into a Pandas dataframe. use_gsheet (bool, optional): Whether or not to upload to the Google Sheet. Defaults to False. upload_source_metrics (bool): Uploads metrics for each source if True. Defaults to False. Can have interactions with the API limit on Google Sheets. If there are two many sources, then it will hit the limit and the script will break. Returns: tuple: 3-element tuple containing - results (:class:`pandas.DataFrame`): DataFrame containing all of the results for every file evaluated in the experiment. The DataFrame also has every key in the experiment configuration in flattened format. For example, model_config_recurrent_stack_args_embedding_size is a column in the DataFrame. - config (*dict*): A dictionary containing the configuration of the experiment. - exp (:class:`comet_ml.Experiment`): An instantiated experiment if comet.ml is needed, otherwise it is None. """ config, exp, path_to_yml_file = load_experiment(path_to_yml_file) paths = glob.glob(os.path.join(config['info']['output_folder'], 'results', '**.yml'), recursive=True) results = [] for _path in paths: data = load_yaml(_path, []) for _data in data: keys = sorted(list(_data.keys())) keys.remove('permutation') for key in keys: flattened = { 'experiment_key': config['info']['experiment_key'], 'notes': config['info']['notes'], 'file_name': _path, 'dataset': config['datasets']['test']['folder'], 'source_name': key.split('/')[-1], } flattened.update(flatten(config)) for metric in _data[key]: flattened[metric] = np.mean(_data[key][metric]) results.append(flattened) results = pd.DataFrame(results) logging.info(results.mean()) logging.info(config['info']['experiment_key']) if use_gsheet: upload_to_gsheet(results, config, exp, upload_source_metrics) return results, config, exp
def test_yaml(path_to_yml): data = load_yaml(path_to_yml)
def create_experiments(path_to_yml_file): """ The main logic of this script. Takes the path to the base experiment file and loads the configuration. It then goes through the sweep dictionary kept in that base experiment file. The sweep dictionary tells how to update the configuration. The Cartesian product of all the possible settings specified by sweep is taken. Each experiment is updated accordingly. The length of the Cartesian product of the sweep is the number of experiments that get created. Args: path_to_yml_file (str): Path to base experiment file. Returns: tuple: 2-element tuple containing - experiments (*list*): List of paths to .yml files that define the generated experiments. - cache_experiments (*list*): List of paths to .yml files that define the experiments used for creating caches if any. """ base_experiment = load_yaml(path_to_yml_file) sweep = base_experiment.pop('sweep', []) experiments = [] cache_experiments = [] for k, _sweep in enumerate(sweep): lists = [] keys = [] for key in _sweep: if isinstance(_sweep[key], list): keys.append(key) lists.append(_sweep[key]) _combos = list(itertools.product(*lists)) combos = [] for c in _combos: combos.append({keys[i]: c[i] for i in range(len(c))}) if _sweep['populate_cache']: # Create a single experiment for creating dataset caches. cache_config, cache_exp, cache_path_to_yml_file = load_experiment( path_to_yml_file) cache_config.pop('sweep') this_experiment = update_config_with_sweep(cache_config, _sweep, combos[0]) this_experiment['train_config']['num_epochs'] = 0 this_experiment['dataset_config']['overwrite_cache'] = True if 'num_cache_workers' in _sweep: this_experiment['train_config']['num_workers'] = ( _sweep['num_cache_workers']) cache_experiments.append( save_experiment(this_experiment, cache_exp)) for j, c in enumerate(combos): # Sweep across all the possible combinations and update. config, exp, _path_to_yml_file = load_experiment(path_to_yml_file) config.pop('sweep') this_experiment = update_config_with_sweep(config, _sweep, c) experiments.append(save_experiment(this_experiment, exp)) return experiments, cache_experiments
def _load_dataset(config, split): config['dataset_config']['overwrite_cache'] = True config['dataset_config']['cache'] = 'tests/out/_test_dataset/' config['dataset_config']['fraction_of_dataset'] = .1 dset = loaders.load_dataset( config['datasets'][split]['class'], config['datasets'][split]['folder'], config['dataset_config'], ) return dset paths_to_yml = list(glob.glob('./experiments/**/*.yml', recursive=True)) configs = [load_yaml(path_to_yml) for path_to_yml in paths_to_yml] @pytest.mark.parametrize("config", configs, ids=paths_to_yml) def test_dataset(config): for split in config['datasets']: dset = _load_dataset(config, split) dset[0] @pytest.mark.parametrize("config", configs, ids=paths_to_yml) def test_model(config): if 'model_config' in config: model = loaders.load_model(config['model_config'])