def load_experiment(path_to_yml_file):
    config = load_yaml(path_to_yml_file)
    api_key = os.getenv('COMET_API_KEY', None)
    exp = None

    if not config['info']['experiment_key']:
        if api_key:
            exp = Experiment(api_key=api_key,
                             project_name=config['info']['project_name'])
            exp_key = exp.get_key()
        else:
            exp_key = make_random_string(20)

        os.environ['EXPERIMENT_KEY'] = exp_key

        _env_variables = env_variables + ['EXPERIMENT_KEY']
        config = load_yaml(path_to_yml_file, _env_variables)
        config['info']['experiment_key'] = exp_key
        path_to_yml_file = save_experiment(config, exp)
    else:
        logging.info(
            f"Experiment is already set up @ {config['info']['output_folder']}!"
        )
        try:
            exp = ExistingExperiment(
                api_key=api_key,
                previous_experiment=config['info']['experiment_key'])
        except:
            pass

    return config, exp, path_to_yml_file
示例#2
0
def main(path_to_yml_file):
    config, exp, path_to_yml_file = load_experiment(path_to_yml_file)

    paths = glob.glob(os.path.join(config['info']['output_folder'], 'results',
                                   '**.yml'),
                      recursive=True)

    results = []

    for _path in paths:
        data = load_yaml(_path, [])
        for _data in data:
            keys = sorted(list(_data.keys()))
            keys.remove('permutation')
            for key in keys:
                flattened = {
                    'experiment_key': config['info']['experiment_key'],
                    'notes': config['info']['notes'],
                    'file_name': _path,
                    'dataset': config['datasets']['test']['folder'],
                    'source_name': key.split('/')[-1],
                }

                flattened.update(flatten(config))

                for metric in _data[key]:
                    flattened[metric] = np.mean(_data[key][metric])

                results.append(flattened)

    results = pd.DataFrame(results)
    return results, config, exp
示例#3
0
def sweep_experiment(path_to_yml_file):
    base_experiment = load_yaml(path_to_yml_file)
    sweep = base_experiment.pop('sweep', [])
    experiments = []
    cache_experiments = []

    for k, _sweep in enumerate(sweep):
        lists = []
        keys = []
        for key in _sweep:
            if isinstance(_sweep[key], list):
                keys.append(key)
                lists.append(_sweep[key])

        _combos = list(itertools.product(*lists))
        combos = []
        for c in _combos:
            combos.append({keys[i]: c[i] for i in range(len(c))})

        if _sweep['populate_cache']:
            cache_config, cache_exp, cache_path_to_yml_file = load_experiment(
                path_to_yml_file)
            cache_config.pop('sweep')
            logging_str = (
                f"Creating cache population experiment {0}/{len(combos)} "
                f"for sweep {0}/{len(sweep)}")
            this_experiment = update_config_with_sweep(cache_config, _sweep,
                                                       combos[0], logging_str)
            this_experiment['train_config']['num_epochs'] = 0
            this_experiment['dataset_config']['overwrite_cache'] = True

            if 'num_cache_workers' in _sweep:
                this_experiment['train_config']['num_workers'] = (
                    _sweep['num_cache_workers'])
            cache_experiments.append(
                save_experiment(this_experiment, cache_exp))

        for j, c in enumerate(combos):
            config, exp, _path_to_yml_file = load_experiment(path_to_yml_file)
            config.pop('sweep')

            logging_str = (f"\n\tCreating experiment {j+1}/{len(combos)} "
                           f"for sweep {k+1}/{len(sweep)}")
            this_experiment = update_config_with_sweep(config, _sweep, c,
                                                       logging_str)
            experiments.append(save_experiment(this_experiment, exp))

    return experiments, cache_experiments
示例#4
0
def cmd(script_func, parser_func, exec_func=sequential_job_execution):
    """
    Builds a parser for any script in the scripts/ directory. Scripts should have two
    main functions: 1) a function that actually runs the script and 2) a build_parser
    function that builds up an ArgumentParser with informative help text for the script.
    This function allows the command line arguments to be passed to the script either
    through the command line as normal or through a YAML file which has matching keyword
    arguments for the script. Positional arguments are discouraged.

    The arguments in the YAML file are checked by passing them back into the command 
    line parser function before giving them to the script. This also allows for default
    values to be defined in the script argument parser.

    A script can be called multiple times using a YAML file by having a top-level key
    called 'jobs'. 'jobs' should contain a list where each item in the list is a 
    set of arguments to be passed to the script one by one.

    For each script, simply add this like so::

        if __name__ == "__main__":
            cmd(script_func, parser_func)

    Then to run a script, simply do::

        python -m scripts.[script_name] --yml [path_to_yml_file] # for yml
        python -m scripts.[script_name] [--arg val] # for cmd line


    Arguments:
        script_func (function): A function that will take in the arguments as keyword
            arguments and perform some action.
        parser_func (function): A function that will build up the argument parser for
            the script.
    """

    # first check if environment variables exist
    if not os.getenv('DATA_DIRECTORY'):
        logging.info(
            """

            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            | It doesn't look like you sourced your environment variables! Make sure to      |
            | run 'source setup/environment/[machine_name]_local.sh' before running scripts, | 
            | as the scripts depend on the environment variables.                            |
            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            """
        )
        return

    jobs = []

    yml_parser = build_parser_for_yml_script()
    cmd_parser = parser_func()
    args = vars(yml_parser.parse_known_args()[0])
    if args['help']:
        print('Usage via YML file.')
        yml_parser_help = yml_parser.print_help()
        if cmd_parser:
            print('\nDirect usage via command line arguments.')
            cmd_parser_help = cmd_parser.print_help()        
        return
    
    extra_args = {}

    if args['yml'] is None:   
        args, unknown_args = cmd_parser.parse_known_args()

        unknown_args = [u.replace('--', '') for u in unknown_args]
        unknown_args = dict(zip(unknown_args[:-1:2], unknown_args[1::2]))
        args = vars(args)

        script_signature = inspect.getfullargspec(script_func)
        if script_signature.varkw is not None:
            args.update(unknown_args)

        jobs.append(args)
    else:
        _args = load_yaml(args['yml'])
        _jobs = []

        if 'jobs' in _args:
            _jobs = _args.pop('jobs')
            extra_args = _args
        else:
            _jobs.append(_args)
        
        for job in _jobs:
            if cmd_parser:
                args = []
                for key, val in job.items():
                    if isinstance(val, bool):
                        if val:
                            args.append(f'--{key}')
                    else:
                        args.append(f'--{key}')
                        args.append(str(val))
                args, unknown_args = cmd_parser.parse_known_args(args)

                unknown_args = [u.replace('--', '') for u in unknown_args]
                unknown_args = dict(zip(unknown_args[:-1:2], unknown_args[1::2]))
                args = vars(args)

                script_signature = inspect.getfullargspec(script_func)
                if script_signature.varkw is not None:
                    args.update(unknown_args)
                
                [job.pop(k) for k in args if k in job]
                args.update(job)
            else:
                args = job
            jobs.append(args)
    
    exec_args = inspect.getfullargspec(exec_func)
    for key in extra_args.copy():
        if key not in exec_args.args:
            extra_args.pop(key)

    exec_func(script_func, jobs, **extra_args)
示例#5
0
def analyze(path_to_yml_file, use_gsheet=False, upload_source_metrics=False):
    """
    Analyzes the metrics for all the files that were evaluated in the experiment.
    
    Args:
        path_to_yml_file (str): Path to the yml file that defines the experiment. The
            corresponding results folder for the experiment is what will be analyzed and put
            into a Pandas dataframe.
        use_gsheet (bool, optional): Whether or not to upload to the Google Sheet. 
            Defaults to False.
        upload_source_metrics (bool): Uploads metrics for each source if True. Defaults to False.
            Can have interactions with the API limit on Google Sheets. If there are two many 
            sources, then it will hit the limit and the script will break.
    
    Returns:
        tuple: 3-element tuple containing

            - results (:class:`pandas.DataFrame`): DataFrame containing all of the results 
              for every file evaluated in the experiment. The DataFrame also has every
              key in the experiment configuration in flattened format.
              
              For example, model_config_recurrent_stack_args_embedding_size is a column in the DataFrame.

            - config (*dict*):  A dictionary containing the configuration of the experiment. 

            - exp (:class:`comet_ml.Experiment`): An instantiated experiment if comet.ml is needed,  otherwise it is None.
    """
    config, exp, path_to_yml_file = load_experiment(path_to_yml_file)

    paths = glob.glob(os.path.join(config['info']['output_folder'], 'results',
                                   '**.yml'),
                      recursive=True)

    results = []

    for _path in paths:
        data = load_yaml(_path, [])
        for _data in data:
            keys = sorted(list(_data.keys()))
            keys.remove('permutation')
            for key in keys:
                flattened = {
                    'experiment_key': config['info']['experiment_key'],
                    'notes': config['info']['notes'],
                    'file_name': _path,
                    'dataset': config['datasets']['test']['folder'],
                    'source_name': key.split('/')[-1],
                }

                flattened.update(flatten(config))

                for metric in _data[key]:
                    flattened[metric] = np.mean(_data[key][metric])

                results.append(flattened)

    results = pd.DataFrame(results)

    logging.info(results.mean())
    logging.info(config['info']['experiment_key'])

    if use_gsheet:
        upload_to_gsheet(results, config, exp, upload_source_metrics)

    return results, config, exp
示例#6
0
def test_yaml(path_to_yml):
    data = load_yaml(path_to_yml)
def create_experiments(path_to_yml_file):
    """
    The main logic of this script. Takes the path to the base experiment file and
    loads the configuration. It then goes through the sweep dictionary kept in that
    base experiment file. The sweep dictionary tells how to update the configuration.
    The Cartesian product of all the possible settings specified by sweep is taken.
    Each experiment is updated accordingly. The length of the Cartesian product of
    the sweep is the number of experiments that get created. 
    
    Args:
        path_to_yml_file (str): Path to base experiment file.
    
    Returns:
        tuple: 2-element tuple containing

            - experiments (*list*):  List of paths to .yml files that define the generated
                experiments.
            - cache_experiments (*list*):  List of paths to .yml files that define the 
                experiments used for creating caches if any.
    """
    base_experiment = load_yaml(path_to_yml_file)
    sweep = base_experiment.pop('sweep', [])
    experiments = []
    cache_experiments = []

    for k, _sweep in enumerate(sweep):
        lists = []
        keys = []
        for key in _sweep:
            if isinstance(_sweep[key], list):
                keys.append(key)
                lists.append(_sweep[key])

        _combos = list(itertools.product(*lists))
        combos = []
        for c in _combos:
            combos.append({keys[i]: c[i] for i in range(len(c))})

        if _sweep['populate_cache']:
            # Create a single experiment for creating dataset caches.
            cache_config, cache_exp, cache_path_to_yml_file = load_experiment(
                path_to_yml_file)
            cache_config.pop('sweep')
            this_experiment = update_config_with_sweep(cache_config, _sweep,
                                                       combos[0])
            this_experiment['train_config']['num_epochs'] = 0
            this_experiment['dataset_config']['overwrite_cache'] = True

            if 'num_cache_workers' in _sweep:
                this_experiment['train_config']['num_workers'] = (
                    _sweep['num_cache_workers'])
            cache_experiments.append(
                save_experiment(this_experiment, cache_exp))

        for j, c in enumerate(combos):
            # Sweep across all the possible combinations and update.
            config, exp, _path_to_yml_file = load_experiment(path_to_yml_file)
            config.pop('sweep')

            this_experiment = update_config_with_sweep(config, _sweep, c)
            experiments.append(save_experiment(this_experiment, exp))

    return experiments, cache_experiments

def _load_dataset(config, split):
    config['dataset_config']['overwrite_cache'] = True
    config['dataset_config']['cache'] = 'tests/out/_test_dataset/'
    config['dataset_config']['fraction_of_dataset'] = .1
    dset = loaders.load_dataset(
        config['datasets'][split]['class'],
        config['datasets'][split]['folder'],
        config['dataset_config'],
    )
    return dset


paths_to_yml = list(glob.glob('./experiments/**/*.yml', recursive=True))
configs = [load_yaml(path_to_yml) for path_to_yml in paths_to_yml]


@pytest.mark.parametrize("config", configs, ids=paths_to_yml)
def test_dataset(config):
    for split in config['datasets']:
        dset = _load_dataset(config, split)
        dset[0]


@pytest.mark.parametrize("config", configs, ids=paths_to_yml)
def test_model(config):
    if 'model_config' in config:
        model = loaders.load_model(config['model_config'])