예제 #1
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)

    # if you do not pass arguments for train/valid/test data default to files checked into repo.
    if not arguments['VALID_DATA_PATH']:
        dir_path = Path(__file__).parent.absolute()
        print(dir_path)
        arguments['VALID_DATA_PATH'] = str(dir_path / 'data_dirs_valid.txt')
        arguments['TEST_DATA_PATH'] = str(dir_path / 'data_dirs_test.txt')

    valid_data_dirs = test.expand_data_path(arguments['VALID_DATA_PATH'],
                                            azure_info_path)
    test_data_dirs = test.expand_data_path(arguments['TEST_DATA_PATH'],
                                           azure_info_path)
    test.compute_evaluation_metrics(
        RichPath.create(arguments['MODEL_PATH'],
                        azure_info_path=azure_info_path), arguments,
        azure_info_path, valid_data_dirs, test_data_dirs)
예제 #2
0
def run(arguments, tag_in_vcs=False) -> None:
    azure_info_path = arguments.get('--azure-info', None)
    testrun = arguments.get('--testrun')
    no_eval = arguments.get('--no-eval')
    max_files_per_dir = arguments.get('--max-files-per-dir')

    dir_path = Path(__file__).parent.absolute()

    # if you do not pass arguments for train/valid/test data default to files checked into repo.
    if not arguments['TRAIN_DATA_PATH']:
        arguments['TRAIN_DATA_PATH'] = str(dir_path / 'data_dirs_train.txt')
        arguments['VALID_DATA_PATH'] = str(dir_path / 'data_dirs_valid.txt')
        arguments['TEST_DATA_PATH'] = str(dir_path / 'data_dirs_test.txt')

    train_data_dirs = test.expand_data_path(arguments['TRAIN_DATA_PATH'],
                                            azure_info_path)
    valid_data_dirs = test.expand_data_path(arguments['VALID_DATA_PATH'],
                                            azure_info_path)
    test_data_dirs = test.expand_data_path(arguments['TEST_DATA_PATH'],
                                           azure_info_path)

    # default model save location
    if not arguments['SAVE_FOLDER']:
        arguments['SAVE_FOLDER'] = str(dir_path.parent /
                                       'resources/saved_models/')

    save_folder = arguments['SAVE_FOLDER']

    model_class = model_restore_helper.get_model_class_from_name(
        arguments['--model'])

    hyperparameters = model_class.get_default_hyperparameters()

    # make name of wandb run = run_id (Doesn't populate yet)
    hyperparameters['max_epochs'] = int(arguments.get('--max-num-epochs'))

    if testrun:
        hyperparameters['max_epochs'] = 2
        if not max_files_per_dir:
            max_files_per_dir = 1

    # override hyperparams if flag is passed
    hypers_override = arguments.get('--hypers-override')
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))
    elif arguments.get('--hypers-override-file') is not None:
        with open(arguments.get('--hypers-override-file')) as f:
            hyperparameters.update(json.load(f))

    os.makedirs(save_folder, exist_ok=True)

    if tag_in_vcs:
        hyperparameters['git_commit'] = git_tag_run(run_name)

    # turns off wandb if you don't want to log anything
    if arguments.get('--dryrun'):
        os.environ["WANDB_MODE"] = 'dryrun'
    # save hyperparams to logging
    # must filter out type=set from logging when as that is not json serializable
    results = []
    num_random_samples = int(arguments['--num-random-samples'])

    if num_random_samples > 1:
        random_data_dir = str(train_data_dirs[0]) + arguments['--run-name']
    else:
        random_data_dir = None

    for i in range(num_random_samples):
        run_name = make_run_id(arguments)
        wandb.init(name=run_name,
                   config={
                       k: v
                       for k, v in hyperparameters.items()
                       if not isinstance(v, set)
                   })
        wandb.config.update({
            'model-class':
            arguments['--model'],
            'train_folder':
            str(train_data_dirs),
            'valid_folder':
            str(valid_data_dirs),
            'save_folder':
            str(save_folder),
            'test_folder':
            str(test_data_dirs),
            'CUDA_VISIBLE_DEVICES':
            os.environ.get("CUDA_VISIBLE_DEVICES", 'Not Set'),
            'run-name':
            arguments.get('--run-name'),
            'CLI-command':
            ' '.join(sys.argv)
        })

        if arguments.get('--evaluate-model'):
            model_path = RichPath.create(arguments['--evaluate-model'])
        else:
            model_path = run_train(model_class,
                                   train_data_dirs,
                                   valid_data_dirs,
                                   save_folder,
                                   hyperparameters,
                                   azure_info_path,
                                   run_name,
                                   arguments['--quiet'],
                                   max_files_per_dir=max_files_per_dir,
                                   parallelize=not (arguments['--sequential']),
                                   random_sample_size=int(
                                       args['--random-sample-size']),
                                   random_data_dir_name=random_data_dir)

        if num_random_samples == 1:
            wandb.config['best_model_path'] = str(model_path)
            wandb.save(str(model_path.to_local_path()))

        if no_eval:
            continue
        # only limit files in test run if `--testrun` flag is passed by user.
        elif testrun:
            compute_evaluation_metrics(model_path, arguments, azure_info_path,
                                       valid_data_dirs, test_data_dirs,
                                       max_files_per_dir)
        else:
            compute_evaluation_metrics(model_path, arguments, azure_info_path,
                                       valid_data_dirs, test_data_dirs)