Пример #1
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    valid_data_dir = test.expand_data_path(arguments['VALID_DATA_PATH'],
                                           azure_info_path)
    test_data_dir = test.expand_data_path(arguments['TEST_DATA_PATH'],
                                          azure_info_path)
    model_paths = RichPath.create(
        arguments['MODEL_PATH'],
        azure_info_path=azure_info_path).get_filtered_files_in_dir('*.pkl.gz')
    alpha = float(args['--alpha'])

    with Pool(int(arguments['--processes'])) as pool:
        results = pool.map(
            functools.partial(test.compute_evaluation_metrics,
                              arguments=arguments,
                              azure_info_path=azure_info_path,
                              valid_data_dirs=valid_data_dir,
                              test_data_dirs=test_data_dir,
                              return_results=True,
                              languages=['java'],
                              test_valid=False), model_paths)

    docstring_mrrs = [x['java'][0] for x in results]
    func_name_mrrs = [x['java'][1] for x in results]

    docstring_confidence = get_confidence_interval(docstring_mrrs, alpha)
    func_name_confidence = get_confidence_interval(func_name_mrrs, alpha)

    print(
        f'{alpha*100}% confidence interval for mrr using docstring as the query: {docstring_confidence}'
    )
    print(
        f'{alpha*100}% confidence interval for mrr using function name as the query: {func_name_confidence}'
    )
def run(arguments):
    max_num_examples = int(arguments.get('--max-num-examples')) if arguments.get('--max-num-examples') else None
    azure_info_path = arguments.get('--azure-info', None)
    test_data_dirs = expand_data_path(arguments['DATA_PATH'], azure_info_path)

    if arguments['--hypers-override'] is not None:
        hypers_override = json.loads(arguments['--hypers-override'])
    elif arguments['--hypers-override-file'] is not None:
        with open(arguments['--hypers-override-file']) as f:
            hypers_override = json.load(f)
    else:
        hypers_override = {}

    model_path = RichPath.create(arguments['MODEL_PATH'], azure_info_path=azure_info_path)

    tester = MrrSearchTester(model_path, test_batch_size=int(arguments['--test-batch-size']),
                             distance_metric=arguments['--distance-metric'], hypers_override=hypers_override)

    # Load dataset
    if arguments['--standard-dataset'] or arguments['--method2code-dataset']:
        data = model_test.get_dataset_from(test_data_dirs, use_func_names=arguments['--method2code-dataset'])
    else:
        raise Exception(f'No dataset option seems to have been passed in.')

    generate_html_error_report(tester=tester,
                               data=data,
                               max_num_examples=max_num_examples,
                               outfile=arguments['OUT_FILE'],
                               filter_language=arguments.get('--language-to-analyze'))
Пример #3
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)

    # if you do not pass arguments for train/valid/test data default to files checked into repo.
    if not arguments['VALID_DATA_PATH']:
        dir_path = Path(__file__).parent.absolute()
        print(dir_path)
        arguments['VALID_DATA_PATH'] = str(dir_path / 'data_dirs_valid.txt')
        arguments['TEST_DATA_PATH'] = str(dir_path / 'data_dirs_test.txt')

    valid_data_dirs = test.expand_data_path(arguments['VALID_DATA_PATH'],
                                            azure_info_path)
    test_data_dirs = test.expand_data_path(arguments['TEST_DATA_PATH'],
                                           azure_info_path)
    test.compute_evaluation_metrics(
        RichPath.create(arguments['MODEL_PATH'],
                        azure_info_path=azure_info_path), arguments,
        azure_info_path, valid_data_dirs, test_data_dirs)
Пример #4
0
def run(arguments, tag_in_vcs=False) -> None:
    azure_info_path = arguments.get('--azure-info', None)
    testrun = arguments.get('--testrun')
    no_eval = arguments.get('--no-eval')
    max_files_per_dir = arguments.get('--max-files-per-dir')

    dir_path = Path(__file__).parent.absolute()

    # if you do not pass arguments for train/valid/test data default to files checked into repo.
    if not arguments['TRAIN_DATA_PATH']:
        arguments['TRAIN_DATA_PATH'] = str(dir_path / 'data_dirs_train.txt')
        arguments['VALID_DATA_PATH'] = str(dir_path / 'data_dirs_valid.txt')
        arguments['TEST_DATA_PATH'] = str(dir_path / 'data_dirs_test.txt')

    train_data_dirs = test.expand_data_path(arguments['TRAIN_DATA_PATH'],
                                            azure_info_path)
    valid_data_dirs = test.expand_data_path(arguments['VALID_DATA_PATH'],
                                            azure_info_path)
    test_data_dirs = test.expand_data_path(arguments['TEST_DATA_PATH'],
                                           azure_info_path)

    # default model save location
    if not arguments['SAVE_FOLDER']:
        arguments['SAVE_FOLDER'] = str(dir_path.parent /
                                       'resources/saved_models/')

    save_folder = arguments['SAVE_FOLDER']

    model_class = model_restore_helper.get_model_class_from_name(
        arguments['--model'])

    hyperparameters = model_class.get_default_hyperparameters()

    # make name of wandb run = run_id (Doesn't populate yet)
    hyperparameters['max_epochs'] = int(arguments.get('--max-num-epochs'))

    if testrun:
        hyperparameters['max_epochs'] = 2
        if not max_files_per_dir:
            max_files_per_dir = 1

    # override hyperparams if flag is passed
    hypers_override = arguments.get('--hypers-override')
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))
    elif arguments.get('--hypers-override-file') is not None:
        with open(arguments.get('--hypers-override-file')) as f:
            hyperparameters.update(json.load(f))

    os.makedirs(save_folder, exist_ok=True)

    if tag_in_vcs:
        hyperparameters['git_commit'] = git_tag_run(run_name)

    # turns off wandb if you don't want to log anything
    if arguments.get('--dryrun'):
        os.environ["WANDB_MODE"] = 'dryrun'
    # save hyperparams to logging
    # must filter out type=set from logging when as that is not json serializable
    results = []
    num_random_samples = int(arguments['--num-random-samples'])

    if num_random_samples > 1:
        random_data_dir = str(train_data_dirs[0]) + arguments['--run-name']
    else:
        random_data_dir = None

    for i in range(num_random_samples):
        run_name = make_run_id(arguments)
        wandb.init(name=run_name,
                   config={
                       k: v
                       for k, v in hyperparameters.items()
                       if not isinstance(v, set)
                   })
        wandb.config.update({
            'model-class':
            arguments['--model'],
            'train_folder':
            str(train_data_dirs),
            'valid_folder':
            str(valid_data_dirs),
            'save_folder':
            str(save_folder),
            'test_folder':
            str(test_data_dirs),
            'CUDA_VISIBLE_DEVICES':
            os.environ.get("CUDA_VISIBLE_DEVICES", 'Not Set'),
            'run-name':
            arguments.get('--run-name'),
            'CLI-command':
            ' '.join(sys.argv)
        })

        if arguments.get('--evaluate-model'):
            model_path = RichPath.create(arguments['--evaluate-model'])
        else:
            model_path = run_train(model_class,
                                   train_data_dirs,
                                   valid_data_dirs,
                                   save_folder,
                                   hyperparameters,
                                   azure_info_path,
                                   run_name,
                                   arguments['--quiet'],
                                   max_files_per_dir=max_files_per_dir,
                                   parallelize=not (arguments['--sequential']),
                                   random_sample_size=int(
                                       args['--random-sample-size']),
                                   random_data_dir_name=random_data_dir)

        if num_random_samples == 1:
            wandb.config['best_model_path'] = str(model_path)
            wandb.save(str(model_path.to_local_path()))

        if no_eval:
            continue
        # only limit files in test run if `--testrun` flag is passed by user.
        elif testrun:
            compute_evaluation_metrics(model_path, arguments, azure_info_path,
                                       valid_data_dirs, test_data_dirs,
                                       max_files_per_dir)
        else:
            compute_evaluation_metrics(model_path, arguments, azure_info_path,
                                       valid_data_dirs, test_data_dirs)