예제 #1
0
def _single_k_fold(classifier, catalog, entity, k, dir_io, **kwargs):
    predictions, test_set = None, []
    dataset, positive_samples_index = train.build_training_set(
        catalog, entity, dir_io)
    k_fold, binary_target_variables = utils.prepare_stratified_k_fold(
        k, dataset, positive_samples_index)

    for train_index, test_index in k_fold.split(dataset,
                                                binary_target_variables):
        training, test = dataset.iloc[train_index], dataset.iloc[test_index]
        test_set.append(test)

        model = utils.init_model(classifier, dataset.shape[1], **kwargs)
        model.fit(training, positive_samples_index & training.index)

        preds = model.predict(test)

        K.clear_session()  # Free memory

        if predictions is None:
            predictions = preds
        else:
            predictions |= preds

    test_set = concat(test_set)

    return (
        predictions,
        _compute_performance(positive_samples_index & test_set.index,
                             predictions, len(test_set)),
    )
예제 #2
0
def _nested_k_fold_with_grid_search(classifier, param_grid, catalog, entity, k,
                                    scoring, dir_io, **kwargs):
    dataset, positive_samples_index = train.build_training_set(
        catalog, entity, dir_io)
    model = utils.init_model(classifier, dataset.shape[1], **kwargs).kernel

    inner_k_fold, target = utils.prepare_stratified_k_fold(
        k, dataset, positive_samples_index)
    outer_k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1269)
    grid_search = GridSearchCV(
        model,
        param_grid,
        scoring=scoring,
        n_jobs=-1,
        cv=inner_k_fold,
        verbose=1,
    )
    result = []

    dataset = dataset.to_numpy()

    for k, (train_index,
            test_index) in enumerate(outer_k_fold.split(dataset, target), 1):
        # Run grid search
        grid_search.fit(dataset[train_index], target[train_index])

        # Let grid search compute the test score
        test_score = grid_search.score(dataset[test_index], target[test_index])

        # No reason to keep trained models in memory. We will instead just dump them
        # to a file and keep the path
        best_model = grid_search.best_estimator_

        model_path = os.path.join(
            dir_io,
            constants.LINKER_NESTED_CV_BEST_MODEL.format(
                catalog, entity, classifier, k),
        )

        joblib.dump(best_model, model_path)

        LOGGER.info("Best model for fold %d dumped to '%s'", k, model_path)

        # Grid search best score is the train score
        result.append({
            f'train_{scoring}': grid_search.best_score_,
            f'test_{scoring}': test_score,
            'best_model': model_path,
            'params': grid_search.best_params_,
        })

    return result
예제 #3
0
def _average_k_fold(classifier, catalog, entity, k, dir_io, **kwargs):
    predictions, precisions, recalls, f_scores = None, [], [], []
    dataset, positive_samples_index = train.build_training_set(
        catalog, entity, dir_io)
    k_fold, binary_target_variables = utils.prepare_stratified_k_fold(
        k, dataset, positive_samples_index)

    for train_index, test_index in k_fold.split(dataset,
                                                binary_target_variables):
        training, test = dataset.iloc[train_index], dataset.iloc[test_index]

        model = utils.init_model(classifier, dataset.shape[1], **kwargs)

        model.fit(training, positive_samples_index & training.index)

        preds = model.predict(test)

        K.clear_session()  # Free memory

        p, r, f, _ = _compute_performance(positive_samples_index & test.index,
                                          preds, len(test))

        if predictions is None:
            predictions = preds
        else:
            predictions |= preds

        precisions.append(p)
        recalls.append(r)
        f_scores.append(f)

    return (
        predictions,
        mean(precisions),
        std(precisions),
        mean(recalls),
        std(recalls),
        mean(f_scores),
        std(f_scores),
    )