示例#1
0
    def __init__(self, num_features, **kwargs):
        super(GatedEnsembleClassifier, self).__init__()

        kwargs = {**constants.GATED_ENSEMBLE_PARAMS, **kwargs}

        self.num_features = num_features
        self.num_folds = kwargs.pop('folds', 2)
        self.meta_layer = kwargs.pop('meta_layer')

        estimators = []
        for clf in constants.CLASSIFIERS_FOR_ENSEMBLE:
            model = utils.init_model(
                clf, num_features=self.num_features, **kwargs
            )

            estimators.append((clf, model.kernel))

        self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds)

        # use as output the probability of a given class (not just
        # the class itself)
        self.kernel.add(estimators, proba=True)

        self.kernel.add_meta(
            utils.init_model(
                self.meta_layer, len(estimators) * self.num_folds, **kwargs
            ).kernel,
            proba=True,
        )
示例#2
0
def _single_k_fold(classifier, catalog, entity, k, dir_io, **kwargs):
    predictions, test_set = None, []
    dataset, positive_samples_index = train.build_training_set(
        catalog, entity, dir_io)
    k_fold, binary_target_variables = utils.prepare_stratified_k_fold(
        k, dataset, positive_samples_index)

    for train_index, test_index in k_fold.split(dataset,
                                                binary_target_variables):
        training, test = dataset.iloc[train_index], dataset.iloc[test_index]
        test_set.append(test)

        model = utils.init_model(classifier, dataset.shape[1], **kwargs)
        model.fit(training, positive_samples_index & training.index)

        preds = model.predict(test)

        K.clear_session()  # Free memory

        if predictions is None:
            predictions = preds
        else:
            predictions |= preds

    test_set = concat(test_set)

    return (
        predictions,
        _compute_performance(positive_samples_index & test_set.index,
                             predictions, len(test_set)),
    )
示例#3
0
    def __init__(self, num_features, **kwargs):
        super(StackedEnsembleClassifier, self).__init__()

        kwargs = {**constants.STACKED_ENSEMBLE_PARAMS, **kwargs}

        self.num_features = num_features
        self.num_folds = kwargs.pop('folds', 2)
        self.meta_layer = kwargs.pop('meta_layer')

        def init_estimators(num_features):
            estimators = []
            for clf in constants.CLASSIFIERS_FOR_ENSEMBLE:
                model = utils.init_model(
                    clf, num_features=num_features, **kwargs
                )

                estimators.append((clf, model.kernel))
            return estimators

        self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds)

        l1_estimators = init_estimators(self.num_features)
        self.kernel.add(l1_estimators, proba=True)

        l2_estimators = init_estimators(len(l1_estimators) * self.num_folds)
        self.kernel.add(l2_estimators, proba=True)

        self.kernel.add_meta(
            utils.init_model(
                self.meta_layer, len(l2_estimators) * self.num_folds, **kwargs
            ).kernel,
            proba=True,
        )
示例#4
0
        def init_estimators(num_features):
            estimators = []
            for clf in constants.CLASSIFIERS_FOR_ENSEMBLE:
                model = utils.init_model(
                    clf, num_features=num_features, **kwargs
                )

                estimators.append((clf, model.kernel))
            return estimators
示例#5
0
def _train(classifier, feature_vectors, positive_samples_index, **kwargs):
    model = utils.init_model(classifier, feature_vectors.shape[1], **kwargs)

    LOGGER.info('Training a %s ...', classifier)

    model.fit(feature_vectors, positive_samples_index)

    LOGGER.info('Training done')

    return model
示例#6
0
def _nested_k_fold_with_grid_search(classifier, param_grid, catalog, entity, k,
                                    scoring, dir_io, **kwargs):
    dataset, positive_samples_index = train.build_training_set(
        catalog, entity, dir_io)
    model = utils.init_model(classifier, dataset.shape[1], **kwargs).kernel

    inner_k_fold, target = utils.prepare_stratified_k_fold(
        k, dataset, positive_samples_index)
    outer_k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1269)
    grid_search = GridSearchCV(
        model,
        param_grid,
        scoring=scoring,
        n_jobs=-1,
        cv=inner_k_fold,
        verbose=1,
    )
    result = []

    dataset = dataset.to_numpy()

    for k, (train_index,
            test_index) in enumerate(outer_k_fold.split(dataset, target), 1):
        # Run grid search
        grid_search.fit(dataset[train_index], target[train_index])

        # Let grid search compute the test score
        test_score = grid_search.score(dataset[test_index], target[test_index])

        # No reason to keep trained models in memory. We will instead just dump them
        # to a file and keep the path
        best_model = grid_search.best_estimator_

        model_path = os.path.join(
            dir_io,
            constants.LINKER_NESTED_CV_BEST_MODEL.format(
                catalog, entity, classifier, k),
        )

        joblib.dump(best_model, model_path)

        LOGGER.info("Best model for fold %d dumped to '%s'", k, model_path)

        # Grid search best score is the train score
        result.append({
            f'train_{scoring}': grid_search.best_score_,
            f'test_{scoring}': test_score,
            'best_model': model_path,
            'params': grid_search.best_params_,
        })

    return result
示例#7
0
    def __init__(self, num_features, **kwargs):
        super(VotingClassifier, self).__init__()

        kwargs = {**constants.VOTING_CLASSIFIER_PARAMS, **kwargs}

        voting = kwargs.pop('voting')

        self.num_features = num_features

        estimators = []
        for clf in constants.CLASSIFIERS_FOR_ENSEMBLE:
            model = utils.init_model(clf, num_features=num_features, **kwargs)

            estimators.append((clf, model.kernel))

        # use as kernel the VotingClassifier coming from sklearn
        self.kernel = SKVotingClassifier(
            estimators=estimators, voting=voting, n_jobs=None
        )
示例#8
0
def _grid_search(
    k: int,
    feature_vectors: pd.DataFrame,
    positive_samples_index: pd.MultiIndex,
    classifier: str,
    **kwargs,
) -> Dict:
    k_fold, target = utils.prepare_stratified_k_fold(k, feature_vectors,
                                                     positive_samples_index)
    model = utils.init_model(classifier, feature_vectors.shape[1], **kwargs)

    grid_search = GridSearchCV(
        model.kernel,
        constants.PARAMETER_GRIDS[classifier],
        scoring='f1',
        n_jobs=-1,
        cv=k_fold,
    )
    grid_search.fit(feature_vectors.to_numpy(), target)

    return grid_search.best_params_
示例#9
0
def _average_k_fold(classifier, catalog, entity, k, dir_io, **kwargs):
    predictions, precisions, recalls, f_scores = None, [], [], []
    dataset, positive_samples_index = train.build_training_set(
        catalog, entity, dir_io)
    k_fold, binary_target_variables = utils.prepare_stratified_k_fold(
        k, dataset, positive_samples_index)

    for train_index, test_index in k_fold.split(dataset,
                                                binary_target_variables):
        training, test = dataset.iloc[train_index], dataset.iloc[test_index]

        model = utils.init_model(classifier, dataset.shape[1], **kwargs)

        model.fit(training, positive_samples_index & training.index)

        preds = model.predict(test)

        K.clear_session()  # Free memory

        p, r, f, _ = _compute_performance(positive_samples_index & test.index,
                                          preds, len(test))

        if predictions is None:
            predictions = preds
        else:
            predictions |= preds

        precisions.append(p)
        recalls.append(r)
        f_scores.append(f)

    return (
        predictions,
        mean(precisions),
        std(precisions),
        mean(recalls),
        std(recalls),
        mean(f_scores),
        std(f_scores),
    )