Python make_index_balanced_accuracy примеры, imblearn.metrics.make_index_balanced_accuracy Python примеры использования

Пример #1

0

Показать файл

Файл: test_classification.py Проект: chkoar/imbalanced-learn

def test_iba_error_y_score_prob_error(score_loss):
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(score_loss)
    with pytest.raises(AttributeError):
        aps(y_true, y_pred)

Пример #2

0

Показать файл

Файл: model.py Проект: mozilla/bugbug

def classification_report_imbalanced_values(
    y_true, y_pred, labels, target_names=None, sample_weight=None, digits=2, alpha=0.1
):
    """Copy of imblearn.metrics.classification_report_imbalanced to have
    access to the raw values. The code is mostly the same except the
    formatting code and generation of the report which haven removed. Copied
    from version 0.4.3. The original code is living here:
    https://github.com/scikit-learn-contrib/imbalanced-learn/blob/b861b3a8e3414c52f40a953f2e0feca5b32e7460/imblearn/metrics/_classification.py#L790
    """
    labels = np.asarray(labels)

    if target_names is None:
        target_names = [str(label) for label in labels]

    # Compute the different metrics
    # Precision/recall/f1
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Specificity
    specificity = specificity_score(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Geometric mean
    geo_mean = geometric_mean_score(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Index balanced accuracy
    iba_gmean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
        geometric_mean_score
    )
    iba = iba_gmean(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )

    result = {"targets": {}}

    for i, label in enumerate(labels):
        result["targets"][target_names[i]] = {
            "precision": precision[i],
            "recall": recall[i],
            "specificity": specificity[i],
            "f1": f1[i],
            "geo_mean": geo_mean[i],
            "iba": iba[i],
            "support": support[i],
        }

    result["average"] = {
        "precision": np.average(precision, weights=support),
        "recall": np.average(recall, weights=support),
        "specificity": np.average(specificity, weights=support),
        "f1": np.average(f1, weights=support),
        "geo_mean": np.average(geo_mean, weights=support),
        "iba": np.average(iba, weights=support),
        "support": np.sum(support),
    }

    return result

Пример #3

0

Показать файл

def test_iba_error_y_score_prob():
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(average_precision_score)
    assert_raises(AttributeError, aps, y_true, y_pred)

    brier = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(brier_score_loss)
    assert_raises(AttributeError, brier, y_true, y_pred)

    kappa = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(cohen_kappa_score)
    assert_raises(AttributeError, kappa, y_true, y_pred)

    ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score)
    assert_raises(AttributeError, ras, y_true, y_pred)

Пример #4

0

Показать файл

Файл: test_classification.py Проект: vishalbelsare/imbalanced-learn

def test_iba_geo_mean_binary():
    y_true, y_pred, _ = make_prediction(binary=True)

    iba_gmean = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(geometric_mean_score)
    iba = iba_gmean(y_true, y_pred)

    assert_allclose(iba, 0.5948, rtol=R_TOL)

Пример #5

0

Показать файл

Файл: test_classification.py Проект: chkoar/imbalanced-learn

def test_iba_geo_mean_binary():
    y_true, y_pred, _ = make_prediction(binary=True)

    iba_gmean = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(geometric_mean_score)
    iba = iba_gmean(y_true, y_pred)

    assert_allclose(iba, 0.5948, rtol=R_TOL)

Пример #6

0

Показать файл

def test_iba_geo_mean_binary():
    """Test to test the iba using the geometric mean"""
    y_true, y_pred, _ = make_prediction(binary=True)

    iba_gmean = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(geometric_mean_score)
    iba = iba_gmean(y_true, y_pred)

    assert_almost_equal(iba, 0.54, 2)

Пример #7

0

Показать файл

Файл: frame_classifier_extra_features_with_filtering.py Проект: tradr-project/TRADR_frame_classifiers

def flat_iba(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  geo_mean = geometric_mean_score(labels_flat, pred_flat, average=None, sample_weight=None)
  iba_gmean = make_index_balanced_accuracy(alpha=0.1, squared=True)(geometric_mean_score)
  iba = iba_gmean(labels_flat, pred_flat, average=None, sample_weight=None)
  _, _, _, support = precision_recall_fscore_support(labels_flat, pred_flat, average=None, sample_weight=None)
  res = np.average(iba, weights=support)
  return res

Пример #8

0

Показать файл

def test_iba_error_y_score_prob():
    """Test if an error is raised when a scoring metric take over parameters
    than y_pred"""
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(average_precision_score)
    assert_raises(AttributeError, aps, y_true, y_pred)

    brier = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(brier_score_loss)
    assert_raises(AttributeError, brier, y_true, y_pred)

    kappa = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(cohen_kappa_score)
    assert_raises(AttributeError, kappa, y_true, y_pred)

    ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score)
    assert_raises(AttributeError, ras, y_true, y_pred)

Пример #9

0

Показать файл

def test_iba_sklearn_metrics():
    y_true, y_pred, _ = make_prediction(binary=True)

    acc = make_index_balanced_accuracy(alpha=0.5, squared=True)(accuracy_score)
    score = acc(y_true, y_pred)
    assert_equal(score, 0.54756)

    jss = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(jaccard_similarity_score)
    score = jss(y_true, y_pred)
    assert_equal(score, 0.54756)

    pre = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(precision_score)
    score = pre(y_true, y_pred)
    assert_equal(score, 0.65025)

    rec = make_index_balanced_accuracy(alpha=0.5, squared=True)(recall_score)
    score = rec(y_true, y_pred)
    assert_equal(score, 0.41616000000000009)

Пример #10

0

Показать файл

Файл: test_classification.py Проект: bodycat/imbalanced-learn

def test_iba_sklearn_metrics():
    y_true, y_pred, _ = make_prediction(binary=True)

    acc = make_index_balanced_accuracy(alpha=0.5, squared=True)(accuracy_score)
    score = acc(y_true, y_pred)
    assert score == approx(0.54756)

    jss = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(jaccard_similarity_score)
    score = jss(y_true, y_pred)
    assert score == approx(0.54756)

    pre = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(precision_score)
    score = pre(y_true, y_pred)
    assert score == approx(0.65025)

    rec = make_index_balanced_accuracy(alpha=0.5, squared=True)(recall_score)
    score = rec(y_true, y_pred)
    assert score == approx(0.41616000000000009)

Пример #11

0

Показать файл

Файл: test_classification.py Проект: bodycat/imbalanced-learn

def test_iba_error_y_score_prob():
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(average_precision_score)
    with raises(AttributeError):
        aps(y_true, y_pred)

    brier = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(brier_score_loss)
    with raises(AttributeError):
        brier(y_true, y_pred)

    kappa = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(cohen_kappa_score)
    with raises(AttributeError):
        kappa(y_true, y_pred)

    ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score)
    with raises(AttributeError):
        ras(y_true, y_pred)

Пример #12

0

Показать файл

Файл: test_classification.py Проект: zbn123/imbalanced-learn

def test_iba_error_y_score_prob():
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(average_precision_score)
    with raises(AttributeError):
        aps(y_true, y_pred)

    brier = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(brier_score_loss)
    with raises(AttributeError):
        brier(y_true, y_pred)

    kappa = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(cohen_kappa_score)
    with raises(AttributeError):
        kappa(y_true, y_pred)

    ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score)
    with raises(AttributeError):
        ras(y_true, y_pred)

Пример #13

0

Показать файл

Файл: classification.py Проект: aauss/EventEpi

    def plot_learning_curve(self,
                            estimator,
                            title,
                            X,
                            y,
                            train_sizes=np.linspace(0.1, 1.0, 5)):
        """
        Generate test and training learning curve.
        """
        _, ax = plt.subplots(1, 1, figsize=(8, 6))

        ax.set_title(title)
        ax.set_xlabel("Training examples")
        ax.set_ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(
            estimator,
            X,
            y,
            train_sizes=train_sizes,
            scoring=make_scorer(
                make_index_balanced_accuracy()(geometric_mean_score)),
            verbose=1,
        )
        pd.DataFrame({
            "train_size":
            np.array([[size] * train_scores.shape[1]
                      for size in train_sizes]).reshape(-1),
            "train_score":
            train_scores.reshape(-1),
            "test_score":
            test_scores.reshape(-1),
        }).to_csv(
            self._file_path /
            Path(f"../data/results/{title.replace(' ', '_')}_values.csv"))
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        # Plot learning curve
        ax.grid()
        ax.fill_between(
            train_sizes,
            train_scores_mean - train_scores_std,
            train_scores_mean + train_scores_std,
            alpha=0.1,
            color="r",
        )
        ax.fill_between(
            train_sizes,
            test_scores_mean - test_scores_std,
            test_scores_mean + test_scores_std,
            alpha=0.1,
            color="g",
        )
        ax.plot(train_sizes,
                train_scores_mean,
                "o-",
                color="r",
                label="Training score")
        ax.plot(
            train_sizes,
            test_scores_mean,
            "o-",
            color="g",
            label="Cross-validation score",
        )
        ax.legend(loc="best")

        plt.tight_layout()
        plt.savefig(self._file_path /
                    Path(f"../data/results/{title.replace(' ', '_')}.pdf"))

Пример #14

0

Показать файл

Файл: classification.py Проект: aauss/EventEpi

    def train_relevance_scoring(self):
        X, y = self.loader.labeled_texts()
        X_train, X_test, y_train, y_test = self.train_test_split(X, y)

        # BOW models
        grid_search_parameters = {
            "tfidf__ngram_range": [(1, 1), (1, 3)],
            "tfidf__use_idf": (True, False),
        }

        for model_name, model in [
            ("complement", ComplementNB),
            ("multinomial", MultinomialNB),
        ]:

            pipeline = Pipeline([
                ("norm", TextNormalizer()),
                (
                    "tfidf",
                    TfidfVectorizer(tokenizer=self._identity,
                                    preprocessor=None,
                                    lowercase=False),
                ),
                ("clf", model(alpha=0.001)),
            ])

            gs_model = GridSearchCV(
                pipeline,
                grid_search_parameters,
                scoring=make_scorer(
                    make_index_balanced_accuracy()(geometric_mean_score)),
                verbose=2,
            )

            start_time = time.time()
            gs_model = gs_model.fit(X_train, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            best_params = gs_model.best_params_

            predicted = gs_model.predict(X_test)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(
                    str(best_params) + "\n\n" + report + "\n\n" +
                    f"training time: {training_time} min.")

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix, relevance scoring, {model_name} NBC",
            )

            self.plot_learning_curve(
                pipeline.set_params(**best_params),
                f"Learning curve, relevance scoring, {model_name} NBC",
                X,
                y,
            )

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_model.pickle"
                         ),
                    "wb",
            ) as f:
                pickle.dump(gs_model.best_estimator_, f)

        # Embedding models without ADASYN
        embedder = MeanDocumentEmbedder()

        X_embedded = np.array(list(embedder.transform(X)))
        (X_train_embedded, X_test_embedded, y_train,
         y_test) = self.train_test_split(X_embedded, y)

        for model_name, model in [
            ("logistic regression", LogisticRegression()),
            ("k-nearest neighbors", KNeighborsClassifier()),
            ("support vector classifier", SVC()),
            ("multi layer perceptron", MLPClassifier()),
        ]:
            start_time = time.time()
            model.fit(X_train_embedded, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            predicted = model.predict(X_test_embedded)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path / Path(
                        f"../data/results/relevance_no_adasyn_{model_name}_report.txt"
                    ),
                    "w",
            ) as f:
                f.write(report + "\n\n" +
                        f"training time: {training_time} min.")
            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix (no ADASYN), relevance scoring, {model_name}",
            )

        # Embedding models with ADASYN
        adasyn = ADASYN(random_state=13353)
        X_resample, y_resample = adasyn.fit_sample(X_train_embedded, y_train)
        for model_name, model in [
            ("logistic regression", LogisticRegression),
            ("k-nearest neighbors", KNeighborsClassifier),
            ("support vector classifier", SVC),
            ("multi layer perceptron", MLPClassifier),
        ]:
            if model_name == "support vector classifier":
                clf = model(probability=True)
            else:
                clf = model()
            start_time = time.time()
            clf = clf.fit(X_resample, y_resample)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            predicted = clf.predict(X_test_embedded)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(report + "\n\n" +
                        f"training time: {training_time} min.")

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_model.pickle"
                         ),
                    "wb",
            ) as f:
                pickle.dump(clf, f)

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix, relevance scoring, {model_name}",
            )

            self.plot_learning_curve(
                model(),
                f"Learning curve, relevance scoring, {model_name}",
                X_resample,
                y_resample,
            )

Пример #15

0

Показать файл

Файл: classification.py Проект: aauss/EventEpi

    def _train_key_entity_classification(self, X, y, entity):
        X_train, X_test, y_train, y_test = self.train_test_split(X,
                                                                 y,
                                                                 stratify=y)

        grid_search_parameters = {
            "tfidf__ngram_range": [(1, 1), (1, 3), (1, 4)],
            "tfidf__use_idf": (True, False),
            "clf__alpha": (0.01, 0.001),
        }

        for model_name, model in [
            ("Bernoulli", BernoulliNB),
            ("multinomial", MultinomialNB),
        ]:

            pipeline = Pipeline([
                ("norm", TextNormalizer()),
                (
                    "tfidf",
                    TfidfVectorizer(tokenizer=self._identity,
                                    preprocessor=None,
                                    lowercase=False),
                ),
                ("clf", model()),
            ])

            gs_model = GridSearchCV(
                pipeline,
                grid_search_parameters,
                scoring=make_scorer(
                    make_index_balanced_accuracy()(geometric_mean_score)),
                verbose=2,
            )

            start_time = time.time()
            gs_model = gs_model.fit(X_train, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            best_params = gs_model.best_params_

            with open(
                    self._file_path / Path(
                        f"../data/results/{entity}_{model_name}_model.pickle"),
                    "wb",
            ) as f:
                pickle.dump(gs_model.best_estimator_, f)

            predicted = gs_model.predict(X_test)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/{entity}_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(
                    str(best_params) + "\n\n" + report + "\n\n" +
                    f"training time: {training_time} min.")

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["not key", "is key"],
                f"Confusion matrix, {entity} key entity, {model_name} NBC",
            )

            self.plot_learning_curve(
                pipeline.set_params(**best_params),
                f"Learning curve, {entity} key entity, {model_name} NBC",
                X,
                y,
            )

Пример #16

0

Показать файл

Файл: test_classification.py Проект: vishalbelsare/imbalanced-learn

def test_iba_error_y_score_prob_error(score_loss):
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(alpha=0.5, squared=True)(score_loss)
    with pytest.raises(AttributeError):
        aps(y_true, y_pred)

Пример #17

0

Показать файл

Файл: classification.py Проект: aauss/EventEpi

class Trainer:
    loader = DataLoader()
    iba = make_index_balanced_accuracy()(geometric_mean_score)
    train_test_split = partial(train_test_split,
                               test_size=0.25,
                               random_state=13353)
    _file_path = Path(__file__).parent.resolve()

    def train_relevance_scoring(self):
        X, y = self.loader.labeled_texts()
        X_train, X_test, y_train, y_test = self.train_test_split(X, y)

        # BOW models
        grid_search_parameters = {
            "tfidf__ngram_range": [(1, 1), (1, 3)],
            "tfidf__use_idf": (True, False),
        }

        for model_name, model in [
            ("complement", ComplementNB),
            ("multinomial", MultinomialNB),
        ]:

            pipeline = Pipeline([
                ("norm", TextNormalizer()),
                (
                    "tfidf",
                    TfidfVectorizer(tokenizer=self._identity,
                                    preprocessor=None,
                                    lowercase=False),
                ),
                ("clf", model(alpha=0.001)),
            ])

            gs_model = GridSearchCV(
                pipeline,
                grid_search_parameters,
                scoring=make_scorer(
                    make_index_balanced_accuracy()(geometric_mean_score)),
                verbose=2,
            )

            start_time = time.time()
            gs_model = gs_model.fit(X_train, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            best_params = gs_model.best_params_

            predicted = gs_model.predict(X_test)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(
                    str(best_params) + "\n\n" + report + "\n\n" +
                    f"training time: {training_time} min.")

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix, relevance scoring, {model_name} NBC",
            )

            self.plot_learning_curve(
                pipeline.set_params(**best_params),
                f"Learning curve, relevance scoring, {model_name} NBC",
                X,
                y,
            )

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_model.pickle"
                         ),
                    "wb",
            ) as f:
                pickle.dump(gs_model.best_estimator_, f)

        # Embedding models without ADASYN
        embedder = MeanDocumentEmbedder()

        X_embedded = np.array(list(embedder.transform(X)))
        (X_train_embedded, X_test_embedded, y_train,
         y_test) = self.train_test_split(X_embedded, y)

        for model_name, model in [
            ("logistic regression", LogisticRegression()),
            ("k-nearest neighbors", KNeighborsClassifier()),
            ("support vector classifier", SVC()),
            ("multi layer perceptron", MLPClassifier()),
        ]:
            start_time = time.time()
            model.fit(X_train_embedded, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            predicted = model.predict(X_test_embedded)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path / Path(
                        f"../data/results/relevance_no_adasyn_{model_name}_report.txt"
                    ),
                    "w",
            ) as f:
                f.write(report + "\n\n" +
                        f"training time: {training_time} min.")
            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix (no ADASYN), relevance scoring, {model_name}",
            )

        # Embedding models with ADASYN
        adasyn = ADASYN(random_state=13353)
        X_resample, y_resample = adasyn.fit_sample(X_train_embedded, y_train)
        for model_name, model in [
            ("logistic regression", LogisticRegression),
            ("k-nearest neighbors", KNeighborsClassifier),
            ("support vector classifier", SVC),
            ("multi layer perceptron", MLPClassifier),
        ]:
            if model_name == "support vector classifier":
                clf = model(probability=True)
            else:
                clf = model()
            start_time = time.time()
            clf = clf.fit(X_resample, y_resample)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            predicted = clf.predict(X_test_embedded)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(report + "\n\n" +
                        f"training time: {training_time} min.")

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_model.pickle"
                         ),
                    "wb",
            ) as f:
                pickle.dump(clf, f)

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix, relevance scoring, {model_name}",
            )

            self.plot_learning_curve(
                model(),
                f"Learning curve, relevance scoring, {model_name}",
                X_resample,
                y_resample,
            )

    def train_key_entity_classifications(self):
        X_date, y_date = self.loader.labeled_date_sentences()
        self._train_key_entity_classification(X_date, y_date, "date")

        X_count, y_count = self.loader.labeled_count_sentences()
        self._train_key_entity_classification(X_count, y_count, "count")

    def _train_key_entity_classification(self, X, y, entity):
        X_train, X_test, y_train, y_test = self.train_test_split(X,
                                                                 y,
                                                                 stratify=y)

        grid_search_parameters = {
            "tfidf__ngram_range": [(1, 1), (1, 3), (1, 4)],
            "tfidf__use_idf": (True, False),
            "clf__alpha": (0.01, 0.001),
        }

        for model_name, model in [
            ("Bernoulli", BernoulliNB),
            ("multinomial", MultinomialNB),
        ]:

            pipeline = Pipeline([
                ("norm", TextNormalizer()),
                (
                    "tfidf",
                    TfidfVectorizer(tokenizer=self._identity,
                                    preprocessor=None,
                                    lowercase=False),
                ),
                ("clf", model()),
            ])

            gs_model = GridSearchCV(
                pipeline,
                grid_search_parameters,
                scoring=make_scorer(
                    make_index_balanced_accuracy()(geometric_mean_score)),
                verbose=2,
            )

            start_time = time.time()
            gs_model = gs_model.fit(X_train, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            best_params = gs_model.best_params_

            with open(
                    self._file_path / Path(
                        f"../data/results/{entity}_{model_name}_model.pickle"),
                    "wb",
            ) as f:
                pickle.dump(gs_model.best_estimator_, f)

            predicted = gs_model.predict(X_test)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/{entity}_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(
                    str(best_params) + "\n\n" + report + "\n\n" +
                    f"training time: {training_time} min.")

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["not key", "is key"],
                f"Confusion matrix, {entity} key entity, {model_name} NBC",
            )

            self.plot_learning_curve(
                pipeline.set_params(**best_params),
                f"Learning curve, {entity} key entity, {model_name} NBC",
                X,
                y,
            )

    def plot_confusion_matrix(
        self,
        cm,
        target_names,
        title,
    ):
        """
        Plot a sklearn confusion matrix (cm)

        Citiation
        ---------
        http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

        """

        misclass = 1 - np.trace(cm) / float(np.sum(cm))

        plt.figure(figsize=(8, 6))
        plt.imshow(cm, interpolation="nearest", cmap=plt.get_cmap("Blues"))
        plt.title(title)
        plt.colorbar()

        if target_names is not None:
            tick_marks = np.arange(len(target_names))
            plt.xticks(tick_marks, target_names, rotation=45)
            plt.yticks(tick_marks, target_names)

        thresh = cm.max() / 2
        for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(
                j,
                i,
                "{:,}".format(cm[i, j]),
                horizontalalignment="right",
                color="white" if cm[i, j] > thresh else "black",
            )

        plt.ylabel("True label")
        plt.xlabel("Predicted label\nmisclass={:0.2f}".format(misclass))
        plt.tight_layout()
        plt.savefig(self._file_path /
                    Path(f"../data/results/{title.replace(' ', '_')}.pdf"))

    def plot_learning_curve(self,
                            estimator,
                            title,
                            X,
                            y,
                            train_sizes=np.linspace(0.1, 1.0, 5)):
        """
        Generate test and training learning curve.
        """
        _, ax = plt.subplots(1, 1, figsize=(8, 6))

        ax.set_title(title)
        ax.set_xlabel("Training examples")
        ax.set_ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(
            estimator,
            X,
            y,
            train_sizes=train_sizes,
            scoring=make_scorer(
                make_index_balanced_accuracy()(geometric_mean_score)),
            verbose=1,
        )
        pd.DataFrame({
            "train_size":
            np.array([[size] * train_scores.shape[1]
                      for size in train_sizes]).reshape(-1),
            "train_score":
            train_scores.reshape(-1),
            "test_score":
            test_scores.reshape(-1),
        }).to_csv(
            self._file_path /
            Path(f"../data/results/{title.replace(' ', '_')}_values.csv"))
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        # Plot learning curve
        ax.grid()
        ax.fill_between(
            train_sizes,
            train_scores_mean - train_scores_std,
            train_scores_mean + train_scores_std,
            alpha=0.1,
            color="r",
        )
        ax.fill_between(
            train_sizes,
            test_scores_mean - test_scores_std,
            test_scores_mean + test_scores_std,
            alpha=0.1,
            color="g",
        )
        ax.plot(train_sizes,
                train_scores_mean,
                "o-",
                color="r",
                label="Training score")
        ax.plot(
            train_sizes,
            test_scores_mean,
            "o-",
            color="g",
            label="Cross-validation score",
        )
        ax.legend(loc="best")

        plt.tight_layout()
        plt.savefig(self._file_path /
                    Path(f"../data/results/{title.replace(' ', '_')}.pdf"))

    def _identity(self, text):
        return text

Пример #18

0

Показать файл

Файл: Results.py Проект: I-Good-Vegetable/CuteDecisionTree

def classificationReportDict(trueY,
                             predY,
                             labels=None,
                             targetNames=None,
                             sampleWeight=None,
                             alpha=0.1):
    report = dict()
    if labels is None:
        labels = unique_labels(trueY, predY)
    else:
        labels = np.asarray(labels)
    if targetNames is None:
        targetNames = [str(label) for label in labels]
    # Precision Recall F1 Support
    precision, recall, f1, support = \
        precision_recall_fscore_support(trueY, predY, labels=labels, average=None, sample_weight=sampleWeight)
    # Specificity
    specificity = specificity_score(trueY,
                                    predY,
                                    labels=labels,
                                    average=None,
                                    sample_weight=sampleWeight)
    # Geometric mean
    gMean = geometric_mean_score(trueY,
                                 predY,
                                 labels=labels,
                                 average=None,
                                 sample_weight=sampleWeight)
    # Index balanced accuracy
    ibaGMeanScore = make_index_balanced_accuracy(
        alpha=alpha, squared=True)(geometric_mean_score)
    ibaGMean = ibaGMeanScore(trueY,
                             predY,
                             labels=labels,
                             average=None,
                             sample_weight=sampleWeight)
    for i, label in enumerate(labels):
        targetName = targetNames[i]
        report[targetName] = {
            'Precision': precision[i],
            'Recall': recall[i],
            'F1': f1[i],
            'Specificity': specificity[i],
            'GMean': gMean[i],
            'IbaGMean': ibaGMean[i],
            'Support': support[i],
        }

    report['Weighted Avg'] = {
        'Precision': np.average(precision, weights=support),
        'Recall': np.average(recall, weights=support),
        'F1': np.average(f1, weights=support),
        'Specificity': np.average(specificity, weights=support),
        'GMean': np.average(gMean, weights=support),
        'IbaGMean': np.average(ibaGMean, weights=support),
        'Support': np.sum(support)
    }

    report['Macro Avg'] = {
        'Precision': np.average(precision),
        'Recall': np.average(recall),
        'F1': np.average(f1),
        'Specificity': np.average(specificity),
        'GMean': np.average(gMean),
        'IbaGMean': np.average(ibaGMean),
        'Support': np.sum(support)
    }

    # Accuracy
    accuracy = accuracy_score(trueY,
                              predY,
                              normalize=True,
                              sample_weight=sampleWeight)
    report['Accuracy'] = accuracy

    return report

Пример #19

0

Показать файл

Файл: test_classification.py Проект: vishalbelsare/imbalanced-learn

def test_iba_sklearn_metrics(score, expected_score):
    y_true, y_pred, _ = make_prediction(binary=True)

    score_iba = make_index_balanced_accuracy(alpha=0.5, squared=True)(score)
    score = score_iba(y_true, y_pred)
    assert score == pytest.approx(expected_score)

Пример #20

0

Показать файл

Файл: plot_metrics.py Проект: vishalbelsare/imbalanced-learn

# The geometric mean corresponds to the square root of the product of the
# sensitivity and specificity. Combining the two metrics should account for
# the balancing of the dataset.

# %%
from imblearn.metrics import geometric_mean_score

print(f"The geometric mean is {geometric_mean_score(y_test, y_pred):.3f}")

# %% [markdown]
# The index balanced accuracy can transform any metric to be used in
# imbalanced learning problems.

# %%
from imblearn.metrics import make_index_balanced_accuracy

alpha = 0.1
geo_mean = make_index_balanced_accuracy(alpha=alpha,
                                        squared=True)(geometric_mean_score)

print(f"The IBA using alpha={alpha} and the geometric mean: "
      f"{geo_mean(y_test, y_pred):.3f}")

# %%
alpha = 0.5
geo_mean = make_index_balanced_accuracy(alpha=alpha,
                                        squared=True)(geometric_mean_score)

print(f"The IBA using alpha={alpha} and the geometric mean: "
      f"{geo_mean(y_test, y_pred):.3f}")

Пример #21

0

Показать файл

R_TOL = 1e-2


@pytest.fixture
def data():
    X, y = make_blobs(random_state=0, centers=2)
    return train_test_split(X, y, random_state=0)


@pytest.mark.filterwarnings("ignore:Liblinear failed to converge")
@pytest.mark.parametrize(
    "score, expected_score",
    [(sensitivity_score, 0.92),
     (specificity_score, 0.92),
     (geometric_mean_score, 0.92),
     (make_index_balanced_accuracy()(geometric_mean_score), 0.85)]
)
@pytest.mark.parametrize("average",['macro', 'weighted', 'micro'])
def test_scorer_common_average(data, score, expected_score, average):
    X_train, X_test, y_train, _ = data

    scorer = make_scorer(score, pos_label=None, average=average)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer, cv=3, iid=False)
    grid.fit(X_train, y_train).predict(X_test)

    assert grid.best_score_ == pytest.approx(expected_score, rel=R_TOL)


@pytest.mark.filterwarnings("ignore:Liblinear failed to converge")
@pytest.mark.parametrize(

Пример #22

0

Показать файл

Файл: test_score_objects.py Проект: centre-for-humanities-computing/corona-psychopathology


@pytest.fixture
def data():
    X, y = make_blobs(random_state=0, centers=2)
    return train_test_split(X, y, random_state=0)


@pytest.mark.filterwarnings("ignore:Liblinear failed to converge")
@pytest.mark.parametrize(
    "score, expected_score",
    [
        (sensitivity_score, 0.92),
        (specificity_score, 0.92),
        (geometric_mean_score, 0.92),
        (make_index_balanced_accuracy()(geometric_mean_score), 0.85),
    ],
)
@pytest.mark.parametrize("average", ["macro", "weighted", "micro"])
def test_scorer_common_average(data, score, expected_score, average):
    X_train, X_test, y_train, _ = data

    scorer = make_scorer(score, pos_label=None, average=average)
    grid = GridSearchCV(
        LinearSVC(random_state=0),
        param_grid={"C": [1, 10]},
        scoring=scorer,
        cv=3,
    )
    grid.fit(X_train, y_train).predict(X_test)

Пример #23

0

Показать файл

Файл: test_score_objects.py Проект: glemaitre/imbalanced-learn

def test_imblearn_classification_scorers():
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LinearSVC(random_state=0)
    clf.fit(X_train, y_train)

    # sensitivity scorer
    scorer = make_scorer(sensitivity_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    # specificity scorer
    scorer = make_scorer(specificity_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.95, rtol=R_TOL)

    # geometric_mean scorer
    scorer = make_scorer(geometric_mean_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(
        geometric_mean_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    # make a iba metric before a scorer
    geo_mean_iba = make_index_balanced_accuracy()(geometric_mean_score)
    scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=1)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.84, rtol=R_TOL)

Пример #24

0

Показать файл

Файл: plot_metrics.py Проект: bodycat/imbalanced-learn

###############################################################################
# The geometric mean corresponds to the square root of the product of the
# sensitivity and specificity. Combining the two metrics should account for
# the balancing of the dataset.

print('The geometric mean is {}'.format(geometric_mean_score(
    y_test,
    y_pred_bal)))

###############################################################################
# The index balanced accuracy can transform any metric to be used in
# imbalanced learning problems.

alpha = 0.1
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
    geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
    alpha, geo_mean(
        y_test,
        y_pred_bal)))

alpha = 0.5
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
    geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
    alpha, geo_mean(
        y_test,
        y_pred_bal)))

Пример #25

0

Показать файл

Файл: test_score_objects.py Проект: weimengdong/imbalanced-learn

def test_imblearn_classification_scorers():
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LinearSVC(random_state=0)
    clf.fit(X_train, y_train)

    # sensitivity scorer
    scorer = make_scorer(sensitivity_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    # specificity scorer
    scorer = make_scorer(specificity_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.95, rtol=R_TOL)

    # geometric_mean scorer
    scorer = make_scorer(geometric_mean_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score,
                         pos_label=None,
                         average='weighted')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    # make a iba metric before a scorer
    geo_mean_iba = make_index_balanced_accuracy()(geometric_mean_score)
    scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=1)
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.84, rtol=R_TOL)

Пример #26

0

Показать файл

Файл: test_classification.py Проект: chkoar/imbalanced-learn

def test_iba_sklearn_metrics(score, expected_score):
    y_true, y_pred, _ = make_prediction(binary=True)

    score_iba = make_index_balanced_accuracy(alpha=0.5, squared=True)(score)
    score = score_iba(y_true, y_pred)
    assert score == pytest.approx(expected_score)

Python make_index_balanced_accuracy примеры использования