Python make_index_balanced_accuracy示例，imblearn.metrics.make_index_balanced_accuracy Python示例

示例#1

0

显示文件

文件： test_classification.py 项目： chkoar/imbalanced-learn

def test_iba_error_y_score_prob_error(score_loss):
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(score_loss)
    with pytest.raises(AttributeError):
        aps(y_true, y_pred)

示例#2

0

显示文件

文件： model.py 项目： mozilla/bugbug

def classification_report_imbalanced_values(
    y_true, y_pred, labels, target_names=None, sample_weight=None, digits=2, alpha=0.1
):
    """Copy of imblearn.metrics.classification_report_imbalanced to have
    access to the raw values. The code is mostly the same except the
    formatting code and generation of the report which haven removed. Copied
    from version 0.4.3. The original code is living here:
    https://github.com/scikit-learn-contrib/imbalanced-learn/blob/b861b3a8e3414c52f40a953f2e0feca5b32e7460/imblearn/metrics/_classification.py#L790
    """
    labels = np.asarray(labels)

    if target_names is None:
        target_names = [str(label) for label in labels]

    # Compute the different metrics
    # Precision/recall/f1
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Specificity
    specificity = specificity_score(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Geometric mean
    geo_mean = geometric_mean_score(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Index balanced accuracy
    iba_gmean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
        geometric_mean_score
    )
    iba = iba_gmean(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )

    result = {"targets": {}}

    for i, label in enumerate(labels):
        result["targets"][target_names[i]] = {
            "precision": precision[i],
            "recall": recall[i],
            "specificity": specificity[i],
            "f1": f1[i],
            "geo_mean": geo_mean[i],
            "iba": iba[i],
            "support": support[i],
        }

    result["average"] = {
        "precision": np.average(precision, weights=support),
        "recall": np.average(recall, weights=support),
        "specificity": np.average(specificity, weights=support),
        "f1": np.average(f1, weights=support),
        "geo_mean": np.average(geo_mean, weights=support),
        "iba": np.average(iba, weights=support),
        "support": np.sum(support),
    }

    return result

示例#3

0

显示文件

def test_iba_error_y_score_prob():
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(average_precision_score)
    assert_raises(AttributeError, aps, y_true, y_pred)

    brier = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(brier_score_loss)
    assert_raises(AttributeError, brier, y_true, y_pred)

    kappa = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(cohen_kappa_score)
    assert_raises(AttributeError, kappa, y_true, y_pred)

    ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score)
    assert_raises(AttributeError, ras, y_true, y_pred)

示例#4

0

显示文件

文件： test_classification.py 项目： vishalbelsare/imbalanced-learn

def test_iba_geo_mean_binary():
    y_true, y_pred, _ = make_prediction(binary=True)

    iba_gmean = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(geometric_mean_score)
    iba = iba_gmean(y_true, y_pred)

    assert_allclose(iba, 0.5948, rtol=R_TOL)

示例#5

0

显示文件

文件： test_classification.py 项目： chkoar/imbalanced-learn

def test_iba_geo_mean_binary():
    y_true, y_pred, _ = make_prediction(binary=True)

    iba_gmean = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(geometric_mean_score)
    iba = iba_gmean(y_true, y_pred)

    assert_allclose(iba, 0.5948, rtol=R_TOL)

示例#6

0

显示文件

def test_iba_geo_mean_binary():
    """Test to test the iba using the geometric mean"""
    y_true, y_pred, _ = make_prediction(binary=True)

    iba_gmean = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(geometric_mean_score)
    iba = iba_gmean(y_true, y_pred)

    assert_almost_equal(iba, 0.54, 2)

示例#7

0

显示文件

文件： frame_classifier_extra_features_with_filtering.py 项目： tradr-project/TRADR_frame_classifiers

def flat_iba(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  geo_mean = geometric_mean_score(labels_flat, pred_flat, average=None, sample_weight=None)
  iba_gmean = make_index_balanced_accuracy(alpha=0.1, squared=True)(geometric_mean_score)
  iba = iba_gmean(labels_flat, pred_flat, average=None, sample_weight=None)
  _, _, _, support = precision_recall_fscore_support(labels_flat, pred_flat, average=None, sample_weight=None)
  res = np.average(iba, weights=support)
  return res

示例#8

0

显示文件

def test_iba_error_y_score_prob():
    """Test if an error is raised when a scoring metric take over parameters
    than y_pred"""
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(average_precision_score)
    assert_raises(AttributeError, aps, y_true, y_pred)

    brier = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(brier_score_loss)
    assert_raises(AttributeError, brier, y_true, y_pred)

    kappa = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(cohen_kappa_score)
    assert_raises(AttributeError, kappa, y_true, y_pred)

    ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score)
    assert_raises(AttributeError, ras, y_true, y_pred)

示例#9

0

显示文件

def test_iba_sklearn_metrics():
    y_true, y_pred, _ = make_prediction(binary=True)

    acc = make_index_balanced_accuracy(alpha=0.5, squared=True)(accuracy_score)
    score = acc(y_true, y_pred)
    assert_equal(score, 0.54756)

    jss = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(jaccard_similarity_score)
    score = jss(y_true, y_pred)
    assert_equal(score, 0.54756)

    pre = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(precision_score)
    score = pre(y_true, y_pred)
    assert_equal(score, 0.65025)

    rec = make_index_balanced_accuracy(alpha=0.5, squared=True)(recall_score)
    score = rec(y_true, y_pred)
    assert_equal(score, 0.41616000000000009)

示例#10

0

显示文件

文件： test_classification.py 项目： bodycat/imbalanced-learn

def test_iba_sklearn_metrics():
    y_true, y_pred, _ = make_prediction(binary=True)

    acc = make_index_balanced_accuracy(alpha=0.5, squared=True)(accuracy_score)
    score = acc(y_true, y_pred)
    assert score == approx(0.54756)

    jss = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(jaccard_similarity_score)
    score = jss(y_true, y_pred)
    assert score == approx(0.54756)

    pre = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(precision_score)
    score = pre(y_true, y_pred)
    assert score == approx(0.65025)

    rec = make_index_balanced_accuracy(alpha=0.5, squared=True)(recall_score)
    score = rec(y_true, y_pred)
    assert score == approx(0.41616000000000009)

示例#11

0

显示文件

文件： test_classification.py 项目： bodycat/imbalanced-learn

def test_iba_error_y_score_prob():
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(average_precision_score)
    with raises(AttributeError):
        aps(y_true, y_pred)

    brier = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(brier_score_loss)
    with raises(AttributeError):
        brier(y_true, y_pred)

    kappa = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(cohen_kappa_score)
    with raises(AttributeError):
        kappa(y_true, y_pred)

    ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score)
    with raises(AttributeError):
        ras(y_true, y_pred)

示例#12

0

显示文件

文件： test_classification.py 项目： zbn123/imbalanced-learn

def test_iba_error_y_score_prob():
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(alpha=0.5,
                                       squared=True)(average_precision_score)
    with raises(AttributeError):
        aps(y_true, y_pred)

    brier = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(brier_score_loss)
    with raises(AttributeError):
        brier(y_true, y_pred)

    kappa = make_index_balanced_accuracy(alpha=0.5,
                                         squared=True)(cohen_kappa_score)
    with raises(AttributeError):
        kappa(y_true, y_pred)

    ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score)
    with raises(AttributeError):
        ras(y_true, y_pred)

示例#13

0

显示文件

文件： classification.py 项目： aauss/EventEpi

    def plot_learning_curve(self,
                            estimator,
                            title,
                            X,
                            y,
                            train_sizes=np.linspace(0.1, 1.0, 5)):
        """
        Generate test and training learning curve.
        """
        _, ax = plt.subplots(1, 1, figsize=(8, 6))

        ax.set_title(title)
        ax.set_xlabel("Training examples")
        ax.set_ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(
            estimator,
            X,
            y,
            train_sizes=train_sizes,
            scoring=make_scorer(
                make_index_balanced_accuracy()(geometric_mean_score)),
            verbose=1,
        )
        pd.DataFrame({
            "train_size":
            np.array([[size] * train_scores.shape[1]
                      for size in train_sizes]).reshape(-1),
            "train_score":
            train_scores.reshape(-1),
            "test_score":
            test_scores.reshape(-1),
        }).to_csv(
            self._file_path /
            Path(f"../data/results/{title.replace(' ', '_')}_values.csv"))
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        # Plot learning curve
        ax.grid()
        ax.fill_between(
            train_sizes,
            train_scores_mean - train_scores_std,
            train_scores_mean + train_scores_std,
            alpha=0.1,
            color="r",
        )
        ax.fill_between(
            train_sizes,
            test_scores_mean - test_scores_std,
            test_scores_mean + test_scores_std,
            alpha=0.1,
            color="g",
        )
        ax.plot(train_sizes,
                train_scores_mean,
                "o-",
                color="r",
                label="Training score")
        ax.plot(
            train_sizes,
            test_scores_mean,
            "o-",
            color="g",
            label="Cross-validation score",
        )
        ax.legend(loc="best")

        plt.tight_layout()
        plt.savefig(self._file_path /
                    Path(f"../data/results/{title.replace(' ', '_')}.pdf"))

示例#14

0

显示文件

文件： classification.py 项目： aauss/EventEpi

    def train_relevance_scoring(self):
        X, y = self.loader.labeled_texts()
        X_train, X_test, y_train, y_test = self.train_test_split(X, y)

        # BOW models
        grid_search_parameters = {
            "tfidf__ngram_range": [(1, 1), (1, 3)],
            "tfidf__use_idf": (True, False),
        }

        for model_name, model in [
            ("complement", ComplementNB),
            ("multinomial", MultinomialNB),
        ]:

            pipeline = Pipeline([
                ("norm", TextNormalizer()),
                (
                    "tfidf",
                    TfidfVectorizer(tokenizer=self._identity,
                                    preprocessor=None,
                                    lowercase=False),
                ),
                ("clf", model(alpha=0.001)),
            ])

            gs_model = GridSearchCV(
                pipeline,
                grid_search_parameters,
                scoring=make_scorer(
                    make_index_balanced_accuracy()(geometric_mean_score)),
                verbose=2,
            )

            start_time = time.time()
            gs_model = gs_model.fit(X_train, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            best_params = gs_model.best_params_

            predicted = gs_model.predict(X_test)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(
                    str(best_params) + "\n\n" + report + "\n\n" +
                    f"training time: {training_time} min.")

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix, relevance scoring, {model_name} NBC",
            )

            self.plot_learning_curve(
                pipeline.set_params(**best_params),
                f"Learning curve, relevance scoring, {model_name} NBC",
                X,
                y,
            )

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_model.pickle"
                         ),
                    "wb",
            ) as f:
                pickle.dump(gs_model.best_estimator_, f)

        # Embedding models without ADASYN
        embedder = MeanDocumentEmbedder()

        X_embedded = np.array(list(embedder.transform(X)))
        (X_train_embedded, X_test_embedded, y_train,
         y_test) = self.train_test_split(X_embedded, y)

        for model_name, model in [
            ("logistic regression", LogisticRegression()),
            ("k-nearest neighbors", KNeighborsClassifier()),
            ("support vector classifier", SVC()),
            ("multi layer perceptron", MLPClassifier()),
        ]:
            start_time = time.time()
            model.fit(X_train_embedded, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            predicted = model.predict(X_test_embedded)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path / Path(
                        f"../data/results/relevance_no_adasyn_{model_name}_report.txt"
                    ),
                    "w",
            ) as f:
                f.write(report + "\n\n" +
                        f"training time: {training_time} min.")
            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix (no ADASYN), relevance scoring, {model_name}",
            )

        # Embedding models with ADASYN
        adasyn = ADASYN(random_state=13353)
        X_resample, y_resample = adasyn.fit_sample(X_train_embedded, y_train)
        for model_name, model in [
            ("logistic regression", LogisticRegression),
            ("k-nearest neighbors", KNeighborsClassifier),
            ("support vector classifier", SVC),
            ("multi layer perceptron", MLPClassifier),
        ]:
            if model_name == "support vector classifier":
                clf = model(probability=True)
            else:
                clf = model()
            start_time = time.time()
            clf = clf.fit(X_resample, y_resample)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            predicted = clf.predict(X_test_embedded)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(report + "\n\n" +
                        f"training time: {training_time} min.")

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_model.pickle"
                         ),
                    "wb",
            ) as f:
                pickle.dump(clf, f)

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix, relevance scoring, {model_name}",
            )

            self.plot_learning_curve(
                model(),
                f"Learning curve, relevance scoring, {model_name}",
                X_resample,
                y_resample,
            )

示例#15

0

显示文件

文件： classification.py 项目： aauss/EventEpi

    def _train_key_entity_classification(self, X, y, entity):
        X_train, X_test, y_train, y_test = self.train_test_split(X,
                                                                 y,
                                                                 stratify=y)

        grid_search_parameters = {
            "tfidf__ngram_range": [(1, 1), (1, 3), (1, 4)],
            "tfidf__use_idf": (True, False),
            "clf__alpha": (0.01, 0.001),
        }

        for model_name, model in [
            ("Bernoulli", BernoulliNB),
            ("multinomial", MultinomialNB),
        ]:

            pipeline = Pipeline([
                ("norm", TextNormalizer()),
                (
                    "tfidf",
                    TfidfVectorizer(tokenizer=self._identity,
                                    preprocessor=None,
                                    lowercase=False),
                ),
                ("clf", model()),
            ])

            gs_model = GridSearchCV(
                pipeline,
                grid_search_parameters,
                scoring=make_scorer(
                    make_index_balanced_accuracy()(geometric_mean_score)),
                verbose=2,
            )

            start_time = time.time()
            gs_model = gs_model.fit(X_train, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            best_params = gs_model.best_params_

            with open(
                    self._file_path / Path(
                        f"../data/results/{entity}_{model_name}_model.pickle"),
                    "wb",
            ) as f:
                pickle.dump(gs_model.best_estimator_, f)

            predicted = gs_model.predict(X_test)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/{entity}_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(
                    str(best_params) + "\n\n" + report + "\n\n" +
                    f"training time: {training_time} min.")

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["not key", "is key"],
                f"Confusion matrix, {entity} key entity, {model_name} NBC",
            )

            self.plot_learning_curve(
                pipeline.set_params(**best_params),
                f"Learning curve, {entity} key entity, {model_name} NBC",
                X,
                y,
            )

示例#16

0

显示文件

文件： test_classification.py 项目： vishalbelsare/imbalanced-learn

def test_iba_error_y_score_prob_error(score_loss):
    y_true, y_pred, _ = make_prediction(binary=True)

    aps = make_index_balanced_accuracy(alpha=0.5, squared=True)(score_loss)
    with pytest.raises(AttributeError):
        aps(y_true, y_pred)

示例#17

0

显示文件

文件： classification.py 项目： aauss/EventEpi

class Trainer:
    loader = DataLoader()
    iba = make_index_balanced_accuracy()(geometric_mean_score)
    train_test_split = partial(train_test_split,
                               test_size=0.25,
                               random_state=13353)
    _file_path = Path(__file__).parent.resolve()

    def train_relevance_scoring(self):
        X, y = self.loader.labeled_texts()
        X_train, X_test, y_train, y_test = self.train_test_split(X, y)

        # BOW models
        grid_search_parameters = {
            "tfidf__ngram_range": [(1, 1), (1, 3)],
            "tfidf__use_idf": (True, False),
        }

        for model_name, model in [
            ("complement", ComplementNB),
            ("multinomial", MultinomialNB),
        ]:

            pipeline = Pipeline([
                ("norm", TextNormalizer()),
                (
                    "tfidf",
                    TfidfVectorizer(tokenizer=self._identity,
                                    preprocessor=None,
                                    lowercase=False),
                ),
                ("clf", model(alpha=0.001)),
            ])

            gs_model = GridSearchCV(
                pipeline,
                grid_search_parameters,
                scoring=make_scorer(
                    make_index_balanced_accuracy()(geometric_mean_score)),
                verbose=2,
            )

            start_time = time.time()
            gs_model = gs_model.fit(X_train, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            best_params = gs_model.best_params_

            predicted = gs_model.predict(X_test)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(
                    str(best_params) + "\n\n" + report + "\n\n" +
                    f"training time: {training_time} min.")

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix, relevance scoring, {model_name} NBC",
            )

            self.plot_learning_curve(
                pipeline.set_params(**best_params),
                f"Learning curve, relevance scoring, {model_name} NBC",
                X,
                y,
            )

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_model.pickle"
                         ),
                    "wb",
            ) as f:
                pickle.dump(gs_model.best_estimator_, f)

        # Embedding models without ADASYN
        embedder = MeanDocumentEmbedder()

        X_embedded = np.array(list(embedder.transform(X)))
        (X_train_embedded, X_test_embedded, y_train,
         y_test) = self.train_test_split(X_embedded, y)

        for model_name, model in [
            ("logistic regression", LogisticRegression()),
            ("k-nearest neighbors", KNeighborsClassifier()),
            ("support vector classifier", SVC()),
            ("multi layer perceptron", MLPClassifier()),
        ]:
            start_time = time.time()
            model.fit(X_train_embedded, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            predicted = model.predict(X_test_embedded)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path / Path(
                        f"../data/results/relevance_no_adasyn_{model_name}_report.txt"
                    ),
                    "w",
            ) as f:
                f.write(report + "\n\n" +
                        f"training time: {training_time} min.")
            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix (no ADASYN), relevance scoring, {model_name}",
            )

        # Embedding models with ADASYN
        adasyn = ADASYN(random_state=13353)
        X_resample, y_resample = adasyn.fit_sample(X_train_embedded, y_train)
        for model_name, model in [
            ("logistic regression", LogisticRegression),
            ("k-nearest neighbors", KNeighborsClassifier),
            ("support vector classifier", SVC),
            ("multi layer perceptron", MLPClassifier),
        ]:
            if model_name == "support vector classifier":
                clf = model(probability=True)
            else:
                clf = model()
            start_time = time.time()
            clf = clf.fit(X_resample, y_resample)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            predicted = clf.predict(X_test_embedded)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(report + "\n\n" +
                        f"training time: {training_time} min.")

            with open(
                    self._file_path /
                    Path(f"../data/results/relevance_{model_name}_model.pickle"
                         ),
                    "wb",
            ) as f:
                pickle.dump(clf, f)

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["irrelevant", "relevant"],
                f"Confusion matrix, relevance scoring, {model_name}",
            )

            self.plot_learning_curve(
                model(),
                f"Learning curve, relevance scoring, {model_name}",
                X_resample,
                y_resample,
            )

    def train_key_entity_classifications(self):
        X_date, y_date = self.loader.labeled_date_sentences()
        self._train_key_entity_classification(X_date, y_date, "date")

        X_count, y_count = self.loader.labeled_count_sentences()
        self._train_key_entity_classification(X_count, y_count, "count")

    def _train_key_entity_classification(self, X, y, entity):
        X_train, X_test, y_train, y_test = self.train_test_split(X,
                                                                 y,
                                                                 stratify=y)

        grid_search_parameters = {
            "tfidf__ngram_range": [(1, 1), (1, 3), (1, 4)],
            "tfidf__use_idf": (True, False),
            "clf__alpha": (0.01, 0.001),
        }

        for model_name, model in [
            ("Bernoulli", BernoulliNB),
            ("multinomial", MultinomialNB),
        ]:

            pipeline = Pipeline([
                ("norm", TextNormalizer()),
                (
                    "tfidf",
                    TfidfVectorizer(tokenizer=self._identity,
                                    preprocessor=None,
                                    lowercase=False),
                ),
                ("clf", model()),
            ])

            gs_model = GridSearchCV(
                pipeline,
                grid_search_parameters,
                scoring=make_scorer(
                    make_index_balanced_accuracy()(geometric_mean_score)),
                verbose=2,
            )

            start_time = time.time()
            gs_model = gs_model.fit(X_train, y_train)
            training_time = f"{int(time.time()-start_time)/60:.1f}"
            best_params = gs_model.best_params_

            with open(
                    self._file_path / Path(
                        f"../data/results/{entity}_{model_name}_model.pickle"),
                    "wb",
            ) as f:
                pickle.dump(gs_model.best_estimator_, f)

            predicted = gs_model.predict(X_test)
            report = classification_report_imbalanced(y_test, predicted)

            with open(
                    self._file_path /
                    Path(f"../data/results/{entity}_{model_name}_report.txt"),
                    "w",
            ) as f:
                f.write(
                    str(best_params) + "\n\n" + report + "\n\n" +
                    f"training time: {training_time} min.")

            self.plot_confusion_matrix(
                confusion_matrix(y_test, predicted),
                ["not key", "is key"],
                f"Confusion matrix, {entity} key entity, {model_name} NBC",
            )

            self.plot_learning_curve(
                pipeline.set_params(**best_params),
                f"Learning curve, {entity} key entity, {model_name} NBC",
                X,
                y,
            )

    def plot_confusion_matrix(
        self,
        cm,
        target_names,
        title,
    ):
        """
        Plot a sklearn confusion matrix (cm)

        Citiation
        ---------
        http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

        """

        misclass = 1 - np.trace(cm) / float(np.sum(cm))

        plt.figure(figsize=(8, 6))
        plt.imshow(cm, interpolation="nearest", cmap=plt.get_cmap("Blues"))
        plt.title(title)
        plt.colorbar()

        if target_names is not None:
            tick_marks = np.arange(len(target_names))
            plt.xticks(tick_marks, target_names, rotation=45)
            plt.yticks(tick_marks, target_names)

        thresh = cm.max() / 2
        for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(
                j,
                i,
                "{:,}".format(cm[i, j]),
                horizontalalignment="right",
                color="white" if cm[i, j] > thresh else "black",
            )

        plt.ylabel("True label")
        plt.xlabel("Predicted label\nmisclass={:0.2f}".format(misclass))
        plt.tight_layout()
        plt.savefig(self._file_path /
                    Path(f"../data/results/{title.replace(' ', '_')}.pdf"))

    def plot_learning_curve(self,
                            estimator,
                            title,
                            X,
                            y,
                            train_sizes=np.linspace(0.1, 1.0, 5)):
        """
        Generate test and training learning curve.
        """
        _, ax = plt.subplots(1, 1, figsize=(8, 6))

        ax.set_title(title)
        ax.set_xlabel("Training examples")
        ax.set_ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(
            estimator,
            X,
            y,
            train_sizes=train_sizes,
            scoring=make_scorer(
                make_index_balanced_accuracy()(geometric_mean_score)),
            verbose=1,
        )
        pd.DataFrame({
            "train_size":
            np.array([[size] * train_scores.shape[1]
                      for size in train_sizes]).reshape(-1),
            "train_score":
            train_scores.reshape(-1),
            "test_score":
            test_scores.reshape(-1),
        }).to_csv(
            self._file_path /
            Path(f"../data/results/{title.replace(' ', '_')}_values.csv"))
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        # Plot learning curve
        ax.grid()
        ax.fill_between(
            train_sizes,
            train_scores_mean - train_scores_std,
            train_scores_mean + train_scores_std,
            alpha=0.1,
            color="r",
        )
        ax.fill_between(
            train_sizes,
            test_scores_mean - test_scores_std,
            test_scores_mean + test_scores_std,
            alpha=0.1,
            color="g",
        )
        ax.plot(train_sizes,
                train_scores_mean,
                "o-",
                color="r",
                label="Training score")
        ax.plot(
            train_sizes,
            test_scores_mean,
            "o-",
            color="g",
            label="Cross-validation score",
        )
        ax.legend(loc="best")

        plt.tight_layout()
        plt.savefig(self._file_path /
                    Path(f"../data/results/{title.replace(' ', '_')}.pdf"))

    def _identity(self, text):
        return text

示例#18

0

显示文件

文件： Results.py 项目： I-Good-Vegetable/CuteDecisionTree

def classificationReportDict(trueY,
                             predY,
                             labels=None,
                             targetNames=None,
                             sampleWeight=None,
                             alpha=0.1):
    report = dict()
    if labels is None:
        labels = unique_labels(trueY, predY)
    else:
        labels = np.asarray(labels)
    if targetNames is None:
        targetNames = [str(label) for label in labels]
    # Precision Recall F1 Support
    precision, recall, f1, support = \
        precision_recall_fscore_support(trueY, predY, labels=labels, average=None, sample_weight=sampleWeight)
    # Specificity
    specificity = specificity_score(trueY,
                                    predY,
                                    labels=labels,
                                    average=None,
                                    sample_weight=sampleWeight)
    # Geometric mean
    gMean = geometric_mean_score(trueY,
                                 predY,
                                 labels=labels,
                                 average=None,
                                 sample_weight=sampleWeight)
    # Index balanced accuracy
    ibaGMeanScore = make_index_balanced_accuracy(
        alpha=alpha, squared=True)(geometric_mean_score)
    ibaGMean = ibaGMeanScore(trueY,
                             predY,
                             labels=labels,
                             average=None,
                             sample_weight=sampleWeight)
    for i, label in enumerate(labels):
        targetName = targetNames[i]
        report[targetName] = {
            'Precision': precision[i],
            'Recall': recall[i],
            'F1': f1[i],
            'Specificity': specificity[i],
            'GMean': gMean[i],
            'IbaGMean': ibaGMean[i],
            'Support': support[i],
        }

    report['Weighted Avg'] = {
        'Precision': np.average(precision, weights=support),
        'Recall': np.average(recall, weights=support),
        'F1': np.average(f1, weights=support),
        'Specificity': np.average(specificity, weights=support),
        'GMean': np.average(gMean, weights=support),
        'IbaGMean': np.average(ibaGMean, weights=support),
        'Support': np.sum(support)
    }

    report['Macro Avg'] = {
        'Precision': np.average(precision),
        'Recall': np.average(recall),
        'F1': np.average(f1),
        'Specificity': np.average(specificity),
        'GMean': np.average(gMean),
        'IbaGMean': np.average(ibaGMean),
        'Support': np.sum(support)
    }

    # Accuracy
    accuracy = accuracy_score(trueY,
                              predY,
                              normalize=True,
                              sample_weight=sampleWeight)
    report['Accuracy'] = accuracy

    return report

示例#19

0

显示文件

文件： test_classification.py 项目： vishalbelsare/imbalanced-learn

def test_iba_sklearn_metrics(score, expected_score):
    y_true, y_pred, _ = make_prediction(binary=True)

    score_iba = make_index_balanced_accuracy(alpha=0.5, squared=True)(score)
    score = score_iba(y_true, y_pred)
    assert score == pytest.approx(expected_score)

示例#20

0

显示文件

文件： plot_metrics.py 项目： vishalbelsare/imbalanced-learn

# The geometric mean corresponds to the square root of the product of the
# sensitivity and specificity. Combining the two metrics should account for
# the balancing of the dataset.

# %%
from imblearn.metrics import geometric_mean_score

print(f"The geometric mean is {geometric_mean_score(y_test, y_pred):.3f}")

# %% [markdown]
# The index balanced accuracy can transform any metric to be used in
# imbalanced learning problems.

# %%
from imblearn.metrics import make_index_balanced_accuracy

alpha = 0.1
geo_mean = make_index_balanced_accuracy(alpha=alpha,
                                        squared=True)(geometric_mean_score)

print(f"The IBA using alpha={alpha} and the geometric mean: "
      f"{geo_mean(y_test, y_pred):.3f}")

# %%
alpha = 0.5
geo_mean = make_index_balanced_accuracy(alpha=alpha,
                                        squared=True)(geometric_mean_score)

print(f"The IBA using alpha={alpha} and the geometric mean: "
      f"{geo_mean(y_test, y_pred):.3f}")

示例#21

0

显示文件

R_TOL = 1e-2


@pytest.fixture
def data():
    X, y = make_blobs(random_state=0, centers=2)
    return train_test_split(X, y, random_state=0)


@pytest.mark.filterwarnings("ignore:Liblinear failed to converge")
@pytest.mark.parametrize(
    "score, expected_score",
    [(sensitivity_score, 0.92),
     (specificity_score, 0.92),
     (geometric_mean_score, 0.92),
     (make_index_balanced_accuracy()(geometric_mean_score), 0.85)]
)
@pytest.mark.parametrize("average",['macro', 'weighted', 'micro'])
def test_scorer_common_average(data, score, expected_score, average):
    X_train, X_test, y_train, _ = data

    scorer = make_scorer(score, pos_label=None, average=average)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer, cv=3, iid=False)
    grid.fit(X_train, y_train).predict(X_test)

    assert grid.best_score_ == pytest.approx(expected_score, rel=R_TOL)


@pytest.mark.filterwarnings("ignore:Liblinear failed to converge")
@pytest.mark.parametrize(

示例#22

0

显示文件

文件： test_score_objects.py 项目： centre-for-humanities-computing/corona-psychopathology


@pytest.fixture
def data():
    X, y = make_blobs(random_state=0, centers=2)
    return train_test_split(X, y, random_state=0)


@pytest.mark.filterwarnings("ignore:Liblinear failed to converge")
@pytest.mark.parametrize(
    "score, expected_score",
    [
        (sensitivity_score, 0.92),
        (specificity_score, 0.92),
        (geometric_mean_score, 0.92),
        (make_index_balanced_accuracy()(geometric_mean_score), 0.85),
    ],
)
@pytest.mark.parametrize("average", ["macro", "weighted", "micro"])
def test_scorer_common_average(data, score, expected_score, average):
    X_train, X_test, y_train, _ = data

    scorer = make_scorer(score, pos_label=None, average=average)
    grid = GridSearchCV(
        LinearSVC(random_state=0),
        param_grid={"C": [1, 10]},
        scoring=scorer,
        cv=3,
    )
    grid.fit(X_train, y_train).predict(X_test)

示例#23

0

显示文件

文件： test_score_objects.py 项目： glemaitre/imbalanced-learn

def test_imblearn_classification_scorers():
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LinearSVC(random_state=0)
    clf.fit(X_train, y_train)

    # sensitivity scorer
    scorer = make_scorer(sensitivity_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    # specificity scorer
    scorer = make_scorer(specificity_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.95, rtol=R_TOL)

    # geometric_mean scorer
    scorer = make_scorer(geometric_mean_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(
        geometric_mean_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    # make a iba metric before a scorer
    geo_mean_iba = make_index_balanced_accuracy()(geometric_mean_score)
    scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=1)
    grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]},
                        scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.84, rtol=R_TOL)

示例#24

0

显示文件

文件： plot_metrics.py 项目： bodycat/imbalanced-learn

###############################################################################
# The geometric mean corresponds to the square root of the product of the
# sensitivity and specificity. Combining the two metrics should account for
# the balancing of the dataset.

print('The geometric mean is {}'.format(geometric_mean_score(
    y_test,
    y_pred_bal)))

###############################################################################
# The index balanced accuracy can transform any metric to be used in
# imbalanced learning problems.

alpha = 0.1
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
    geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
    alpha, geo_mean(
        y_test,
        y_pred_bal)))

alpha = 0.5
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
    geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
    alpha, geo_mean(
        y_test,
        y_pred_bal)))

示例#25

0

显示文件

文件： test_score_objects.py 项目： weimengdong/imbalanced-learn

def test_imblearn_classification_scorers():
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LinearSVC(random_state=0)
    clf.fit(X_train, y_train)

    # sensitivity scorer
    scorer = make_scorer(sensitivity_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(sensitivity_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    # specificity scorer
    scorer = make_scorer(specificity_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(specificity_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.95, rtol=R_TOL)

    # geometric_mean scorer
    scorer = make_scorer(geometric_mean_score, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score,
                         pos_label=None,
                         average='weighted')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    scorer = make_scorer(geometric_mean_score, pos_label=1)
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.92, rtol=R_TOL)

    # make a iba metric before a scorer
    geo_mean_iba = make_index_balanced_accuracy()(geometric_mean_score)
    scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=None, average='weighted')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=None, average='micro')
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.85, rtol=R_TOL)

    scorer = make_scorer(geo_mean_iba, pos_label=1)
    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
    grid.fit(X_train, y_train).predict(X_test)
    assert_allclose(grid.best_score_, 0.84, rtol=R_TOL)

示例#26

0

显示文件

文件： test_classification.py 项目： chkoar/imbalanced-learn

def test_iba_sklearn_metrics(score, expected_score):
    y_true, y_pred, _ = make_prediction(binary=True)

    score_iba = make_index_balanced_accuracy(alpha=0.5, squared=True)(score)
    score = score_iba(y_true, y_pred)
    assert score == pytest.approx(expected_score)