Exemplo n.º 1
0
    def _train(self, train_input: gobbli.io.TrainInput,
               context: ContainerTaskContext) -> gobbli.io.TrainOutput:
        if train_input.checkpoint is not None:
            warnings.warn(
                "SKLearnClassifier does not support training from an existing "
                "checkpoint, so the passed checkpoint will be ignored.")

        # Input must be a numpy array for OneVsRestClassifier case
        X_train = np.array(train_input.X_train)
        X_valid = np.array(train_input.X_valid)

        y_train = train_input.y_train
        y_valid = train_input.y_valid
        labels = train_input.labels()
        if train_input.multilabel:
            self.estimator.base_estimator = OneVsRestClassifier(
                self.estimator.base_estimator)

            y_train = multilabel_to_indicator_df(
                train_input.y_train_multilabel, labels)
            y_valid = multilabel_to_indicator_df(
                train_input.y_valid_multilabel, labels)

        self.estimator.fit(X_train, y_train)

        if train_input.multilabel:
            # The fit method for OneVsRestClassifier uses LabelBinarizer to determine the
            # classes, which doesn't take string column names from a pandas DataFrame,
            # so the classes will come back as integer indexes.  Fix that manually here.
            # Use a numpy array to ensure compatilibity with the automatically-created classes.
            np_labels = np.array(labels)
            self.estimator.classes_ = np_labels
            self.estimator.base_estimator.classes_ = np_labels
            self.estimator.base_estimator.label_binarizer_.classes_ = np_labels

        y_train_pred = self.estimator.predict(X_train)
        y_valid_pred = self.estimator.predict(X_valid)

        train_loss = -f1_score(
            y_train, y_train_pred, zero_division="warn", average="weighted")

        valid_loss = -f1_score(
            y_valid, y_valid_pred, zero_division="warn", average="weighted")

        valid_accuracy = accuracy_score(y_valid, y_valid_pred)

        checkpoint_path = (context.host_output_dir /
                           SKLearnClassifier._TRAIN_OUTPUT_CHECKPOINT)
        self._dump_estimator(self.estimator.base_estimator, checkpoint_path)

        return gobbli.io.TrainOutput(
            valid_loss=valid_loss,
            valid_accuracy=valid_accuracy,
            train_loss=train_loss,
            labels=labels,
            multilabel=train_input.multilabel,
            checkpoint=checkpoint_path,
        )
Exemplo n.º 2
0
    def _train(self, train_input: gobbli.io.TrainInput,
               context: ContainerTaskContext) -> gobbli.io.TrainOutput:
        """
        Determine the majority class.
        """
        if train_input.multilabel:
            train_labels: List[str] = list(
                itertools.chain.from_iterable(train_input.y_train_multilabel))
        else:
            train_labels = train_input.y_train_multiclass

        unique_values, value_counts = np.unique(train_labels,
                                                return_counts=True)
        self.majority_class = unique_values[value_counts.argmax(axis=0)]

        labels = train_input.labels()
        y_train_pred_proba = self._make_pred_df(labels,
                                                len(train_input.y_train))
        y_valid_pred_proba = self._make_pred_df(labels,
                                                len(train_input.y_valid))

        if train_input.multilabel:
            y_train_indicator = multilabel_to_indicator_df(
                train_input.y_train_multilabel, labels)
            train_loss = ((y_train_pred_proba.subtract(y_train_indicator)
                           ).abs().to_numpy().sum())

            y_valid_indicator = multilabel_to_indicator_df(
                train_input.y_valid_multilabel, labels)
            valid_loss = ((y_valid_pred_proba.subtract(y_valid_indicator)
                           ).abs().to_numpy().sum())
            valid_accuracy = valid_loss / (y_valid_pred_proba.shape[0] *
                                           y_valid_pred_proba.shape[1])
        else:
            y_train_pred = pred_prob_to_pred_label(y_train_pred_proba)
            train_loss = np.sum(y_train_pred != train_input.y_train_multiclass)

            y_valid_pred = pred_prob_to_pred_label(y_valid_pred_proba)
            valid_loss = np.sum(y_valid_pred != train_input.y_valid_multiclass)
            valid_accuracy = valid_loss / len(y_valid_pred)

        return gobbli.io.TrainOutput(
            valid_loss=valid_loss,
            valid_accuracy=valid_accuracy,
            train_loss=train_loss,
            labels=train_input.labels(),
            multilabel=train_input.multilabel,
        )
Exemplo n.º 3
0
    def _train(self, train_input: gobbli.io.TrainInput,
               context: ContainerTaskContext) -> gobbli.io.TrainOutput:
        self._write_input(
            train_input.X_train,
            train_input.y_train_multilabel,
            context.host_input_dir / FastText._TRAIN_INPUT_FILE,
        )
        self._write_input(
            train_input.X_valid,
            train_input.y_valid_multilabel,
            context.host_input_dir / FastText._VALID_INPUT_FILE,
        )

        container_validation_input_path = (context.container_input_dir /
                                           FastText._VALID_INPUT_FILE)
        train_logs, train_loss = self._run_supervised(
            train_input.checkpoint,
            context.container_input_dir / FastText._TRAIN_INPUT_FILE,
            context.container_output_dir / FastText._CHECKPOINT_BASE,
            context,
            train_input.num_train_epochs,
            autotune_validation_file_path=container_validation_input_path,
        )

        host_checkpoint_path = context.host_output_dir / f"{FastText._CHECKPOINT_BASE}"

        labels = train_input.labels()

        # Calculate validation accuracy on our own, since the CLI only provides
        # precision/recall
        predict_logs, pred_prob_df = self._run_predict_prob(
            host_checkpoint_path, labels, container_validation_input_path,
            context)

        if train_input.multilabel:
            pred_labels = pred_prob_to_pred_multilabel(pred_prob_df)
            gold_labels = multilabel_to_indicator_df(
                train_input.y_valid_multilabel, labels)
        else:
            pred_labels = pred_prob_to_pred_label(pred_prob_df)
            gold_labels = train_input.y_valid_multiclass

        valid_accuracy = accuracy_score(gold_labels, pred_labels)

        # Not ideal, but fastText doesn't provide a way to get validation loss;
        # Negate the validation accuracy instead
        valid_loss = -valid_accuracy

        return gobbli.io.TrainOutput(
            train_loss=train_loss,
            valid_loss=valid_loss,
            valid_accuracy=valid_accuracy,
            labels=labels,
            multilabel=train_input.multilabel,
            checkpoint=host_checkpoint_path,
            _console_output="\n".join((train_logs, predict_logs)),
        )
Exemplo n.º 4
0
 def y_true_multilabel(self) -> pd.DataFrame:
     return multilabel_to_indicator_df(
         as_multilabel(self.y_true, self.multilabel), self.labels)
Exemplo n.º 5
0
def test_classification_evaluation_multilabel():
    labels = ["a", "b"]
    results = ClassificationEvaluation(
        labels=labels,
        X=["a1", "a2", "ab1", "ab2", "b1", "b2", "01", "02", "03"],
        y_true=[["a"], ["a"], ["a", "b"], ["a", "b"], ["b"], ["b"], [], [],
                []],
        y_pred_proba=pd.DataFrame({
            "a": [0.7, 0.3, 0.4, 0.6, 0.9, 0.3, 0.6, 0.4, 0.3],
            "b": [0.3, 0.7, 0.6, 0.6, 0.1, 0.7, 0.4, 0.6, 0.3],
        }),
    )

    # Ensure predicted labels are calculated correctly
    expected_y_pred = multilabel_to_indicator_df(
        [["a"], ["b"], ["b"], ["a", "b"], ["a"], ["b"], ["a"], ["b"], []],
        labels)
    pd.testing.assert_frame_equal(results.y_pred_multilabel, expected_y_pred)

    # Ensure errors are calculated correctly
    # Error observations are a2, ab1, b1, 01, 02
    ea2 = ClassificationError(X="a2",
                              y_true=["a"],
                              y_pred_proba={
                                  "a": 0.3,
                                  "b": 0.7
                              })
    eab1 = ClassificationError(X="ab1",
                               y_true=["a", "b"],
                               y_pred_proba={
                                   "a": 0.4,
                                   "b": 0.6
                               })
    eb1 = ClassificationError(X="b1",
                              y_true=["b"],
                              y_pred_proba={
                                  "a": 0.9,
                                  "b": 0.1
                              })
    e01 = ClassificationError(X="01",
                              y_true=[],
                              y_pred_proba={
                                  "a": 0.6,
                                  "b": 0.4
                              })
    e02 = ClassificationError(X="02",
                              y_true=[],
                              y_pred_proba={
                                  "a": 0.4,
                                  "b": 0.6
                              })

    # Cut off at k=2 and ensure the correct errors are in there and ordered correctly
    errors = results.errors(k=2)

    # Expected errors
    a_false_positives = [eb1, e01]
    a_false_negatives = [ea2, eab1]
    b_false_positives = [ea2, e02]
    b_false_negatives = [eb1]

    a_errors = errors["a"]
    b_errors = errors["b"]
    assert a_errors[0] == a_false_positives
    assert a_errors[1] == a_false_negatives
    assert b_errors[0] == b_false_positives
    assert b_errors[1] == b_false_negatives