Пример #1
0
    def multiclass_classification(target, predictions):

        all_labels = [i[11:] for i in predictions.columns.tolist()[:-1]]

        ll = logloss(target, predictions[predictions.columns[:-1]])

        labels = {i: l for i, l in enumerate(all_labels)}
        predictions = predictions["label"]
        target = target["target"].map(labels)
        # Print the confusion matrix
        conf_matrix = confusion_matrix(target, predictions, labels=all_labels)

        rows = [f"Predicted as {a}" for a in all_labels]
        cols = [f"Labeled as {a}" for a in all_labels]

        conf_matrix = pd.DataFrame(conf_matrix, columns=rows, index=cols)

        max_metrics = classification_report(target,
                                            predictions,
                                            digits=6,
                                            labels=all_labels,
                                            output_dict=True)
        max_metrics["logloss"] = ll

        return {
            "max_metrics": pd.DataFrame(max_metrics).transpose(),
            "confusion_matrix": conf_matrix,
        }
Пример #2
0
    def multiclass_classification(target, predictions, sample_weight=None):
        all_labels = [i[11:] for i in predictions.columns.tolist()[:-1]]

        predicted_probas = predictions[predictions.columns[:-1]]
        ll = logloss(target,
                     predictions[predictions.columns[:-1]],
                     sample_weight=sample_weight)

        if "target" in target.columns.tolist():
            # multiclass coding with integer
            labels = {i: l for i, l in enumerate(all_labels)}
            target = target["target"].map(labels)
        else:
            # multiclass coding with one-hot encoding
            old_columns = target.columns
            t = target[old_columns[0]]
            for l in all_labels:
                t[target[f"target_{l}"] == 1] = l

            target = pd.DataFrame({"target": t})

        # Print the confusion matrix
        predicted_labels = predictions["label"]
        predictions = predictions["label"]
        if not pd.api.types.is_string_dtype(predictions):
            predictions = predictions.astype(str)

        if not pd.api.types.is_string_dtype(target):
            target = target.astype(str)

        conf_matrix = confusion_matrix(target,
                                       predictions,
                                       labels=all_labels,
                                       sample_weight=sample_weight)

        rows = [f"Predicted as {a}" for a in all_labels]
        cols = [f"Labeled as {a}" for a in all_labels]

        conf_matrix = pd.DataFrame(conf_matrix, columns=rows, index=cols)

        max_metrics = classification_report(
            target,
            predictions,
            digits=6,
            labels=all_labels,
            output_dict=True,
            sample_weight=sample_weight,
        )
        max_metrics["logloss"] = ll

        return {
            "max_metrics":
            pd.DataFrame(max_metrics).transpose(),
            "confusion_matrix":
            conf_matrix,
            "additional_plots":
            AdditionalPlots.plots_multiclass(target, predicted_labels,
                                             predicted_probas),
        }
Пример #3
0
    def multiclass_classification(target, predictions):
        all_labels = [i[11:] for i in predictions.columns.tolist()[:-1]]

        ll = logloss(target, predictions[predictions.columns[:-1]])

        if "target" in target.columns.tolist():
            # multiclass coding with integer
            labels = {i: l for i, l in enumerate(all_labels)}
            target = target["target"].map(labels)
        else:
            # multiclass coding with one-hot encoding
            old_columns = target.columns
            print(old_columns)
            t = target[old_columns[0]]
            for l in all_labels:
                t[target[f"target_{l}"] == 1] = l
                print(l, t)

            target = pd.DataFrame({"target": t})

        # Print the confusion matrix
        predictions = predictions["label"]
        conf_matrix = confusion_matrix(target, predictions, labels=all_labels)

        rows = [f"Predicted as {a}" for a in all_labels]
        cols = [f"Labeled as {a}" for a in all_labels]

        conf_matrix = pd.DataFrame(conf_matrix, columns=rows, index=cols)

        max_metrics = classification_report(target,
                                            predictions,
                                            digits=6,
                                            labels=all_labels,
                                            output_dict=True)
        max_metrics["logloss"] = ll

        return {
            "max_metrics": pd.DataFrame(max_metrics).transpose(),
            "confusion_matrix": conf_matrix,
        }
Пример #4
0
    def binary_classification(target, predictions, sample_weight=None):

        negative_label, positive_label = "0", "1"
        mapping = None
        try:
            pred_col = predictions.columns[0]
            if "_0_for_" in pred_col and "_1_for_" in pred_col:
                t = pred_col.split("_0_for_")[1]
                t = t.split("_1_for_")
                negative_label, positive_label = t[0], t[1]
                mapping = {0: negative_label, 1: positive_label}
        except Exception as e:
            pass

        predictions = np.array(predictions)
        sorted_predictions = np.sort(predictions)
        STEPS = 100  # can go lower for speed increase ???
        details = {
            "threshold": [],
            "f1": [],
            "accuracy": [],
            "precision": [],
            "recall": [],
            "mcc": [],
        }
        samples_per_step = max(1, np.floor(predictions.shape[0] / STEPS))

        for i in range(STEPS):
            idx = int(i * samples_per_step)
            if idx + 1 >= predictions.shape[0]:
                break
            if i == 0:
                th = 0.9 * np.min(sorted_predictions)
            else:
                th = float(
                    0.5 *
                    (sorted_predictions[idx] + sorted_predictions[idx + 1]))

            if np.sum(predictions > th) < 1:
                break
            response = (predictions > th).astype(int)

            details["threshold"] += [th]
            details["f1"] += [
                f1_score(target, response, sample_weight=sample_weight)
            ]
            details["accuracy"] += [
                accuracy_score(target, response, sample_weight=sample_weight)
            ]
            details["precision"] += [
                precision_score(target, response, sample_weight=sample_weight)
            ]
            details["recall"] += [
                recall_score(target, response, sample_weight=sample_weight)
            ]
            if i == 0:
                details["mcc"] += [0.0]
            else:
                details["mcc"] += [
                    matthews_corrcoef(target,
                                      response,
                                      sample_weight=sample_weight)
                ]

        # max metrics
        max_metrics = {
            "logloss": {
                "score": logloss(target,
                                 predictions,
                                 sample_weight=sample_weight),
                "threshold": None,
            },  # there is no threshold for LogLoss
            "auc": {
                "score":
                roc_auc_score(target, predictions,
                              sample_weight=sample_weight),
                "threshold":
                None,
            },  # there is no threshold for AUC
            "f1": {
                "score": np.max(details["f1"]),
                "threshold": details["threshold"][np.argmax(details["f1"])],
            },
            "accuracy": {
                "score": np.max(details["accuracy"]),
                "threshold":
                details["threshold"][np.argmax(details["accuracy"])],
            },
            "precision": {
                "score": np.max(details["precision"]),
                "threshold":
                details["threshold"][np.argmax(details["precision"])],
            },
            "recall": {
                "score": np.max(details["recall"]),
                "threshold":
                details["threshold"][np.argmax(details["recall"])],
            },
            "mcc": {
                "score": np.max(details["mcc"]),
                "threshold": details["threshold"][np.argmax(details["mcc"])],
            },
        }

        threshold = float(max_metrics["accuracy"]["threshold"])

        # if sample_weight is not None:
        #    new_max_metrics = {}
        #    for k, v in max_metrics.items():
        #        new_max_metrics["weighted_" + k] = v
        #    max_metrics = new_max_metrics

        # confusion matrix

        conf_matrix = confusion_matrix(target,
                                       predictions > threshold,
                                       sample_weight=sample_weight)

        conf_matrix = pd.DataFrame(
            conf_matrix,
            columns=[
                f"Predicted as {negative_label}",
                f"Predicted as {positive_label}",
            ],
            index=[
                f"Labeled as {negative_label}", f"Labeled as {positive_label}"
            ],
        )

        predicted_labels = pd.Series(
            (predictions.ravel() > threshold).astype(int))
        predicted_probas = pd.DataFrame({
            "proba_0": 1 - predictions.ravel(),
            "proba_1": predictions.ravel(),
        })

        if mapping is not None:
            labeled_target = target["target"].map(mapping)
            predicted_labels = predicted_labels.map(mapping)
        else:
            labeled_target = target

        return {
            "metric_details":
            pd.DataFrame(details),
            "max_metrics":
            pd.DataFrame(max_metrics),
            "confusion_matrix":
            conf_matrix,
            "threshold":
            threshold,
            "additional_plots":
            AdditionalPlots.plots_binary(labeled_target, predicted_labels,
                                         predicted_probas),
        }
Пример #5
0
    def binary_classification(target, predictions):

        predictions = np.array(predictions)
        sorted_predictions = np.sort(predictions)
        STEPS = 100  # can go lower for speed increase ???
        details = {
            "threshold": [],
            "f1": [],
            "accuracy": [],
            "precision": [],
            "recall": [],
            "mcc": [],
        }
        samples_per_step = max(1, np.floor(predictions.shape[0] / STEPS))

        for i in range(STEPS):
            idx = int(i * samples_per_step)
            if idx + 1 >= predictions.shape[0]:
                break
            if i == 0:
                th = 0.9 * np.min(sorted_predictions)
            else:
                th = float(
                    0.5 *
                    (sorted_predictions[idx] + sorted_predictions[idx + 1]))

            if np.sum(predictions > th) < 1:
                break
            response = (predictions > th).astype(int)

            details["threshold"] += [th]
            details["f1"] += [f1_score(target, response)]
            details["accuracy"] += [accuracy_score(target, response)]
            details["precision"] += [precision_score(target, response)]
            details["recall"] += [recall_score(target, response)]
            if i == 0:
                details["mcc"] += [0.0]
            else:
                details["mcc"] += [matthews_corrcoef(target, response)]

        # max metrics
        max_metrics = {
            "logloss": {
                "score": logloss(target, predictions),
                "threshold": None,
            },  # there is no threshold for LogLoss
            "auc": {
                "score": roc_auc_score(target, predictions),
                "threshold": None,
            },  # there is no threshold for AUC
            "f1": {
                "score": np.max(details["f1"]),
                "threshold": details["threshold"][np.argmax(details["f1"])],
            },
            "accuracy": {
                "score": np.max(details["accuracy"]),
                "threshold":
                details["threshold"][np.argmax(details["accuracy"])],
            },
            "precision": {
                "score": np.max(details["precision"]),
                "threshold":
                details["threshold"][np.argmax(details["precision"])],
            },
            "recall": {
                "score": np.max(details["recall"]),
                "threshold":
                details["threshold"][np.argmax(details["recall"])],
            },
            "mcc": {
                "score": np.max(details["mcc"]),
                "threshold": details["threshold"][np.argmax(details["mcc"])],
            },
        }
        # confusion matrix
        conf_matrix = confusion_matrix(
            target, predictions > max_metrics["f1"]["threshold"])
        conf_matrix = pd.DataFrame(
            conf_matrix,
            columns=["Predicted as negative", "Predicted as positive"],
            index=["Labeled as negative", "Labeled as positive"],
        )

        return {
            "metric_details": pd.DataFrame(details),
            "max_metrics": pd.DataFrame(max_metrics),
            "confusion_matrix": conf_matrix,
            "threshold": float(max_metrics["f1"]["threshold"]),
        }