Exemplos de confusion_matrix em Python, exemplos de turicreate.toolkits.evaluation.confusion_matrix em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_boosted_trees.py Projeto: chrinide/turicreate

    def test_evaluate(self):
        t = self.dtrain[self.target]
        c = self.model.predict(self.dtrain, "class")
        p = self.model.predict(self.dtrain, "probability_vector")
        ans_metrics = [
            "accuracy",
            "auc",
            "confusion_matrix",
            "f1_score",
            "log_loss",
            "precision",
            "recall",
            "roc_curve",
        ]

        self.sm_metrics = {
            "accuracy": evaluation.accuracy(t, c),
            "auc": evaluation.auc(t, p),
            "confusion_matrix": evaluation.confusion_matrix(t, c),
            "f1_score": evaluation.f1_score(t, c),
            "log_loss": evaluation.log_loss(t, p),
            "precision": evaluation.precision(t, c),
            "recall": evaluation.recall(t, c),
            "roc_curve": evaluation.roc_curve(t, p),
        }
        model = self.model

        def check_cf_matrix(ans):
            self.assertTrue(ans is not None)
            self.assertTrue("confusion_matrix" in ans)
            cf = ans["confusion_matrix"].sort(["target_label", "predicted_label"])
            ans_cf = self.sm_metrics["confusion_matrix"].sort(
                ["target_label", "predicted_label"]
            )
            self.assertEqual(list(cf["count"]), list(ans_cf["count"]))

        def check_roc_curve(ans):
            self.assertTrue(ans is not None)
            self.assertTrue("roc_curve" in ans)
            roc = ans["roc_curve"]
            self.assertEqual(type(roc), tc.SFrame)

        def check_metric(ans, metric):
            if metric == "confusion_matrix":
                check_cf_matrix(ans)
            elif metric == "roc_curve":
                check_roc_curve(ans)
            else:
                self.assertTrue(ans is not None)
                self.assertTrue(metric in ans)
                self.assertAlmostEqual(
                    ans[metric],
                    self.sm_metrics[metric],
                    places=4,
                    msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]),
                )

        # Default
        ans = model.evaluate(self.dtrain)
        self.assertEqual(sorted(ans.keys()), sorted(ans_metrics))
        for m in ans_metrics:
            check_metric(ans, m)

        # Individual
        for m in ans_metrics:
            ans = model.evaluate(self.dtrain, metric=m)
            check_metric(ans, m)

        # Test evaluate with new class
        test_data = self.dtrain.copy().head()
        test_data[self.target] = test_data[self.target].apply(lambda x: str(x) + "-new")
        for m in ans_metrics:
            ans = model.evaluate(test_data, metric=m)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: _activity_classifier.py Projeto: vsarmien/turicreate

    def evaluate(self, dataset, metric='auto'):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the session_id, target and features used for model training.
            Additional columns are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        create, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """

        avail_metrics = [
            'accuracy', 'auc', 'precision', 'recall', 'f1_score', 'log_loss',
            'confusion_matrix', 'roc_curve'
        ]
        _tkutl._check_categorical_option_type('metric', metric,
                                              avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        probs = self.predict(dataset, output_type='probability_vector')
        classes = self.predict(dataset, output_type='class')

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(dataset[self.target],
                                                   classes)
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(dataset[self.target],
                                         probs,
                                         index_map=self._target_id_map)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(dataset[self.target],
                                                     classes)
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(dataset[self.target], classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(dataset[self.target],
                                                   classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = _evaluation.log_loss(
                dataset[self.target], probs, index_map=self._target_id_map)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(
                dataset[self.target], classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(
                dataset[self.target], probs, index_map=self._target_id_map)

        return ret

Exemplo n.º 3

0

Exibir arquivo

    def evaluate(self, dataset, metric='auto', batch_size=256, verbose=True):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the feature and target columns used for model training.
            Additional columns are ignored.

        metric : str, optional
            Name of the evaluation metric. Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve
            performance.

        verbose : bool, optional
            If True, prints prediction progress.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        create, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print(results['accuracy'])
        """
        import os, json, math

        if self.target not in dataset.column_names():
            raise _ToolkitError("Must provide ground truth column, '" +
                                self.target + "' in the evaluation dataset.")

        predicted = self._predict_with_probabilities(dataset, batch_size,
                                                     verbose)

        avail_metrics = [
            'accuracy', 'auc', 'precision', 'recall', 'f1_score',
            'confusion_matrix', 'roc_curve', 'log_loss'
        ]

        _tkutl._check_categorical_option_type('metric', metric,
                                              avail_metrics + ['auto'])

        metrics = avail_metrics if metric == 'auto' else [metric]

        labels = self.classes

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(dataset[self.target],
                                                   predicted[self.target])
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(dataset[self.target],
                                         predicted['probability'],
                                         index_map=self._class_to_index)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(dataset[self.target],
                                                     predicted[self.target])
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(dataset[self.target],
                                               predicted[self.target])
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(dataset[self.target],
                                                   predicted[self.target])
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(
                dataset[self.target], predicted[self.target])
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(
                dataset[self.target],
                predicted['probability'],
                index_map=self._class_to_index)
        if 'log_loss' in metrics:
            ret['log_loss'] = _evaluation.log_loss(
                dataset[self.target],
                predicted['probability'],
                index_map=self._class_to_index)

        from .._evaluate_utils import (entropy, confidence,
                                       relative_confidence,
                                       get_confusion_matrix, hclusterSort,
                                       l2Dist)
        evaluation_result = {k: ret[k] for k in metrics}
        evaluation_result['num_test_examples'] = len(dataset)
        for k in [
                'num_classes', 'num_examples', 'training_loss',
                'training_time', 'max_iterations'
        ]:
            evaluation_result[k] = getattr(self, k)

        #evaluation_result['input_image_shape'] = getattr(self, 'input_image_shape')

        evaluation_result["model_name"] = "Drawing Classifier"
        extended_test = dataset.add_column(predicted["probability"], 'probs')
        extended_test['label'] = dataset[self.target]

        extended_test = extended_test.add_columns([
            extended_test.apply(
                lambda d: labels[d['probs'].index(confidence(d['probs']))]),
            extended_test.apply(lambda d: entropy(d['probs'])),
            extended_test.apply(lambda d: confidence(d['probs'])),
            extended_test.apply(lambda d: relative_confidence(d['probs']))
        ], ['predicted_label', 'entropy', 'confidence', 'relative_confidence'])

        extended_test = extended_test.add_column(
            extended_test.apply(lambda d: d['label'] == d['predicted_label']),
            'correct')

        sf_conf_mat = get_confusion_matrix(extended_test, labels)
        confidence_threshold = 0.5
        hesitant_threshold = 0.2
        evaluation_result['confidence_threshold'] = confidence_threshold
        evaluation_result['hesitant_threshold'] = hesitant_threshold
        evaluation_result[
            'confidence_metric_for_threshold'] = 'relative_confidence'

        evaluation_result['conf_mat'] = list(sf_conf_mat)

        vectors = map(
            lambda l: {
                'name':
                l,
                'pos':
                list(sf_conf_mat[sf_conf_mat['target_label'] == l].sort(
                    'predicted_label')['norm_prob'])
            }, labels)
        evaluation_result['sorted_labels'] = hclusterSort(
            vectors, l2Dist)[0]['name'].split("|")

        per_l = extended_test.groupby(
            ['label'], {
                'count': _tc.aggregate.COUNT,
                'correct_count': _tc.aggregate.SUM('correct')
            })
        per_l['recall'] = per_l.apply(
            lambda l: l['correct_count'] * 1.0 / l['count'])

        per_pl = extended_test.groupby(
            ['predicted_label'], {
                'predicted_count': _tc.aggregate.COUNT,
                'correct_count': _tc.aggregate.SUM('correct')
            })
        per_pl['precision'] = per_pl.apply(
            lambda l: l['correct_count'] * 1.0 / l['predicted_count'])
        per_pl = per_pl.rename({'predicted_label': 'label'})
        evaluation_result['label_metrics'] = list(
            per_l.join(per_pl, on='label', how='outer').select_columns([
                'label', 'count', 'correct_count', 'predicted_count', 'recall',
                'precision'
            ]))
        evaluation_result['labels'] = labels

        extended_test = extended_test.add_row_number('__idx').rename(
            {'label': 'target_label'})

        evaluation_result['test_data'] = extended_test
        evaluation_result['feature'] = self.feature

        return _Evaluation(evaluation_result)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: sound_classifier.py Projeto: omodolapovictorb/turicreate

    def evaluate(self, dataset, metric="auto", verbose=True, batch_size=64):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset to use for evaluation, must include a column with the same
            name as the features used for model training. Additional columns
            are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        verbose : bool, optional
            If True, prints progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        classify, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """
        from turicreate.toolkits import evaluation

        # parameter checking
        if not isinstance(dataset, _tc.SFrame):
            raise TypeError("'dataset' parameter must be an SFrame")

        avail_metrics = [
            "accuracy",
            "auc",
            "precision",
            "recall",
            "f1_score",
            "log_loss",
            "confusion_matrix",
            "roc_curve",
        ]
        _tk_utils._check_categorical_option_type(
            "metric", metric, avail_metrics + ["auto"]
        )

        if metric == "auto":
            metrics = avail_metrics
        else:
            metrics = [metric]

        if _is_deep_feature_sarray(dataset[self.feature]):
            deep_features = dataset[self.feature]
        else:
            deep_features = get_deep_features(dataset[self.feature], verbose=verbose)
        data = _tc.SFrame({"deep features": deep_features})
        data = data.add_row_number()
        missing_ids = data.filter_by([[]], "deep features")["id"]

        if len(missing_ids) > 0:
            data = data.filter_by([[]], "deep features", exclude=True)
            # Remove the labels for entries without deep features
            _logging.warning(
                "Dropping %d examples which are less than 975ms in length."
                % len(missing_ids)
            )
            labels = dataset[[self.target]].add_row_number()
            labels = data.join(labels, how="left")[self.target]
        else:
            labels = dataset[self.target]
        assert len(labels) == len(data)

        if any([m in metrics for m in ("roc_curve", "log_loss", "auc")]):
            probs = self.predict(
                data["deep features"],
                output_type="probability_vector",
                verbose=verbose,
                batch_size=batch_size,
            )
        if any(
            [
                m in metrics
                for m in (
                    "accuracy",
                    "precision",
                    "recall",
                    "f1_score",
                    "confusion_matrix",
                )
            ]
        ):
            classes = self.predict(
                data["deep features"],
                output_type="class",
                verbose=verbose,
                batch_size=batch_size,
            )

        ret = {}
        if "accuracy" in metrics:
            ret["accuracy"] = evaluation.accuracy(labels, classes)
        if "auc" in metrics:
            ret["auc"] = evaluation.auc(
                labels, probs, index_map=self._class_label_to_id
            )
        if "precision" in metrics:
            ret["precision"] = evaluation.precision(labels, classes)
        if "recall" in metrics:
            ret["recall"] = evaluation.recall(labels, classes)
        if "f1_score" in metrics:
            ret["f1_score"] = evaluation.f1_score(labels, classes)
        if "log_loss" in metrics:
            ret["log_loss"] = evaluation.log_loss(
                labels, probs, index_map=self._class_label_to_id
            )
        if "confusion_matrix" in metrics:
            ret["confusion_matrix"] = evaluation.confusion_matrix(labels, classes)
        if "roc_curve" in metrics:
            ret["roc_curve"] = evaluation.roc_curve(
                labels, probs, index_map=self._class_label_to_id
            )

        return ret

Exemplo n.º 5

0

Exibir arquivo

Arquivo: drawing_classifier.py Projeto: schaicdn/turicreate

    def evaluate(self, dataset, metric = 'auto', verbose = True):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.
        
        Parameters
        ----------
        dataset : SFrame
        Dataset of new observations. Must include columns with the same
        names as the feature and target columns used for model training.
        Additional columns are ignored.
        
        metric : str optional
        Name of the evaluation metric. Possible values are:
        
        - 'auto'             : Returns all available metrics.
        - 'accuracy'         : Classification accuracy (micro average).
        - 'auc'              : Area under the ROC curve (macro average)
        - 'precision'        : Precision score (macro average)
        - 'recall'           : Recall score (macro average)
        - 'f1_score'         : F1 score (macro average)
        - 'confusion_matrix' : An SFrame with counts of possible 
                               prediction/true label combinations.
        - 'roc_curve'        : An SFrame containing information needed for an
                               ROC curve
        
        verbose : bool optional
        If True, prints prediction progress.

        Returns
        -------
        out : dict
        Dictionary of evaluation results where the key is the name of the
        evaluation metric (e.g. `accuracy`) and the value is the evaluation
        score.
        
        See Also
        ----------
        create, predict
        
        Examples
        ----------
        .. sourcecode:: python
        
        >>> results = model.evaluate(data)
        >>> print(results['accuracy'])
        """

        if self.target not in dataset.column_names():
            raise _ToolkitError("Dataset provided to evaluate does not have " 
                + "ground truth in the " + self.target + " column.")

        predicted = self._predict_with_probabilities(dataset, verbose)

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'confusion_matrix', 'roc_curve']

        _tkutl._check_categorical_option_type(
                        'metric', metric, avail_metrics + ['auto'])

        metrics = avail_metrics if metric == 'auto' else [metric]
        
        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(
                dataset[self.target], predicted[self.target])
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(
                dataset[self.target], predicted['probability'], 
                index_map=self._class_to_index)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(
                dataset[self.target], predicted[self.target])
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(
                dataset[self.target], predicted[self.target])
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(
                dataset[self.target], predicted[self.target])
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(
                dataset[self.target], predicted[self.target])
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(
                dataset[self.target], predicted['probability'], 
                index_map=self._class_to_index)
        
        return ret

Exemplo n.º 6

0

Exibir arquivo

    def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None):
        """
        Evaluate the model's predictive accuracy. This is done by predicting the
        target class for instances in a new dataset and comparing to known
        target values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the target and features used for model training. Additional
            columns are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto': Returns all available metrics.

            - 'accuracy': Classification accuracy.

            - 'confusion_matrix': An SFrame with counts of possible
              prediction/true label combinations.

            - 'roc_curve': An SFrame containing information needed for an roc
              curve (binary classification only).

        max_neighbors : int, optional
            Maximum number of neighbors to consider for each point.

        radius : float, optional
            Maximum distance from each point to a neighbor in the reference
            dataset.

        Returns
        -------
        out : dict
            Evaluation results. The dictionary keys are *accuracy* and
            *confusion_matrix* and *roc_curve* (if applicable).

        See also
        --------
        create, predict, predict_topk, classify

        Notes
        -----
        - Because the model randomly breaks ties between predicted classes, the
          results of repeated calls to `evaluate` method may differ.

        Examples
        --------
        >>> sf_train = turicreate.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
        ...                             'height': [9, 25, 20, 23],
        ...                             'weight': [13, 28, 33, 22]})
        >>> m = turicreate.nearest_neighbor_classifier.create(sf, target='species')
        >>> ans = m.evaluate(sf_train, max_neighbors=2,
        ...                  metric='confusion_matrix')
        >>> print ans['confusion_matrix']
        +--------------+-----------------+-------+
        | target_label | predicted_label | count |
        +--------------+-----------------+-------+
        |     cat      |       dog       |   1   |
        |     dog      |       dog       |   2   |
        |    fossa     |       dog       |   1   |
        +--------------+-----------------+-------+
        """

        ## Validate the metric name
        _raise_error_evaluation_metric_is_valid(
            metric, ['auto', 'accuracy', 'confusion_matrix', 'roc_curve'])

        ## Make sure the input dataset has a target column with an appropriate
        #  type.
        target = self.target
        _raise_error_if_column_exists(dataset, target, 'dataset', target)

        if not dataset[target].dtype == str and not dataset[
                target].dtype == int:
            raise TypeError("The target column of the evaluation dataset must "
                            "contain integers or strings.")

        if self.num_classes != 2:
            if (metric == 'roc_curve') or (metric == ['roc_curve']):
                err_msg = "Currently, ROC curve is not supported for "
                err_msg += "multi-class classification in this model."
                raise _ToolkitError(err_msg)
            else:
                warn_msg = "WARNING: Ignoring `roc_curve`. "
                warn_msg += "Not supported for multi-class classification."
                print(warn_msg)

        ## Compute predictions with the input dataset.
        ystar = self.predict(dataset,
                             output_type='class',
                             max_neighbors=max_neighbors,
                             radius=radius)
        ystar_prob = self.predict(dataset,
                                  output_type='probability',
                                  max_neighbors=max_neighbors,
                                  radius=radius)

        ## Compile accuracy metrics
        results = {}

        if metric in ['accuracy', 'auto']:
            results['accuracy'] = _evaluation.accuracy(targets=dataset[target],
                                                       predictions=ystar)

        if metric in ['confusion_matrix', 'auto']:
            results['confusion_matrix'] = \
                _evaluation.confusion_matrix(targets=dataset[target],
                                                predictions=ystar)

        if self.num_classes == 2:
            if metric in ['roc_curve', 'auto']:
                results['roc_curve'] = \
                      _evaluation.roc_curve(targets=dataset[target],
                                               predictions=ystar_prob)
        return results

Exemplo n.º 7

0

Exibir arquivo

Arquivo: sound_classifier.py Projeto: ablades/turicreate

    def evaluate(self, dataset, metric='auto', batch_size=64):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset to use for evaluation, must include a column with the same
            name as the features used for model training. Additional columns
            are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        classify, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """
        from turicreate.toolkits import evaluation

        # parameter checking
        if not isinstance(dataset, _tc.SFrame):
            raise TypeError('\'dataset\' parameter must be an SFrame')
        if(batch_size < 1):
            raise ValueError('\'batch_size\' must be greater than or equal to 1')

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve']
        _tk_utils._check_categorical_option_type(
            'metric', metric, avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]):
            probs = self.predict(dataset, output_type='probability_vector', batch_size=batch_size)
        if any([m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix')]):
            classes = self.predict(dataset, output_type='class', batch_size=batch_size)

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = evaluation.accuracy(dataset[self.target], classes)
        if 'auc' in metrics:
            ret['auc'] = evaluation.auc(dataset[self.target], probs, index_map=self._class_label_to_id)
        if 'precision' in metrics:
            ret['precision'] = evaluation.precision(dataset[self.target], classes)
        if 'recall' in metrics:
            ret['recall'] = evaluation.recall(dataset[self.target], classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = evaluation.f1_score(dataset[self.target], classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = evaluation.log_loss(dataset[self.target], probs, index_map=self._class_label_to_id)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = evaluation.confusion_matrix(dataset[self.target], classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = evaluation.roc_curve(dataset[self.target], probs, index_map=self._class_label_to_id)

        return ret