def fit(self, data): """ Fits a transformer using the SFrame `data`. The `fit` phase does not train a deep learning model, it only checks that the trained model is comptable with the data provided. If the `auto` model is chosen, then the fit phase choses the right model to extract features from. Parameters ---------- data : SFrame The data used to fit the transformer. Returns ------- self (A fitted object) See Also -------- transform, fit_transform Examples -------- # Create data. >>> import graphlab as gl # Import data from MNIST >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') # Create a DeepFeatureExtractorObject >>> extractor = gl.feature_engineering.DeepFeatureExtractor( features = 'image') # Fit the encoder for a given dataset. >>> extractor = extractor.fit(data) # Return the model used for the deep feature extraction. >>> extractor['model'] """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit') # Check that the column is in the SFrame. _raise_error_if_not_of_type(data, [_SFrame]) _raise_error_if_column_exists(data, self._state["features"]) # Make sure the output column_name exists. count = 1 old_output_column_name = self._state["output_column_name"] output_column_name = old_output_column_name while output_column_name in data.column_names(): output_column_name = "%s.%s" % (old_output_column_name, count) count = count + 1 self._state["output_column_name"] = output_column_name if data[self._state["features"]].dtype() != _Image: raise ToolkitError( "Feature `%s` must be of type Image." % self._state["features"]) return self
def evaluate(self, dataset, metric='auto', missing_value_action='auto'): """ Evaluate the model on the given dataset. Parameters ---------- dataset : SFrame Dataset in the same format used for training. The columns names and types of the dataset must be the same as that used in training. metric : str, optional Name of the evaluation metric. Possible values are: 'auto' : Compute all metrics. 'rmse' : Rooted mean squared error. 'max_error' : Maximum error. missing_value_action : str, optional Action to perform when missing values are encountered. Can be one of: - 'auto': By default the model will treat missing value as is. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with evaluation and terminate with an error message. Returns ------- out : dict A dictionary containing the evaluation result. See Also ---------- create, predict Examples -------- >>> results = model.evaluate(test_data, 'rmse') """ _mt._get_metric_tracker().track('toolkit.regression.random_forest_regression.evaluate') _raise_error_evaluation_metric_is_valid(metric, ['auto', 'rmse', 'max_error']) results = {} if metric in ['rmse', 'auto']: results = super(RandomForestRegression, self).evaluate(dataset, metric=metric, missing_value_action=missing_value_action) if metric in ['max_error', 'auto']: predictions = self.predict(dataset, missing_value_action=missing_value_action) target = self.get('target') _raise_error_if_column_exists(dataset, predictions, 'dataset', target + '(target column)') results['max_error'] = _graphlab.evaluation.max_error(predictions, dataset[target]) return results
def evaluate(self, dataset, metric='auto'): """ Evaluate the model on the given dataset. Parameters ---------- dataset : SFrame Dataset in the same format used for training. The columns names and types of the dataset must be the same as that used in training. metric : str, optional Name of the evaluation metric. Possible values are: 'auto' : Returns all available metrics. 'accuracy ' : Classification accuracy. 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. Returns ------- out : dict A dictionary containing the evaluation result. See Also ---------- create, predict, classify Examples -------- >>> results = model.evaluate(test_data) >>> results = model.evaluate(test_data, metric='accuracy') >>> results = model.evaluate(test_data, metric='confusion_matrix') Notes ----- When evaluating for classification metrics (e.g. auc, confusion_matrix), the classification threshold is set to 0.5. For more flexible classification accuracy, please use functions in the :py:mod:`~graphlab.toolkits.evaluation` module. """ _mt._get_metric_tracker().track('toolkit.classifier.boosted_trees_classifier.evaluate') _raise_error_evaluation_metric_is_valid(metric, ['auto', 'accuracy', 'confusion_matrix']) results = {} if metric in ['auto', 'accuracy']: results = super(_Classifier, self).evaluate(dataset, metric = metric) if metric in ['confusion_matrix', 'auto']: predictions = self.predict(dataset, output_type = 'class') target = self.get('target') _raise_error_if_column_exists(dataset, predictions, 'dataset', target + '(target column)') results['confusion_matrix'] = _graphlab.evaluation.confusion_matrix(\ predictions, dataset[target]) return results
def evaluate(self, dataset, metric='auto'): """ Evaluate the model on the given dataset. Parameters ---------- dataset : SFrame Dataset in the same format used for training. The columns names and types of the dataset must be the same as that used in training. metric : str, optional Name of the evaluation metric. Possible values are: 'auto' : Compute all metrics. 'rmse' : Rooted mean squared error. 'max_error' : Maximum error. Returns ------- out : dict A dictionary containing the evaluation result. See Also ---------- create, predict Examples -------- >>> results = model.evaluate(test_data, 'rmse') Notes ----- When evaluating for classifier metrics (e.g. auc, confusion_matrix), the classifier threshold is set to 0.5. """ _mt._get_metric_tracker().track('toolkit.regression.boosted_trees_regression.evaluate') _raise_error_evaluation_metric_is_valid(metric, ['auto', 'rmse', 'max_error']) results = {} if metric in ['rmse', 'auto']: results = super(BoostedTreesRegression, self).evaluate(dataset, metric = metric) if metric in ['max_error', 'auto']: predictions = self.predict(dataset) target = self.get('target') _raise_error_if_column_exists(dataset, predictions, 'dataset', target + '(target column)') results['max_error'] = _graphlab.evaluation.max_error(\ predictions, dataset[target]) return results
def fit(self, data): """ Fits a transformer using the SFrame `data`. The `fit` phase does not train a deep learning model, it only checks that the trained model is comptable with the data provided. If the `auto` model is chosen, then the fit phase choses the right model to extract features from. Parameters ---------- data : SFrame The data used to fit the transformer. Returns ------- self (A fitted object) See Also -------- transform, fit_transform Examples -------- # Create data. >>> import graphlab as gl # Import data from MNIST >>> data = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k') # Create a DeepFeatureExtractorObject >>> extractor = gl.feature_engineering.DeepFeatureExtractor(features = 'image') # Fit the encoder for a given dataset. >>> extractor = extractor.fit(data) # Return the model used for the deep feature extraction. >>> extractor['model'] """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit') # Check that the column is in the SFrame. _raise_error_if_not_of_type(data, [_SFrame]) for feature in self._state["features"]: _raise_error_if_column_exists(data, feature) if data[feature].dtype() != _Image: raise ToolkitError("Feature `%s` must be of type Image." % feature) return self
def create(data, row_label=None, features=None, feature_model='auto', method='lsh', verbose=True): """ Create a similarity search model, which can be used to quickly retrieve items similar to a query observation. In the case of images, this model automatically performs the appropriate feature engineering steps. NOTE: If you are using a CPU for the creation step with feature_model='auto', creation time may take a while. This is because extracting features for images on a CPU is expensive. With a GPU, one can expect large speedups. .. warning:: The similarity search toolkit is currently in beta, and feedback is welcome! Please send comments to [email protected]. Parameters ---------- dataset : SFrame The SFrame that represents the training data for the model, including at least one column of images. row_label : str, optional Name of the SFrame column with row id's. If 'row_label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : str, optional The name of an image column in the input 'dataset' SFrame. feature_model : 'auto' | A model of type NeuralNetClassifier, optional A trained model for extracting features from raw data objects. By default ('auto'), we choose an appropriate model from our set of pre-trained models. See :class:`~graphlab.toolkits.feature_engineering.DeepFeatureExtractor` for more information. method : {'lsh', 'brute_force'}, optional The method used for nearest neighbor search. The 'lsh' option uses locality-sensitive hashing to find approximate results more quickly. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : SimilaritySearchModel See Also -------- SimilaritySearchModel graphlab.toolkits.nearest_neighbors graphlab.toolkits.feature_engineering Notes ----- The similarity search toolkit currently uses cosine distance to evaluate the similarity between each query and candidate results. Examples -------- First, split data into reference and query. >>> import graphlab as gl >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') >>> reference, query = data.random_split(0.8) Build neuralnet feature extractor for images: >>> nn_model = gl.neuralnet_classifier.create(reference, target='label') Construct SimilaritySearchModel: >>> model = gl.similarity_search.create(reference, features= 'image', ... feature_model=nn_model) Find the most similar items in the reference set for each item in the query set: >>> model.search(query) """ _mt._get_metric_tracker().track(__name__ + '.create') _raise_error_if_not_of_type(data, [_SFrame]) _raise_error_if_not_of_type(features, [str]) _raise_error_if_column_exists(data, features) if data[features].dtype() != _Image: raise _ToolkitError("Feature `%s` must be of type Image" \ % features) return SimilaritySearchModel(data, row_label=row_label, feature=features, feature_model=feature_model, method=method, verbose=verbose)
def search(self, data, row_label=None, k=5): """ Search for the nearest neighbors from the reference set for each element of the query set. The query SFrame must include columns with the same names as the row_label and feature columns used to create the SimilaritySearchModel. Parameters ---------- data : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. row_label : string, optional Name of the query SFrame column with row id's. If 'row_label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors. Returns ------- out A SFrame that contains all the nearest neighbors. Examples -------- First, split data into reference and query: >>> import graphlab as gl >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') >>> reference, query = data.random_split(0.8) Build a neural net feature extractor for images: >>> nn_model = gl.neuralnet_classifier.create(reference, target='label') Construct the SimilaritySearchModel: >>> model = gl.similarity_search.create(reference, features='image', ... feature_model=nn_model) Find the most similar items in the reference set for each query: >>> model.search(query) """ _raise_error_if_not_of_type(row_label, [str, _NoneType]) feature = self._state['features'] _raise_error_if_column_exists(data, feature) if (data[feature].dtype() != self._feature_type): raise ValueError('Feature columns must have same data type in both reference and query set') if row_label != None: _raise_error_if_column_exists(data, row_label) if data[feature].dtype() == _Image: transformed_data = self._extractor.transform(data) else: transformed_data = data transformed_data[self._state['output_column_name']] = transformed_data[feature] return self._neighbors_model.query(transformed_data, label=row_label, k=k)
def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None): """ Evaluate model accuracy by making predicting target classes for a new dataset and comparing to actual target values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the target and features used for model training. Additional columns are ignored. metric : string, optional Name of the evaluation metric. Possible values are: 'auto' : Returns all available metrics. 'accuracy ' : Classification accuracy. 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. 'roc_curve' : An SFrame containing information needed for an roc curve max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. Returns ------- out : dict Evaluation results. The dictionary keys are *accuracy* and *confusion_matrix* and *roc_curve*. See also -------- create, predict, predict_topk, classify Notes ----- - Because the model randomly breaks ties between predicted classes, the results of repeated calls to `evaluate` method may differ. Examples -------- >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species') >>> ans = m.evaluate(sf_train, max_neighbors=2, ... metric='confusion_matrix') >>> print ans['confusion_matrix'] +--------------+-----------------+-------+ | target_label | predicted_label | count | +--------------+-----------------+-------+ | cat | dog | 1 | | dog | dog | 2 | | fossa | dog | 1 | +--------------+-----------------+-------+ """ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.evaluate') ## Validate the metric name _raise_error_evaluation_metric_is_valid(metric, ['auto', 'accuracy', 'confusion_matrix', 'roc_curve']) ## Make sure the input dataset has a target column with an appropriate # type. target = self.get('target') _raise_error_if_column_exists(dataset, target, 'dataset', target) if not dataset[target].dtype() == str and not dataset[target].dtype() == int: raise TypeError("The target column of the evaluation dataset must " "contain integers or strings.") ## Compute predictions with the input dataset. ystar = self.predict(dataset, output_type='class', max_neighbors=max_neighbors, radius=radius) ystar_prob = self.predict(dataset, output_type='probability', max_neighbors=max_neighbors, radius=radius) ## Compile accuracy metrics results = {} if metric in ['accuracy', 'auto']: results['accuracy'] = _gl.evaluation.accuracy(targets=dataset[target], predictions=ystar) if metric in ['confusion_matrix', 'auto']: results['confusion_matrix'] = \ _gl.evaluation.confusion_matrix(targets=dataset[target], predictions=ystar) if metric in ['roc_curve', 'auto']: results['roc_curve'] = \ _gl.evaluation.roc_curve(targets=dataset[target], predictions=ystar_prob) return results
def create(data, row_label=None, features=None, feature_model='auto', method='lsh', verbose=True): """ Create a similarity search model, which can be used to quickly retrieve items similar to a query observation. In the case of images, this model automatically performs the appropriate feature engineering steps. NOTE: If you are using a CPU for the creation step with feature_model='auto', creation time may take a while. This is because extracting features for images on a CPU is expensive. With a GPU, one can expect large speedups. Parameters ---------- dataset : SFrame The SFrame that represents the training data for the model, including at least one column of images. row_label : str, optional Name of the SFrame column with row id's. If 'row_label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : str, optional The name of an image column in the input 'dataset' SFrame. feature_model : 'auto' | A model of type NeuralNetClassifier, optional A trained model for extracting features from raw data objects. By default ('auto'), we choose an appropriate model from our set of pre-trained models. See :class:`~graphlab.toolkits.feature_engineering.DeepFeatureExtractor` for more information. method : {'lsh', 'brute_force'}, optional The method used for nearest neighbor search. The 'lsh' option uses locality-sensitive hashing to find approximate results more quickly. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : SimilaritySearchModel See Also -------- SimilaritySearchModel graphlab.toolkits.nearest_neighbors graphlab.toolkits.feature_engineering Notes ----- The similarity search toolkit currently uses cosine distance to evaluate the similarity between each query and candidate results. Examples -------- First, split data into reference and query. >>> import graphlab as gl >>> data = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k') >>> reference, query = data.random_split(0.8) Build neuralnet feature extractor for images: >>> nn_model = gl.neuralnet_classifier.create(reference, target='label') Construct SimilaritySearchModel: >>> model = gl.similarity_search.create(reference, features= 'image', ... feature_model=nn_model) Find the most similar items in the reference set for each item in the query set: >>> model.search(query) """ _mt._get_metric_tracker().track(__name__ + '.create') _raise_error_if_not_of_type(data, [_SFrame]) _raise_error_if_not_of_type(features, [str]) _raise_error_if_column_exists(data, features) if data[features].dtype() != _Image: raise _ToolkitError("Feature `%s` must be of type Image" \ % features) return SimilaritySearchModel(data, row_label=row_label, feature=features, feature_model=feature_model, method=method, verbose=verbose)
def search(self, data, row_label=None, k=5): """ Search for the nearest neighbors from the reference set for each element of the query set. The query SFrame must include columns with the same names as the row_label and feature columns used to create the SimilaritySearchModel. Parameters ---------- data : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. row_label : string, optional Name of the query SFrame column with row id's. If 'row_label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors. Returns ------- out A SFrame that contains all the nearest neighbors. Examples -------- First, split data into reference and query: >>> import graphlab as gl >>> data = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k') >>> reference, query = data.random_split(0.8) Build a neural net feature extractor for images: >>> nn_model = gl.neuralnet_classifier.create(reference, target='label') Construct the SimilaritySearchModel: >>> model = gl.similarity_search.create(reference, features='image', ... feature_model=nn_model) Find the most similar items in the reference set for each query: >>> model.search(query) """ _raise_error_if_not_of_type(row_label, [str, type(None)]) feature = self._state['features'] _raise_error_if_column_exists(data, feature) if (data[feature].dtype() != self._feature_type): raise ValueError('Feature columns must have same data type in both reference and query set') if row_label != None: _raise_error_if_column_exists(data, row_label) if data[feature].dtype() == _Image: transformed_data = self._extractor.transform(data) else: transformed_data = data transformed_data[self._state['output_column_name']] = transformed_data[feature] return self._neighbors_model.query(transformed_data, label=row_label, k=k)
def evaluate(self, dataset, metric='auto', missing_value_action='auto'): """ Evaluate the model on the given dataset. Parameters ---------- dataset : SFrame Dataset in the same format used for training. The columns names and types of the dataset must be the same as that used in training. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto' : Returns all available metrics. - 'accuracy ' : Classification accuracy. - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. - 'roc_curve' : An SFrame containing information needed for an roc curve missing_value_action : str, optional Action to perform when missing values are encountered. Can be one of: - 'auto': By default the model will treat missing value as is. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with evaluation and terminate with an error message. Returns ------- out : dict A dictionary containing the evaluation result. See Also ---------- create, predict, classify Examples -------- >>> results = model.evaluate(test_data) >>> results = model.evaluate(test_data, metric='accuracy') >>> results = model.evaluate(test_data, metric='confusion_matrix') Notes ----- When evaluating for classification metrics (e.g. auc, confusion_matrix), the classification threshold is set to 0.5. For more flexible classification accuracy, please use functions in the :py:mod:`~graphlab.toolkits.evaluation` module. """ _mt._get_metric_tracker().track('toolkit.classifier.boosted_trees_classifier.evaluate') _raise_error_evaluation_metric_is_valid(metric, ['auto', 'accuracy', 'confusion_matrix', 'roc_curve']) results = {} if metric in ['auto', 'accuracy', 'roc_curve']: results = super(_Classifier, self).evaluate(dataset, metric=metric, missing_value_action=missing_value_action) if metric in ['confusion_matrix', 'auto']: predictions = self.predict(dataset, output_type='class', missing_value_action=missing_value_action) target = self.get('target') _raise_error_if_column_exists(dataset, target, 'dataset', target) results['confusion_matrix'] = _graphlab.evaluation.confusion_matrix(dataset[target], predictions) return results
def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None): """ Evaluate the model's predictive accuracy. This is done by predicting the target class for instances in a new dataset and comparing to known target values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the target and features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto': Returns all available metrics. - 'accuracy': Classification accuracy. - 'confusion_matrix': An SFrame with counts of possible prediction/true label combinations. - 'roc_curve': An SFrame containing information needed for an roc curve (binary classification only). max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. Returns ------- out : dict Evaluation results. The dictionary keys are *accuracy* and *confusion_matrix* and *roc_curve* (if applicable). See also -------- create, predict, predict_topk, classify Notes ----- - Because the model randomly breaks ties between predicted classes, the results of repeated calls to `evaluate` method may differ. Examples -------- >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species') >>> ans = m.evaluate(sf_train, max_neighbors=2, ... metric='confusion_matrix') >>> print ans['confusion_matrix'] +--------------+-----------------+-------+ | target_label | predicted_label | count | +--------------+-----------------+-------+ | cat | dog | 1 | | dog | dog | 2 | | fossa | dog | 1 | +--------------+-----------------+-------+ """ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.evaluate') ## Validate the metric name _raise_error_evaluation_metric_is_valid( metric, ['auto', 'accuracy', 'confusion_matrix', 'roc_curve']) ## Make sure the input dataset has a target column with an appropriate # type. target = self.get('target') _raise_error_if_column_exists(dataset, target, 'dataset', target) if not dataset[target].dtype() == str and not dataset[target].dtype( ) == int: raise TypeError("The target column of the evaluation dataset must " "contain integers or strings.") if self._state["num_classes"] != 2: if (metric == 'roc_curve') or (metric == ['roc_curve']): err_msg = "Currently, ROC curve is not supported for " err_msg += "multi-class classification in this model." raise _ToolkitError(err_msg) else: warn_msg = "WARNING: Ignoring `roc_curve`. " warn_msg += "Not supported for multi-class classification." print(warn_msg) ## Compute predictions with the input dataset. ystar = self.predict(dataset, output_type='class', max_neighbors=max_neighbors, radius=radius) ystar_prob = self.predict(dataset, output_type='probability', max_neighbors=max_neighbors, radius=radius) ## Compile accuracy metrics results = {} if metric in ['accuracy', 'auto']: results['accuracy'] = _gl.evaluation.accuracy( targets=dataset[target], predictions=ystar) if metric in ['confusion_matrix', 'auto']: results['confusion_matrix'] = \ _gl.evaluation.confusion_matrix(targets=dataset[target], predictions=ystar) if self._state["num_classes"] == 2: if metric in ['roc_curve', 'auto']: results['roc_curve'] = \ _gl.evaluation.roc_curve(targets=dataset[target], predictions=ystar_prob) return results