def evaluate(self, dataset, metric="auto", missing_value_action='auto', options={}, **kwargs): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset in the same format used for training. The columns names and types of the dataset must be the same as that used in training. metric : str, list[str] Evaluation metric(s) to be computed. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. options : dict additional options to be passed in to prediction kwargs : dict additional options to be passed into prediction """ if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy( self, 'evaluate') _raise_error_if_not_sframe(dataset, "dataset") options = options.copy() options.update(kwargs) options.update({ 'model': self.__proxy__, 'dataset': dataset, 'model_name': self.__name__, 'missing_value_action': missing_value_action, 'metric': metric }) results = _turicreate.toolkits._main.run( 'supervised_learning_evaluate', options) return _map_unity_proxy_to_object(results)
def _training_stats(self): """ Return a dictionary containing statistics collected during model training. These statistics are also available with the ``get`` method, and are described in more detail in the documentation for that method. Notes ----- """ opts = {'model': self.__proxy__, 'model_name': self.__name__} results = _turicreate.toolkits._main.run("supervised_learning_get_train_stats", opts) return _map_unity_proxy_to_object(results)
def classify(self, dataset, missing_value_action='auto'): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or 1). Parameters ---------- dataset: SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose model dependent missing value action - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with prediction and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. """ if (missing_value_action == 'auto'): missing_value_action = select_default_missing_value_policy( self, 'classify') # Low latency path if isinstance(dataset, list): return _turicreate.extensions._fast_classify( self.__proxy__, dataset, missing_value_action) if isinstance(dataset, dict): return _turicreate.extensions._fast_classify( self.__proxy__, [dataset], missing_value_action) _raise_error_if_not_sframe(dataset, "dataset") options = {} options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'missing_value_action': missing_value_action, }) target = _turicreate.toolkits._main.run('supervised_learning_classify', options) return _map_unity_proxy_to_object(target['classify'])
def _get(self, field): """ Get the value of a given field. Parameters ---------- field : string Name of the field to be retrieved. Returns ------- out : [various] The current value of the requested field. """ opts = {'model': self.__proxy__, 'model_name': self.__name__, 'field': field} response = _turicreate.toolkits._main.run('supervised_learning_get_value', opts) return _map_unity_proxy_to_object(response['value'])
def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `id`, `class`, and `probability`, `margin`, or `rank`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model. Parameters ---------- dataset : SFrame A dataset that has the same columns that were used during training. If the target column exists in ``dataset`` it will be ignored while making predictions. output_type : {'probability', 'rank', 'margin'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. - `margin` : Margin associated with each label in the prediction. k : int, optional Number of classes to return for each input example. missing_value_action : str, optional Action to perform when missing values are encountered. Can be one of: - 'auto': By default the model will treat missing value as is. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with evaluation and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _check_categorical_option_type('output_type', output_type, ['rank', 'margin', 'probability']) if missing_value_action == 'auto': missing_value_action = _sl.select_default_missing_value_policy( self, 'predict') # Low latency path if isinstance(dataset, list): return _turicreate.extensions._fast_predict_topk( self.__proxy__, dataset, output_type, missing_value_action, k) if isinstance(dataset, dict): return _turicreate.extensions._fast_predict_topk( self.__proxy__, [dataset], output_type, missing_value_action, k) options = dict() options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'output_type': output_type, 'topk': k, 'missing_value_action': missing_value_action }) target = _turicreate.toolkits._main.run( 'supervised_learning_predict_topk', options) return _map_unity_proxy_to_object(target['predicted'])
def predict(self, dataset, missing_value_action='auto', output_type='', options={}, **kwargs): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or 1). Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. output_type : str, optional output type that maybe needed by some of the toolkits options : dict additional options to be passed in to prediction kwargs : dict additional options to be passed into prediction Returns ------- out : SArray An SArray with model predictions. """ if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy(self, 'predict') # Low latency path if isinstance(dataset, list): return _turicreate.extensions._fast_predict(self.__proxy__, dataset, output_type, missing_value_action) if isinstance(dataset, dict): return _turicreate.extensions._fast_predict(self.__proxy__, [dataset], output_type, missing_value_action) # Batch predictions path else: _raise_error_if_not_sframe(dataset, "dataset") options = options.copy() options.update(kwargs) options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'missing_value_action' : missing_value_action, 'output_type' : output_type }) target = _turicreate.toolkits._main.run( 'supervised_learning_predict', options) return _map_unity_proxy_to_object(target['predicted'])
def extract_features(self, dataset, missing_value_action='auto'): """ For each example in the dataset, extract the leaf indices of each tree as features. For multiclass classification, each leaf index contains #num_class numbers. The returned feature vectors can be used as input to train another supervised learning model such as a :py:class:`~turicreate.logistic_classifier.LogisticClassifier`, an :py:class:`~turicreate.svm_classifier.SVMClassifier`, or a Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. Examples -------- >>> data = turicreate.SFrame( 'https://static.turi.com/datasets/regression/houses.csv') >>> # Regression Tree Models >>> data['regression_tree_features'] = model.extract_features(data) >>> # Classification Tree Models >>> data['classification_tree_features'] = model.extract_features(data) """ metric_name = '.'.join([self.__module__, 'extract_features']) _raise_error_if_not_sframe(dataset, "dataset") if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy( self, 'extract_features') options = dict() options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'missing_value_action': missing_value_action, 'dataset': dataset }) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])
def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0, output_type='topic_probabilities'): """ Get the words associated with a given topic. The score column is the probability of choosing that word given that you have chosen a particular topic. Parameters ---------- topic_ids : list of int, optional The topics to retrieve words. Topic ids are zero-based. Throws an error if greater than or equal to m['num_topics'], or if the requested topic name is not present. num_words : int, optional The number of words to show. cdf_cutoff : float, optional Allows one to only show the most probable words whose cumulative probability is below this cutoff. For example if there exist three words where .. math:: p(word_1 | topic_k) = .1 p(word_2 | topic_k) = .2 p(word_3 | topic_k) = .05 then setting :math:`cdf_{cutoff}=.3` would return only :math:`word_1` and :math:`word_2` since :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}` output_type : {'topic_probabilities' | 'topic_words'}, optional Determine the type of desired output. See below. Returns ------- out : SFrame If output_type is 'topic_probabilities', then the returned value is an SFrame with a column of words ranked by a column of scores for each topic. Otherwise, the returned value is a SArray where each element is a list of the most probable words for each topic. Examples -------- Get the highest ranked words for all topics. >>> docs = turicreate.SArray('https://static.turi.com/datasets/nips-text') >>> m = turicreate.topic_model.create(docs, num_iterations=50) >>> m.get_topics() +-------+----------+-----------------+ | topic | word | score | +-------+----------+-----------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 1 | function | 0.0482834508265 | | 1 | input | 0.0456270024091 | | 1 | point | 0.0302662839454 | | 1 | result | 0.0239474934631 | | 1 | problem | 0.0231750116011 | | ... | ... | ... | +-------+----------+-----------------+ Get the highest ranked words for topics 0 and 1 and show 15 words per topic. >>> m.get_topics([0, 1], num_words=15) +-------+----------+------------------+ | topic | word | score | +-------+----------+------------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 0 | response | 0.0139740298286 | | 0 | layer | 0.0122585145062 | | 0 | features | 0.0115343177265 | | 0 | feature | 0.0103530459301 | | 0 | spatial | 0.00823387994361 | | ... | ... | ... | +-------+----------+------------------+ If one wants to instead just get the top words per topic, one may change the format of the output as follows. >>> topics = m.get_topics(output_type='topic_words') dtype: list Rows: 10 [['cell', 'image', 'input', 'object', 'visual'], ['algorithm', 'data', 'learning', 'method', 'set'], ['function', 'input', 'point', 'problem', 'result'], ['model', 'output', 'pattern', 'set', 'unit'], ['action', 'learning', 'net', 'problem', 'system'], ['error', 'function', 'network', 'parameter', 'weight'], ['information', 'level', 'neural', 'threshold', 'weight'], ['control', 'field', 'model', 'network', 'neuron'], ['hidden', 'layer', 'system', 'training', 'vector'], ['component', 'distribution', 'local', 'model', 'optimal']] """ _check_categorical_option_type('output_type', output_type, ['topic_probabilities', 'topic_words']) if topic_ids is None: topic_ids = list(range(self._get('num_topics'))) assert isinstance(topic_ids, list), \ "The provided topic_ids is not a list." if any([type(x) == str for x in topic_ids]): raise ValueError( "Only integer topic_ids can be used at this point in time.") if not all([x >= 0 and x < self.num_topics for x in topic_ids]): raise ValueError("Topic id values must be non-negative and less than the " + \ "number of topics used to fit the model.") opts = { 'model': self.__proxy__, 'topic_ids': topic_ids, 'num_words': num_words, 'cdf_cutoff': cdf_cutoff } response = _turicreate.toolkits._main.run('text_topicmodel_get_topic', opts) ret = _map_unity_proxy_to_object(response['top_words']) def sort_wordlist_by_prob(z): words = sorted(z.items(), key=_operator.itemgetter(1), reverse=True) return [word for (word, prob) in words] if output_type != 'topic_probabilities': ret = ret.groupby( 'topic', {'word': _turicreate.aggregate.CONCAT('word', 'score')}) words = ret.sort('topic')['word'].apply(sort_wordlist_by_prob) ret = _SFrame({'words': words}) return ret