def create(dataset, transformers): """ Create a Transformer object to transform data for feature engineering. Parameters ---------- dataset : SFrame The dataset to use for training the model. transformers: Transformer | list[Transformer] An Transformer or a list of Transformers. See Also -------- graphlab.toolkits.feature_engineering._feature_engineering._TransformerBase Examples -------- .. sourcecode:: python # Create data. >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4]}) >>> from graphlab.toolkits.feature_engineering import FeatureHasher, \ QuadraticFeatures, OneHotEncoder # Create a single transformer. >>> encoder = graphlab.feature_engineering.create(sf, OneHotEncoder(max_categories = 10)) # Create a chain of transformers. >>> chain = graphlab.feature_engineering.create(sf, [ QuadraticFeatures(), FeatureHasher() ]) # Create a chain of transformers with names for each of the steps. >>> chain = graphlab.feature_engineering.create(sf, [ ('quadratic', QuadraticFeatures()), ('hasher', FeatureHasher()) ]) """ err_msg = "The parameters 'transformers' must be a valid Transformer object." cls = transformers.__class__ _raise_error_if_not_sframe(dataset, "dataset") # List of transformers. if (cls == list): transformers = TransformerChain(transformers) # Transformer. else: if not issubclass(cls, TransformerBase): raise TypeError(err_msg) # Fit and return transformers.fit(dataset) return transformers
def fit(self, data): """ Fit a transformer using the SFrame `data`. Parameters ---------- data : SFrame The data used to fit the transformer. Returns ------- self (A fitted version of the object) See Also -------- transform, fit_transform Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit') self.__proxy__.fit(data) return self
def transform(self, data): """ Transform the SFrame `data` using a fitted model. Parameters ---------- data : SFrame The data to be transformed. Returns ------- A transformed SFrame. See Also -------- fit, fit_transform Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") _mt._get_metric_tracker().track(self.__class__.__module__ + '.transform') return self.__proxy__.transform(data)
def fit(self, data): """ Fit a transformer using the SFrame `data`. Parameters ---------- data : SFrame The data used to fit the transformer. Returns ------- self (A fitted version of the object) See Also -------- transform fit_transform Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit') self.__proxy__.fit(data) return self
def fit_transform(self, data): """ First fit a transformer using the SFrame `data` and then return a transformed version of `data`. Parameters ---------- data : SFrame The data used to fit the transformer. The same data is then also transformed. Returns ------- Transformed SFrame. See Also -------- fit, transform Notes ------ - Fit transform modifies self. Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit_transform') return self.__proxy__.fit_transform(data)
def transform(self, data): """ Transform the SFrame `data` using a fitted model. Parameters ---------- data : SFrame The data to be transformed. Returns ------- A transformed SFrame. See Also -------- transform fit_transform Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") _mt._get_metric_tracker().track(self.__class__.__module__ + '.transform') return self.__proxy__.transform(data)
def fit_transform(self, data): """ First fit a transformer using the SFrame `data` and then return a transformed version of `data`. Parameters ---------- data : SFrame The data used to fit the transformer. The same data is then also transformed. Returns ------- Transformed SFrame. See Also -------- fit fit_transform Notes ------ - Fit transform modifies self. Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit_transform') return self.__proxy__.fit_transform(data)
def create_regression_with_model_selector(dataset, target, model_selector, features = None, validation_set='auto', verbose = True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Run the model selector. selected_model_name = model_selector(features_sframe) model = create_selected(selected_model_name, dataset, target, features, validation_set, verbose) return model
def create_regression_with_model_selector(dataset, target, model_selector, features = None, validation_set=None, verbose = True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Run the model selector. selected_model_name = model_selector(features_sframe) model = create_selected(selected_model_name, dataset, target, features, validation_set, verbose) return model
def extract_features(self, dataset): """ Use the mined patterns to convert itemsets to binary vectors. For each itemset in ``dataset``, extract_features returns a vector of binary indicator variables, marking which mined patterns contain the itemset. Parameters ---------- dataset : SFrame A dataset that has the same columns that were used during training. If the item column exists in ``dataset`` it will be ignored while making predictions. Returns ------- out : SFrame An SFrame of extracted features. The SFrame contains a row for each unique transaction in ``dataset`` Each row of the SFrame consists of the 'features' and * extracted_features - an array.array of binary indicator variables See Also -------- predict Examples -------- .. sourcecode:: python >>> features = model.extract_features(bakery_sf) >>> features Data: +---------+-------------------------------+ | Receipt | extracted_features | +---------+-------------------------------+ | 21855 | [0.0, 1.0, 0.0, 0.0, 0.0, ... | | 63664 | [0.0, 0.0, 0.0, 0.0, 0.0, ... | | 7899 | [0.0, 0.0, 0.0, 0.0, 0.0, ... | | 25263 | [0.0, 0.0, 0.0, 0.0, 0.0, ... | | 30621 | [0.0, 0.0, 0.0, 0.0, 0.0, ... | | 43116 | [0.0, 0.0, 0.0, 1.0, 0.0, ... | | 27112 | [0.0, 0.0, 0.0, 0.0, 1.0, ... | | 26319 | [0.0, 1.0, 0.0, 0.0, 0.0, ... | | 26439 | [0.0, 0.0, 0.0, 0.0, 0.0, ... | | 62361 | [0.0, 0.0, 0.0, 0.0, 0.0, ... | +---------+-------------------------------+ [75000 rows x 2 columns] """ _mt._get_metric_tracker().track( 'toolkits.frequent_pattern_mining.extract_features') _raise_error_if_not_sframe(dataset, "dataset") return self.__proxy__.extract_features(dataset)
def evaluate(self, dataset, metric="auto", missing_value_action='auto', options={}, **kwargs): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset in the same format used for training. The columns names and types of the dataset must be the same as that used in training. metric : str, list[str] Evaluation metric(s) to be computed. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. options : dict additional options to be passed in to prediction kwargs : dict additional options to be passed into prediction """ if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy( self, 'evaluate') _raise_error_if_not_sframe(dataset, "dataset") options = options.copy() options.update(kwargs) options.update({'model': self.__proxy__, 'dataset': dataset, 'model_name': self.__name__, 'missing_value_action': missing_value_action, 'metric': metric }) results = _graphlab.toolkits._main.run( 'supervised_learning_evaluate', options) return _map_unity_proxy_to_object(results)
def classify(self, dataset, missing_value_action='auto'): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or 1). Parameters ---------- dataset: SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose model dependent missing value action - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with prediction and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. """ if (missing_value_action == 'auto'): missing_value_action = select_default_missing_value_policy(self, 'classify') # Low latency path if isinstance(dataset, list): return _graphlab.extensions._fast_classify(self.__proxy__, dataset, missing_value_action) if isinstance(dataset, dict): return _graphlab.extensions._fast_classify(self.__proxy__, [dataset], missing_value_action) _raise_error_if_not_sframe(dataset, "dataset") options = {} options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'missing_value_action': missing_value_action, }) target = _graphlab.toolkits._main.run('supervised_learning_classify', options) return _map_unity_proxy_to_object(target['classify'])
def predict(self, dataset, missing_value_action = 'error', output_type='', options = {}, **kwargs): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or 1). Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error' : Do not proceed with prediction and terminate with an error message. output_type : str, optional output type that maybe needed by some of the toolkits options : dict additional options to be passed in to prediction kwargs : dict additional options to be passed into prediction Returns ------- out : SArray An SArray with model predictions. """ _raise_error_if_not_sframe(dataset, "dataset") options = options.copy() options.update(kwargs) options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'missing_value_action' : missing_value_action, 'output_type' : output_type }) target = _graphlab.toolkits._main.run('supervised_learning_predict', options) return _map_unity_proxy_to_object(target['predicted'])
def fit(self, data): """ Fits the transformer using the given data. """ _raise_error_if_not_sframe(data, "data") fitted_state = {} feature_columns = _internal_utils.get_column_names( data, self._exclude, self._features) if not feature_columns: raise RuntimeError( "No valid feature columns specified in transformation.") fitted_state['features'] = feature_columns fitted_state['fitted'] = True self.__proxy__.update(fitted_state) return self
def create(dataset, target, features=None, distance=None, verbose=True): """ Create a :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`. This model predicts the class of a query point by finding the most common class among the query's nearest neighbors. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns except the target variable should be used. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. Please see the :mod:`~graphlab.toolkits.distances` module for more details. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. Please see the documentation for that module for specific distance functions. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) Note that for sparse vectors, missing keys are assumed to have value 0.0. If distance is left unspecified or set to 'auto', then a composite distance is constructed automatically based on feature types. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborClassifier A trained model of type :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`. See Also -------- NearestNeighborClassifier graphlab.toolkits.nearest_neighbors graphlab.toolkits.distances References ---------- - `Wikipedia - nearest neighbors classifier <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_ - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`_. Vol. 2. New York. Springer. pp. 463-481. Examples -------- >>> sf = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species') As with the nearest neighbors toolkit, the nearest neighbor classifier accepts composite distance functions. >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7], ... [('height', 'weight'), 'manhattan', 1.6]] ... >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species', ... distance=my_dist) """ ## Set up ## ------ _mt._get_metric_tracker().track('toolkit.classifier.nearest_neighbor_classifier.create') start_time = _time.time() ## Validation and preprocessing ## ---------------------------- ## 'dataset' must be a non-empty SFrame _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## 'target' must be a string, in 'dataset', and the type of the target must # be string or integer. if not isinstance(target, str) or target not in dataset.column_names(): raise _ToolkitError("The 'target' parameter must be the name of a " "column in the input dataset.") if not dataset[target].dtype() == str and not dataset[target].dtype() == int: raise TypeError("The target column must contain integers or strings.") ## Warn that 'None' values in the target may lead to ambiguous predictions. if dataset[target].num_missing() > 0: _logging.warning("Missing values detected in the target column. This " + "may lead to ambiguous 'None' predictions, if the " + "'radius' parameter is set too small in the prediction, " + "classification, or evaluation methods.") ## convert features and distance arguments into a composite distance ## NOTE: this is done here instead of in the nearest neighbors toolkit # because the automatic distance construction may be different for the two # toolkits. if features is None: _features = [x for x in dataset.column_names() if x != target] else: _features = [x for x in features if x != target] if isinstance(distance, list): distance = _copy.deepcopy(distance) elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] elif distance is None or distance == 'auto': col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} distance = _construct_auto_distance(_features, col_types) else: raise TypeError("Input 'distance' not understood. The 'distance' " + "parameter must be a string or a composite distance, " + " or left unspecified.") ## Construct and query the nearest neighbors model ## ----------------------------------------------- knn_model = _gl.nearest_neighbors.create(dataset, label=target, distance=distance, verbose=verbose) ## Postprocessing and formatting ## ----------------------------- model = NearestNeighborClassifier(knn_model) model._state['verbose'] = verbose model._state['distance'] = knn_model['distance'] model._state['num_distance_components'] = knn_model['num_distance_components'] model._state['num_examples'] = dataset.num_rows() model._state['features'] = knn_model['features'] model._state['target'] = target model._state['num_classes'] = len(dataset[target].unique()) model._state['num_features'] = knn_model['num_features'] model._state['num_unpacked_features'] = knn_model['num_unpacked_features'] model._state['training_time'] = _time.time() - start_time return model
def predict_topk(self, dataset, max_neighbors=10, radius=None, k=3, verbose=False): """ Return top-k most likely predictions for each observation in ``dataset``. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`. Parameters ---------- dataset : SFrame Dataset of new observations. Must include the features used for model training, but does not require a target column. Additional columns are ignored. max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. k : int, optional Number of classes to return for each input example. Returns ------- out : SFrame See Also ---------- create, classify, predict Notes ----- - If the 'radius' parameter is small, it is possible that a query point has no neighbors in the training dataset. In this case, the query is dropped from the SFrame output by this method. If all queries have no neighbors, then the result is an empty SFrame. If the target column in the training dataset has missing values, these predictions will be ambiguous. - Ties between predicted classes are broken randomly. Examples -------- >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> sf_new = graphlab.SFrame({'height': [26, 19], ... 'weight': [25, 35]}) ... >>> m = graphlab.nearest_neighbor_classifier.create(sf_train, target='species') >>> ystar = m.predict_topk(sf_new, max_neighbors=2) >>> print ystar +--------+-------+-------------+ | row_id | class | probability | +--------+-------+-------------+ | 0 | dog | 1.0 | | 1 | fossa | 0.5 | | 1 | dog | 0.5 | +--------+-------+-------------+ """ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.predict_topk') ## Validate the number of results to return. Note that the # 'max_neighbors' and 'radius' parameters are validated by the nearest # neighbor model's query method. if not isinstance(k, int) or k < 1: raise TypeError( "The number of results to return for each point, " + "'k', must be an integer greater than 0.") ## Validate the query dataset. _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## Validate neighborhood parameters 'max_neighbors'. # - NOTE: when the parameter name is changed in nearest neighbors, the # query call will do this itself, and this block can be removed. if max_neighbors is not None: if not isinstance(max_neighbors, int): raise ValueError("Input 'max_neighbors' must be an integer.") if max_neighbors <= 0: raise ValueError( "Input 'max_neighbors' must be larger than 0.") ## Find the nearest neighbors for each query and count the number of # votes for each class. knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius, verbose=verbose) ## If there are *no* results for *any* query make an empty SFrame. if knn.num_rows() == 0: ystar = _gl.SFrame({'row_id': [], 'class': [], 'probability': []}) ystar['row_id'] = ystar['row_id'].astype(int) ystar['class'] = ystar['class'].astype(str) else: ## Find the classes with the top-k vote totals grp = knn.groupby(['query_label', 'reference_label'], _gl.aggregate.COUNT) ystar = grp.unstack(column=['reference_label', 'Count'], new_column_name='votes') ystar['topk'] = ystar['votes'].apply( lambda x: _sort_topk_votes(x, k)) ystar['total_votes'] = ystar['votes'].apply( lambda x: sum(x.values())) ## Re-stack, unpack, and rename the results ystar = ystar.stack('topk', new_column_name='topk') ystar = ystar.unpack('topk') ystar.rename({'topk.class': 'class', 'query_label': 'row_id'}) ystar['probability'] = ystar['topk.votes'] / ystar['total_votes'] ystar = ystar[['row_id', 'class', 'probability']] return ystar
def predict_topk(self, dataset, k=5): """ Use the trained model to obtain top-k predictions for the most confident rules given a partial set of observations described in the ``dataset``. Parameters ---------- dataset : SFrame A dataset that has the same columns that were used during training. If the item column exists in ``dataset`` it will be ignored while making predictions. k : int, optional Number of predictions to return for each input example. Returns ------- out : SFrame An SFrame with the top scoring association rules for each itemset in the dataset. The SFrame contains a row for each unique transaction in ``dataset`` Each row of the SFrame consists of the 'features' and * prefix - the 'antecedent' or 'left-hand side' of an assocation rule. It must be a frequent itemset and a subset of the associated itemset. * prediction - the 'consequent' or 'right-hand side' of the assocation rule. It must be disjoint of the prefix. * confidence - the confidence of the assocation rule defined as: ``confidence(prefix => prediction) = Support(prefix U prediction) / Support(prefix)`` * prefix support - the frequency of the 'prefix' itemset in the training data * prediction support - the frequency of the 'prediction' itemset in the training data * joint support - the frequency of the cooccurance ('prefix' + 'prediction') in the training data If there does not exist ``k`` valid association rules for an itemset, then ``predict_topk`` will return as many valid rules as possible. See Also -------- get_frequent_patterns, extract_features, predict Note -------- Prediction can be slow when max_patterns is set to a large value because there are more rules to consider for predictions. References ---------- - Wikipedia - Association Rule Learning <https://en.wikipedia.org/wiki/Association_rule_learning> - Han, Jiawei, Micheline Kamber, and Jian Pei. Data mining: concepts and techniques: concepts and techniques. Elsevier, 2011. Examples -------- .. sourcecode:: python # For an SFrame >>> predictions = model.predict(bakery_sf, k = 5) Columns: Receipt int prefix list prediction list confidence float prefix support int joint support int Rows: 13283 Data: +---------+-----------------+-------------------------------+-----------------+ | Receipt | prefix | prediction | confidence | +---------+-----------------+-------------------------------+-----------------+ | 13 | [CherrySoda] | [AppleDanish] | 0.352077687444 | | 13 | [CherrySoda] | [AppleTart] | 0.349593495935 | | 13 | [CherrySoda] | [AppleCroissant] | 0.349141824752 | | 13 | [CherrySoda] | [AppleCroissant, AppleDanish] | 0.302619692864 | | 13 | [CherrySoda] | [AppleCroissant, AppleTart] | 0.301942186089 | | 42 | [ChocolateTart] | [VanillaFrappuccino] | 0.461889374644 | | 42 | [ChocolateTart] | [WalnutCookie] | 0.367990876259 | | 42 | [ChocolateTart] | [WalnutCookie, VanillaFrap... | 0.323322562251 | | 42 | [] | [CoffeeEclair] | 0.104013695516 | | 42 | [] | [HotCoffee] | 0.0976340461956 | +---------+-----------------+-------------------------------+-----------------+ +----------------+---------------+ | prefix support | joint support | +----------------+---------------+ | 4428 | 1559 | | 4428 | 1548 | | 4428 | 1546 | | 4428 | 1340 | | 4428 | 1337 | | 5261 | 2430 | | 5261 | 1936 | | 5261 | 1701 | | 74769 | 7777 | | 74769 | 7300 | +----------------+---------------+ [13283 rows x 7 columns] # For a single itemset, e.g. ['HotCoffee', 'VanillaEclair'] >>> new_itemset = gl.SFrame({'Receipt': [-1, -1], 'Item': ['HotCoffee', 'VanillaEclair']}) >>> model.predict(new_itemset, k = 3) Data: +---------+-------------+-------------------------------+----------------+----------------+ | Receipt | prefix | prediction | score | prefix support | +---------+-------------+-------------------------------+----------------+----------------+ | -1 | [HotCoffee] | [ApricotCroissant] | 0.344545454545 | 7700 | | -1 | [HotCoffee] | [BlueberryTart] | 0.341298701299 | 7700 | | -1 | [HotCoffee] | [BlueberryTart, ApricotCro... | 0.31974025974 | 7700 | +---------+-------------+-------------------------------+----------------+----------------+ +---------------+ | joint support | +---------------+ | 2653 | | 2628 | | 2462 | +---------------+ [3 rows x 7 columns] """ _mt._get_metric_tracker().track( 'toolkits.frequent_pattern_mining.predict_topk') _raise_error_if_not_sframe(dataset, "dataset") score_function = "confidence" # For now, we only support confidence return self.__proxy__.predict_topk(dataset, score_function, k)
def extract_features(self, dataset, layer_id=None): """ Takes an input dataset, propagates each example through the network, and returns an SArray of dense feature vectors, each of which is the concatenation of all the hidden unit values at layer[layer_id]. These feature vectors can be used as input to train another classifier such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`, an :py:class:`~graphlab.svm_classifier.SVMClassifier`, another :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`, or a :py:class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier`. Input dataset size must be the same as for the training of the model, except for images which are automatically resized. We also are releasing a pre-trained model for ImageNet, as described by Alex Krizhevsky et. al. It is located at http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45 . Using it requires 256 x 256 x 3 images. Please see Examples and References for more. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. layer_id : int , optional The index of the layer in neuralnet at which the activations are taken to be a dense feature vector. Must be a fully-connected layer. Default is None, in which case the layer before the connection layer to the output is used. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. See Also ------------ graphlab.deeplearning.layers References ---------- - Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with deep convolutional neural networks." Advances in neural information processing systems. 2012. Examples -------- >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(data, ... target='label', ... network=net, ... max_iterations=3) >>> # Now, let's extract features from the last layer >>> data['features'] = m.extract_features(data) >>> # Now, let's build a new classifier on top of extracted features >>> m = graphlab.classifier.create(data, ... features = ['features'], ... target='label') Now, let's see how to load the ImageNet model, and use it for extracting features after resizing the data: >>> imagenet_model = graphlab.load_model('http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45') >>> data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3) >>> data['imagenet_features'] = imagenet_model.extract_features(data) """ _mt._get_metric_tracker().track('toolkit.classifier.neuralnet_classifier.extract_features') _raise_error_if_not_sframe(dataset, "dataset") options = dict() net = self.get('network').layers network_size = len(net) - 1 if layer_id is None: if net[network_size]._type == "CONNECTION": layer_id = network_size - 1 else: layer_id = network_size - 2 _numeric_param_check_range("layer_id", layer_id, 0, network_size) conv2flat = False for i in range(0, layer_id + 1): if net[i]._type == "CONNECTION" or net[i]._type == "TRANSITION": conv2flat = True if conv2flat is not True: raise ValueError("Features must be extracted from either a network " "with non-image input or a layer after a FlattenLayer. " "Try extracting features from layer following a FlattenLayer.") options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'layer_id': layer_id}) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])
def make_sgraph(vertex_sframe, edge_sframe, output_path, vid_field, src_field, dst_field, num_partitions=8, _distributed='auto'): """ Make an SGraph with input vertex and edge sframes, Save the graph to output_path, and return the graph. Parameters ---------- vertex_sframe : SFrame SFrame of vertex data edge_sframe : SFrame SFrame of edge data output_path : str Path where the final graph is saved to. vid_field : str Column name of vertex id in the vertex sframe. src_field : str Column name of source vertex id in the edge sframe. dst_field : str Column name of target vertex id in the edge sframe. num_partitions : int Number of partitions for the final sgraph. Returns ------- out : g SGraph """ if type(vid_field) is not str: raise TypeError('vid_field must be str') if type(src_field) is not str: raise TypeError('src_field must be str') if type(dst_field) is not str: raise TypeError('dst_field must be str') # Infer the vid type vid_type = None if (vertex_sframe is not None and len(vertex_sframe) > 0): vid_type = vertex_sframe[vid_field].dtype() elif (edge_sframe is not None and len(edge_sframe) > 0): vid_type = edge_sframe[src_field].dtype() else: vid_type = int # Create empty edge sframe if input is dummy if (edge_sframe is None or len(edge_sframe) == 0): edge_sframe = gl.SFrame() edge_sframe['__src_id'] = gl.SArray([], vid_type) edge_sframe['__dst_id'] = gl.SArray([], vid_type) src_field = '__src_id' dst_field = '__dst_id' # Create empty vertex sframe if input is dummy if (vertex_sframe is None or len(vertex_sframe) == 0): vertex_sframe = gl.SFrame() vertex_sframe['__id'] = gl.SArray([], vid_type) vid_field = '__id' _raise_error_if_not_sframe(vertex_sframe, "vertex_data") _raise_error_if_not_sframe(edge_sframe, "edge_data") if vid_field not in vertex_sframe.column_names(): raise ValueError('Column %s not found in vertex_data' % vid_field) if src_field not in edge_sframe.column_names(): raise ValueError('Column %s not found in edge_data' % src_field) if dst_field not in edge_sframe.column_names(): raise ValueError('Column %s not found in edge_data' % dst_field) output_path = _make_internal_url(output_path) opts = {'vertex_data': vertex_sframe, 'edge_data': edge_sframe, 'output_path': output_path, 'vid_field': vid_field, 'src_field': src_field, 'dst_field': dst_field, 'num_partitions': num_partitions} run('distributed_graph_ingress', opts, env=_distributed) from graphlab.data_structures.sgraph import load_sgraph return load_sgraph(output_path)
def link(self, dataset, k=5, radius=None, verbose=True): """ Find matching records from the reference dataset (entered when the model was created) for each record in the 'dataset' passed to this function. The query dataset must include columns with the same names as the label and feature columns used to create the RecordLinker model. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. k : int, optional Maximum number of nearest neighbors to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the row label of the query observation, the second is the row label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- Assume we've created the model from the example in the RecordLinker 'create' function. >>> queries = graphlab.SFrame({'sqft': [986, 1320], ... 'street': ['fremont', 'phiney'], ... 'city': ['sea', 'seattle'], ... 'state': ['WA', 'WA']}) ... >>> model.link(queries, k=2, radius=5.) +-------------+-----------------+----------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------+------+ | 0 | 0 | 4.0 | 1 | | 0 | 2 | 5.0 | 2 | | 1 | 0 | 0.0 | 1 | +-------------+-----------------+----------+------+ """ _mt._get_metric_tracker().track(self.__module__ + '.link_records') ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Make sure all of the necessary features are present at 'link' time. sf_features = _tkutl._toolkits_select_columns(dataset, self.get('features')) ## Clean and impute string data. *** Think about consolidating this and # the next step into a feature transformer.*** col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(self._state['distance']) for ftr in self.get('features'): if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Convert strings to dicts and concatenate string features. _dataset, _ = _engineer_distance_features(_dataset, _distance) ## Query the nearest neighbor model result = self._knn_model.query(_dataset, k=k, radius=radius, verbose=verbose) return result
def create(dataset, features=None, distance=None, radius=1., min_core_neighbors=10, verbose=True): """ Create a DBSCAN clustering model. The DBSCAN method partitions the input dataset into three types of points, based on the estimated probability density at each point. - **Core** points have a large number of points within a given neighborhood. Specifically, `min_core_neighbors` must be within distance `radius` of a point for it to be considered a core point. - **Boundary** points are within distance `radius` of a core point, but don't have sufficient neighbors of their own to be considered core. - **Noise** points comprise the remainder of the data. These points have too few neighbors to be considered core points, and are further than distance `radius` from all core points. Clusters are formed by connecting core points that are neighbors of each other, then assigning boundary points to their nearest core neighbor's cluster. Parameters ---------- dataset : SFrame Training data, with each row corresponding to an observation. Must include all features specified in the `features` parameter, but may have additional columns as well. features : list[str], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns of the input `dataset` should be used to train the model. All features must be numeric, i.e. integer or float types. distance : str or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (str) 2. standard distance name (str) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified, a composite distance is constructed automatically based on feature types. radius : int or float, optional Size of each point's neighborhood, with respect to the specified distance function. min_core_neighbors : int, optional Number of neighbors that must be within distance `radius` of a point in order for that point to be considered a "core point" of a cluster. verbose : bool, optional If True, print progress updates and model details during model creation. Returns ------- out : DBSCANModel A model containing a cluster label for each row in the input `dataset`. Also contains the indices of the core points, cluster boundary points, and noise points. See Also -------- DBSCANModel, graphlab.toolkits.distances Notes ----- - Our implementation of DBSCAN first computes the similarity graph on the input dataset, which can be a computationally intensive process. In the current implementation, some distances are substantially faster than others; in particular "euclidean", "squared_euclidean", "cosine", and "transformed_dot_product" are quite fast, while composite distances can be slow. - Any distance function in the GL Create library may be used with DBSCAN but the results may be poor for distances that violate the standard metric properties, i.e. symmetry, non-negativity, triangle inequality, and identity of indiscernibles. In particular, the DBSCAN algorithm is based on the concept of connecting high-density points that are *close* to each other into a single cluster, but the notion of *close* may be very counterintuitive if the chosen distance function is not a valid metric. The distances "euclidean", "manhattan", "jaccard", and "levenshtein" will likely yield the best results. References ---------- - Ester, M., et al. (1996) `A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise <http://www.aaai.org/Papers/KDD/1996/KDD96-037>`_. In Proceedings of the Second International Conference on Knowledge Discovery and Data Mining. pp. 226-231. - `Wikipedia - DBSCAN <https://en.wikipedia.org/wiki/DBSCAN>`_ - `Visualizing DBSCAN Clustering <http://www.naftaliharris.com/blog/visualizing-dbscan-clustering/>`_ Examples -------- >>> sf = graphlab.SFrame({ ... 'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162, ... 8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020], ... 'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305, ... 5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]}) ... >>> model = graphlab.dbscan.create(sf, radius=4.25, min_core_neighbors=3) >>> model['cluster_id'].print_rows(15) +--------+------------+----------+ | row_id | cluster_id | type | +--------+------------+----------+ | 8 | 0 | core | | 7 | 2 | core | | 0 | 1 | core | | 2 | 2 | core | | 3 | 1 | core | | 11 | 2 | core | | 4 | 2 | core | | 1 | 0 | boundary | | 6 | 0 | boundary | | 5 | 0 | boundary | | 9 | 0 | boundary | | 12 | 2 | boundary | | 10 | 1 | boundary | | 13 | 1 | boundary | +--------+------------+----------+ [14 rows x 3 columns] """ ## Start the training time clock and instantiate an empty model _mt._get_metric_tracker().track('{}.create'.format(__name__)) logger = _logging.getLogger(__name__) start_time = _time.time() ## Validate the input dataset _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate neighborhood parameters if not isinstance(min_core_neighbors, int) or min_core_neighbors < 0: raise ValueError("Input 'min_core_neighbors' must be a non-negative " + "integer.") if not isinstance(radius, (int, float)) or radius < 0: raise ValueError("Input 'radius' must be a non-negative integer " + "or float.") ## Compute all-point nearest neighbors within `radius` and count # neighborhood sizes knn_model = _gl.nearest_neighbors.create(dataset, features=features, distance=distance, method='brute_force', verbose=verbose) knn = knn_model.similarity_graph(k=None, radius=radius, include_self_edges=False, output_type='SFrame', verbose=verbose) neighbor_counts = knn.groupby('query_label', _agg.COUNT) ### NOTE: points with NO neighbors are already dropped here! ## Identify core points and boundary candidate points. Not all of the # boundary candidates will be boundary points - some are in small isolated # clusters. if verbose: logger.info("Identifying noise points and core points.") boundary_mask = neighbor_counts['Count'] < min_core_neighbors core_mask = 1 - boundary_mask # this includes too small clusters boundary_idx = neighbor_counts[boundary_mask]['query_label'] core_idx = neighbor_counts[core_mask]['query_label'] ## Build a similarity graph on the core points ## NOTE: careful with singleton core points - the second filter removes them # from the edge set so they have to be added separately as vertices. if verbose: logger.info("Constructing the core point similarity graph.") core_vertices = knn.filter_by(core_idx, 'query_label') core_edges = core_vertices.filter_by(core_idx, 'reference_label') core_graph = _gl.SGraph() core_graph = core_graph.add_vertices(core_vertices[['query_label']], vid_field='query_label') core_graph = core_graph.add_edges(core_edges, src_field='query_label', dst_field='reference_label') ## Compute core point connected components and relabel to be consecutive # integers cc = _gl.connected_components.create(core_graph, verbose=verbose) cc_labels = cc['component_size'].add_row_number('__label') core_assignments = cc['component_id'].join(cc_labels, on='component_id', how='left')[['__id', '__label']] core_assignments['type'] = 'core' ## Join potential boundary points to core cluster labels (points that aren't # really on a boundary are implicitly dropped) if verbose: logger.info("Processing boundary points.") boundary_edges = knn.filter_by(boundary_idx, 'query_label') # separate real boundary points from points in small isolated clusters boundary_core_edges = boundary_edges.filter_by(core_idx, 'reference_label') # join a boundary point to its single closest core point. boundary_assignments = boundary_core_edges.groupby( 'query_label', {'reference_label': _agg.ARGMIN('rank', 'reference_label')}) boundary_assignments = boundary_assignments.join( core_assignments, on={'reference_label': '__id'}) boundary_assignments = boundary_assignments.rename({'query_label': '__id'}) boundary_assignments = boundary_assignments.remove_column( 'reference_label') boundary_assignments['type'] = 'boundary' ## Identify boundary candidates that turned out to be in small clusters but # not on real cluster boundaries small_cluster_idx = set(boundary_idx).difference( boundary_assignments['__id']) ## Identify individual noise points by the fact that they have no neighbors. noise_idx = set(range(dataset.num_rows())).difference( neighbor_counts['query_label']) noise_idx = noise_idx.union(small_cluster_idx) noise_assignments = _gl.SFrame( {'row_id': _gl.SArray(list(noise_idx), int)}) noise_assignments['cluster_id'] = None noise_assignments['cluster_id'] = noise_assignments['cluster_id'].astype( int) noise_assignments['type'] = 'noise' ## Append core, boundary, and noise results to each other. master_assignments = _gl.SFrame() num_clusters = 0 if core_assignments.num_rows() > 0: core_assignments = core_assignments.rename({ '__id': 'row_id', '__label': 'cluster_id' }) master_assignments = master_assignments.append(core_assignments) num_clusters = len(core_assignments['cluster_id'].unique()) if boundary_assignments.num_rows() > 0: boundary_assignments = boundary_assignments.rename({ '__id': 'row_id', '__label': 'cluster_id' }) master_assignments = master_assignments.append(boundary_assignments) if noise_assignments.num_rows() > 0: master_assignments = master_assignments.append(noise_assignments) ## Post-processing and formatting state = { 'verbose': verbose, 'radius': radius, 'min_core_neighbors': min_core_neighbors, 'distance': knn_model['distance'], 'num_distance_components': knn_model['num_distance_components'], 'num_examples': dataset.num_rows(), 'features': knn_model['features'], 'num_features': knn_model['num_features'], 'unpacked_features': knn_model['unpacked_features'], 'num_unpacked_features': knn_model['num_unpacked_features'], 'cluster_id': master_assignments, 'num_clusters': num_clusters, 'training_time': _time.time() - start_time } return DBSCANModel(state)
def predict(self, dataset, output_type='cluster_id', verbose=True): """ Return predicted cluster label for instances in the new 'dataset'. K-means predictions are made by assigning each new instance to the closest cluster center. Parameters ---------- dataset : SFrame Dataset of new observations. Must include the features used for model training; additional columns are ignored. output_type : {'cluster_id', 'distance'}, optional Form of the prediction. 'cluster_id' (the default) returns the cluster label assigned to each input instance, while 'distance' returns the Euclidean distance between the instance and its assigned cluster's center. verbose : bool, optional If True, print progress updates to the screen. Returns ------- out : SArray Model predictions. Depending on the specified `output_type`, either the assigned cluster label or the distance of each point to its closest cluster center. The order of the predictions is the same as order of the input data rows. See Also -------- create Examples -------- >>> sf = graphlab.SFrame({ ... 'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162, ... 8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020], ... 'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305, ... 5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]}) ... >>> model = graphlab.kmeans.create(sf, num_clusters=3) ... >>> sf_new = graphlab.SFrame({'x1': [-5.6584, -1.0167, -9.6181], ... 'x2': [-6.3803, -3.7937, -1.1022]}) >>> clusters = model.predict(sf_new, output_type='cluster_id') >>> print clusters [1, 0, 1] """ _mt._get_metric_tracker().track('toolkit.kmeans.predict') ## Validate the input dataset. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate the output type. if not isinstance(output_type, str): raise TypeError("The 'output_type' parameter must be a string.") if not output_type in ('cluster_id', 'distance'): raise ValueError("The 'output_type' parameter must be either " + "'cluster_label' or 'distance'.") ## Get model features. ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Compute predictions. opts = {'model': self.__proxy__, 'model_name': self.__name__, 'dataset': sf_features} result = _gl.toolkits._main.run('kmeans_predict', opts, verbose) sf_result = _gl.SFrame(None, _proxy=result['predictions']) if output_type == 'distance': return sf_result['distance'] else: return sf_result['cluster_id']
def create(dataset, tag_name=None, features=None, verbose=True): """ Create a :class:`NearestNeighborAutoTagger` model, which can be used to quickly apply tags from a reference set of text labels to a new query set using the ``tag`` method. Parameters ---------- dataset : SFrame Reference data. This SFrame must contain at least one column. By default, only the ``tag_name`` column is used as the basis for tagging. You may optionally include additional columns with the ``features`` parameter. tag_name : string, optional Name of the column in ``dataset`` with the tags. This column must contain string values. If ``dataset`` contains more than one column, ``tag_name`` must be specified. features : list[string], optional Names of the columns with features to use as the basis for tagging. 'None' (the default) indicates that only the column specified by the ``tag_name`` parameter should be used. Only str or list fields are allowed. If a column of type list is specified, all values must be either of type string or convertible to type string. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : model A model for quickly tagging new query observations with entries from `dataset`. Currently, the only implementation is the following: - NearestNeighborAutoTagger See Also -------- graphlab.nearest_neighbors.NearestNeighborsModel Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.nearest_neighbor_autotagger.create( actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "https://static.turi.com/datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's similarity_threshold parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ In this second example, you'll notice that the ``review_id`` column is much more sparse. This is because all results whose score was below the specified similarity threshold (.8) were excluded from the output. """ # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that tag_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not tag_name: raise _ToolkitError("No tag_name parameter specified on dataset " \ "with %d columns" % dataset.num_cols()) tag_name = tag_name or dataset.column_names()[0] # ensure that column with name tag_name exists if tag_name not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' % tag_name) # ensure that column is of type string if dataset[tag_name].dtype() != str: raise TypeError("The column used as the tag name must be of type " \ "string.") # use reasonable default for general case distance = _gl.distances.weighted_jaccard # if additional features are specified, ensure they are of appropriate types if features and not isinstance(features, list) and \ all([isinstance(x, str) for x in features]): raise TypeError("The feature parameter must be a list of strings " \ "and those strings must correspond to columns in " \ "`dataset`.") # at a minimum, this SFrame will contain the tags as features; features = features or [] features = [tag_name] + [x for x in features if x != tag_name] # ensure that each specified feature column is either of type list or str column_names = set(dataset.column_names()) for col_name in features: if col_name not in column_names: raise _ToolkitError("Specified feature column (%s) not found " \ "in dataset" % col_name) if dataset.select_column(col_name).dtype() not in (str, list): raise TypeError("Only string and list columns are allowed as " \ "features.") # concatenate the feature columns into a single column features_sf = dataset.select_columns(features) feature_col, features_sf = _concat_string_features(features_sf, features) # compute features if verbose: _logging.getLogger().info("Extracting features...") features = _preprocess(features_sf.select_column(feature_col)) # group by tag_name to ensure that tags are unique feature_cols = features.column_names() select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \ in feature_cols} features.add_column(dataset[tag_name], tag_name) features = features.groupby(tag_name, select_cols) # create nearest neighbors model m = _gl.nearest_neighbors.create(features, label=tag_name, distance=distance, features=feature_cols, verbose=verbose) # add standard toolkit state attributes state = { "nearest_neighbors_model": m, "training_time": m.get("training_time"), "tag_name": tag_name, "verbose": verbose, "num_examples": len(features), "features": feature_cols, "num_features": len(feature_cols), "distance": m.get("distance") } model = NearestNeighborAutoTagger(state) return model
def create(datasets, row_label=None, features=None, grouping_features=None, distance=None, k=2, radius=None, verbose=True): """ Create a deduplication model based on nearest neighbors and SGraph connected components. This method creates a :class:`NearestNeighborDeduplication` model by constructing a nearest neighbors similarity graph on all of the rows in the input 'datasets', then using the connected components tool in the :mod:`~graphlab.toolkits.graph_analytics` module to assign an entity label to each record. Records which share the same label are considered to be duplicates. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- datasets : SFrame or list[SFrame] or dict(string: SFrame) Input datasets. Each SFrame in the list must include all of the features specified in the `features` or 'distance' parameters, but may have additional columns as well. SFrames can be input as values in a dictionary, where the keys are strings used in the output to identify the SFrame from which each record originated. row_label : string, optional Name of the SFrame column with row labels. If not specified, row numbers are used to identify rows in the output. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates the intersection of columns over all SFrames in `datasets` should be used (except the label column, if specified). Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Any additional columns named in 'features' will be included in the model output but not used for distance computations. grouping_features : list[string], optional Names of features to use in grouping records before finding approximate matches. These columns must have string or integer type data. See the Notes section for more details on grouping. distance : string or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. k : int, optional Number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a potential duplicate. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborDeduplication model The NearestNeighborDeduplication object contains a field 'entities' which shows the entity label for each input record. It also shows the features for each record that are used to construct the model, as well as the original SFrame and row label for each record. If the original `datasets` are passed in a list, the SFrame identifier is the index of the SFrame in that list. See Also -------- NearestNeighborDeduplication, graphlab.toolkits.nearest_neighbors, graphlab.SFrame.groupby Notes ----- - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - For datasets with more than about 10,000 records, *grouping* (also known as *blocking*) is a critical step to avoid computing distances between all pairs of records. The grouping step simply assigns each record to a group that has identical values for all `grouping_features`, and only looks for duplicates within each group. - Records with missing data in the `grouping_features` are removed from consideration as duplicates. These records are given the entity label "None". - For tasks that require *only* exact matches on certain features, it is generally more natural to use the SFrame `groupby` function. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. References ---------- - Christen, Peter. "Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection." Springer Science & Business Media, 2012. Examples -------- >>> sf1 = graphlab.SFrame({'id': [0, 1, 2], ... 'x0': [0.5, 0.5, 0.3], ... 'x1': [1., 0.8, 0.6], ... 'city': ['seattle', 'olympia', 'boston'], ... 'state': ['WA', 'WA', 'MA']}) ... ... # note: misspellings in the following dataset do not prevent correct ... # matches. >>> sf2 = graphlab.SFrame({'id': [9, 10], ... 'x0': [0.35, 0.4], ... 'x1': [0.65, 0.8], ... 'city': ['bostan', 'seatle'], ... 'state': ['MA', 'WA']}) ... >>> dist = [[('city',), 'levenshtein', 2], ... [('x0', 'x1'), 'euclidean', 1.5]] ... >>> m = graphlab.nearest_neighbor_deduplication.create({'a': sf1, 'b': sf2}, ... row_label='id', ... grouping_features=['state'], ... distance=dist, k=None, ... radius=3) ... >>> print m['entities'] +----------+----+----------+-------+------+---------+------+ | __sframe | id | __entity | state | x0 | city | x1 | +----------+----+----------+-------+------+---------+------+ | a | 1 | 0 | WA | 0.5 | olympia | 0.8 | | a | 0 | 1 | WA | 0.5 | seattle | 1.0 | | b | 10 | 1 | WA | 0.4 | seatle | 0.8 | | a | 2 | 2 | MA | 0.3 | boston | 0.6 | | b | 9 | 2 | MA | 0.35 | bostan | 0.65 | +----------+----+----------+-------+------+---------+------+ [5 rows x 7 columns] """ ## Set up _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() model = NearestNeighborDeduplication() model.__proxy__['verbose'] = verbose model.__proxy__['k'] = k model.__proxy__['radius'] = radius ### ----------------------------- ### ### Validation and preprocessing ### ### ----------------------------- ### ### Validate input datasets ### ----------------------- ## If datasets is already a dict, check the keys are all strings if isinstance(datasets, dict): if not(all([isinstance(x, str) for x in datasets.keys()])): raise ValueError("Keys in the 'datasets' dict must be strings.") ## Convert singleton SFrame dataset into a list of datasets if isinstance(datasets, _gl.SFrame): _raise_error_if_sframe_empty(datasets, "dataset") datasets = {0: datasets} ## Convert a list of SFrames into a dict if isinstance(datasets, list): datasets = {k: sf for k, sf in enumerate(datasets)} ## At this point, 'datasets' must be dict. If it's not, something is wrong. if not isinstance(datasets, dict): raise TypeError("Input 'datasets' must be an SFrame, a list of SFrames, " + "or a dictionary of (string, SFrame) pairs.") model.__proxy__['num_datasets'] = len(datasets) ## Ensure that all datasets are SFrames for d in datasets.values(): _raise_error_if_not_sframe(d, "dataset") ### Validate row label ### ------------------ ## Validate the label column if row_label: if not isinstance(row_label, str): raise TypeError("The 'row_label' parameter must be the name (string " + "type) of a column in each of the input datasets.") for d in datasets.values(): if row_label not in d.column_names(): raise _ToolkitError("The specified row_label column does not " + " exist in all input datasets.") else: row_label = 'row_number' for d in datasets.values(): if row_label in d.column_names(): raise _ToolkitError("Input 'row_label' defaulted to " + "'row_number', which is already a column" + " in at least one input dataset. Please " + "specify a row label column manually.") model.__proxy__['row_label'] = row_label ### Validate 'features' and 'grouping_features' parameters ### ------------------------------------------------------ if features is not None: if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Input 'features' must contain only strings.") if grouping_features is not None: if not hasattr(grouping_features, '__iter__'): raise TypeError("Input 'grouping_features' must be a list.") if not all([isinstance(x, str) for x in grouping_features]): raise TypeError("Input 'grouping_features' must contain only strings.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. ## Find the intersection of all feature sets and feature types col_types = {k: v for k, v in zip(list(datasets.values())[0].column_names(), list(datasets.values())[0].column_types())} all_features = [sf.column_names() for sf in datasets.values()] ftr_intersection = list(set(all_features[0]).intersection(*all_features)) ftr_intersection = [x for x in ftr_intersection if x != row_label] ## Convert features and distance arguments into a composite distance. if isinstance(distance, list): distance = _copy.deepcopy(distance) elif isinstance(distance, str): if features is not None: distance = [[features, distance, 1]] else: distance = [[ftr_intersection, distance, 1]] elif distance == None: if features is not None: distance = _construct_auto_distance(features, col_types) else: distance = _construct_auto_distance(ftr_intersection, col_types) else: raise TypeError("Input 'distance' not understood. Note that for the " + "data matching toolkit, 'distance' must be a string or " + "a composite distance list." ) ## Validate the form of the composite distance and add to the model allowed_dists = { 'euclidean': [int, float, _array.array], 'squared_euclidean': [int, float, _array.array], 'manhattan': [int, float, _array.array], 'levenshtein': [str], 'jaccard': [str, dict], 'weighted_jaccard': [str, dict], 'cosine': [int, float, str, dict, _array.array], 'dot_product': [int, float, str, dict, _array.array], 'transformed_dot_product': [int, float, str, dict, _array.array]} distance = _dmutl.validate_composite_distance(distance, row_label, list(allowed_dists.keys()), verbose) model.__proxy__['distance'] = _copy.deepcopy(distance) ## Figure out which features are 'fuzzy', i.e. used for approximate # matching, and set in the model state. fuzzy_features = _dmutl.extract_composite_features(distance) # already has row_label removed model.__proxy__['features'] = fuzzy_features model.__proxy__['num_features'] = len(fuzzy_features) ## Compile a master list of all features. This includes grouping features, # fuzzy features (the ones used for approximate matching), and "ancillary" # features, which are specified in the 'features' parameter but not in the # composite distance function for whatever reason. by the user in the # 'features' parameter, but not included in the 'distance' specification # for some reason. if features is None: features = [] else: features = [x for x in features if x != row_label] if grouping_features is None: grouping_features = [] else: grouping_features = [x for x in grouping_features if x != row_label] model.__proxy__['grouping_features'] = grouping_features model.__proxy__['num_grouping_features'] = len(grouping_features) master_features = list(set(features + grouping_features + fuzzy_features)) ### Consolidate data and engineer features ### -------------------------------------- ## Consolidate multiple input datasets into a single SFrame, with a useful # row label. sf_union = _dmutl.concatenate_sframes(datasets, row_label=row_label, features=master_features, sf_index_name='__sframe') overall_label = '__sframe.' + row_label sf_union[overall_label] = (sf_union['__sframe'].astype(str) + "." + sf_union[row_label].astype(str)) ## Validate the feature types in the consolidated dataset against the # specified distance functions. _dmutl.validate_distance_feature_types(sf_union, distance, allowed_dists) ## Clean string-type features in the fuzzy feature set. for ftr in fuzzy_features: if col_types[ftr] == str: new_ftr = '__clean.' + ftr sf_union[new_ftr] = sf_union[ftr].fillna("") sf_union[new_ftr] = sf_union[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Feature engineering, distance-component-wise. Also update list of # features and a map to their types. sf_union, distance = _engineer_distance_features(sf_union, distance) transformed_features = _dmutl.extract_composite_features(distance) ### -------------------------------------------- ### ### Main loop over blocks of neighbor candidates ### ### -------------------------------------------- ### ## Construct blocks on features that must match exactly if verbose: _logging.info("Constructing groups of records that match exactly on " + "the 'grouping_features'.") sf_union, block_errors, blocks = \ _dmutl.construct_exact_blocks(sf_union, grouping_features) if verbose and len(distance) > 0 and blocks['Count'].max() > 10000: _logging.warning("There are more than 10,000 records in the largest match " + "group. For many uses, approximate matches within each match group are " + "computed with brute force nearest neighbors, which may be slow. " + "Consider using smaller groups by requiring different features to " + "match exactly.") max_entity_number = 0 sf_entity = _gl.SFrame() output_features = (master_features + [row_label, '__sframe', '__entity']) ## Main loop over blocks for i, block in enumerate(blocks): if verbose: _logging.info("Processing {} records in match group: {}/{}".format(block['Count'], i+1, len(blocks))) ## Retrieve records in the block and impute the mean for missing numeric # values. records = sf_union[block['min_idx']:(block['max_idx'] + 1)] complete_records = _dmutl.impute_numeric_means(records, transformed_features) if len(distance) > 0: ## Run all-point nearest neighbors if verbose: _logging.info("Building the similarity graph....") m = _gl.nearest_neighbors.create(complete_records, label=overall_label, distance=distance, verbose=False) knn = m.query(complete_records, label=overall_label, k=k, radius=radius, verbose=verbose) ## Construct similarity graph to resolve transitive closure sg = _gl.SGraph() sg = sg.add_vertices(records[[overall_label]], vid_field=overall_label) sg = sg.add_edges(knn, src_field='query_label', dst_field='reference_label') ## Cut the similarity graph to establish an entity for each vertex if verbose: _logging.info("Finding duplicate records in the similarity graph....") cc = _gl.connected_components.create(sg, verbose=verbose) ## Relabel the component IDs to be consecutive integers starting with # the max index of the previous block's entity labels. block_labels = cc['component_size'].add_row_number('__entity') block_labels['__entity'] += max_entity_number max_entity_number += block_labels.num_rows() block_entity_labels = cc['component_id'].join(block_labels, on='component_id', how='left') ## Join the entity labels for the block back to the block's records, # then append to the master output records = records.join(block_entity_labels[['__id', '__entity']], on={overall_label: '__id'}, how='left') records = records.sort('__entity') else: # no fuzzy features, so no nearest neighbors, just block ID records['__entity'] = _gl.SArray.from_const(i, len(records)) sf_entity = sf_entity.append(records[output_features]) ### ------------------------------------- ### ### Postprocessing and results formatting ### ### ------------------------------------- ### ## Add rows missing from the blocking back to the master results if len(block_errors) > 0: block_errors['__entity'] = _gl.SArray.from_const(None, len(block_errors)).astype(int) sf_entity = sf_entity.append(block_errors[output_features]) ## Rearrange columns sf_entity.swap_columns('__sframe', sf_entity.column_names()[0]) sf_entity.swap_columns(row_label, sf_entity.column_names()[1]) sf_entity.swap_columns('__entity', sf_entity.column_names()[2]) ## Finalize the model state model.__proxy__['training_time'] = _time.time() - start_time model.__proxy__['entities'] = sf_entity model.__proxy__['num_entities'] = max_entity_number return model
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ Retrieve the nearest neighbors from the reference set for each element of the query set. The query SFrame must include columns with the same names as the label and feature columns used to create the NearestNeighborsModel. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. label : string, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = graphlab.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = graphlab.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors.query') ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input ref_label = self.get('label') if label is None: sf_features = sf_features.add_row_number(column_name=ref_label) sf_label = sf_features[[ref_label]] sf_features.remove_column(ref_label) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype() == int: raise TypeError("The label column must contain integers or strings.") if label in ref_features: raise ValueError("The label column cannot be one of the features.") sf_label = _tkutl._toolkits_select_columns(dataset, [label]) if label != ref_label: sf_label.rename({label: ref_label}) ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = {'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'label': sf_label, 'k': k, 'radius': radius} if verbose is True: print "Starting model querying..." result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ For each row of the input 'dataset', retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model, but if it is, the 'include_self_edges' parameter can be set to False to exclude results that match query points to themselves. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - The `dataset` input to this method *can* have missing values (in contrast to the reference dataset used to create the nearest neighbors model). Missing numeric values are imputed to be the mean of the corresponding feature in the reference dataset, and missing strings are imputed to be empty strings. - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. - For models created with the 'lsh' method, the query results may have fewer query labels than input query points. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query, the query point is omitted from the results. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = graphlab.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = graphlab.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ _mt._get_metric_tracker().track('toolkit.nearest_neighbors.query') ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input if label is None: query_labels = _graphlab.SArray.from_sequence(len(dataset)) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype( ) == int: raise TypeError( "The label column must contain integers or strings.") if label in ref_features: raise ValueError( "The label column cannot be one of the features.") query_labels = dataset[label] ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'query_labels': query_labels, 'k': k, 'radius': radius } result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])
def create(dataset, label=None, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *List*: list of integer or string values. Each element is treated as a separate variable in the model. - *String*: string values. Please note: if a composite distance is also specified, this parameter is ignored. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate nearest neighbors efficiently. The LSH model supports 'euclidean', 'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product' (deprecated), and 'transformed_dot_product' distances. Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``. See the notes below for details. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. The default value is 20. We recommend choosing values from 10 to 30. - *num_projections_per_table*: For the LSH method, the number of projections/hash functions for each hash table. The default value is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine' distance and 4 ~ 12 for other distances. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query, graphlab.toolkits.distances Notes ----- - Missing data is not allowed in the 'dataset' provided to this function. Please use the :func:`graphlab.SFrame.fillna` and :func:`graphlab.SFrame.dropna` utilities to handle missing data before creating a nearest neighbors model. - Missing keys in sparse vectors are assumed to have value 0. - The `composite_params` parameter was removed as of GraphLab Create version 1.5. The `distance` parameter now accepts either standard or composite distances. Please see the :mod:`~graphlab.toolkits.distances` module documentation for more information on composite distances. - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. **Locality Sensitive Hashing (LSH)** There are several efficient nearest neighbors search algorithms that work well for data with low dimensions :math:`d` (approximately 50). However, most of the solutions suffer from either space or query time that is exponential in :math:`d`. For large :math:`d`, they often provide little, if any, improvement over the 'brute_force' method. This is a well-known consequence of the phenomenon called `The Curse of Dimensionality`. `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach that is designed to efficiently solve the *approximate* nearest neighbor search problem for high dimensional data. The key idea of LSH is to hash the data points using several hash functions, so that the probability of collision is much higher for data points which are close to each other than those which are far apart. An LSH family is a family of functions :math:`h` which map points from the metric space to a bucket, so that - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`. - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`. LSH for efficient approximate nearest neighbor search: - We define a new family of hash functions :math:`g`, where each function :math:`g` is obtained by concatenating :math:`k` functions :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`. The algorithm constructs :math:`L` hash tables, each of which corresponds to a different randomly chosen hash function :math:`g`. There are :math:`k \\cdot L` hash functions used in total. - In the preprocessing step, we hash all :math:`n` reference points into each of the :math:`L` hash tables. - Given a query point :math:`q`, the algorithm iterates over the :math:`L` hash functions :math:`g`. For each :math:`g` considered, it retrieves the data points that are hashed into the same bucket as q. These data points from all the :math:`L` hash tables are considered as candidates that are then re-ranked by their real distances with the query data. **Note** that the number of tables :math:`L` and the number of hash functions per table :math:`k` are two main parameters. They can be set using the options ``num_tables`` and ``num_projections_per_table`` respectively. Hash functions for different distances: - `euclidean` and `squared_euclidean`: :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where :math:`a` is a vector, of which the elements are independently sampled from normal distribution, and :math:`b` is a number uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the bucket width. We set :math:`r` using the average all-pair `euclidean` distances from a small randomly sampled subset of the reference data. - `manhattan`: The hash function of `manhattan` is similar with that of `euclidean`. The only difference is that the elements of `a` are sampled from Cauchy distribution, instead of normal distribution. - `cosine`: Random Projection is designed to approximate the cosine distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`, where :math:`a` is randomly sampled normal unit vector. - `jaccard`: We use a recently proposed method one permutation hashing by Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014] <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details. - `dot_product`: The reference data points are first transformed to fixed-norm vectors, and then the minimum `dot_product` distance search problem can be solved via finding the reference data with smallest `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015] <http://jmlr.org/proceedings/papers/v37/neyshabur15.html>`_ for details. References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_. Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], ... 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]] ... >>> model = graphlab.nearest_neighbors.create(sf, distance=my_dist) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Basic validation of the features input if features is not None and not isinstance(features, list): raise TypeError("If specified, input 'features' must be a list of " + "strings.") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError( "'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and ( distance == 'cosine' or distance == _graphlab.distances.cosine or distance == 'dot_product' or distance == _graphlab.distances.dot_product or distance == 'transformed_dot_product' or distance == _graphlab.distances.transformed_dot_product): raise TypeError( "The ball tree method does not work with 'cosine' " + "'dot_product', or 'transformed_dot_product' distance." + "Please use the 'brute_force' method for these distances.") if method == 'lsh' and ('num_projections_per_table' not in _method_options): if distance == 'jaccard' or distance == _graphlab.distances.jaccard: _method_options['num_projections_per_table'] = 4 elif distance == 'cosine' or distance == _graphlab.distances.cosine: _method_options['num_projections_per_table'] = 16 else: _method_options['num_projections_per_table'] = 8 ## Initial validation and processing of the label if label is None: _label = _robust_column_name('__id', dataset.column_names()) _dataset = dataset.add_row_number(_label) else: _label = label _dataset = _copy.copy(dataset) col_type_map = {c: _dataset[c].dtype() for c in _dataset.column_names()} _validate_row_label(_label, col_type_map) ref_labels = _dataset[_label] ## Determine the internal list of available feature names (may still include # the row label name). if features is None: _features = _dataset.column_names() else: _features = _copy.deepcopy(features) ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: raise _ToolkitError("The only available feature is the same as the " + "row label column. Please specify features " + "that are not also row labels.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. # - At this point, the row label 'label' may still be in the list(s) of # features. ## Convert any distance function input into a single composite distance. # distance is already a composite distance if isinstance(distance, list): distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. elif distance is None or distance == 'auto': sample = _dataset.head() distance = _construct_auto_distance(_features, _dataset.column_names(), _dataset.column_types(), sample) else: raise TypeError("Input 'distance' not understood. The 'distance' " " argument must be a string, function handle, or " + "composite distance.") ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. distance = _scrub_composite_distance_features(distance, [_label]) distance = _convert_distance_names_to_functions(distance) _validate_composite_distance(distance) ## Raise an error if any distances are used with non-lists list_features_to_check = [] sparse_distances = [ 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product', 'transformed_dot_product' ] sparse_distances = [ _graphlab.distances.__dict__[k] for k in sparse_distances ] for d in distance: feature_names, dist, _ = d list_features = [ f for f in feature_names if _dataset[f].dtype() == list ] for f in list_features: if dist in sparse_distances: list_features_to_check.append(f) else: raise TypeError( "The chosen distance cannot currently be used " + "on list-typed columns.") for f in list_features_to_check: only_str_lists = _validate_lists(_dataset[f], [str]) if not only_str_lists: raise TypeError("Distances for sparse data, such as jaccard " + "and weighted_jaccard, can only be used on " + "lists containing only strings. Please modify " + "any list features accordingly before creating " + "the nearest neighbors model.") ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _graphlab.distances.levenshtein): raise ValueError( "Levenshtein distance cannot be used with multiple " + "columns. Please concatenate strings into a single " + "column before creating the nearest neighbors model.") ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: _method = 'brute_force' if method != 'brute_force' and verbose is True: print("Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components.") else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([ len(x) if hasattr(x, '__iter__') else 1 for x in _six.itervalues(sf_clean[0]) ]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([ x in [int, float, list, array.array] for x in sf_clean.column_types() ]) ## Conditions necessary for ball tree to work and be worth it if ((distance[0][1] in [ 'euclidean', 'manhattan', _graphlab.distances.euclidean, _graphlab.distances.manhattan ]) and numeric_type_flag is True and num_variables <= 200): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors_balltree.create') elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors_brute.create') elif _method == 'lsh': model_name = 'nearest_neighbors_lsh' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_lsh.create') else: raise ValueError( "Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'.") ## Package the model options opts = {} opts.update(_method_options) opts.update({ 'model_name': model_name, 'ref_labels': ref_labels, 'label': label, 'sf_features': sf_clean, 'composite_params': distance }) ## Construct the nearest neighbors model if not verbose: _mt.main.get_server().set_log_progress(False) result = _graphlab.extensions._nearest_neighbors.train(opts) _mt.main.get_server().set_log_progress(True) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) return model
def create(dataset, tag_name=None, features=None, verbose=True): """ Create a NearestNeighborAutotagger model, which can be used to quickly apply tags from a reference set of text labels to a new query set using the ``tag`` method. Parameters ---------- dataset : SFrame Reference data. This SFrame must contain at least one column. By default, only the ``tag_name`` column is used as the basis for tagging. You may optionally include additional columns with the ``features`` parameter. tag_name : string, optional Name of the column in ``dataset`` with the tags. This column must contain string values. If ``dataset`` contains more than one column, ``tag_name`` must be specified. features : list[string], optional Names of the columns with features to use as the basis for tagging. 'None' (the default) indicates that only the column specified by the ``tag_name`` parameter should be used. Only str or list fields are allowed. If a column of type list is specified, all values must be either of type string or convertible to type string. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : model A model for quickly tagging new query observations with entries from `dataset`. Currently, the only implementation is the following: - NearestNeighborAutoTagger See Also -------- NearestNeighborsModel Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.nearest_neighbor_autotagger.create( actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "s3://dato-datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's similarity_threshold parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ In this second example, you'll notice that the ``review_id`` column is much more sparse. This is because all results whose score was below the specified similarity threshold (.8) were excluded from the output. """ # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that tag_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not tag_name: raise _ToolkitError("No tag_name parameter specified on dataset " \ "with %d columns" % dataset.num_cols()) tag_name = tag_name or dataset.column_names()[0] # ensure that column with name tag_name exists if tag_name not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' % tag_name) # ensure that column is of type string if dataset[tag_name].dtype() != str: raise TypeError("The column used as the tag name must be of type " \ "string.") # use reasonable default for general case distance = _gl.distances.weighted_jaccard # if additional features are specified, ensure they are of appropriate types if features and not isinstance(features, list) and \ all([isinstance(x, str) for x in features]): raise TypeError("The feature parameter must be a list of strings " \ "and those strings must correspond to columns in " \ "`dataset`.") # at a minimum, this SFrame will contain the tags as features; features = features or [] features = [tag_name] + [x for x in features if x != tag_name] # ensure that each specified feature column is either of type list or str column_names = set(dataset.column_names()) for col_name in features: if col_name not in column_names: raise _ToolkitError("Specified feature column (%s) not found " \ "in dataset" % x) if dataset.select_column(col_name).dtype() not in (str, list): raise TypeError("Only string and list columns are allowed as " \ "features.") # concatenate the feature columns into a single column features_sf = dataset.select_columns(features) feature_col, features_sf = _concat_string_features(features_sf, features) # compute features if verbose: _logging.getLogger().info("Extracting features...") features = _preprocess(features_sf.select_column(feature_col)) # group by tag_name to ensure that tags are unique feature_cols = features.column_names() select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \ in feature_cols} features.add_column(dataset[tag_name], tag_name) features = features.groupby(tag_name, select_cols) # create nearest neighbors model m = _gl.nearest_neighbors.create( features, label=tag_name, distance=distance, features=feature_cols, verbose=verbose) # add standard toolkit state attributes state = {"training_time": m.get("training_time"), "tag_name": tag_name, "verbose": verbose, "num_examples": len(features), "features": feature_cols, "num_features": len(feature_cols), "distance": m.get("distance")} model = NearestNeighborAutoTagger(m, state) model.summary() return model
def predict_topk(self, dataset, max_neighbors=10, radius=None, k=3, verbose=False): """ Return top-k most likely predictions for each observation in ``dataset``. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`. Parameters ---------- dataset : SFrame Dataset of new observations. Must include the features used for model training, but does not require a target column. Additional columns are ignored. max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. k : int, optional Number of classes to return for each input example. Returns ------- out : SFrame See Also ---------- create, classify, predict Notes ----- - If the 'radius' parameter is small, it is possible that a query point has no neighbors in the training dataset. In this case, the query is dropped from the SFrame output by this method. If all queries have no neighbors, then the result is an empty SFrame. If the target column in the training dataset has missing values, these predictions will be ambiguous. - Ties between predicted classes are broken randomly. Examples -------- >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> sf_new = graphlab.SFrame({'height': [26, 19], ... 'weight': [25, 35]}) ... >>> m = graphlab.nearest_neighbor_classifier.create(sf_train, target='species') >>> ystar = m.predict_topk(sf_new, max_neighbors=2) >>> print ystar +--------+-------+-------------+ | row_id | class | probability | +--------+-------+-------------+ | 0 | dog | 1.0 | | 1 | fossa | 0.5 | | 1 | dog | 0.5 | +--------+-------+-------------+ """ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.predict_topk') ## Validate the number of results to return. Note that the # 'max_neighbors' and 'radius' parameters are validated by the nearest # neighbor model's query method. if not isinstance(k, int) or k < 1: raise TypeError("The number of results to return for each point, 'k', " "must be an integer greater than 0.") ## Validate the query dataset. _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## Validate neighborhood parameters 'max_neighbors'. # - NOTE: when the parameter name is changed in nearest neighbors, the # query call will do this itself, and this block can be removed. if max_neighbors is not None: if not isinstance(max_neighbors, int): raise ValueError("Input 'max_neighbors' must be an integer.") if max_neighbors <= 0: raise ValueError("Input 'max_neighbors' must be larger than 0.") ## Find the nearest neighbors for each query and count the number of # votes for each class. knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius, verbose=verbose) ## If there are *no* results for *any* query make an empty SFrame. if knn.num_rows() == 0: ystar = _gl.SFrame({'row_id': [], 'class': [], 'probability': []}) ystar['row_id'] = ystar['row_id'].astype(int) ystar['class'] = ystar['class'].astype(str) else: ## Find the classes with the top-k vote totals grp = knn.groupby(['query_label', 'reference_label'], _gl.aggregate.COUNT) ystar = grp.unstack(column=['reference_label', 'Count'], new_column_name='votes') ystar['topk'] = ystar['votes'].apply(lambda x: _sort_topk_votes(x, k)) ystar['total_votes'] = ystar['votes'].apply(lambda x: sum(x.values())) ## Re-stack, unpack, and rename the results ystar = ystar.stack('topk', new_column_name='topk') ystar = ystar.unpack('topk') ystar.rename({'topk.class': 'class', 'query_label': 'row_id'}) ystar['probability'] = ystar['topk.votes'] / ystar['total_votes'] ystar = ystar[['row_id', 'class', 'probability']] return ystar
def tag(self, dataset, query_name=None, k=5, similarity_threshold=None, exclude_zeros=True, verbose=True): """ Match the reference tags passed when a model is created to a new set of queries. This is a many-to-many match: each query may have any number of occurrences of a reference tag. Parameters ---------- dataset : SFrame Query data to be tagged. query_name : string, optional Name of the column in ``dataset`` to be auto-tagged. If ``dataset`` has more than one column, ``query_name`` must be specified. k : int, optional Number of results to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all results whose score is greater than or equal to ``similarity_threshold``. similarity_threshold : float, optional Only results whose score is greater than or equal to the specified ``similarity_threshold`` are returned. The default is ``None``, in which case the ``k`` best results are returned for each query point. verbose : bool, optional If True, print progress updates and model details. exclude_zeros : boolean, optional If True, only entries for which there is a tag with a nonzero score are preserved in the output. This is the default behavior. Returns ------- out : SFrame An SFrame with four columns: - row ID - column name specified as `tag_name` parameter to `create` method - column name specified as `query_name` parameter to `tag` method - a similarity score between 0 and 1, indicating the strength of the match between the query data and the suggested reference tag, where a score of zero indicates a poor match and a strength of 1 corresponds to a perfect match Notes ----- - By default, only rows for which there is a tag with a nonzero score are included in the output. To guarantee at least one output row for every input row in ``dataset``, set the ``exclude_zeros`` parameter to False. - If both ``k`` and ``similarity_threshold`` are set to ``None``, a ToolkitError is raised. Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "https://static.turi.com/datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's `similarity_threshold` parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ """ _mt._get_metric_tracker().track(self.__module__ + '.tag') # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that either k or similarity_threshold is set if not (k or similarity_threshold): raise _ToolkitError("Either k or similarity_threshold parameters " \ "must be set") # ensure that query_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not query_name: raise _ToolkitError("No query_name parameter specified on " \ "dataset with %d columns" % dataset.num_cols()) query_column = query_name or dataset.column_names()[0] # ensure that column with name tag_name exists if query_column not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' \ % query_column) query_sa = dataset.select_column(query_column) query_sf = _gl.SFrame({ "id": range(len(query_sa)), query_column: query_sa }) features = _preprocess(query_sa) features = features.add_row_number() if similarity_threshold: if not isinstance(similarity_threshold, (float, int)): raise _ToolkitError("similarity_threshold parameter must be a" \ "float or an int.") if similarity_threshold < 0 or similarity_threshold > 1: raise _ToolkitError("similarity_threshold parameter must be " \ "between 0 and 1.") radius = (1 - similarity_threshold) if similarity_threshold else None results = self.__proxy__['nearest_neighbors_model'].query( features, label="id", k=k, radius=radius, verbose=verbose) # return empty SFrame immediately if no NN results if len(results) == 0: return _gl.SFrame({ query_column + "_id": [], query_column: [], self.get("tag_name"): [], "score": [] }) results = results.join(query_sf, on={"query_label": "id"}) results.rename({"query_label": query_column + "_id"}) results.rename({query_column: "query_label"}) # convert distances to similarity scores scores = _dists_to_sim_scores("weighted_jaccard", results) results.add_column(scores, "score") results.remove_column("distance") results.remove_column("rank") results.rename({ "reference_label": self.get("tag_name"), "query_label": query_column }) results.swap_columns(self.get("tag_name"), query_column) if exclude_zeros: try: results = results.filter_by(0.0, "score", exclude=True) except RuntimeError: # nothing to join _logging.getLogger(__name__).warn( "Empty results after filtering scores of 0.") results = results.head(0) return results
def link(self, dataset, k=5, radius=None, verbose=True): """ Find matching records from the reference dataset (entered when the model was created) for each record in the 'dataset' passed to this function. The query dataset must include columns with the same names as the label and feature columns used to create the RecordLinker model. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. k : int, optional Maximum number of nearest neighbors to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the row label of the query observation, the second is the row label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- Assume we've created the model from the example in the RecordLinker 'create' function. >>> queries = graphlab.SFrame({'sqft': [986, 1320], ... 'street': ['fremont', 'phiney'], ... 'city': ['sea', 'seattle'], ... 'state': ['WA', 'WA']}) ... >>> model.link(queries, k=2, radius=5.) +-------------+-----------------+----------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------+------+ | 0 | 0 | 4.0 | 1 | | 0 | 2 | 5.0 | 2 | | 1 | 0 | 0.0 | 1 | +-------------+-----------------+----------+------+ """ _mt._get_metric_tracker().track(self.__module__ + '.link_records') ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Make sure all of the necessary features are present at 'link' time. sf_features = _tkutl._toolkits_select_columns(dataset, self.get('features')) ## Clean and impute string data. *** Think about consolidating this and # the next step into a feature transformer.*** col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(self.__proxy__['distance']) for ftr in self.get('features'): if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [ new_ftr if x == ftr else x for x in dist_comp[0] ] ## Convert strings to dicts and concatenate string features. _dataset, _ = _engineer_distance_features(_dataset, _distance) ## Query the nearest neighbor model result = self.__proxy__['nearest_neighbors_model'].query( _dataset, k=k, radius=radius, verbose=verbose) return result
def create(dataset, features=None, label=None, distance=None, num_neighbors=5, threshold_distances=True, verbose=True): """ Create a :class:`LocalOutlierFactorModel`. This mode contains local outlier factor (LOF) scores for the training data passed to this model, and can predict the LOF score for new observations. The LOF method scores each data instance by computing the ratio of the average densities of the instance's neighbors to the density of the instance itself. The higher the score, the more likely the instance is to be an outlier *relative to its neighbors*. A score of 1 or less means that an instance has a density similar (or higher) to its neighbors and is unlikely to be an outlier. The model created by this function contains an SFrame called 'scores' that contains the computed local outlier factors. The `scores` SFrame has four columns: - *row_id*: the row index of the instance in the input dataset. If a label column is passed, the labels (and the label name) are passed through to this column in the output. - *density*: the density of instance as estimated by the LOF procedure. - *neighborhood_radius*: the distance from the instance to its furthest neighbor (defined by 'num_neighbors', and used for predicting the LOF for new points). - *anomaly_score*: the local outlier factor. For more information on the LOF method and the computation used for each of these columns, please see the Notes and References sections below. Parameters ---------- dataset : SFrame Input dataset. The 'dataset' SFrame must include the features specified in the 'features' or 'distance' parameter (additional columns are ignored). features : list[string], optional Names of feature columns. 'None' (the default) indicates that all columns should be used. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if 'distance' is specified as a composite distance, then that parameter controls which features are used in the model. Also note that the column of row labels is automatically removed from the features, if there is a conflict. label : str, optional Name of the input column containing row labels. The values in this column must be integers or strings. If not specified, row numbers are used by default. distance : string or list[list], optional Function to measure the distance between any two input data rows. If left unspecified, a distance function is automatically constructed based on the feature types. The distance may be specified by either a string or composite distance: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', or 'dot_product'. Please see the :mod:`distances` module for more details. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) num_neighbors : int, optional Number of neighbors to consider for each point. threshold_distances : bool, optional If True (the default), the distance between two points is thresholded. This reduces noise and can improve the quality of results, but at the cost of slower computation. See the notes below for more detail. verbose : bool, optional If True, print progress updates and model details. Returns ------- model : LocalOutlierFactorModel A trained :class:`LocalOutlierFactorModel`, which contains an SFrame called 'scores' that includes the 'anomaly score' for each input instance. See Also -------- LocalOutlierFactorModel, graphlab.toolkits.nearest_neighbors Notes ----- - The LOF method scores each data instance by computing the ratio of the average densities of the instance's neighbors to the density of the instance itself. According to the LOF method, the estimated density of a point :math:`p` is the number of :math:`p`'s neighbors divided by the sum of distances to the instance's neighbors. In the following, suppose :math:`N(p)` is the set of neighbors of point :math:`p`, :math:`k` is the number of points in this set (i.e. the 'num_neighbors' parameter), and :math:`d(p, x)` is the distance between points :math:`p` and :math:`x` (also based on a user-specified distance function). .. math:: \hat{f}(p) = \\frac{k}{\sum_{x \in N(p)} d(p, x)} - The LOF score for point :math:`p` is then the ratio of :math:`p`'s density to the average densities of :math:`p`'s neighbors: .. math:: LOF(p) = \\frac{\\frac{1}{k} \sum_{x \in N(p)} \hat{f}(x)}{\hat{f}(p)} - If the 'threshold_distances' flag is set to True, exact distances are replaced by "thresholded" distances. Let Suppose :math:`r_k(x)` is the distance from :math:`x` to its :math:`k`'th nearest neighbor. Then the thresholded distance from point :math:`p` to point :math:`x_i` is .. math:: d^*(p, x) = \max\{r_k(x), d(p, x)\} This adaptive thresholding is used in the original LOF paper (see the References section) to reduce noise in the computed distances and improve the quality of the final LOF scores. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - If there are several observations located at an identical position, the LOF values can be undefined. An LOF score of "nan" means that a point is either in or near a set of co-located points. - This implementation of LOF forces the neighborhood of each data instance to contain exactly 'num_neighbors' points, breaking ties arbitrarily. This differs from the original LOF paper (see References below), which allows neighborhoods to expand if there are multiple neighbors at exactly the same distance from an instance. References ---------- - Breunig, M. M., Kriegel, H., Ng, R. T., & Sander, J. (2000). `LOF: Identifying Density-Based Local Outliers <http://people.cs.vt.edu/badityap/classes/cs6604-Fall13/readings/breunig-2000.pdf>`_, pp 1-12. Examples -------- >>> sf = graphlab.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.], ... 'x1': [2., 1., 0., 1., 2., 1.5, 2.5]}) >>> lof = graphlab.local_outlier_factor.create(sf, num_neighbors=3) >>> lof['scores'] +--------+----------------+----------------+---------------------+ | row_id | density | anomaly_score | neighborhood_radius | +--------+----------------+----------------+---------------------+ | 0 | 0.927050983125 | 1.03785526045 | 1.0 | | 3 | 0.962144739546 | 0.919592692017 | 1.0 | | 1 | 0.765148090776 | 1.14822979837 | 1.0 | | 6 | 0.230412599692 | 3.52802012342 | 4.71699056603 | | 2 | 0.71140803489 | 1.26014768739 | 1.80277563773 | | 5 | 0.962144739546 | 0.919592692017 | 1.11803398875 | | 4 | 0.962144739546 | 0.919592692017 | 1.11803398875 | +--------+----------------+----------------+---------------------+ [7 rows x 4 columns] """ ## Start the training time clock and instantiate an empty model _mt._get_metric_tracker().track( 'toolkit.anomaly_detection.local_outlier_factor.create') logger = _logging.getLogger(__name__) start_time = _time.time() ## Validate the input dataset _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate the number of neighbors, mostly to make the error message use # the right parameter name. if not isinstance(num_neighbors, int): raise TypeError("Input 'num_neighbors' must be an integer.") if num_neighbors <= 0: raise ValueError("Input 'num_neighbors' must be larger than 0.") if num_neighbors > dataset.num_rows(): num_neighbors = dataset.num_rows() if verbose: logger.info("Input 'num_neighbors' is larger than the number " + "of rows in the input 'dataset'. Resetting " + "'num_neighbors' to the dataset length.") ## Validate the row label against the features *using the nearest neighbors # tool with only one row of data. This is a hack - we should encapsulate # the validation steps in nearest neighbors and do them here first. validation_model = _gl.nearest_neighbors.create(dataset[:1], label=label, features=features, distance=distance, method='brute_force', verbose=False) ## Compute the similarity graph based on k and radius, without self-edges, # but keep it in the form of an SFrame. Do this *without* the row label, # because I need to sort on the row number, and row labels that aren't # already in order will be screwed up. knn_model = _gl.nearest_neighbors.create(dataset, distance=validation_model.distance, method='brute_force', verbose=verbose) knn = knn_model.similarity_graph(k=num_neighbors, radius=None, include_self_edges=False, output_type='SFrame', verbose=verbose) ## Bias the distances by making them at least equal to the *reference* # point's k'th neighbor radius. This is "reach-distance" in the original # paper. if threshold_distances is True: radii = knn.groupby('query_label', {'neighborhood_radius': _gl.aggregate.MAX('distance')}) knn = knn.join(radii, on={'reference_label': 'query_label'}, how='left') knn['distance'] = knn.apply( lambda x: x['distance'] if x['distance'] > x['neighborhood_radius'] \ else x['neighborhood_radius']) ## Find the sum of distances from each point to its neighborhood, then # compute the "local reachability density (LRD)". This is not remotely a # valid density estimate, but it does have the form of mass / volume, # where the mass is estimated by the number of neighbors in point x's # neighborhood, and the volume is estimated by the sum of the distances # between x and its neighbors. # ## NOTE: if a vertex is co-located with all of its neighbors, the sum of # distances will be 0, in which case the inverse distance sum value is # 'inf'. scores = knn.groupby('query_label', {'dist_sum': _gl.aggregate.SUM('distance')}) scores['density'] = float(num_neighbors) / scores['dist_sum'] ## Join the density of each point back to the nearest neighbors results, # then get the average density of each point's neighbors' densities. knn = knn.join(scores, on={'reference_label': 'query_label'}, how='left') scores2 = knn.groupby('query_label', {'average_neighbor_density': _gl.aggregate.AVG('density')}) ## Combine each point's density and average neighbor density into one # SFrame, then compute the local outlier factor (LOF). scores = scores.sort('query_label') scores2 = scores2.sort('query_label') scores['anomaly_score'] = scores2['average_neighbor_density'] / scores['density'] ## Add each point's neighborhood radius to the output SFrame. if threshold_distances is True: radii = radii.sort('query_label') scores['neighborhood_radius'] = radii['neighborhood_radius'] ## Remove the extraneous columns from the output SFrame and format. scores = scores.remove_column('dist_sum') ## Substitute in the row labels. if label is None: row_label_name = 'row_id' scores = scores.rename({'query_label': row_label_name}) else: row_label_name = label scores = scores.remove_column('query_label') col_names = scores.column_names() scores[row_label_name] = dataset[label] scores = scores[[row_label_name] + col_names] ## Post-processing and formatting state = { 'nearest_neighbors_model': knn_model, 'verbose': verbose, 'threshold_distances': threshold_distances, 'num_neighbors': num_neighbors, 'num_examples': dataset.num_rows(), 'distance': knn_model['distance'], 'num_distance_components': knn_model['num_distance_components'], 'features': knn_model['features'], 'row_label_name': row_label_name, 'num_features': knn_model['num_features'], 'unpacked_features': knn_model['unpacked_features'], 'num_unpacked_features': knn_model['num_unpacked_features'], 'scores': scores, 'training_time': _time.time() - start_time} model = LocalOutlierFactorModel(state) return model
def create(dataset, target, model_name, env, features=None, validation_set='auto', verbose=True, **kwargs): _raise_error_if_not_sframe(dataset, "training dataset") # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print( "PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.\n" " You can set ``validation_set=None`` to disable validation tracking.\n" ) dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError('Unrecognized value for validation_set.') # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError( "Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({ 'target': target_sframe, 'features': features_sframe, 'model_name': model_name }) if validation_set is not None: if not isinstance(validation_set, _graphlab.SFrame): raise TypeError( "validation_set must be either 'auto' or an SFrame matching the training data." ) # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) options.update({ 'features_validation': _toolkits_select_columns(validation_set, features), 'target_validation': _toolkits_select_columns(validation_set, [target]) }) from . import _dml dml_obj = _dml.run("distributed_supervised_train", model_name, options, env) return dml_obj
def predict(self, dataset, verbose=True): """ Compute local outlier factors for new data. The LOF scores for new data instances are based on the neighborhood statistics for the data used when the model was created. Each new point is scored independently. Parameters ---------- dataset : SFrame Dataset of new points to score with LOF against the training data already stored in the model. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SArray LOF score for each new point. The output SArray is sorted to match the order of the 'dataset' input to this method. Examples -------- >>> sf = graphlab.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.], ... 'x1': [2., 1., 0., 1., 2., 1.5, 2.5]}) >>> m = graphlab.local_outlier_factor.create(sf, num_neighbors=3) ... >>> sf_new = graphlab.SFrame({'x0': [0.5, 4.5], ... 'x1': [1., 4.0]}) >>> m.predict(sf_new) dtype: float Rows: 2 [0.9317508614964032, 2.905646339288692] """ _mt._get_metric_tracker().track( 'toolkit.anomaly_detection.local_outlier_factor.predict') ## Validate the input dataset _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") num_neighbors = self.__proxy__['num_neighbors'] ## Query the knn model with the new points. knn = self.__proxy__['nearest_neighbors_model'].query(dataset, k=num_neighbors, verbose=verbose) ## Join the reference data's neighborhood statistics to the nearest # neighbors results. knn = knn.join(self.__proxy__['scores'], on={'reference_label': 'row_id'}, how='left') # Compute reachability distance for each new point and its # neighborhood. if self.__proxy__['threshold_distances'] is True: knn['distance'] = knn.apply( lambda x: x['distance'] \ if x['distance'] > x['neighborhood_radius'] \ else x['neighborhood_radius']) ## Find the sum of distances from each point to its neighborhood, then # compute the "local reachability density" for each query point. scores = knn.groupby('query_label', {'dist_sum': _gl.aggregate.SUM('distance')}) scores['density'] = float(num_neighbors) / scores['dist_sum'] ## Find the average density for each query point's neighbors. scores2 = knn.groupby('query_label', {'average_neighbor_density': _gl.aggregate.AVG('density')}) ## Join the point densities and average neighbor densities into a # single SFrame and compute the local outlier factor. scores = scores.join(scores2, on='query_label') scores['anomaly_score'] = \ scores['average_neighbor_density'] / scores['density'] ## Remove extraneous columns and format. scores = scores.sort('query_label', ascending=True) return scores['anomaly_score']
def extract_features(self, dataset): """ For each example in the dataset, extract the leaf indices of each tree as features. For multiclass classification, each leaf index contains #num_class numbers. The returned feature vectors can be used as input to train another supervised learning model such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`, an :py:class:`~graphlab.svm_classifier.SVMClassifier`, or a :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. Examples -------- >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/regression/houses.csv') >>> # Regression Tree Models >>> model = graphlab.boosted_trees_regression.create(data, ... target='price', ... features=['bath', 'bedroom', 'size']) >>> data['boosted_tree_features'] = model.extract_features(data) >>> model = graphlab.random_forest_regression.create(data, ... target='price', ... features=['bath', 'bedroom', 'size']) >>> data['random_forest_features'] = model.extract_features(data) >>> # Classification Tree Models >>> data['is_expensive'] = data['price'] > 30000 >>> model = graphlab.boosted_trees_classifier.create(data, ... target='is_expensive', ... features=['bath', 'bedroom', 'size']) >>> data['boosted_tree_features'] = model.extract_features(data) >>> model = graphlab.random_forest_classifier.create(data, ... target='is_expensive', ... features=['bath', 'bedroom', 'size']) >>> data['random_forest_features'] = model.extract_features(data) """ metric_name = '.'.join([self.__module__, 'extract_features']) _mt._get_metric_tracker().track(metric_name) _raise_error_if_not_sframe(dataset, "dataset") options = dict() options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset}) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])
def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`, `margin`, or `rank`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model. Parameters ---------- dataset : SFrame A dataset that has the same columns that were used during training. If the target column exists in ``dataset`` it will be ignored while making predictions. output_type : {'probability', 'rank', 'margin'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. - `margin` : Margin associated with each label in the prediction. k : int, optional Number of classes to return for each input example. missing_value_action : str, optional Action to perform when missing values are encountered. Can be one of: - 'auto': Default to 'impute' - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with evaluation and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | row_id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _mt._get_metric_tracker().track('toolkit.classifier.logistic_classifier.predict_topk') _check_categorical_option_type('output_type', output_type, ['rank', 'margin', 'probability']) _check_categorical_option_type('missing_value_action', missing_value_action, ['auto', 'impute', 'error']) if missing_value_action == 'auto': missing_value_action = 'impute' # Low latency path if isinstance(dataset, list): return _graphlab.extensions._fast_predict_topk(self.__proxy__, dataset, output_type, missing_value_action, k) if isinstance(dataset, dict): return _graphlab.extensions._fast_predict_topk(self.__proxy__, [dataset], output_type, missing_value_action, k) # Fast path _raise_error_if_not_sframe(dataset, "dataset") options = dict() if (missing_value_action == 'auto'): missing_value_action = _sl.select_default_missing_value_policy( self, 'predict') options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'output_type': output_type, 'topk': k, 'missing_value_action': missing_value_action}) target = _graphlab.toolkits._main.run( 'supervised_learning_predict_topk', options) return _map_unity_proxy_to_object(target['predicted'])
def predict_topk(self, dataset, output_type="probability", k=3): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`,`rank`, or `score`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model, except for images which are automatically resized. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'rank', 'score'}, optional Choose the return type of the prediction: - `rank`: outputs rank along with class label. - `probability`: outputs learned probability along with class label. - `score`: Same as probability k : int, optional Number of classes to return for each input example. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train') >>> training_data, validation_data = data.random_split(0.8) >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(training_data, ... target='label', ... network=net, ... max_iterations=3) ... >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | row_id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _mt._get_metric_tracker().track('toolkit.classifier.neuralnet_classifier.predict_topk') _raise_error_if_not_sframe(dataset, "dataset") options = dict() options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'output_type': output_type, 'topk': k, 'missing_value_action': 'error'}) target = _toolkits_main.run('supervised_learning_predict_topk', options) return _map_unity_proxy_to_object(target['predicted'])
def create(dataset, label=None, features=None, distance='auto', method='auto', composite_params=None, verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Columns of type *list* are not supported. Convert them to array columns if all entries in the list are of numeric types. Please note: if `composite_params` is also specified, this parameter is ignored. distance : string or function, optional Name of the function that measures the distances between two observations. Please see the notes and references for detailed descriptions of the distances. Note that for sparse vectors, missing keys are assumed to have value 0.0. Please note: if `composite_params` is also specified, this parameter is ignored. - *auto* (default): the model chooses a reasonable distance based on the data types in 'dataset'. Columns of type str will be compared using levenshtein distance, columns of type dict use jaccard distance, and columns of type float, int, or list will be combined and use euclidean distance. The set of column-specific distances are aggregated into a single composite distance. - *squared_euclidean*: works only with the `brute_force` method because it is not a metric. - *euclidean* - *manhattan* - *jaccard*: works only with variables in a dictionary feature, where the keys are treated as a set and the values are ignored. - *weighted_jaccard*: like jaccard distance, works only with variables in a dictionary feature. For the weighted version of jaccard, however, the values of the dictionary are used to weight the contribution of each key. This is done by taking the minimum of the two values for each key in the numerator and the maximum of the two values in the denominator. - *cosine*: works only with the 'brute_force' method because it is not a true metric. Please see `Wikipedia <http://en.wikipedia.org/wiki/Cosine_similarity>`_ for more detail. - *dot_product*: works only with the 'brute_force' method because it is not a true metric. - *levenshtein*: for a single column of string inputs. method : {'auto', 'ball_tree', 'brute_force'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. composite_params : list [list [list [string], string or function, float]] Multiple sets of features and corresponding distance functions can be used as inputs to a composite distance function. Each element of this composite is specified by a list in this argument. Each inner list must include a list of feature names, the name of a distance function, and a relative weight. See the examples and notes sections below. If `composite_params` is specified, any standalone `features`, `distance`, and `method` arguments are ignored. Keyword arguments are applied to each member of the composite distance computation. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. The default leaf size is indicated by a "0" in the :func:`~graphlab.nearest_neighbors.NearestNeighborsModel.get_default_options` method. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query Notes ----- - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. - Distance definitions. Suppose :math:`u` and :math:`v` are observations with :math:`d` variables each. - `squared_euclidean` .. math:: D(u, v) = \sum_i^d (u_i - v_i)^2 - `euclidean` .. math:: D(u, v) = \\sqrt{\sum_i^d (u_i - v_i)^2} - `manhattan` .. math:: D(u, v) = \\sum_i^d |u_i - v_i| - `cosine` .. math:: D(u, v) = 1 - \\frac{\sum_i^d u_i v_i} {\sqrt{\sum_i^d u_i^2}\sqrt{\sum_i^d v_i^2}} - `dot_product` .. math:: D(u, v) = \\frac{1}{\sum_i^d u_i v_i} - For the jaccard distances, suppose :math:`S` and :math:`T` are the sets of keys from two observations' dictionaries. For the weighted version of jaccard distance, suppose :math:`S_k` and :math:`T_k` are the values associated with key :math:`k` in the respective dictionaries. Typically these values are counts, i.e. of words or n-grams. - `jaccard` .. math:: D(S, T) = 1 - \\frac{|S \cap T|}{|S \cup T|} - `weighted_jaccard` .. math:: D(S, T) = 1 - \\frac{\sum_{k \in S \cup T} \min\{S_k, T_k\}} {\sum_{k \in S \cup T} \max\{S_k, T_k\}} - Levenshtein distance is a type of edit distance for string types. The distance is the number of insertion, deletion, and substituion edits needed to transform string :math:`A` into string :math:`B`. .. math:: D(A, B) = d(|A|, |B|) .. math :: d(i, j) = \max(i, j), \quad \mathrm{if } \min(i, j) = 0 .. math :: d(i, j) = \min \Big \{d(i-1, j) + 1, \ d(i, j-1) + 1, \ d(i-1, j-1) + I(A_i \\neq B_i) \Big \}, \quad \mathrm{else} - Composite distances are simply weighted sums of the above distances. The set of features input to each component distance may vary, and the weight on each component acts as a multiplier before each of the component distances is summed. For example, if ``composite_params`` is set to ``[[['X1', 'X2'], 'euclidean', 2], [['X2', 'X3'], 'manhattan', 3]]``, then the overall distance computation for rows :math:`a` and :math:`b` is: .. math:: D(a, b) = 2 * D_{euclidean}(a[[X1, X2]], b[[X1, X2]]) + 3 * D_{manhattan}(a[[X2, X3]], b[[X2, X3]]) References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> model = graphlab.nearest_neighbors.create(sf, composite_params=[ ... [['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]]) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and (distance == 'cosine' or distance == _graphlab.distances.cosine or distance == 'dot_product' or distance == _graphlab.distances.dot_product): raise TypeError("The ball tree method does not work with 'cosine' " +\ "or 'dot_product' distance. Please use the 'brute_force' " +\ "method for these distances.") ## Initial validation and processing of the label if label is None: _label = '__id' try: _dataset = dataset.add_row_number(column_name=_label) except: print "Tried to add a default label column '{}' ".format(_label) +\ "but a column by this name already exists. Using the " + \ "existing column as the label column." _dataset = dataset else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype() == int: raise TypeError("The label column must contain integers or strings.") _label = label _dataset = dataset sf_label = _tkutl._toolkits_select_columns(_dataset, [_label]) ## Clean the method options and create the options dictionary if len(kwargs) > 0: _method_options = {k.lower(): v for k, v in kwargs.items()} else: _method_options = {} ## If composite inputs aren't specifed, formulate the standalone inputs as a # composite input for code simplicity. If the standalone input doesn't # specify features, choose all the features and make a set of distance # components based on feature type. if composite_params is None: ## If not features specified, use them all if features is None: _features = [x for x in _dataset.column_names() if x != _label] else: _features = features[:] ## If the distance argument is 'auto', turn it into a list of distance # components by choosing an automatic distance for each feature based # on its type. if distance == 'auto': _composites = choose_auto_distance(_features, dataset.column_names(), dataset.column_types()) else: _composites = [[_features, distance, 1]] # Ignore automatically generated components if components have been provided else: if distance != 'auto': raise ValueError( "Either the 'distance' parameter or the 'composite_params' " +\ "parameter may be specified, but not both.") if features is not None: raise ValueError( "Either the 'features' parameter or the 'composite_params' " +\ "parameter may be specified, but not both.") if len(composite_params) == 0: raise ValueError( "'composite_params' was specified as an empty list. If " +\ "specified, this parameter must contain at least one distance " +\ "component, which is a list containing three elements: a list " +\ "of feature names, a distance name or function, and a relative " +\ "weight.") _composites = copy.deepcopy(composite_params) ## Clean the list of features in each component of the composite inputs, and # compile the union of the lists of features. all_features = [] for i in range(len(_composites)): if len(_composites[i]) != 3: raise ValueError("Each element of 'composite_params' must be a " +\ "list with three members.") feature_names = _composites[i][0] if len(feature_names) == 0: raise ValueError("An empty list of features cannot be passed " +\ "as part of a composite distance function.") # set of features must be iterable _tkutl._raise_error_if_not_iterable(feature_names, "features") # feature names must be strings if not all([isinstance(x, str) for x in feature_names]): raise TypeError("Input 'features' must contain only strings.") # remove the label name from all of the features lists feature_names = [x for x in feature_names if x != _label] # ensure that string features are in single columns if len(feature_names) > 1 and any([_dataset[x].dtype() is str for x in feature_names]): raise ValueError( "Multiple features have been entered, one of which is of string " +\ "type. If the input features for any distance component contain a " +\ "string column, that must be the only column for that component.") # ensure that relative weights are integers or floats if not isinstance(_composites[i][2], (int, float)): raise ValueError( "The weight of each distance component must be a single " +\ "integer or a float value.") # combine all features into a big list _composites[i][0] = feature_names all_features += feature_names # convert distance strings to distance functions temp_dist = _composites[i][1] if isinstance(temp_dist, str): _composites[i][1] = _graphlab.util._get_distance(temp_dist) # Pull out the relevant features from the input dataset (the union of # features over all distance components) all_features = list(set(all_features)) sf_features = _tkutl._toolkits_select_columns(_dataset, all_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(_composites) > 1: _method = 'brute_force' if method == 'ball_tree': print "Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components." else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([len(x) if hasattr(x, '__iter__') else 1 for x in sf_features[0].itervalues()]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([x in [int, float, list, array.array] for x in sf_features.column_types()]) ## Conditions necessary for ball tree to work and be worth it if ((_composites[0][1] in ['euclidean', 'manhattan', _graphlab.distances.euclidean, _graphlab.distances.manhattan]) and numeric_type_flag is True and num_variables <= 100): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_balltree.create') elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_brute.create') else: raise ValueError("Method must be 'brute_force', 'ball_tree', or 'auto'") ## Package the model options opts = {} opts.update(_method_options) opts.update( {'model_name': model_name, 'sf_label': sf_label, 'sf_features': sf_features, 'composite_params': _composites}) ## Construct the nearest neighbors model if verbose: print "Starting model construction..." result = _graphlab.extensions._nearest_neighbors.train(opts) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) if verbose: model.summary() print return model
def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`, `margin`, or `rank`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model. Parameters ---------- dataset : SFrame A dataset that has the same columns that were used during training. If the target column exists in ``dataset`` it will be ignored while making predictions. output_type : {'probability', 'rank', 'margin'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. - `margin` : Margin associated with each label in the prediction. k : int, optional Number of classes to return for each input example. missing_value_action : str, optional Action to perform when missing values are encountered. Can be one of: - 'auto': By default the model will treat missing value as is. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with evaluation and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | row_id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _mt._get_metric_tracker().track( 'toolkit.classifier.decision_tree_classifier.predict_topk') _check_categorical_option_type('output_type', output_type, ['rank', 'margin', 'probability']) if missing_value_action == 'auto': missing_value_action = _sl.select_default_missing_value_policy( self, 'predict') # Low latency path if isinstance(dataset, list): return _graphlab.extensions._fast_predict_topk( self.__proxy__, dataset, output_type, missing_value_action, k) if isinstance(dataset, dict): return _graphlab.extensions._fast_predict_topk( self.__proxy__, [dataset], output_type, missing_value_action, k) # Fast path _raise_error_if_not_sframe(dataset, "dataset") options = dict() options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'output_type': output_type, 'topk': k, 'missing_value_action': missing_value_action }) target = _graphlab.toolkits._main.run( 'supervised_learning_predict_topk', options) return _map_unity_proxy_to_object(target['predicted'])
def extract_features(self, dataset, layer_id=None): """ Takes an input dataset, propagates each example through the network, and returns an SArray of dense feature vectors, each of which is the concatenation of all the hidden unit values at layer[layer_id]. These feature vectors can be used as input to train another classifier such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`, an :py:class:`~graphlab.svm_classifier.SVMClassifier`, another :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`, or a :py:class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier`. Input dataset size must be the same as for the training of the model, except for images which are automatically resized. We also are releasing a pre-trained model for ImageNet, as described by Alex Krizhevsky et. al. It is located at https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45 . Using it requires 256 x 256 x 3 images. Please see Examples and References for more. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. layer_id : int , optional The index of the layer in neuralnet at which the activations are taken to be a dense feature vector. Must be a fully-connected layer. Default is None, in which case the layer before the connection layer to the output is used. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. See Also ------------ graphlab.deeplearning.layers References ---------- - Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with deep convolutional neural networks." Advances in neural information processing systems. 2012. Examples -------- >>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k') >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(data, ... target='label', ... network=net, ... max_iterations=3) >>> # Now, let's extract features from the last layer >>> data['features'] = m.extract_features(data) >>> # Now, let's build a new classifier on top of extracted features >>> m = graphlab.classifier.create(data, ... features = ['features'], ... target='label') Now, let's see how to load the ImageNet model, and use it for extracting features after resizing the data: >>> imagenet_model = graphlab.load_model('https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45') >>> data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3, decode=True) >>> data['imagenet_features'] = imagenet_model.extract_features(data) """ _mt._get_metric_tracker().track( 'toolkit.classifier.neuralnet_classifier.extract_features') _raise_error_if_not_sframe(dataset, "dataset") options = dict() net = self.get('network').layers network_size = len(net) - 1 if layer_id is None: if net[network_size]._type == "CONNECTION": layer_id = network_size - 1 else: layer_id = network_size - 2 _numeric_param_check_range("layer_id", layer_id, 0, network_size) conv2flat = False for i in range(0, layer_id + 1): if net[i]._type == "CONNECTION" or net[i]._type == "TRANSITION": conv2flat = True if conv2flat is not True: raise ValueError( "Features must be extracted from either a network " "with non-image input or a layer after a FlattenLayer. " "Try extracting features from layer following a FlattenLayer.") options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'missing_value_action': "error", 'layer_id': layer_id }) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])
def classify(self, dataset, max_neighbors=10, radius=None, verbose=True): """ Return the predicted class for each observation in *dataset*. This prediction is made based on the closest neighbors stored in the nearest neighbors classifier model. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. verbose : bool, optional If True, print progress updates. max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. Returns ------- out : SFrame An SFrame with model predictions. The first column is the most likely class according to the model, and the second column is the predicted probability for that class. See Also -------- create, predict, predict_topk Notes ----- - If the 'radius' parameter is small, it is possible that a query point has no qualified neighbors in the training dataset. In this case, the resulting class and probability for that query are 'None' in the SFrame output by this method. If the target column in the training dataset has missing values, these predictions will be ambiguous. - Ties between predicted classes are broken randomly. Examples -------- >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> sf_new = graphlab.SFrame({'height': [26, 19], ... 'weight': [25, 35]}) ... >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species') >>> ystar = m.classify(sf_new, max_neighbors=2) >>> print ystar +-------+-------------+ | class | probability | +-------+-------------+ | dog | 1.0 | | fossa | 0.5 | +-------+-------------+ """ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.classify') ## Validate the query 'dataset'. Note that the 'max_neighbors' and # 'radius' parameters are validated by the nearest neighbor model's # query method. _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") n_query = dataset.num_rows() ## Validate neighborhood parameters 'max_neighbors'. # - NOTE: when the parameter name is changed in nearest neighbors, the # query call will do this itself, and this block can be removed. if max_neighbors is not None: if not isinstance(max_neighbors, int): raise ValueError("Input 'max_neighbors' must be an integer.") if max_neighbors <= 0: raise ValueError( "Input 'max_neighbors' must be larger than 0.") ## Find the nearest neighbors for each query and count the number of # votes for each class. knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius, verbose=verbose) ## If there are *no* results for *any* query make an SFrame of nothing. if knn.num_rows() == 0: ystar = _gl.SFrame({ 'class': _gl.SArray([None] * n_query, self._target_type), 'probability': _gl.SArray([None] * n_query, int) }) else: ## Find the class with the most votes for each query and postprocess. grp = knn.groupby(['query_label', 'reference_label'], _gl.aggregate.COUNT) ystar = grp.groupby( 'query_label', { 'class': _gl.aggregate.ARGMAX('Count', 'reference_label'), 'max_votes': _gl.aggregate.MAX('Count'), 'total_votes': _gl.aggregate.SUM('Count') }) ystar['probability'] = ystar['max_votes'] / ystar['total_votes'] ## Fill in 'None' for query points that don't have any near neighbors. row_ids = _gl.SFrame({'query_label': range(n_query)}) ystar = ystar.join(row_ids, how='right') ## Sort by row number (because row number is not returned) and return ystar = ystar.sort('query_label', ascending=True) ystar = ystar[['class', 'probability']] return ystar
def create(dataset, item, features=None, min_support=1, max_patterns=100, min_length=1): """ Create a :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner` to extract the set of frequently occurring items in an event-series. Parameters ---------- dataset : SFrame Dataset for training the model. item: string Name of the column containing the item. The values in this column must be of string or integer type. features : list[string], optional Names of the columns containing features. 'None' (the default) indicates that all columns except the target variable should be used as features. The feature columns are the ones that together identify a unique transaction ID for the item. min_support : int, optional The minimum number of times that a pattern must occur in order for it to be considered `frequent`. max_patterns : int, optional The maximum number of frequent patterns to be mined. min_length: int, optional The minimum size (number of elements in the set) of each pattern being mined. Returns ------- out : FrequentPatternMiner A trained model of type :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner`. Notes ----- Frequent closed itemests are mined using the `top-k FP growth` algorithm. Mining occurs until the top max_patterns closed itemsets of size min_length and support greater than min_support are found. See Also -------- FrequentPatternMiner References ---------- - Wikipedia - Association Rule Learning <https://en.wikipedia.org/wiki/Association_rule_learning> - Han, Jiawei, et al. "Mining top-k frequent closed patterns without minimum support." Data Mining, 2002. ICDM 2003. - Wang, Jianyong, et al. "TFP: An efficient algorithm for mining top-k frequent closed itemsets." Knowledge and Data Engineering, IEEE Transactions on 17.5 (2005): 652-663. Examples -------- .. sourcecode:: python >>> import graphlab as gl >>> bakery_sf = gl.SFrame("http://s3.amazonaws.com/dato-datasets/bakery.sf") >>> bakery_sf Data: +---------+-------------+-------+----------+----------+-----------------+ | Receipt | SaleDate | EmpId | StoreNum | Quantity | Item | +---------+-------------+-------+----------+----------+-----------------+ | 1 | 12-JAN-2000 | 20 | 20 | 1 | GanacheCookie | | 1 | 12-JAN-2000 | 20 | 20 | 5 | ApplePie | | 2 | 15-JAN-2000 | 35 | 10 | 1 | CoffeeEclair | | 2 | 15-JAN-2000 | 35 | 10 | 3 | ApplePie | | 2 | 15-JAN-2000 | 35 | 10 | 4 | AlmondTwist | | 2 | 15-JAN-2000 | 35 | 10 | 3 | HotCoffee | | 3 | 8-JAN-2000 | 13 | 13 | 5 | OperaCake | | 3 | 8-JAN-2000 | 13 | 13 | 3 | OrangeJuice | | 3 | 8-JAN-2000 | 13 | 13 | 3 | CheeseCroissant | | 4 | 24-JAN-2000 | 16 | 16 | 1 | TruffleCake | +---------+-------------+-------+----------+----------+-----------------+ [266209 rows x 6 columns] >>> model = gl.frequent_pattern_mining.create(train, 'Item', features=['Receipt'], min_length=4, max_patterns=500) Model fields ------------ Min support : 1 Max patterns : 500 Min pattern length : 4 Most frequent patterns ---------------------- ['CoffeeEclair', 'HotCoffee', 'AlmondTwist', 'ApplePie']: 1704 ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'RaspberryCookie']: 1565 ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'GreenTea']: 1290 ['LemonLemonade', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1289 ['LemonLemonade', 'LemonCookie', 'RaspberryCookie', 'GreenTea']: 1279 ['LemonCookie', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1279 ['AppleTart', 'AppleDanish', 'AppleCroissant', 'CherrySoda']: 1253 ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1221 ['CherryTart', 'ApricotDanish', 'OperaCake', 'ApricotTart']: 61 ['CherryTart', 'ApricotDanish', 'OperaCake', 'RaspberryLemonade']: 55 """ _mt._get_metric_tracker().track('toolkit.frequent_pattern_mining.create') # Type checking. _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_not_of_type(item, str, "item") _raise_error_if_not_of_type(features, [list, _types.NoneType], "features") _raise_error_if_not_of_type(min_support, [int, float], "min_support") _raise_error_if_not_of_type(max_patterns, [int, float], "max_patterns") _raise_error_if_not_of_type(min_length, [int, float], "min_length") # Value checking. column_names = dataset.column_names() # If features is None, then use all other column names than item if features is None: features = column_names features.remove(item) # Call the C++ create function. proxy = _gl.extensions._pattern_mining_create( dataset, item, features, min_support, max_patterns, min_length) return FrequentPatternMiner(proxy)
def create(dataset, target, features=None, distance=None, verbose=True): """ Create a :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier` model. This model predicts the class of a query instance by finding the most common class among the query's nearest neighbors. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Dataset for training the model. target : str Name of the column containing the target variable. The values in this column must be of string or integer type. features : list[str], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns except the target variable should be used. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. distance : str, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (str) 2. standard distance name (str) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborClassifier A trained model of type :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`. See Also -------- NearestNeighborClassifier graphlab.toolkits.nearest_neighbors graphlab.toolkits.distances References ---------- - `Wikipedia - nearest neighbors classifier <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_ - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`_. Vol. 2. New York. Springer. pp. 463-481. Examples -------- >>> sf = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species') As with the nearest neighbors toolkit, the nearest neighbor classifier accepts composite distance functions. >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7], ... [('height', 'weight'), 'manhattan', 1.6]] ... >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species', ... distance=my_dist) """ ## Set up ## ------ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.create') start_time = _time.time() ## Validation and preprocessing ## ---------------------------- ## 'dataset' must be a non-empty SFrame _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## 'target' must be a string, in 'dataset', and the type of the target must # be string or integer. if not isinstance(target, str) or target not in dataset.column_names(): raise _ToolkitError("The 'target' parameter must be the name of a " "column in the input dataset.") if not dataset[target].dtype() == str and not dataset[target].dtype( ) == int: raise TypeError("The target column must contain integers or strings.") ## Warn that 'None' values in the target may lead to ambiguous predictions. if dataset[target].num_missing() > 0: _logging.warning( "Missing values detected in the target column. This " + "may lead to ambiguous 'None' predictions, if the " + "'radius' parameter is set too small in the prediction, " + "classification, or evaluation methods.") ## convert features and distance arguments into a composite distance ## NOTE: this is done here instead of in the nearest neighbors toolkit # because the automatic distance construction may be different for the two # toolkits. if features is None: _features = [x for x in dataset.column_names() if x != target] else: _features = [x for x in features if x != target] if isinstance(distance, list): distance = _copy.deepcopy(distance) elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] elif distance is None or distance == 'auto': col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } distance = _construct_auto_distance(_features, col_types) else: raise TypeError( "Input 'distance' not understood. The 'distance' " + "parameter must be a string or a composite distance, " + " or left unspecified.") ## Construct and query the nearest neighbors model ## ----------------------------------------------- knn_model = _gl.nearest_neighbors.create(dataset, label=target, distance=distance, verbose=verbose) ## Postprocessing and formatting ## ----------------------------- model = NearestNeighborClassifier(knn_model) model._state['verbose'] = verbose model._state['distance'] = knn_model['distance'] model._state['num_distance_components'] = knn_model[ 'num_distance_components'] model._state['num_examples'] = dataset.num_rows() model._state['features'] = knn_model['features'] model._state['target'] = target model._state['num_classes'] = len(dataset[target].unique()) model._state['num_features'] = knn_model['num_features'] model._state['num_unpacked_features'] = knn_model['num_unpacked_features'] model._state['training_time'] = _time.time() - start_time model._target_type = dataset[target].dtype() return model
def create(dataset, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a RecordLinker model to match query records to a reference dataset of records, assuming both sets have the same general form. Parameters ---------- dataset : SFrame Reference data, against which to link new queries with the 'link' method. The 'dataset' SFrame must include at least the features specified in the 'features' or 'distance' parameter. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns should be used. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if 'distance' is specified as a composite distance, then that parameter controls which features are used in the model. distance : string or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', or 'dot_product'. Please see the :mod:`distances` module for more details. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) method : {'auto', brute_force', 'lsh', 'ball_tree'}, optional Strategy for the nearest neighbors search. If not specified or 'auto', the search strategy is chosen automatically based on the data type and dimension. verbose : bool, optional If True, print progress updates and model details. **kwargs : optional Options passed through to the nearest_neighbors toolkit for particular nearest neighbors search strategies: - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. - *num_projections_per_table*: For the LSH method, the number of projections for each hash table. Returns ------- out : RecordLinker model. See Also -------- RecordLinker, graphlab.toolkits.nearest_neighbors Notes ----- - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. References ---------- - Christen, Peter. "Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection." Springer Science & Business Media, 2012. Examples -------- >>> homes = graphlab.SFrame({'sqft': [1230, 875, 1745], ... 'street': ['phinney', 'fairview', 'cottage'], ... 'city': ['seattle', 'olympia', 'boston'], ... 'state': ['WA', 'WA', 'MA']}) ... >>> model = graphlab.record_linker.create(homes, features=['city'], ... distance='levenshtein') """ _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate the features input. if features is not None: if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Input 'features' must contain only strings.") else: features = dataset.column_names() ## Validate and preprocess the distance input. col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} if isinstance(distance, list): distance = _copy.deepcopy(distance) elif isinstance(distance, str): # this will likely produce errors downstream if 'features' was not # specified by the user. distance = [[features, distance, 1]] elif distance == None: distance = _construct_auto_distance(features, col_types) else: raise TypeError("Input 'distance' not understood. For the " + "data matching toolkit, 'distance' must be a string or " + "a composite distance list." ) ## Validate the composite distance and set it in the model. allowed_dists = { 'euclidean': [int, float, _array.array], 'squared_euclidean': [int, float, _array.array], 'manhattan': [int, float, _array.array], 'levenshtein': [str], 'jaccard': [str, dict], 'weighted_jaccard': [str, dict], 'cosine': [int, float, str, dict, _array.array], 'dot_product': [int, float, str, dict, _array.array]} distance = _dmutl.validate_composite_distance(distance, row_label=None, allowed_dists=allowed_dists.keys(), verbose=verbose) ## Validate feauture types against distance functions. _dmutl.validate_distance_feature_types(dataset, distance, allowed_dists) ## Clean and impute string data. # *** NOTE: after this, the composite distance and feature set will be # modified and useless to the user, so set the state here. *** state = {'distance': distance, 'num_distance_components': len(distance)} union_features = _dmutl.extract_composite_features(distance) _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(distance) for ftr in union_features: if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Convert strings to dicts if the distance isn't levenshtein, and # concatenate string columns within a distance component into a single # feature. _dataset, _distance = _engineer_distance_features(_dataset, _distance) ## Create the nearest neighbors model and set in the model knn_model = _gl.nearest_neighbors.create(_dataset, distance=_distance, method=method, verbose=verbose, **kwargs) ## Postprocessing and formatting state.update({'verbose': verbose, 'num_examples': dataset.num_rows(), 'features': union_features, 'num_features': len(union_features), 'method': knn_model['method'], 'training_time': _time.time() - start_time}) model = RecordLinker(knn_model, state) return model
def create(dataset, target, model_name, features=None, validation_set='auto', verbose=True, distributed='auto', **kwargs): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. distributed: env The distributed environment verbose : boolean whether print out messages during training kwargs : dict Additional parameter options that can be passed """ _raise_error_if_not_sframe(dataset, "training dataset") # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError, 'Unrecognized value for validation_set.' # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({'target': target_sframe, 'features': features_sframe, 'model_name': model_name}) if validation_set is not None: if not isinstance(validation_set, _graphlab.SFrame): raise TypeError, "validation_set must be either 'auto' or an SFrame matching the training data." # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) options.update({ 'features_validation' : _toolkits_select_columns(validation_set, features), 'target_validation' : _toolkits_select_columns(validation_set, [target])}) execution_env = get_distributed_execution_environment() if distributed == 'auto' and execution_env is None: ret = _graphlab.toolkits._main.run("supervised_learning_train", options, verbose) model = SupervisedLearningModel(ret['model'], model_name) else: ret = _distributed_run("distributed_supervised_train", options, env=distributed, verbose=verbose) model = SupervisedLearningModel(ret, model_name) return model
def create(dataset, target, model_name, features=None, validation_set='auto', verbose=True, distributed='auto', **kwargs): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. distributed: env The distributed environment verbose : boolean whether print out messages during training kwargs : dict Additional parameter options that can be passed """ _raise_error_if_not_sframe(dataset, "training dataset") # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError('Unrecognized value for validation_set.') # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({'target': target_sframe, 'features': features_sframe, 'model_name': model_name}) if validation_set is not None: if not isinstance(validation_set, _graphlab.SFrame): raise TypeError("validation_set must be either 'auto' or an SFrame matching the training data.") # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) options.update({ 'features_validation' : _toolkits_select_columns(validation_set, features), 'target_validation' : _toolkits_select_columns(validation_set, [target])}) ret = _graphlab.toolkits._main.run("supervised_learning_train", options, verbose) model = SupervisedLearningModel(ret['model'], model_name) return model
def create_classification_with_model_selector(dataset, target, model_selector, features=None, validation_set='auto', verbose=True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel. This function is normally not called, call specific model's create function instead. Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Get available models for this dataset num_classes = dataset[target].unique().size() selected_model_names = model_selector(num_classes, features_sframe) # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError, 'Unrecognized value for validation_set.' # Match C++ model names with user model names python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier', 'random_forest_classifier': 'RandomForestClassifier', 'classifier_logistic_regression': 'LogisticClassifier', 'classifier_svm': 'SVMClassifier', 'neuralnet_classifier': 'NeuralNetClassifier', 'neuralnet_classifier_v2': 'NeuralNetClassifier'} # Print useful user-facing progress messages print 'PROGRESS: The following methods are available for this type of problem.' print 'PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names]) if len(selected_model_names) > 1: print 'PROGRESS: The returned model will be chosen according to validation accuracy.' models = {} metrics = {} for model_name in selected_model_names: # Fit each of the available models m = create_selected(model_name, dataset, target, features, validation_set, verbose) models[model_name] = m # Get the last progress value or validation_accuracy, whichever is there if 'progress' in m.list_fields(): prog = m['progress'] validation_column = 'Validation-accuracy' accuracy_column = 'Training-accuracy' if validation_column in prog.column_names(): metrics[model_name] = float(prog[validation_column].tail(1)[0]) else: metrics[model_name] = float(prog[accuracy_column].tail(1)[0]) # Validation accuracy (for boosted trees.) elif 'validation_accuracy' in m.list_fields(): metrics[model_name] = m['validation_accuracy'] else: raise ValueError, \ "Model does not have metrics that can be used for model selection." # Choose model based on either validation, if available. best_model = None best_acc = None for model_name in selected_model_names: if best_acc is None: best_model = model_name best_acc = metrics[model_name] if best_acc < metrics[model_name]: best_model = model_name best_acc = metrics[model_name] ret = [] width = 32 if len(selected_model_names) > 1: ret.append('PROGRESS: Model selection based on validation accuracy:') ret.append('---------------------------------------------') key_str = '{:<{}}: {}' for model_name in selected_model_names: name = python_names[model_name] row = key_str.format(name, width, str(metrics[model_name])) ret.append(row) ret.append('---------------------------------------------') ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.') print '\nPROGRESS: '.join(ret) return models[best_model]
def create_with_model_selector(dataset, target, model_selector, features = None, verbose = True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Run the model selector. selected_model_name = model_selector(features_sframe) if (selected_model_name == 'neuralnet_classifier'): model = _graphlab.classifier.neuralnet_classifier.create(dataset, target, features = features, verbose = verbose) return model else: # Multi-class through boosted trees if ('classifier' in selected_model_name) and \ (dataset[target].unique().size() > 2): selected_model_name = 'boosted_trees_classifier' # Create the model model = create(dataset, target, selected_model_name, features = features, verbose = verbose) # Return the model if selected_model_name == 'boosted_trees_regression': return _graphlab.boosted_trees_regression.BoostedTreesRegression(\ model.__proxy__) elif selected_model_name == 'regression_linear_regression': return _graphlab.linear_regression.LinearRegression(\ model.__proxy__) elif selected_model_name == 'boosted_trees_classifier': return _graphlab.boosted_trees_classifier.BoostedTreesClassifier(\ model.__proxy__) elif selected_model_name == 'classifier_logistic_regression': return _graphlab.logistic_classifier.LogisticClassifier(\ model.__proxy__) elif selected_model_name == 'classifier_svm': return _graphlab.svm_classifier.SVMClassifier(model.__proxy__) else: raise ToolkitError, "Internal error: Incorrect model returned."
def predict_topk(self, dataset, output_type="probability", k=3): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`,`rank`, or `score`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model, except for images which are automatically resized. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'rank', 'score'}, optional Choose the return type of the prediction: - `rank`: outputs rank along with class label. - `probability`: outputs learned probability along with class label. - `score`: Same as probability k : int, optional Number of classes to return for each input example. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train') >>> training_data, validation_data = data.random_split(0.8) >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(training_data, ... target='label', ... network=net, ... max_iterations=3) ... >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | row_id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _mt._get_metric_tracker().track( 'toolkit.classifier.neuralnet_classifier.predict_topk') _raise_error_if_not_sframe(dataset, "dataset") options = dict() options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'output_type': output_type, 'topk': k, 'missing_value_action': 'error' }) target = _toolkits_main.run('supervised_learning_predict_topk', options) return _map_unity_proxy_to_object(target['predicted'])
def create_classification_with_model_selector(dataset, target, model_selector, features=None, validation_set='auto', verbose=True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel. This function is normally not called, call specific model's create function instead. Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Get available models for this dataset num_classes = dataset[target].unique().size() selected_model_names = model_selector(num_classes, features_sframe) # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError('Unrecognized value for validation_set.') # Match C++ model names with user model names python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier', 'random_forest_classifier': 'RandomForestClassifier', 'decision_tree_classifier': 'DecisionTreeClassifier', 'classifier_logistic_regression': 'LogisticClassifier', 'classifier_svm': 'SVMClassifier', 'neuralnet_classifier': 'NeuralNetClassifier', 'neuralnet_classifier_v2': 'NeuralNetClassifier'} # Print useful user-facing progress messages if verbose: print('PROGRESS: The following methods are available for this type of problem.') print('PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names])) if len(selected_model_names) > 1: print('PROGRESS: The returned model will be chosen according to validation accuracy.') models = {} metrics = {} for model_name in selected_model_names: # Fit each of the available models m = create_selected(model_name, dataset, target, features, validation_set, verbose) models[model_name] = m if 'validation_accuracy' in m.list_fields(): metrics[model_name] = m['validation_accuracy'] # Most models have this. elif 'progress' in m.list_fields(): prog = m['progress'] validation_column = 'Validation-accuracy' accuracy_column = 'Training-accuracy' if validation_column in prog.column_names(): metrics[model_name] = float(prog[validation_column].tail(1)[0]) else: metrics[model_name] = float(prog[accuracy_column].tail(1)[0]) else: raise ValueError("Model does not have metrics that can be used for model selection.") # Choose model based on either validation, if available. best_model = None best_acc = None for model_name in selected_model_names: if best_acc is None: best_model = model_name best_acc = metrics[model_name] if best_acc is not None and best_acc < metrics[model_name]: best_model = model_name best_acc = metrics[model_name] ret = [] width = 32 if len(selected_model_names) > 1: ret.append('PROGRESS: Model selection based on validation accuracy:') ret.append('---------------------------------------------') key_str = '{:<{}}: {}' for model_name in selected_model_names: name = python_names[model_name] row = key_str.format(name, width, str(metrics[model_name])) ret.append(row) ret.append('---------------------------------------------') ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.') if verbose: print('\nPROGRESS: '.join(ret)) return models[best_model]
def classify(self, dataset, max_neighbors=10, radius=None, verbose=True): """ Return the predicted class for each observation in ``dataset``, based on the closest neighbors stored in the nearest neighbors classifier model. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. verbose : bool, optional If True, print progress updates. max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. Returns ------- out : SFrame An SFrame with model predictions. The first column is the most likely class according to the model, and the second column is the predicted probability for that class. See Also -------- create, predict, predict_topk Notes ----- - If the 'radius' parameter is small, it is possible that a query point has no qualified neighbors in the training dataset. In this case, the resulting class and probability for that query are 'None' in the SFrame output by this method. If the target column in the training dataset has missing values, these predictions will be ambiguous. - Ties between predicted classes are broken randomly. Examples -------- >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> sf_new = graphlab.SFrame({'height': [26, 19], ... 'weight': [25, 35]}) ... >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species') >>> ystar = m.classify(sf_new, max_neighbors=2) >>> print ystar +-------+-------------+ | class | probability | +-------+-------------+ | dog | 1.0 | | fossa | 0.5 | +-------+-------------+ """ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.classify') ## Validate the query 'dataset'. Note that the 'max_neighbors' and # 'radius' parameters are validated by the nearest neighbor model's # query method. _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") n_query = dataset.num_rows() ## Validate neighborhood parameters 'max_neighbors'. # - NOTE: when the parameter name is changed in nearest neighbors, the # query call will do this itself, and this block can be removed. if max_neighbors is not None: if not isinstance(max_neighbors, int): raise ValueError("Input 'max_neighbors' must be an integer.") if max_neighbors <= 0: raise ValueError("Input 'max_neighbors' must be larger than 0.") ## Find the nearest neighbors for each query and count the number of # votes for each class. knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius, verbose=verbose) ## If there are *no* results for *any* query make an SFrame of nothing. if knn.num_rows() == 0: ystar = _gl.SFrame({'class': [None] * n_query, 'probability': [None] * n_query}) else: ## Find the class with the most votes for each query and postprocess. grp = knn.groupby(['query_label', 'reference_label'], _gl.aggregate.COUNT) ystar = grp.groupby('query_label', {'class': _gl.aggregate.ARGMAX('Count', 'reference_label'), 'max_votes': _gl.aggregate.MAX('Count'), 'total_votes': _gl.aggregate.SUM('Count')}) ystar['probability'] = ystar['max_votes'] / ystar['total_votes'] ## Fill in 'None' for query points that don't have any near neighbors. row_ids = _gl.SFrame({'query_label': range(n_query)}) ystar = ystar.join(row_ids, how='right') ## Sort by row number (because row number is not returned) and return ystar = ystar.sort('query_label', ascending=True) ystar = ystar[['class', 'probability']] return ystar
def predict(self, dataset, missing_value_action='auto', output_type='', options={}, **kwargs): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or 1). Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. output_type : str, optional output type that maybe needed by some of the toolkits options : dict additional options to be passed in to prediction kwargs : dict additional options to be passed into prediction Returns ------- out : SArray An SArray with model predictions. """ if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy(self, 'predict') # Low latency path if isinstance(dataset, list): return _graphlab.extensions._fast_predict(self.__proxy__, dataset, output_type, missing_value_action) if isinstance(dataset, dict): return _graphlab.extensions._fast_predict(self.__proxy__, [dataset], output_type, missing_value_action) # Batch predictions path else: _raise_error_if_not_sframe(dataset, "dataset") options = options.copy() options.update(kwargs) options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'missing_value_action' : missing_value_action, 'output_type' : output_type }) target = _graphlab.toolkits._main.run( 'supervised_learning_predict', options) return _map_unity_proxy_to_object(target['predicted'])
def create(datasets, row_label=None, features=None, grouping_features=None, distance=None, k=2, radius=None, verbose=True): """ Create a deduplication model based on nearest neighbors and SGraph connected components. This method creates a :class:`NearestNeighborDeduplication` model by constructing a nearest neighbors similarity graph on all of the rows in the input 'datasets', then using the connected components tool in the :mod:`~graphlab.toolkits.graph_analytics` module to assign an entity label to each record. Records which share the same label are considered to be duplicates. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- datasets : SFrame or list[SFrame] or dict(string: SFrame) Input datasets. Each SFrame in the list must include all of the features specified in the `features` or 'distance' parameters, but may have additional columns as well. SFrames can be input as values in a dictionary, where the keys are strings used in the output to identify the SFrame from which each record originated. row_label : string, optional Name of the SFrame column with row labels. If not specified, row numbers are used to identify rows in the output. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates the intersection of columns over all SFrames in `datasets` should be used (except the label column, if specified). Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Any additional columns named in 'features' will be included in the model output but not used for distance computations. grouping_features : list[string], optional Names of features to use in grouping records before finding approximate matches. These columns must have string or integer type data. See the Notes section for more details on grouping. distance : string or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. k : int, optional Number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a potential duplicate. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborDeduplication model The NearestNeighborDeduplication object contains a field 'entities' which shows the entity label for each input record. It also shows the features for each record that are used to construct the model, as well as the original SFrame and row label for each record. If the original `datasets` are passed in a list, the SFrame identifier is the index of the SFrame in that list. See Also -------- NearestNeighborDeduplication, graphlab.toolkits.nearest_neighbors, graphlab.SFrame.groupby Notes ----- - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - For datasets with more than about 10,000 records, *grouping* (also known as *blocking*) is a critical step to avoid computing distances between all pairs of records. The grouping step simply assigns each record to a group that has identical values for all `grouping_features`, and only looks for duplicates within each group. - Records with missing data in the `grouping_features` are removed from consideration as duplicates. These records are given the entity label "None". - For tasks that require *only* exact matches on certain features, it is generally more natural to use the SFrame `groupby` function. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. References ---------- - Christen, Peter. "Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection." Springer Science & Business Media, 2012. Examples -------- >>> sf1 = graphlab.SFrame({'id': [0, 1, 2], ... 'x0': [0.5, 0.5, 0.3], ... 'x1': [1., 0.8, 0.6], ... 'city': ['seattle', 'olympia', 'boston'], ... 'state': ['WA', 'WA', 'MA']}) ... ... # note: misspellings in the following dataset do not prevent correct ... # matches. >>> sf2 = graphlab.SFrame({'id': [9, 10], ... 'x0': [0.35, 0.4], ... 'x1': [0.65, 0.8], ... 'city': ['bostan', 'seatle'], ... 'state': ['MA', 'WA']}) ... >>> dist = [[('city',), 'levenshtein', 2], ... [('x0', 'x1'), 'euclidean', 1.5]] ... >>> m = graphlab.nearest_neighbor_deduplication.create({'a': sf1, 'b': sf2}, ... row_label='id', ... grouping_features=['state'], ... distance=dist, k=None, ... radius=3) ... >>> print m['entities'] +----------+----+----------+-------+------+---------+------+ | __sframe | id | __entity | state | x0 | city | x1 | +----------+----+----------+-------+------+---------+------+ | a | 1 | 0 | WA | 0.5 | olympia | 0.8 | | a | 0 | 1 | WA | 0.5 | seattle | 1.0 | | b | 10 | 1 | WA | 0.4 | seatle | 0.8 | | a | 2 | 2 | MA | 0.3 | boston | 0.6 | | b | 9 | 2 | MA | 0.35 | bostan | 0.65 | +----------+----+----------+-------+------+---------+------+ [5 rows x 7 columns] """ ## Set up _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() model = NearestNeighborDeduplication() model._state['verbose'] = verbose model._state['k'] = k model._state['radius'] = radius ### ----------------------------- ### ### Validation and preprocessing ### ### ----------------------------- ### ### Validate input datasets ### ----------------------- ## If datasets is already a dict, check the keys are all strings if isinstance(datasets, dict): if not(all([isinstance(x, str) for x in datasets.keys()])): raise ValueError("Keys in the 'datasets' dict must be strings.") ## Convert singleton SFrame dataset into a list of datasets if isinstance(datasets, _gl.SFrame): _raise_error_if_sframe_empty(datasets, "dataset") datasets = {0: datasets} ## Convert a list of SFrames into a dict if isinstance(datasets, list): datasets = {k: sf for k, sf in enumerate(datasets)} ## At this point, 'datasets' must be dict. If it's not, something is wrong. if not isinstance(datasets, dict): raise TypeError("Input 'datasets' must be an SFrame, a list of SFrames, " + "or a dictionary of (string, SFrame) pairs.") model._state['num_datasets'] = len(datasets) ## Ensure that all datasets are SFrames for d in datasets.values(): _raise_error_if_not_sframe(d, "dataset") ### Validate row label ### ------------------ ## Validate the label column if row_label: if not isinstance(row_label, str): raise TypeError("The 'row_label' parameter must be the name (string " + "type) of a column in each of the input datasets.") for d in datasets.values(): if row_label not in d.column_names(): raise _ToolkitError("The specified row_label column does not " + " exist in all input datasets.") else: row_label = 'row_number' for d in datasets.values(): if row_label in d.column_names(): raise _ToolkitError("Input 'row_label' defaulted to " + "'row_number', which is already a column" + " in at least one input dataset. Please " + "specify a row label column manually.") model._state['row_label'] = row_label ### Validate 'features' and 'grouping_features' parameters ### ------------------------------------------------------ if features is not None: if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Input 'features' must contain only strings.") if grouping_features is not None: if not hasattr(grouping_features, '__iter__'): raise TypeError("Input 'grouping_features' must be a list.") if not all([isinstance(x, str) for x in grouping_features]): raise TypeError("Input 'grouping_features' must contain only strings.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. ## Find the intersection of all feature sets and feature types col_types = {k: v for k, v in zip(datasets.values()[0].column_names(), datasets.values()[0].column_types())} all_features = [sf.column_names() for sf in datasets.values()] ftr_intersection = list(set(all_features[0]).intersection(*all_features)) ftr_intersection = [x for x in ftr_intersection if x != row_label] ## Convert features and distance arguments into a composite distance. if isinstance(distance, list): distance = _copy.deepcopy(distance) elif isinstance(distance, str): if features is not None: distance = [[features, distance, 1]] else: distance = [[ftr_intersection, distance, 1]] elif distance == None: if features is not None: distance = _construct_auto_distance(features, col_types) else: distance = _construct_auto_distance(ftr_intersection, col_types) else: raise TypeError("Input 'distance' not understood. Note that for the " + "data matching toolkit, 'distance' must be a string or " + "a composite distance list." ) ## Validate the form of the composite distance and add to the model allowed_dists = { 'euclidean': [int, float, _array.array], 'squared_euclidean': [int, float, _array.array], 'manhattan': [int, float, _array.array], 'levenshtein': [str], 'jaccard': [str, dict], 'weighted_jaccard': [str, dict], 'cosine': [int, float, str, dict, _array.array], 'dot_product': [int, float, str, dict, _array.array], 'transformed_dot_product': [int, float, str, dict, _array.array]} distance = _dmutl.validate_composite_distance(distance, row_label, allowed_dists.keys(), verbose) model._state['distance'] = _copy.deepcopy(distance) ## Figure out which features are 'fuzzy', i.e. used for approximate # matching, and set in the model state. fuzzy_features = _dmutl.extract_composite_features(distance) # already has row_label removed model._state['features'] = fuzzy_features model._state['num_features'] = len(fuzzy_features) ## Compile a master list of all features. This includes grouping features, # fuzzy features (the ones used for approximate matching), and "ancillary" # features, which are specified in the 'features' parameter but not in the # composite distance function for whatever reason. by the user in the # 'features' parameter, but not included in the 'distance' specification # for some reason. if features is None: features = [] else: features = [x for x in features if x != row_label] if grouping_features is None: grouping_features = [] else: grouping_features = [x for x in grouping_features if x != row_label] model._state['grouping_features'] = grouping_features model._state['num_grouping_features'] = len(grouping_features) master_features = list(set(features + grouping_features + fuzzy_features)) ### Consolidate data and engineer features ### -------------------------------------- ## Consolidate multiple input datasets into a single SFrame, with a useful # row label. sf_union = _dmutl.concatenate_sframes(datasets, row_label=row_label, features=master_features, sf_index_name='__sframe') overall_label = '__sframe.' + row_label sf_union[overall_label] = (sf_union['__sframe'].astype(str) + "." + sf_union[row_label].astype(str)) ## Validate the feature types in the consolidated dataset against the # specified distance functions. _dmutl.validate_distance_feature_types(sf_union, distance, allowed_dists) ## Clean string-type features in the fuzzy feature set. for ftr in fuzzy_features: if col_types[ftr] == str: new_ftr = '__clean.' + ftr sf_union[new_ftr] = sf_union[ftr].fillna("") sf_union[new_ftr] = sf_union[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Feature engineering, distance-component-wise. Also update list of # features and a map to their types. sf_union, distance = _engineer_distance_features(sf_union, distance) transformed_features = _dmutl.extract_composite_features(distance) ### -------------------------------------------- ### ### Main loop over blocks of neighbor candidates ### ### -------------------------------------------- ### ## Construct blocks on features that must match exactly if verbose: _logging.info("Constructing groups of records that match exactly on " + "the 'grouping_features'.") sf_union, block_errors, blocks = \ _dmutl.construct_exact_blocks(sf_union, grouping_features) if verbose and len(distance) > 0 and blocks['Count'].max() > 10000: _logging.warning("There are more than 10,000 records in the largest match " + "group. For many uses, approximate matches within each match group are " + "computed with brute force nearest neighbors, which may be slow. " + "Consider using smaller groups by requiring different features to " + "match exactly.") max_entity_number = 0 sf_entity = _gl.SFrame() output_features = (master_features + [row_label, '__sframe', '__entity']) ## Main loop over blocks for i, block in enumerate(blocks): if verbose: _logging.info("Processing {} records in match group: {}/{}".format(block['Count'], i+1, len(blocks))) ## Retrieve records in the block and impute the mean for missing numeric # values. records = sf_union[block['min_idx']:(block['max_idx'] + 1)] complete_records = _dmutl.impute_numeric_means(records, transformed_features) if len(distance) > 0: ## Run all-point nearest neighbors if verbose: _logging.info("Building the similarity graph....") m = _gl.nearest_neighbors.create(complete_records, label=overall_label, distance=distance, verbose=False) knn = m.query(complete_records, label=overall_label, k=k, radius=radius, verbose=verbose) ## Construct similarity graph to resolve transitive closure sg = _gl.SGraph() sg = sg.add_vertices(records[[overall_label]], vid_field=overall_label) sg = sg.add_edges(knn, src_field='query_label', dst_field='reference_label') ## Cut the similarity graph to establish an entity for each vertex if verbose: _logging.info("Finding duplicate records in the similarity graph....") cc = _gl.connected_components.create(sg, verbose=verbose) ## Relabel the component IDs to be consecutive integers starting with # the max index of the previous block's entity labels. block_labels = cc['component_size'].add_row_number('__entity') block_labels['__entity'] += max_entity_number max_entity_number += block_labels.num_rows() block_entity_labels = cc['component_id'].join(block_labels, on='component_id', how='left') ## Join the entity labels for the block back to the block's records, # then append to the master output records = records.join(block_entity_labels[['__id', '__entity']], on={overall_label: '__id'}, how='left') records = records.sort('__entity') else: # no fuzzy features, so no nearest neighbors, just block ID records['__entity'] = _gl.SArray.from_const(i, len(records)) sf_entity = sf_entity.append(records[output_features]) ### ------------------------------------- ### ### Postprocessing and results formatting ### ### ------------------------------------- ### ## Add rows missing from the blocking back to the master results if len(block_errors) > 0: block_errors['__entity'] = _gl.SArray.from_const(None, len(block_errors)).astype(int) sf_entity = sf_entity.append(block_errors[output_features]) ## Rearrange columns sf_entity.swap_columns('__sframe', sf_entity.column_names()[0]) sf_entity.swap_columns(row_label, sf_entity.column_names()[1]) sf_entity.swap_columns('__entity', sf_entity.column_names()[2]) ## Finalize the model state model._state['training_time'] = _time.time() - start_time model._state['entities'] = sf_entity model._state['num_entities'] = max_entity_number return model
def create(dataset, target, model_name, features=None, validation_set = None, verbose = True, **kwargs): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional The validation set that is used to watch the validation result as boosting progress. verbose : boolean whether print out messages during training kwargs : dict Additional parameter options that can be passed """ _raise_error_if_not_sframe(dataset, "training dataset") # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({'target': target_sframe, 'features': features_sframe, 'model_name': model_name}) if validation_set is not None: options.update({ 'features_validation' : _toolkits_select_columns(validation_set, features), 'target_validation' : _toolkits_select_columns(validation_set, [target])}) ret = _graphlab.toolkits._main.run("supervised_learning_train", options, verbose=verbose) model = SupervisedLearningModel(ret['model'], model_name) return model
def create(dataset, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a RecordLinker model to match query records to a reference dataset of records, assuming both sets have the same general form. Parameters ---------- dataset : SFrame Reference data, against which to link new queries with the 'link' method. The 'dataset' SFrame must include at least the features specified in the 'features' or 'distance' parameter. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns should be used. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if 'distance' is specified as a composite distance, then that parameter controls which features are used in the model. distance : string or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', brute_force', 'lsh', 'ball_tree'}, optional Strategy for the nearest neighbors search. If not specified or 'auto', the search strategy is chosen automatically based on the data type and dimension. verbose : bool, optional If True, print progress updates and model details. **kwargs : optional Options passed through to the nearest_neighbors toolkit for particular nearest neighbors search strategies: - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. - *num_projections_per_table*: For the LSH method, the number of projections for each hash table. Returns ------- out : RecordLinker model. See Also -------- RecordLinker, graphlab.toolkits.nearest_neighbors Notes ----- - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. References ---------- - Christen, Peter. "Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection." Springer Science & Business Media, 2012. Examples -------- >>> homes = graphlab.SFrame({'sqft': [1230, 875, 1745], ... 'street': ['phinney', 'fairview', 'cottage'], ... 'city': ['seattle', 'olympia', 'boston'], ... 'state': ['WA', 'WA', 'MA']}) ... >>> model = graphlab.record_linker.create(homes, features=['city'], ... distance='levenshtein') """ _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError( "'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Validate the features input. if features is not None: if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Input 'features' must contain only strings.") else: features = dataset.column_names() ## Validate and preprocess the distance input. col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } if isinstance(distance, list): distance = _copy.deepcopy(distance) elif isinstance(distance, str): # this will likely produce errors downstream if 'features' was not # specified by the user. distance = [[features, distance, 1]] elif distance == None: distance = _construct_auto_distance(features, col_types) else: raise TypeError( "Input 'distance' not understood. For the " + "data matching toolkit, 'distance' must be a string or " + "a composite distance list.") ## Validate the composite distance and set it in the model. allowed_dists = { 'euclidean': [int, float, _array.array], 'squared_euclidean': [int, float, _array.array], 'manhattan': [int, float, _array.array], 'levenshtein': [str], 'jaccard': [str, dict], 'weighted_jaccard': [str, dict], 'cosine': [int, float, str, dict, _array.array], 'dot_product': [int, float, str, dict, _array.array], 'transformed_dot_product': [int, float, str, dict, _array.array] } distance = _dmutl.validate_composite_distance(distance, row_label=None, allowed_dists=list( allowed_dists.keys()), verbose=verbose) ## Validate feauture types against distance functions. _dmutl.validate_distance_feature_types(dataset, distance, allowed_dists) ## Clean and impute string data. # *** NOTE: after this, the composite distance and feature set will be # modified and useless to the user, so set the state here. *** state = {'distance': distance, 'num_distance_components': len(distance)} union_features = _dmutl.extract_composite_features(distance) _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(distance) for ftr in union_features: if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [ new_ftr if x == ftr else x for x in dist_comp[0] ] ## Convert strings to dicts if the distance isn't levenshtein, and # concatenate string columns within a distance component into a single # feature. _dataset, _distance = _engineer_distance_features(_dataset, _distance) ## Create the nearest neighbors model and set in the model nn_model = _gl.nearest_neighbors.create(_dataset, distance=_distance, method=method, verbose=verbose, **kwargs) ## Postprocessing and formatting state.update({ 'verbose': verbose, 'num_examples': dataset.num_rows(), 'features': union_features, 'nearest_neighbors_model': nn_model, 'num_features': len(union_features), 'method': nn_model['method'], 'training_time': _time.time() - start_time }) model = RecordLinker(state) return model
def tag(self, dataset, query_name=None, k=5, similarity_threshold=None, exclude_zeros=True, verbose=True): """ Match the reference tags passed when a model is created to a new set of queries. This is a many-to-many match: each query may have any number of occurrences of a reference tag. Parameters ---------- dataset : SFrame Query data to be tagged. query_name : string, optional Name of the column in ``dataset`` to be auto-tagged. If ``dataset`` has more than one column, ``query_name`` must be specified. k : int, optional Number of results to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all results whose score is greater than or equal to ``similarity_threshold``. similarity_threshold : float, optional Only results whose score is greater than or equal to the specified ``similarity_threshold`` are returned. The default is ``None``, in which case the ``k`` best results are returned for each query point. verbose : bool, optional If True, print progress updates and model details. exclude_zeros : boolean, optional If True, only entries for which there is a tag with a nonzero score are preserved in the output. This is the default behavior. Returns ------- out : SFrame An SFrame with four columns: - row ID - column name specified as `tag_name` parameter to `create` method - column name specified as `query_name` parameter to `tag` method - a similarity score between 0 and 1, indicating the strength of the match between the query data and the suggested reference tag, where a score of zero indicates a poor match and a strength of 1 corresponds to a perfect match Notes ----- - By default, only rows for which there is a tag with a nonzero score are included in the output. To guarantee at least one output row for every input row in ``dataset``, set the ``exclude_zeros`` parameter to False. - If both ``k`` and ``similarity_threshold`` are set to ``None``, a ToolkitError is raised. Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "s3://dato-datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's `similarity_threshold` parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ """ _mt._get_metric_tracker().track(self.__module__ + '.tag') # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that either k or similarity_threshold is set if not (k or similarity_threshold): raise _ToolkitError("Either k or similarity_threshold parameters " \ "must be set") # ensure that query_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not query_name: raise _ToolkitError("No query_name parameter specified on " \ "dataset with %d columns" % dataset.num_cols()) query_column = query_name or dataset.column_names()[0] # ensure that column with name tag_name exists if query_column not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' \ % query_column) query_sa = dataset.select_column(query_column) query_sf = _gl.SFrame({"id": range(len(query_sa)), query_column: query_sa}) features = _preprocess(query_sa) features = features.add_row_number() if similarity_threshold: if not isinstance(similarity_threshold, (float, int)): raise _ToolkitError("similarity_threshold parameter must be a" \ "float or an int.") if similarity_threshold < 0 or similarity_threshold > 1: raise _ToolkitError("similarity_threshold parameter must be " \ "between 0 and 1.") radius = (1 - similarity_threshold) if similarity_threshold else None results = self._nn_model.query(features, label="id", k=k, radius=radius, verbose=verbose) # return empty SFrame immediately if no NN results if len(results) == 0: return _gl.SFrame({query_column + "_id": [], query_column: [], self.get("tag_name"): [], "score": []}) results = results.join(query_sf, on={"query_label": "id"}) results.rename({"query_label": query_column + "_id", query_column: "query_label"}) # convert distances to similarity scores scores = _dists_to_sim_scores("weighted_jaccard", results) results.add_column(scores, "score") results.remove_column("distance") results.remove_column("rank") results.rename({"reference_label": self.get("tag_name"), "query_label": query_column}) results.swap_columns(self.get("tag_name"), query_column) if exclude_zeros: try: results = results.filter_by(0.0, "score", exclude=True) except RuntimeError: # nothing to join _logging.getLogger(__name__).warn( "Empty results after filtering scores of 0.") results = results.head(0) return results