def create_regression_with_model_selector(dataset, target, model_selector, features = None, validation_set=None, verbose = True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Run the model selector. selected_model_name = model_selector(features_sframe) model = create_selected(selected_model_name, dataset, target, features, validation_set, verbose) return model
def create_regression_with_model_selector(dataset, target, model_selector, features = None, validation_set='auto', verbose = True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Run the model selector. selected_model_name = model_selector(features_sframe) model = create_selected(selected_model_name, dataset, target, features, validation_set, verbose) return model
def create(dataset, target, model_name, features=None, validation_set='auto', verbose=True, distributed='auto', **kwargs): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. distributed: env The distributed environment verbose : boolean whether print out messages during training kwargs : dict Additional parameter options that can be passed """ _raise_error_if_not_sframe(dataset, "training dataset") # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError, 'Unrecognized value for validation_set.' # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({'target': target_sframe, 'features': features_sframe, 'model_name': model_name}) if validation_set is not None: if not isinstance(validation_set, _graphlab.SFrame): raise TypeError, "validation_set must be either 'auto' or an SFrame matching the training data." # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) options.update({ 'features_validation' : _toolkits_select_columns(validation_set, features), 'target_validation' : _toolkits_select_columns(validation_set, [target])}) execution_env = get_distributed_execution_environment() if distributed == 'auto' and execution_env is None: ret = _graphlab.toolkits._main.run("supervised_learning_train", options, verbose) model = SupervisedLearningModel(ret['model'], model_name) else: ret = _distributed_run("distributed_supervised_train", options, env=distributed, verbose=verbose) model = SupervisedLearningModel(ret, model_name) return model
def create_classification_with_model_selector(dataset, target, model_selector, features=None, validation_set='auto', verbose=True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel. This function is normally not called, call specific model's create function instead. Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Get available models for this dataset num_classes = dataset[target].unique().size() selected_model_names = model_selector(num_classes, features_sframe) # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError('Unrecognized value for validation_set.') # Match C++ model names with user model names python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier', 'random_forest_classifier': 'RandomForestClassifier', 'decision_tree_classifier': 'DecisionTreeClassifier', 'classifier_logistic_regression': 'LogisticClassifier', 'classifier_svm': 'SVMClassifier', 'neuralnet_classifier': 'NeuralNetClassifier', 'neuralnet_classifier_v2': 'NeuralNetClassifier'} # Print useful user-facing progress messages if verbose: print('PROGRESS: The following methods are available for this type of problem.') print('PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names])) if len(selected_model_names) > 1: print('PROGRESS: The returned model will be chosen according to validation accuracy.') models = {} metrics = {} for model_name in selected_model_names: # Fit each of the available models m = create_selected(model_name, dataset, target, features, validation_set, verbose) models[model_name] = m if 'validation_accuracy' in m.list_fields(): metrics[model_name] = m['validation_accuracy'] # Most models have this. elif 'progress' in m.list_fields(): prog = m['progress'] validation_column = 'Validation-accuracy' accuracy_column = 'Training-accuracy' if validation_column in prog.column_names(): metrics[model_name] = float(prog[validation_column].tail(1)[0]) else: metrics[model_name] = float(prog[accuracy_column].tail(1)[0]) else: raise ValueError("Model does not have metrics that can be used for model selection.") # Choose model based on either validation, if available. best_model = None best_acc = None for model_name in selected_model_names: if best_acc is None: best_model = model_name best_acc = metrics[model_name] if best_acc is not None and best_acc < metrics[model_name]: best_model = model_name best_acc = metrics[model_name] ret = [] width = 32 if len(selected_model_names) > 1: ret.append('PROGRESS: Model selection based on validation accuracy:') ret.append('---------------------------------------------') key_str = '{:<{}}: {}' for model_name in selected_model_names: name = python_names[model_name] row = key_str.format(name, width, str(metrics[model_name])) ret.append(row) ret.append('---------------------------------------------') ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.') if verbose: print('\nPROGRESS: '.join(ret)) return models[best_model]
def create(dataset, target, model_name, features=None, validation_set='auto', verbose=True, distributed='auto', **kwargs): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. distributed: env The distributed environment verbose : boolean whether print out messages during training kwargs : dict Additional parameter options that can be passed """ _raise_error_if_not_sframe(dataset, "training dataset") # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError('Unrecognized value for validation_set.') # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({'target': target_sframe, 'features': features_sframe, 'model_name': model_name}) if validation_set is not None: if not isinstance(validation_set, _graphlab.SFrame): raise TypeError("validation_set must be either 'auto' or an SFrame matching the training data.") # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) options.update({ 'features_validation' : _toolkits_select_columns(validation_set, features), 'target_validation' : _toolkits_select_columns(validation_set, [target])}) ret = _graphlab.toolkits._main.run("supervised_learning_train", options, verbose) model = SupervisedLearningModel(ret['model'], model_name) return model
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ For each row of the input 'dataset', retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model, but if it is, the 'include_self_edges' parameter can be set to False to exclude results that match query points to themselves. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - The `dataset` input to this method *can* have missing values (in contrast to the reference dataset used to create the nearest neighbors model). Missing numeric values are imputed to be the mean of the corresponding feature in the reference dataset, and missing strings are imputed to be empty strings. - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. - For models created with the 'lsh' method, the query results may have fewer query labels than input query points. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query, the query point is omitted from the results. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = graphlab.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = graphlab.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors.query') ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input if label is None: query_labels = _graphlab.SArray.from_sequence(len(dataset)) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype() == int: raise TypeError("The label column must contain integers or strings.") if label in ref_features: raise ValueError("The label column cannot be one of the features.") query_labels = dataset[label] ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = {'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'query_labels': query_labels, 'k': k, 'radius': radius} result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])
def predict(self, dataset, output_type='cluster_id', verbose=True): """ Return predicted cluster label for instances in the new 'dataset'. K-means predictions are made by assigning each new instance to the closest cluster center. Parameters ---------- dataset : SFrame Dataset of new observations. Must include the features used for model training; additional columns are ignored. output_type : {'cluster_id', 'distance'}, optional Form of the prediction. 'cluster_id' (the default) returns the cluster label assigned to each input instance, while 'distance' returns the Euclidean distance between the instance and its assigned cluster's center. verbose : bool, optional If True, print progress updates to the screen. Returns ------- out : SArray Model predictions. Depending on the specified `output_type`, either the assigned cluster label or the distance of each point to its closest cluster center. The order of the predictions is the same as order of the input data rows. See Also -------- create Examples -------- >>> sf = graphlab.SFrame({ ... 'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162, ... 8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020], ... 'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305, ... 5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]}) ... >>> model = graphlab.kmeans.create(sf, num_clusters=3) ... >>> sf_new = graphlab.SFrame({'x1': [-5.6584, -1.0167, -9.6181], ... 'x2': [-6.3803, -3.7937, -1.1022]}) >>> clusters = model.predict(sf_new, output_type='cluster_id') >>> print clusters [1, 0, 1] """ _mt._get_metric_tracker().track('toolkit.kmeans.predict') ## Validate the input dataset. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate the output type. if not isinstance(output_type, str): raise TypeError("The 'output_type' parameter must be a string.") if not output_type in ('cluster_id', 'distance'): raise ValueError("The 'output_type' parameter must be either " + "'cluster_label' or 'distance'.") ## Get model features. ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Compute predictions. opts = {'model': self.__proxy__, 'model_name': self.__name__, 'dataset': sf_features} result = _gl.toolkits._main.run('kmeans_predict', opts, verbose) sf_result = _gl.SFrame(None, _proxy=result['predictions']) if output_type == 'distance': return sf_result['distance'] else: return sf_result['cluster_id']
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ Retrieve the nearest neighbors from the reference set for each element of the query set. The query SFrame must include columns with the same names as the label and feature columns used to create the NearestNeighborsModel. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. label : string, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = graphlab.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = graphlab.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors.query') ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input ref_label = self.get('label') if label is None: sf_features = sf_features.add_row_number(column_name=ref_label) sf_label = sf_features[[ref_label]] sf_features.remove_column(ref_label) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype() == int: raise TypeError("The label column must contain integers or strings.") if label in ref_features: raise ValueError("The label column cannot be one of the features.") sf_label = _tkutl._toolkits_select_columns(dataset, [label]) if label != ref_label: sf_label.rename({label: ref_label}) ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = {'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'label': sf_label, 'k': k, 'radius': radius} if verbose is True: print "Starting model querying..." result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])
def create(dataset, label=None, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *List*: list of integer or string values. Each element is treated as a separate variable in the model. - *String*: string values. Please note: if a composite distance is also specified, this parameter is ignored. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate nearest neighbors efficiently. The LSH model supports 'euclidean', 'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product' (deprecated), and 'transformed_dot_product' distances. Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``. See the notes below for details. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. The default value is 20. We recommend choosing values from 10 to 30. - *num_projections_per_table*: For the LSH method, the number of projections/hash functions for each hash table. The default value is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine' distance and 4 ~ 12 for other distances. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query, graphlab.toolkits.distances Notes ----- - Missing data is not allowed in the 'dataset' provided to this function. Please use the :func:`graphlab.SFrame.fillna` and :func:`graphlab.SFrame.dropna` utilities to handle missing data before creating a nearest neighbors model. - Missing keys in sparse vectors are assumed to have value 0. - The `composite_params` parameter was removed as of GraphLab Create version 1.5. The `distance` parameter now accepts either standard or composite distances. Please see the :mod:`~graphlab.toolkits.distances` module documentation for more information on composite distances. - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. **Locality Sensitive Hashing (LSH)** There are several efficient nearest neighbors search algorithms that work well for data with low dimensions :math:`d` (approximately 50). However, most of the solutions suffer from either space or query time that is exponential in :math:`d`. For large :math:`d`, they often provide little, if any, improvement over the 'brute_force' method. This is a well-known consequence of the phenomenon called `The Curse of Dimensionality`. `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach that is designed to efficiently solve the *approximate* nearest neighbor search problem for high dimensional data. The key idea of LSH is to hash the data points using several hash functions, so that the probability of collision is much higher for data points which are close to each other than those which are far apart. An LSH family is a family of functions :math:`h` which map points from the metric space to a bucket, so that - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`. - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`. LSH for efficient approximate nearest neighbor search: - We define a new family of hash functions :math:`g`, where each function :math:`g` is obtained by concatenating :math:`k` functions :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`. The algorithm constructs :math:`L` hash tables, each of which corresponds to a different randomly chosen hash function :math:`g`. There are :math:`k \\cdot L` hash functions used in total. - In the preprocessing step, we hash all :math:`n` reference points into each of the :math:`L` hash tables. - Given a query point :math:`q`, the algorithm iterates over the :math:`L` hash functions :math:`g`. For each :math:`g` considered, it retrieves the data points that are hashed into the same bucket as q. These data points from all the :math:`L` hash tables are considered as candidates that are then re-ranked by their real distances with the query data. **Note** that the number of tables :math:`L` and the number of hash functions per table :math:`k` are two main parameters. They can be set using the options ``num_tables`` and ``num_projections_per_table`` respectively. Hash functions for different distances: - `euclidean` and `squared_euclidean`: :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where :math:`a` is a vector, of which the elements are independently sampled from normal distribution, and :math:`b` is a number uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the bucket width. We set :math:`r` using the average all-pair `euclidean` distances from a small randomly sampled subset of the reference data. - `manhattan`: The hash function of `manhattan` is similar with that of `euclidean`. The only difference is that the elements of `a` are sampled from Cauchy distribution, instead of normal distribution. - `cosine`: Random Projection is designed to approximate the cosine distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`, where :math:`a` is randomly sampled normal unit vector. - `jaccard`: We use a recently proposed method one permutation hashing by Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014] <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details. - `dot_product`: The reference data points are first transformed to fixed-norm vectors, and then the minimum `dot_product` distance search problem can be solved via finding the reference data with smallest `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015] <http://jmlr.org/proceedings/papers/v37/neyshabur15.html>`_ for details. References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_. Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], ... 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]] ... >>> model = graphlab.nearest_neighbors.create(sf, distance=my_dist) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Basic validation of the features input if features is not None and not isinstance(features, list): raise TypeError("If specified, input 'features' must be a list of " + "strings.") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError( "'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and ( distance == 'cosine' or distance == _graphlab.distances.cosine or distance == 'dot_product' or distance == _graphlab.distances.dot_product or distance == 'transformed_dot_product' or distance == _graphlab.distances.transformed_dot_product): raise TypeError( "The ball tree method does not work with 'cosine' " + "'dot_product', or 'transformed_dot_product' distance." + "Please use the 'brute_force' method for these distances.") if method == 'lsh' and ('num_projections_per_table' not in _method_options): if distance == 'jaccard' or distance == _graphlab.distances.jaccard: _method_options['num_projections_per_table'] = 4 elif distance == 'cosine' or distance == _graphlab.distances.cosine: _method_options['num_projections_per_table'] = 16 else: _method_options['num_projections_per_table'] = 8 ## Initial validation and processing of the label if label is None: _label = _robust_column_name('__id', dataset.column_names()) _dataset = dataset.add_row_number(_label) else: _label = label _dataset = _copy.copy(dataset) col_type_map = {c: _dataset[c].dtype() for c in _dataset.column_names()} _validate_row_label(_label, col_type_map) ref_labels = _dataset[_label] ## Determine the internal list of available feature names (may still include # the row label name). if features is None: _features = _dataset.column_names() else: _features = _copy.deepcopy(features) ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: raise _ToolkitError("The only available feature is the same as the " + "row label column. Please specify features " + "that are not also row labels.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. # - At this point, the row label 'label' may still be in the list(s) of # features. ## Convert any distance function input into a single composite distance. # distance is already a composite distance if isinstance(distance, list): distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. elif distance is None or distance == 'auto': sample = _dataset.head() distance = _construct_auto_distance(_features, _dataset.column_names(), _dataset.column_types(), sample) else: raise TypeError("Input 'distance' not understood. The 'distance' " " argument must be a string, function handle, or " + "composite distance.") ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. distance = _scrub_composite_distance_features(distance, [_label]) distance = _convert_distance_names_to_functions(distance) _validate_composite_distance(distance) ## Raise an error if any distances are used with non-lists list_features_to_check = [] sparse_distances = [ 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product', 'transformed_dot_product' ] sparse_distances = [ _graphlab.distances.__dict__[k] for k in sparse_distances ] for d in distance: feature_names, dist, _ = d list_features = [ f for f in feature_names if _dataset[f].dtype() == list ] for f in list_features: if dist in sparse_distances: list_features_to_check.append(f) else: raise TypeError( "The chosen distance cannot currently be used " + "on list-typed columns.") for f in list_features_to_check: only_str_lists = _validate_lists(_dataset[f], [str]) if not only_str_lists: raise TypeError("Distances for sparse data, such as jaccard " + "and weighted_jaccard, can only be used on " + "lists containing only strings. Please modify " + "any list features accordingly before creating " + "the nearest neighbors model.") ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _graphlab.distances.levenshtein): raise ValueError( "Levenshtein distance cannot be used with multiple " + "columns. Please concatenate strings into a single " + "column before creating the nearest neighbors model.") ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: _method = 'brute_force' if method != 'brute_force' and verbose is True: print("Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components.") else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([ len(x) if hasattr(x, '__iter__') else 1 for x in _six.itervalues(sf_clean[0]) ]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([ x in [int, float, list, array.array] for x in sf_clean.column_types() ]) ## Conditions necessary for ball tree to work and be worth it if ((distance[0][1] in [ 'euclidean', 'manhattan', _graphlab.distances.euclidean, _graphlab.distances.manhattan ]) and numeric_type_flag is True and num_variables <= 200): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors_balltree.create') elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors_brute.create') elif _method == 'lsh': model_name = 'nearest_neighbors_lsh' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_lsh.create') else: raise ValueError( "Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'.") ## Package the model options opts = {} opts.update(_method_options) opts.update({ 'model_name': model_name, 'ref_labels': ref_labels, 'label': label, 'sf_features': sf_clean, 'composite_params': distance }) ## Construct the nearest neighbors model if not verbose: _mt.main.get_server().set_log_progress(False) result = _graphlab.extensions._nearest_neighbors.train(opts) _mt.main.get_server().set_log_progress(True) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) return model
def link(self, dataset, k=5, radius=None, verbose=True): """ Find matching records from the reference dataset (entered when the model was created) for each record in the 'dataset' passed to this function. The query dataset must include columns with the same names as the label and feature columns used to create the RecordLinker model. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. k : int, optional Maximum number of nearest neighbors to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the row label of the query observation, the second is the row label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- Assume we've created the model from the example in the RecordLinker 'create' function. >>> queries = graphlab.SFrame({'sqft': [986, 1320], ... 'street': ['fremont', 'phiney'], ... 'city': ['sea', 'seattle'], ... 'state': ['WA', 'WA']}) ... >>> model.link(queries, k=2, radius=5.) +-------------+-----------------+----------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------+------+ | 0 | 0 | 4.0 | 1 | | 0 | 2 | 5.0 | 2 | | 1 | 0 | 0.0 | 1 | +-------------+-----------------+----------+------+ """ _mt._get_metric_tracker().track(self.__module__ + '.link_records') ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Make sure all of the necessary features are present at 'link' time. sf_features = _tkutl._toolkits_select_columns(dataset, self.get('features')) ## Clean and impute string data. *** Think about consolidating this and # the next step into a feature transformer.*** col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(self._state['distance']) for ftr in self.get('features'): if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Convert strings to dicts and concatenate string features. _dataset, _ = _engineer_distance_features(_dataset, _distance) ## Query the nearest neighbor model result = self._knn_model.query(_dataset, k=k, radius=radius, verbose=verbose) return result
def create(dataset, label=None, features=None, distance='auto', method='auto', composite_params=None, verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Columns of type *list* are not supported. Convert them to array columns if all entries in the list are of numeric types. Please note: if `composite_params` is also specified, this parameter is ignored. distance : string or function, optional Name of the function that measures the distances between two observations. Please see the notes and references for detailed descriptions of the distances. Note that for sparse vectors, missing keys are assumed to have value 0.0. Please note: if `composite_params` is also specified, this parameter is ignored. - *auto* (default): the model chooses a reasonable distance based on the data types in 'dataset'. Columns of type str will be compared using levenshtein distance, columns of type dict use jaccard distance, and columns of type float, int, or list will be combined and use euclidean distance. The set of column-specific distances are aggregated into a single composite distance. - *squared_euclidean*: works only with the `brute_force` method because it is not a metric. - *euclidean* - *manhattan* - *jaccard*: works only with variables in a dictionary feature, where the keys are treated as a set and the values are ignored. - *weighted_jaccard*: like jaccard distance, works only with variables in a dictionary feature. For the weighted version of jaccard, however, the values of the dictionary are used to weight the contribution of each key. This is done by taking the minimum of the two values for each key in the numerator and the maximum of the two values in the denominator. - *cosine*: works only with the 'brute_force' method because it is not a true metric. Please see `Wikipedia <http://en.wikipedia.org/wiki/Cosine_similarity>`_ for more detail. - *dot_product*: works only with the 'brute_force' method because it is not a true metric. - *levenshtein*: for a single column of string inputs. method : {'auto', 'ball_tree', 'brute_force'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. composite_params : list [list [list [string], string or function, float]] Multiple sets of features and corresponding distance functions can be used as inputs to a composite distance function. Each element of this composite is specified by a list in this argument. Each inner list must include a list of feature names, the name of a distance function, and a relative weight. See the examples and notes sections below. If `composite_params` is specified, any standalone `features`, `distance`, and `method` arguments are ignored. Keyword arguments are applied to each member of the composite distance computation. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. The default leaf size is indicated by a "0" in the :func:`~graphlab.nearest_neighbors.NearestNeighborsModel.get_default_options` method. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query Notes ----- - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. - Distance definitions. Suppose :math:`u` and :math:`v` are observations with :math:`d` variables each. - `squared_euclidean` .. math:: D(u, v) = \sum_i^d (u_i - v_i)^2 - `euclidean` .. math:: D(u, v) = \\sqrt{\sum_i^d (u_i - v_i)^2} - `manhattan` .. math:: D(u, v) = \\sum_i^d |u_i - v_i| - `cosine` .. math:: D(u, v) = 1 - \\frac{\sum_i^d u_i v_i} {\sqrt{\sum_i^d u_i^2}\sqrt{\sum_i^d v_i^2}} - `dot_product` .. math:: D(u, v) = \\frac{1}{\sum_i^d u_i v_i} - For the jaccard distances, suppose :math:`S` and :math:`T` are the sets of keys from two observations' dictionaries. For the weighted version of jaccard distance, suppose :math:`S_k` and :math:`T_k` are the values associated with key :math:`k` in the respective dictionaries. Typically these values are counts, i.e. of words or n-grams. - `jaccard` .. math:: D(S, T) = 1 - \\frac{|S \cap T|}{|S \cup T|} - `weighted_jaccard` .. math:: D(S, T) = 1 - \\frac{\sum_{k \in S \cup T} \min\{S_k, T_k\}} {\sum_{k \in S \cup T} \max\{S_k, T_k\}} - Levenshtein distance is a type of edit distance for string types. The distance is the number of insertion, deletion, and substituion edits needed to transform string :math:`A` into string :math:`B`. .. math:: D(A, B) = d(|A|, |B|) .. math :: d(i, j) = \max(i, j), \quad \mathrm{if } \min(i, j) = 0 .. math :: d(i, j) = \min \Big \{d(i-1, j) + 1, \ d(i, j-1) + 1, \ d(i-1, j-1) + I(A_i \\neq B_i) \Big \}, \quad \mathrm{else} - Composite distances are simply weighted sums of the above distances. The set of features input to each component distance may vary, and the weight on each component acts as a multiplier before each of the component distances is summed. For example, if ``composite_params`` is set to ``[[['X1', 'X2'], 'euclidean', 2], [['X2', 'X3'], 'manhattan', 3]]``, then the overall distance computation for rows :math:`a` and :math:`b` is: .. math:: D(a, b) = 2 * D_{euclidean}(a[[X1, X2]], b[[X1, X2]]) + 3 * D_{manhattan}(a[[X2, X3]], b[[X2, X3]]) References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> model = graphlab.nearest_neighbors.create(sf, composite_params=[ ... [['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]]) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and (distance == 'cosine' or distance == _graphlab.distances.cosine or distance == 'dot_product' or distance == _graphlab.distances.dot_product): raise TypeError("The ball tree method does not work with 'cosine' " +\ "or 'dot_product' distance. Please use the 'brute_force' " +\ "method for these distances.") ## Initial validation and processing of the label if label is None: _label = '__id' try: _dataset = dataset.add_row_number(column_name=_label) except: print "Tried to add a default label column '{}' ".format(_label) +\ "but a column by this name already exists. Using the " + \ "existing column as the label column." _dataset = dataset else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype() == int: raise TypeError("The label column must contain integers or strings.") _label = label _dataset = dataset sf_label = _tkutl._toolkits_select_columns(_dataset, [_label]) ## Clean the method options and create the options dictionary if len(kwargs) > 0: _method_options = {k.lower(): v for k, v in kwargs.items()} else: _method_options = {} ## If composite inputs aren't specifed, formulate the standalone inputs as a # composite input for code simplicity. If the standalone input doesn't # specify features, choose all the features and make a set of distance # components based on feature type. if composite_params is None: ## If not features specified, use them all if features is None: _features = [x for x in _dataset.column_names() if x != _label] else: _features = features[:] ## If the distance argument is 'auto', turn it into a list of distance # components by choosing an automatic distance for each feature based # on its type. if distance == 'auto': _composites = choose_auto_distance(_features, dataset.column_names(), dataset.column_types()) else: _composites = [[_features, distance, 1]] # Ignore automatically generated components if components have been provided else: if distance != 'auto': raise ValueError( "Either the 'distance' parameter or the 'composite_params' " +\ "parameter may be specified, but not both.") if features is not None: raise ValueError( "Either the 'features' parameter or the 'composite_params' " +\ "parameter may be specified, but not both.") if len(composite_params) == 0: raise ValueError( "'composite_params' was specified as an empty list. If " +\ "specified, this parameter must contain at least one distance " +\ "component, which is a list containing three elements: a list " +\ "of feature names, a distance name or function, and a relative " +\ "weight.") _composites = copy.deepcopy(composite_params) ## Clean the list of features in each component of the composite inputs, and # compile the union of the lists of features. all_features = [] for i in range(len(_composites)): if len(_composites[i]) != 3: raise ValueError("Each element of 'composite_params' must be a " +\ "list with three members.") feature_names = _composites[i][0] if len(feature_names) == 0: raise ValueError("An empty list of features cannot be passed " +\ "as part of a composite distance function.") # set of features must be iterable _tkutl._raise_error_if_not_iterable(feature_names, "features") # feature names must be strings if not all([isinstance(x, str) for x in feature_names]): raise TypeError("Input 'features' must contain only strings.") # remove the label name from all of the features lists feature_names = [x for x in feature_names if x != _label] # ensure that string features are in single columns if len(feature_names) > 1 and any([_dataset[x].dtype() is str for x in feature_names]): raise ValueError( "Multiple features have been entered, one of which is of string " +\ "type. If the input features for any distance component contain a " +\ "string column, that must be the only column for that component.") # ensure that relative weights are integers or floats if not isinstance(_composites[i][2], (int, float)): raise ValueError( "The weight of each distance component must be a single " +\ "integer or a float value.") # combine all features into a big list _composites[i][0] = feature_names all_features += feature_names # convert distance strings to distance functions temp_dist = _composites[i][1] if isinstance(temp_dist, str): _composites[i][1] = _graphlab.util._get_distance(temp_dist) # Pull out the relevant features from the input dataset (the union of # features over all distance components) all_features = list(set(all_features)) sf_features = _tkutl._toolkits_select_columns(_dataset, all_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(_composites) > 1: _method = 'brute_force' if method == 'ball_tree': print "Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components." else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([len(x) if hasattr(x, '__iter__') else 1 for x in sf_features[0].itervalues()]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([x in [int, float, list, array.array] for x in sf_features.column_types()]) ## Conditions necessary for ball tree to work and be worth it if ((_composites[0][1] in ['euclidean', 'manhattan', _graphlab.distances.euclidean, _graphlab.distances.manhattan]) and numeric_type_flag is True and num_variables <= 100): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_balltree.create') elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_brute.create') else: raise ValueError("Method must be 'brute_force', 'ball_tree', or 'auto'") ## Package the model options opts = {} opts.update(_method_options) opts.update( {'model_name': model_name, 'sf_label': sf_label, 'sf_features': sf_features, 'composite_params': _composites}) ## Construct the nearest neighbors model if verbose: print "Starting model construction..." result = _graphlab.extensions._nearest_neighbors.train(opts) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) if verbose: model.summary() print return model
def create_with_model_selector(dataset, target, model_selector, features = None, verbose = True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Run the model selector. selected_model_name = model_selector(features_sframe) if (selected_model_name == 'neuralnet_classifier'): model = _graphlab.classifier.neuralnet_classifier.create(dataset, target, features = features, verbose = verbose) return model else: # Multi-class through boosted trees if ('classifier' in selected_model_name) and \ (dataset[target].unique().size() > 2): selected_model_name = 'boosted_trees_classifier' # Create the model model = create(dataset, target, selected_model_name, features = features, verbose = verbose) # Return the model if selected_model_name == 'boosted_trees_regression': return _graphlab.boosted_trees_regression.BoostedTreesRegression(\ model.__proxy__) elif selected_model_name == 'regression_linear_regression': return _graphlab.linear_regression.LinearRegression(\ model.__proxy__) elif selected_model_name == 'boosted_trees_classifier': return _graphlab.boosted_trees_classifier.BoostedTreesClassifier(\ model.__proxy__) elif selected_model_name == 'classifier_logistic_regression': return _graphlab.logistic_classifier.LogisticClassifier(\ model.__proxy__) elif selected_model_name == 'classifier_svm': return _graphlab.svm_classifier.SVMClassifier(model.__proxy__) else: raise ToolkitError, "Internal error: Incorrect model returned."
def create(dataset, target, model_name, features=None, validation_set = None, verbose = True, **kwargs): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional The validation set that is used to watch the validation result as boosting progress. verbose : boolean whether print out messages during training kwargs : dict Additional parameter options that can be passed """ _raise_error_if_not_sframe(dataset, "training dataset") # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({'target': target_sframe, 'features': features_sframe, 'model_name': model_name}) if validation_set is not None: options.update({ 'features_validation' : _toolkits_select_columns(validation_set, features), 'target_validation' : _toolkits_select_columns(validation_set, [target])}) ret = _graphlab.toolkits._main.run("supervised_learning_train", options, verbose=verbose) model = SupervisedLearningModel(ret['model'], model_name) return model
def create(dataset, target, model_name, env, features=None, validation_set='auto', verbose=True, **kwargs): _raise_error_if_not_sframe(dataset, "training dataset") # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print( "PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.\n" " You can set ``validation_set=None`` to disable validation tracking.\n" ) dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError('Unrecognized value for validation_set.') # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError( "Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({ 'target': target_sframe, 'features': features_sframe, 'model_name': model_name }) if validation_set is not None: if not isinstance(validation_set, _graphlab.SFrame): raise TypeError( "validation_set must be either 'auto' or an SFrame matching the training data." ) # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) options.update({ 'features_validation': _toolkits_select_columns(validation_set, features), 'target_validation': _toolkits_select_columns(validation_set, [target]) }) from . import _dml dml_obj = _dml.run("distributed_supervised_train", model_name, options, env) return dml_obj
def create_classification_with_model_selector(dataset, target, model_selector, features=None, validation_set='auto', verbose=True): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel. This function is normally not called, call specific model's create function instead. Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Get available models for this dataset num_classes = dataset[target].unique().size() selected_model_names = model_selector(num_classes, features_sframe) # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError, 'Unrecognized value for validation_set.' # Match C++ model names with user model names python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier', 'random_forest_classifier': 'RandomForestClassifier', 'classifier_logistic_regression': 'LogisticClassifier', 'classifier_svm': 'SVMClassifier', 'neuralnet_classifier': 'NeuralNetClassifier', 'neuralnet_classifier_v2': 'NeuralNetClassifier'} # Print useful user-facing progress messages print 'PROGRESS: The following methods are available for this type of problem.' print 'PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names]) if len(selected_model_names) > 1: print 'PROGRESS: The returned model will be chosen according to validation accuracy.' models = {} metrics = {} for model_name in selected_model_names: # Fit each of the available models m = create_selected(model_name, dataset, target, features, validation_set, verbose) models[model_name] = m # Get the last progress value or validation_accuracy, whichever is there if 'progress' in m.list_fields(): prog = m['progress'] validation_column = 'Validation-accuracy' accuracy_column = 'Training-accuracy' if validation_column in prog.column_names(): metrics[model_name] = float(prog[validation_column].tail(1)[0]) else: metrics[model_name] = float(prog[accuracy_column].tail(1)[0]) # Validation accuracy (for boosted trees.) elif 'validation_accuracy' in m.list_fields(): metrics[model_name] = m['validation_accuracy'] else: raise ValueError, \ "Model does not have metrics that can be used for model selection." # Choose model based on either validation, if available. best_model = None best_acc = None for model_name in selected_model_names: if best_acc is None: best_model = model_name best_acc = metrics[model_name] if best_acc < metrics[model_name]: best_model = model_name best_acc = metrics[model_name] ret = [] width = 32 if len(selected_model_names) > 1: ret.append('PROGRESS: Model selection based on validation accuracy:') ret.append('---------------------------------------------') key_str = '{:<{}}: {}' for model_name in selected_model_names: name = python_names[model_name] row = key_str.format(name, width, str(metrics[model_name])) ret.append(row) ret.append('---------------------------------------------') ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.') print '\nPROGRESS: '.join(ret) return models[best_model]
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ For each row of the input 'dataset', retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model, but if it is, the 'include_self_edges' parameter can be set to False to exclude results that match query points to themselves. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - The `dataset` input to this method *can* have missing values (in contrast to the reference dataset used to create the nearest neighbors model). Missing numeric values are imputed to be the mean of the corresponding feature in the reference dataset, and missing strings are imputed to be empty strings. - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. - For models created with the 'lsh' method, the query results may have fewer query labels than input query points. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query, the query point is omitted from the results. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = graphlab.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = graphlab.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ _mt._get_metric_tracker().track('toolkit.nearest_neighbors.query') ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input if label is None: query_labels = _graphlab.SArray.from_sequence(len(dataset)) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype( ) == int: raise TypeError( "The label column must contain integers or strings.") if label in ref_features: raise ValueError( "The label column cannot be one of the features.") query_labels = dataset[label] ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'query_labels': query_labels, 'k': k, 'radius': radius } result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])
def link(self, dataset, k=5, radius=None, verbose=True): """ Find matching records from the reference dataset (entered when the model was created) for each record in the 'dataset' passed to this function. The query dataset must include columns with the same names as the label and feature columns used to create the RecordLinker model. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. k : int, optional Maximum number of nearest neighbors to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the row label of the query observation, the second is the row label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- Assume we've created the model from the example in the RecordLinker 'create' function. >>> queries = graphlab.SFrame({'sqft': [986, 1320], ... 'street': ['fremont', 'phiney'], ... 'city': ['sea', 'seattle'], ... 'state': ['WA', 'WA']}) ... >>> model.link(queries, k=2, radius=5.) +-------------+-----------------+----------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------+------+ | 0 | 0 | 4.0 | 1 | | 0 | 2 | 5.0 | 2 | | 1 | 0 | 0.0 | 1 | +-------------+-----------------+----------+------+ """ _mt._get_metric_tracker().track(self.__module__ + '.link_records') ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Make sure all of the necessary features are present at 'link' time. sf_features = _tkutl._toolkits_select_columns(dataset, self.get('features')) ## Clean and impute string data. *** Think about consolidating this and # the next step into a feature transformer.*** col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(self.__proxy__['distance']) for ftr in self.get('features'): if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [ new_ftr if x == ftr else x for x in dist_comp[0] ] ## Convert strings to dicts and concatenate string features. _dataset, _ = _engineer_distance_features(_dataset, _distance) ## Query the nearest neighbor model result = self.__proxy__['nearest_neighbors_model'].query( _dataset, k=k, radius=radius, verbose=verbose) return result
def create(dataset, label=None, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Columns of type *list* are not supported. Convert them to array columns if all entries in the list are of numeric types. Please note: if a composite distance is also specified, this parameter is ignored. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate nearest neighbors efficiently. The LSH model supports 'euclidean', 'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product' (deprecated), and 'transformed_dot_product' distances. Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``. See the notes below for details. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. The default value is 20. We recommend choosing values from 10 to 30. - *num_projections_per_table*: For the LSH method, the number of projections/hash functions for each hash table. The default value is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine' distance and 4 ~ 12 for other distances. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query, graphlab.toolkits.distances Notes ----- - Missing data is not allowed in the 'dataset' provided to this function. Please use the :func:`graphlab.SFrame.fillna` and :func:`graphlab.SFrame.dropna` utilities to handle missing data before creating a nearest neighbors model. - Missing keys in sparse vectors are assumed to have value 0. - The `composite_params` parameter was removed as of GraphLab Create version 1.5. The `distance` parameter now accepts either standard or composite distances. Please see the :mod:`~graphlab.toolkits.distances` module documentation for more information on composite distances. - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. **Locality Sensitive Hashing (LSH)** There are several efficient nearest neighbors search algorithms that work well for data with low dimensions :math:`d` (approximately 50). However, most of the solutions suffer from either space or query time that is exponential in :math:`d`. For large :math:`d`, they often provide little, if any, improvement over the 'brute_force' method. This is a well-known consequence of the phenomenon called `The Curse of Dimensionality`. `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach that is designed to efficiently solve the *approximate* nearest neighbor search problem for high dimensional data. The key idea of LSH is to hash the data points using several hash functions, so that the probability of collision is much higher for data points which are close to each other than those which are far apart. An LSH family is a family of functions :math:`h` which map points from the metric space to a bucket, so that - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`. - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`. LSH for efficient approximate nearest neighbor search: - We define a new family of hash functions :math:`g`, where each function :math:`g` is obtained by concatenating :math:`k` functions :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`. The algorithm constructs :math:`L` hash tables, each of which corresponds to a different randomly chosen hash function :math:`g`. There are :math:`k \\cdot L` hash functions used in total. - In the preprocessing step, we hash all :math:`n` reference points into each of the :math:`L` hash tables. - Given a query point :math:`q`, the algorithm iterates over the :math:`L` hash functions :math:`g`. For each :math:`g` considered, it retrieves the data points that are hashed into the same bucket as q. These data points from all the :math:`L` hash tables are considered as candidates that are then re-ranked by their real distances with the query data. **Note** that the number of tables :math:`L` and the number of hash functions per table :math:`k` are two main parameters. They can be set using the options ``num_tables`` and ``num_projections_per_table`` respectively. Hash functions for different distances: - ``euclidean`` and ``squared_euclidean``: :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where :math:`a` is a vector, of which the elements are independently sampled from normal distribution, and :math:`b` is a number uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the bucket width. We set :math:`r` using the average all-pair `euclidean` distances from a small randomly sampled subset of the reference data. - ``manhattan``: The hash function of ``manhattan`` is similar with that of ``euclidean``. The only difference is that the elements of `a` are sampled from Cauchy distribution, instead of normal distribution. - ``cosine``: Random Projection is designed to approximate the cosine distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`, where :math:`a` is randomly sampled normal unit vector. - ``jaccard``: We use a recently proposed method one permutation hashing by Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014] <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details. - ``dot_product``: The reference data points are first transformed to fixed-norm vectors, and then the minimum ``dot_product`` distance search problem can be solved via finding the reference data with smallest ``cosine`` distances. See the paper `[Neyshabur and Srebro, ICML 2015] <http://jmlr.org/proceedings/papers/v37/neyshabur15.html>`_ for details. References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_. Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], ... 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]] ... >>> model = graphlab.nearest_neighbors.create(sf, distance=my_dist) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError("'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and (distance == 'cosine' or distance == _graphlab.distances.cosine or distance == 'dot_product' or distance == _graphlab.distances.dot_product or distance == 'transformed_dot_product' or distance == _graphlab.distances.transformed_dot_product): raise TypeError("The ball tree method does not work with 'cosine' " + "'dot_product', or 'transformed_dot_product' distance." + "Please use the 'brute_force' method for these distances.") if method == 'lsh' and (not _method_options.has_key('num_projections_per_table')): if distance == 'jaccard' or distance == _graphlab.distances.jaccard: _method_options['num_projections_per_table'] = 4 elif distance == 'cosine' or distance == _graphlab.distances.cosine: _method_options['num_projections_per_table'] = 16 else: _method_options['num_projections_per_table'] = 8 ## Initial validation and processing of the label _dataset, _label = _tkutl._validate_row_label(dataset, label=label) ref_labels = _dataset[_label] ## Determine the internal list of available feature names (may still include # the row label name). if features is None: _features = _dataset.column_names() else: _features = _copy.deepcopy(features) ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: raise _ToolkitError("The only available feature is the same as the " + "row label column. Please specify features " + "that are not also row labels.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. # - At this point, the row label 'label' may still be in the list(s) of # features. ## Convert any distance function input into a single composite distance. # distance is already a composite distance if isinstance(distance, list): distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. elif distance is None or distance == 'auto': distance = _construct_auto_distance(_features, _dataset.column_names(), _dataset.column_types()) else: raise TypeError("Input 'distance' not understood. The 'distance' " " argument must be a string, function handle, or " + "composite distance.") ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. distance = _scrub_composite_distance_features(distance, [_label]) distance = _convert_distance_names_to_functions(distance) _validate_composite_distance(distance) ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _graphlab.distances.levenshtein): raise ValueError("Levenshtein distance cannot be used with multiple " + "columns. Please concatenate strings into a single " + "column before creating the nearest neighbors model.") ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: _method = 'brute_force' if method != 'brute_force' and verbose is True: print "Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components." else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([len(x) if hasattr(x, '__iter__') else 1 for x in sf_clean[0].itervalues()]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([x in [int, float, list, array.array] for x in sf_clean.column_types()]) ## Conditions necessary for ball tree to work and be worth it if ((distance[0][1] in ['euclidean', 'manhattan', _graphlab.distances.euclidean, _graphlab.distances.manhattan]) and numeric_type_flag is True and num_variables <= 200): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_balltree.create') elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_brute.create') elif _method == 'lsh': model_name = 'nearest_neighbors_lsh' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_lsh.create') else: raise ValueError("Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'.") ## Package the model options opts = {} opts.update(_method_options) opts.update( {'model_name': model_name, 'ref_labels': ref_labels, 'label': label, 'sf_features': sf_clean, 'composite_params': distance}) ## Construct the nearest neighbors model if not verbose: _mt.main.get_client().set_log_progress(False) result = _graphlab.extensions._nearest_neighbors.train(opts) _mt.main.get_client().set_log_progress(True) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) return model