def _combine(task): ''' The actual code that will be ran inside of a task to combine all results and add the parameter column to the final SFrame(s). ''' # Initialize empty SFrames for each output. for out_name in task.get_outputs(): task.outputs[out_name] = _SFrame() params_to_outputs = task.params[_COMBINE_PARAMETER_NAME] for params, path in params_to_outputs: for out_name in task.get_outputs(): try: cur_result = _SFrame(_path_join(path, out_name)) except IOError: _log.info("No output for %s with parameters: %s " % (out_name, str(params))) continue # Add the 'Parameters' column and append to previous results. cur_result['parameters'] = _SArray.from_const(params, len(cur_result)) cur_result.__materialize__() task.outputs[out_name] = task.outputs[out_name].append(cur_result)
def get_default_options(): """ Return default options information for the similarity search toolkit. Returns ------- out : SFrame Each row in the output SFrames correspond to a parameter, and includes columns for default values, lower and upper bounds, description, and type. """ out = _SFrame({'name': ['method', 'feature_model', 'verbose'], 'default_value' : ['lsh', 'auto', 'True'], 'lower_bound': [None, None, 0], 'upper_bound': [None, None, 1], 'description': ['Method for searching reference data', 'Trained model for extracting features from raw data objects', 'Whether progress output is printed'], 'parameter_type': ['string', 'model', 'boolean']}) return out
def get_default_options_for_model(output_type = 'sframe'): """ Get the default options for the toolkit :class:`~graphlab.{module_name}.{python_class_name}`. Parameters ---------- output_type : str, optional The output can be of the following types. - `sframe`: A table description each option used in the model. - `json`: A list of option dictionaries. | Each dictionary/row in the JSON/SFrame object describes the following parameters of the given model. +------------------+-------------------------------------------------------+ | Name | Description | +==================+=======================================================+ | name | Name of the option used in the model. | +------------------+---------+---------------------------------------------+ | description | A detailed description of the option used. | +------------------+-------------------------------------------------------+ | type | Option type (REAL, BOOL, INTEGER or CATEGORICAL) | +------------------+-------------------------------------------------------+ | default_value | The default value for the option. | +------------------+-------------------------------------------------------+ | possible_values | List of acceptable values (CATEGORICAL only) | +------------------+-------------------------------------------------------+ | lower_bound | Smallest acceptable value for this option (REAL only) | +------------------+-------------------------------------------------------+ | upper_bound | Largest acceptable value for this option (REAL only) | +------------------+-------------------------------------------------------+ Returns ------- out : JSON/SFrame See Also -------- graphlab.{module_name}.{python_class_name}.get_current_options Examples -------- .. sourcecode:: python >>> import graphlab # Returns an output as an SFrame >>> out_sframe = graphlab.{module_name}.get_default_options() # Returns the output as a JSON >>> out_sframe = graphlab.{module_name}.get_default_options('json') """ _mt._get_metric_tracker().track('toolkit.%s.get_default_options' % module_name) response = _gl.extensions._toolkits_get_default_options( unity_server_model_name) for k in response.keys(): response[k] = json.loads(response[k], parse_int = lambda x: float(x) if type(int(x)) is long else int(x)) if output_type == 'json': return response else: json_list = [{'name': k, '': v} for k,v in response.items()] return _SFrame(json_list).unpack('X1', column_name_prefix='')\ .unpack('X1', column_name_prefix='')
from array import array as _array import json from graphlab.data_structures.sframe import SArray as _SArray from graphlab.data_structures.sframe import SFrame as _SFrame from graphlab.data_structures.sgraph import SGraph as _SGraph from graphlab.data_structures.sgraph import Vertex as _Vertex from graphlab.data_structures.sgraph import Edge as _Edge from graphlab.cython.cy_sarray import UnitySArrayProxy from graphlab.cython.cy_sframe import UnitySFrameProxy from graphlab.cython.cy_graph import UnityGraphProxy from graphlab.toolkits._main import ToolkitError import logging as _logging _proxy_map = {UnitySFrameProxy: (lambda x: _SFrame(_proxy=x)), UnitySArrayProxy: (lambda x: _SArray(_proxy=x)), UnityGraphProxy: (lambda x: _SGraph(_proxy=x))} def _add_docstring(format_dict): """ Format a doc-string on the fly. @arg format_dict: A dictionary to format the doc-strings Example: @add_docstring({'context': __doc_string_context}) def predict(x): ''' {context} >> model.predict(data) '''
def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0, output_type='topic_probabilities'): """ Get the words associated with a given topic. The score column is the probability of choosing that word given that you have chosen a particular topic. Parameters ---------- topic_ids : list of int, optional The topics to retrieve words. Topic ids are zero-based. Throws an error if greater than or equal to m['num_topics'], or if the requested topic name is not present. num_words : int, optional The number of words to show. cdf_cutoff : float, optional Allows one to only show the most probable words whose cumulative probability is below this cutoff. For example if there exist three words where .. math:: p(word_1 | topic_k) = .1 p(word_2 | topic_k) = .2 p(word_3 | topic_k) = .05 then setting :math:`cdf_{cutoff}=.3` would return only :math:`word_1` and :math:`word_2` since :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}` output_type : {'topic_probabilities' | 'topic_words'}, optional Determine the type of desired output. See below. Returns ------- out : SFrame If output_type is 'topic_probabilities', then the returned value is an SFrame with a column of words ranked by a column of scores for each topic. Otherwise, the returned value is a SArray where each element is a list of the most probable words for each topic. Examples -------- Get the highest ranked words for all topics. >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text') >>> m = graphlab.topic_model.create(docs, num_iterations=50) >>> m.get_topics() +-------+----------+-----------------+ | topic | word | score | +-------+----------+-----------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 1 | function | 0.0482834508265 | | 1 | input | 0.0456270024091 | | 1 | point | 0.0302662839454 | | 1 | result | 0.0239474934631 | | 1 | problem | 0.0231750116011 | | ... | ... | ... | +-------+----------+-----------------+ Get the highest ranked words for topics 0 and 1 and show 15 words per topic. >>> m.get_topics([0, 1], num_words=15) +-------+----------+------------------+ | topic | word | score | +-------+----------+------------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 0 | response | 0.0139740298286 | | 0 | layer | 0.0122585145062 | | 0 | features | 0.0115343177265 | | 0 | feature | 0.0103530459301 | | 0 | spatial | 0.00823387994361 | | ... | ... | ... | +-------+----------+------------------+ If one wants to instead just get the top words per topic, one may change the format of the output as follows. >>> topics = m.get_topics(output_type='topic_words') dtype: list Rows: 10 [['cell', 'image', 'input', 'object', 'visual'], ['algorithm', 'data', 'learning', 'method', 'set'], ['function', 'input', 'point', 'problem', 'result'], ['model', 'output', 'pattern', 'set', 'unit'], ['action', 'learning', 'net', 'problem', 'system'], ['error', 'function', 'network', 'parameter', 'weight'], ['information', 'level', 'neural', 'threshold', 'weight'], ['control', 'field', 'model', 'network', 'neuron'], ['hidden', 'layer', 'system', 'training', 'vector'], ['component', 'distribution', 'local', 'model', 'optimal']] """ _mt._get_metric_tracker().track('toolkit.text.topic_model.get_topics') _check_categorical_option_type('output_type', output_type, ['topic_probabilities', 'topic_words']) if topic_ids is None: topic_ids = range(self.get('num_topics')) assert isinstance(topic_ids, list), \ "The provided topic_ids is not a list." if any([type(x) == str for x in topic_ids]): raise ValueError, \ "Only integer topic_ids can be used at this point in time." if not all([x >= 0 and x < self['num_topics']]): raise ValueError, \ "Topic id values must be non-negative and less than the " + \ "number of topics used to fit the model." opts = {'model': self.__proxy__, 'topic_ids': topic_ids, 'num_words': num_words, 'cdf_cutoff': cdf_cutoff} response = _graphlab.toolkits._main.run('text_topicmodel_get_topic', opts) ret = _map_unity_proxy_to_object(response['top_words']) if output_type != 'topic_probabilities': sa = ret.unstack(['word','score'], 'word')['word'].dict_keys() ret = _SFrame({'words': sa}) return ret
def get(self, field): """ Return the value of a given field. The list of all queryable fields is detailed below, and can be obtained with the :py:func:`~TopicModel.list_fields` method. +-----------------------+----------------------------------------------+ | Field | Description | +=======================+==============================================+ | topics | An SFrame containing a column with the unique| | | words observed during training, and a column | | | of arrays containing the probability values | | | for each word given each of the topics. | +-----------------------+----------------------------------------------+ | vocabulary | An SArray containing the words used. This is | | | same as the vocabulary column in the topics | | | field above. | +-----------------------+----------------------------------------------+ Parameters ---------- field : string Name of the field to be retrieved. Returns ------- out Value of the requested field. See Also -------- list_fields Examples -------- >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text') >>> m = graphlab.topic_model.create(docs) >>> m.get('topics') +--------------------------------+------------+ | topic_probabilities | vocabulary | +--------------------------------+------------+ | array('d', [0.000514752462 ... | limited | | array('d', [6.120718939647 ... | consider | | array('d', [0.000337251613 ... | represent | | array('d', [0.000104664293 ... | lack | | array('d', [6.120718939647 ... | desirable | | array('d', [6.120718939647 ... | focus | | array('d', [6.120718939647 ... | generaliza | | array('d', [6.120718939647 ... | generalize | | array('d', [6.120718939647 ... | row | | array('d', [6.120718939647 ... | depend | | ... | ... | +--------------------------------+------------+ You may also do m['topics']. """ _mt._get_metric_tracker().track('toolkit.text.topic_model.get') opts = {'model': self.__proxy__, 'field': field} response = _graphlab.toolkits._main.run("text_topicmodel_get_value", opts) if field == 'vocabulary': return _SArray(None, _proxy=response['value']) elif field == 'topics': return _SFrame(None, _proxy=response['value']) return response['value']
""" import json from graphlab.data_structures.sframe import SArray as _SArray from graphlab.data_structures.sframe import SFrame as _SFrame from graphlab.data_structures.sgraph import SGraph as _SGraph from graphlab.data_structures.sgraph import Vertex as _Vertex from graphlab.data_structures.sgraph import Edge as _Edge from graphlab.cython.cy_sarray import UnitySArrayProxy from graphlab.cython.cy_sframe import UnitySFrameProxy from graphlab.cython.cy_graph import UnityGraphProxy from graphlab.toolkits._main import ToolkitError import logging as _logging _proxy_map = {UnitySFrameProxy: (lambda x: _SFrame(_proxy=x)), UnitySArrayProxy: (lambda x: _SArray(_proxy=x)), UnityGraphProxy: (lambda x: _SGraph(_proxy=x))} def _add_docstring(format_dict): """ Format a doc-string on the fly. @arg format_dict: A dictionary to format the doc-strings Example: @add_docstring({'context': __doc_string_context}) def predict(x): ''' {context} >> model.predict(data) '''
def similarity_graph(self, k=5, radius=None, include_self_edges=False, output_type='SGraph', verbose=True): """ Construct the similarity graph on the reference dataset, which is already stored in the model. This is conceptually very similar to running `query` with the reference set, but this method is optimized for the purpose, syntactically simpler, and automatically removes self-edges. Parameters ---------- k : int, optional Maximum number of neighbors to return for each point in the dataset. Setting this to ``None`` deactivates the constraint, so that all neighbors are returned within ``radius`` of a given point. radius : float, optional For a given point, only neighbors within this distance are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. include_self_edges : bool, optional For most distance functions, each point in the model's reference dataset is its own nearest neighbor. If this parameter is set to False, this result is ignored, and the nearest neighbors are returned *excluding* the point itself. output_type : {'SGraph', 'SFrame'}, optional By default, the results are returned in the form of an SGraph, where each point in the reference dataset is a vertex and an edge A -> B indicates that vertex B is a nearest neighbor of vertex A. If 'output_type' is set to 'SFrame', the output is in the same form as the results of the 'query' method: an SFrame with columns indicating the query label (in this case the query data is the same as the reference data), reference label, distance between the two points, and the rank of the neighbor. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame or SGraph The type of the output object depends on the 'output_type' parameter. See the parameter description for more detail. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each data point is matched to the entire dataset. If the reference dataset has :math:`n` rows, the output is an SFrame with :math:`n^2` rows (or an SGraph with :math:`n^2` edges). - For models created with the 'lsh' method, the output similarity graph may have fewer vertices than there are data points in the original reference set. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query and self-edges are excluded, the query point is omitted from the results. Examples -------- First construct an SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'x1': [0.98, 0.62, 0.11], ... 'x2': [0.69, 0.58, 0.36]}) ... >>> model = graphlab.nearest_neighbors.create(sf, distance='euclidean') Unlike the ``query`` method, there is no need for a second dataset with ``similarity_graph``. >>> g = model.similarity_graph(k=1) # an SGraph >>> g.show() >>> g.edges +----------+----------+----------------+------+ | __src_id | __dst_id | distance | rank | +----------+----------+----------------+------+ | 0 | 1 | 0.376430604494 | 1 | | 2 | 1 | 0.55542776308 | 1 | | 1 | 0 | 0.376430604494 | 1 | +----------+----------+----------------+------+ """ _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors.similarity_graph') ## Validate inputs. if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = {'model': self.__proxy__, 'model_name': self.__name__, 'k': k, 'radius': radius, 'include_self_edges': include_self_edges} result = _graphlab.toolkits._main.run('_nearest_neighbors.similarity_graph', opts, verbose) knn = _SFrame(None, _proxy=result['neighbors']) if output_type == "SFrame": return knn else: sg = _SGraph(edges=knn, src_field='query_label', dst_field='reference_label') return sg
def get(self, field): """ Return the value of a given field. The list of all queryable fields is detailed below, and can be obtained with the :py:func:`~TopicModel.list_fields` method. +-----------------------+----------------------------------------------+ | Field | Description | +=======================+==============================================+ | topics | An SFrame containing a column with the unique| | | words observed during training, and a column | | | of arrays containing the probability values | | | for each word given each of the topics. | +-----------------------+----------------------------------------------+ | vocabulary | An SArray containing the words used. This is | | | same as the vocabulary column in the topics | | | field above. | +-----------------------+----------------------------------------------+ Parameters ---------- field : string Name of the field to be retrieved. Returns ------- out Value of the requested field. See Also -------- list_fields Examples -------- >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text') >>> m = graphlab.topic_model.create(docs) >>> m.get('topics') +--------------------------------+------------+ | topic_probabilities | vocabulary | +--------------------------------+------------+ | array('d', [0.000514752462 ... | limited | | array('d', [6.120718939647 ... | consider | | array('d', [0.000337251613 ... | represent | | array('d', [0.000104664293 ... | lack | | array('d', [6.120718939647 ... | desirable | | array('d', [6.120718939647 ... | focus | | array('d', [6.120718939647 ... | generaliza | | array('d', [6.120718939647 ... | generalize | | array('d', [6.120718939647 ... | row | | array('d', [6.120718939647 ... | depend | | ... | ... | +--------------------------------+------------+ You may also do m['topics']. """ opts = {'model': self.__proxy__, 'field': field} response = _graphlab.toolkits._main.run("text_topicmodel_get_value", opts) if field == 'vocabulary': return _SArray(None, _proxy=response['value']) elif field == 'topics': return _SFrame(None, _proxy=response['value']) return response['value']
def get(self, field): """ Return the value of a given field. The list of all queryable fields is detailed below, and can be obtained with the ``list_fields`` method. +-----------------------+----------------------------------------------+ | Field | Description | +=======================+==============================================+ | batch_size | Number of randomly chosen examples to use in | | | each training iteration. | +-----------------------+----------------------------------------------+ | cluster_id | Cluster assignment for each data point and | | | Euclidean distance to the cluster center | +-----------------------+----------------------------------------------+ | cluster_info | Cluster centers, sum of squared Euclidean | | | distances from each cluster member to the | | | assigned center, and the number of data | | | points belonging to the cluster | +-----------------------+----------------------------------------------+ | features | Names of feature columns | +-----------------------+----------------------------------------------+ | max_iterations | Maximum number of iterations to perform | +-----------------------+----------------------------------------------+ | method | Algorithm used to train the model. | +-----------------------+----------------------------------------------+ | num_clusters | Number of clusters | +-----------------------+----------------------------------------------+ | num_examples | Number of examples in the dataset | +-----------------------+----------------------------------------------+ | num_features | Number of feature columns used | +-----------------------+----------------------------------------------+ | num_unpacked_features | Number of features unpacked from the | | | feature columns | +-----------------------+----------------------------------------------+ | training_iterations | Total number of iterations performed | +-----------------------+----------------------------------------------+ | training_time | Total time taken to cluster the data | +-----------------------+----------------------------------------------+ | unpacked_features | Names of features unpacked from the | | | feature columns | +-----------------------+----------------------------------------------+ | verbose | True if model training should print progress | +-----------------------+----------------------------------------------+ Parameters ---------- field : str The name of the field to query. Returns ------- out Value of the requested field See Also -------- list_fields Examples -------- >>> model.get("cluster_info") d1 d2 d3 d4 sum_squared_distance size 0 -0.777484 1.048897 0.523926 0.487775 2.459470 4 1 0.844906 -0.613151 -0.088785 -0.212908 3.651614 5 2 -1.114592 -1.129836 -1.651781 -0.886557 0.000000 1 [3 rows x 6 columns] """ opts = {'model': self.__proxy__, 'model_name': self.__name__, 'field': field} response = _gl.toolkits._main.run('kmeans_get_value', opts) # cluster_id and cluster_info both return a unity SFrame. Cast to an SFrame. if field == 'cluster_id' or field == 'cluster_info': return _SFrame(None, _proxy=response['value']) else: return response['value']
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ For each row of the input 'dataset', retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model, but if it is, the 'include_self_edges' parameter can be set to False to exclude results that match query points to themselves. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - The `dataset` input to this method *can* have missing values (in contrast to the reference dataset used to create the nearest neighbors model). Missing numeric values are imputed to be the mean of the corresponding feature in the reference dataset, and missing strings are imputed to be empty strings. - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. - For models created with the 'lsh' method, the query results may have fewer query labels than input query points. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query, the query point is omitted from the results. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = graphlab.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = graphlab.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ _mt._get_metric_tracker().track('toolkit.nearest_neighbors.query') ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input if label is None: query_labels = _graphlab.SArray.from_sequence(len(dataset)) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype( ) == int: raise TypeError( "The label column must contain integers or strings.") if label in ref_features: raise ValueError( "The label column cannot be one of the features.") query_labels = dataset[label] ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'query_labels': query_labels, 'k': k, 'radius': radius } result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])
def similarity_graph(self, k=5, radius=None, include_self_edges=False, output_type='SGraph', verbose=True): """ Construct the similarity graph on the reference dataset, which is already stored in the model. This is conceptually very similar to running `query` with the reference set, but this method is optimized for the purpose, syntactically simpler, and automatically removes self-edges. Parameters ---------- k : int, optional Maximum number of neighbors to return for each point in the dataset. Setting this to ``None`` deactivates the constraint, so that all neighbors are returned within ``radius`` of a given point. radius : float, optional For a given point, only neighbors within this distance are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. include_self_edges : bool, optional For most distance functions, each point in the model's reference dataset is its own nearest neighbor. If this parameter is set to False, this result is ignored, and the nearest neighbors are returned *excluding* the point itself. output_type : {'SGraph', 'SFrame'}, optional By default, the results are returned in the form of an SGraph, where each point in the reference dataset is a vertex and an edge A -> B indicates that vertex B is a nearest neighbor of vertex A. If 'output_type' is set to 'SFrame', the output is in the same form as the results of the 'query' method: an SFrame with columns indicating the query label (in this case the query data is the same as the reference data), reference label, distance between the two points, and the rank of the neighbor. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame or SGraph The type of the output object depends on the 'output_type' parameter. See the parameter description for more detail. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each data point is matched to the entire dataset. If the reference dataset has :math:`n` rows, the output is an SFrame with :math:`n^2` rows (or an SGraph with :math:`n^2` edges). - For models created with the 'lsh' method, the output similarity graph may have fewer vertices than there are data points in the original reference set. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query and self-edges are excluded, the query point is omitted from the results. Examples -------- First construct an SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'x1': [0.98, 0.62, 0.11], ... 'x2': [0.69, 0.58, 0.36]}) ... >>> model = graphlab.nearest_neighbors.create(sf, distance='euclidean') Unlike the ``query`` method, there is no need for a second dataset with ``similarity_graph``. >>> g = model.similarity_graph(k=1) # an SGraph >>> g.show() >>> g.edges +----------+----------+----------------+------+ | __src_id | __dst_id | distance | rank | +----------+----------+----------------+------+ | 0 | 1 | 0.376430604494 | 1 | | 2 | 1 | 0.55542776308 | 1 | | 1 | 0 | 0.376430604494 | 1 | +----------+----------+----------------+------+ """ _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors.similarity_graph') ## Validate inputs. if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'k': k, 'radius': radius, 'include_self_edges': include_self_edges } result = _graphlab.toolkits._main.run( '_nearest_neighbors.similarity_graph', opts, verbose) knn = _SFrame(None, _proxy=result['neighbors']) if output_type == "SFrame": return knn else: sg = _SGraph(edges=knn, src_field='query_label', dst_field='reference_label') return sg
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ Retrieve the nearest neighbors from the reference set for each element of the query set. The query SFrame must include columns with the same names as the label and feature columns used to create the NearestNeighborsModel. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. label : string, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = graphlab.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = graphlab.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors.query') ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input ref_label = self.get('label') if label is None: sf_features = sf_features.add_row_number(column_name=ref_label) sf_label = sf_features[[ref_label]] sf_features.remove_column(ref_label) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype() == int: raise TypeError("The label column must contain integers or strings.") if label in ref_features: raise ValueError("The label column cannot be one of the features.") sf_label = _tkutl._toolkits_select_columns(dataset, [label]) if label != ref_label: sf_label.rename({label: ref_label}) ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = {'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'label': sf_label, 'k': k, 'radius': radius} if verbose is True: print "Starting model querying..." result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])
def create(dataset, target, loss_function='squared', quadratic=[], l1_penalty=0.0, l2_penalty=0.0, bigram=False, step_size=0.5, num_bits=18, verbose=False, max_iterations=1, command_line_args=''): """ create(dataset, target, loss_function='squared', quadratic=list(), l1_penalty=0.0, l2_penalty=0.0, bigram=False, step_size=0.5, num_bits=18, verbose=False, max_iterations=1, command_line_args='') Learn a large linear model using Vowpal Wabbit. Parameters ---------- dataset : SFrame A data set. Due to the way Vowpal Wabbit creates features from each entry, ':' and '|' characters are not allowed in any columns containing strings. Each row of the dataset is translated into a string and passed to Vowpal Wabbit. Currently, the upper bound on the size of the string is 1MB. Based on the type of the SArray column, the values are passed in the following ways. - *integer* or *float*: the value is passed directly to VW. - *str*: the name of the column is used as the namespace, followed by the entire string. - *dict*: the name of the column is used as the namespace, and each key-value pair is a feature. The keys of the dictionary must be string or numeric and the values must be numeric (integer or float). - *array*: the name of the column is used as the namespace, the index of the array element is used as the name of the feature, and only numeric elements in the array are passed to VW. - *list (recursive type)*: the name of the column is used as the namespace, the index of the list element is used as the name of the feature, and currently only numeric elements (integer or float) are passed to VW. See the `VW input format guidelines <https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format>`_ for more details. target : string The name of the column in ``dataset`` that is the prediction target. This column must have a numeric type. loss_function : {'squared', 'hinge', 'logistic', 'quantile'}, optional This defines the `loss function <http://en.wikipedia.org/wiki/Loss_function>`_ used during optimization. Typical choices: - *real-valued target*: `squared error loss <http://en.wikipedia.org/wiki/Mean_squared_error>`_. - *binary target*: `logistic <http://en.wikipedia.org/wiki/Logistic_regression>`_. The target column must only contain -1 or 1. The `hinge loss <http://en.wikipedia.org/wiki/Hinge_loss>`_ is also used for classification, while `quantile loss <http://en.wikipedia.org/wiki/Quantile_regression>`_ can be good when one aims to predict quantities other than the mean. quadratic : list of pairs, optional This will add `interaction terms <http://en.wikipedia.org/wiki/Interaction_(statistics)>`_ to a linear model between a pair of columns. Quadratic terms add a parameter in the model for the product of two features, i.e. if we include an interaction between :math:`x_1` and :math:`x_2`, we can add a parameter :math:`b_3`. .. math:: y_i = a + b_1 * x_{i1} + b_2 * x_{i2} + b_3 * x_{i1} * x_{i2} Multiple quadratic terms can be added by including multiple pairs, e.g. ``quadratic = [('a', 'b'), ('b', 'c')]`` would add interaction terms between columns names 'a' and 'b' as well as terms for interactions between 'b' and 'c'. Including ':' as one of the items in the pairs is a shortcut for adding quadratic terms for all pairs of features. Due to Vowpal Wabbit's implementation, quadratic terms are determined by the first letter of the column name. l1_penalty : float, optional This defines how strongly you want to keep parameters to be zero. l2_penalty : float, optional This defines how strongly you want to keep parameters near zero. Specifically it adds a penalty of :math:`.5 * \lambda * |w|_2^2` to the weight vector w, where lambda is the provided regularization value. bigram : bool, optional Add bigram features. For columns containing the text "my name is bob" this will add bigram features for "my name", "name is", "is bob". step_size : float, optional Set the learning rate for online learning. verbose : bool, optional Print first 10 rows as they are seen by VowpalWabbit. This is useful for debugging. max_iterations : int, optional Number of passes to take over the data set. command_line_args : string, optional Additional arguments to pass to Vowpal Wabbit, just as one would use when using VW via the command line. Returns ------- out : VowpalWabbitModel A model that can be used for predicting new cases. See Also -------- VowpalWabbitModel.predict, VowpalWabbitModel.evaluate Notes ----- - Other desired command line arguments can be provided manually through the command_line_args keyword argument. See the `VW documentation <http://gith ub.com/JohnLangford/vowpal_wabbit/wiki/Command-line-arguments>`_ for more details. - Several Vowpal Wabbit features are not yet supported, including importance weighted learning. Examples -------- >>> data = graphlab.SFrame('http://s3.amazonaws.com/GraphLab-Datasets/regression/houses.csv') >>> data['price'] = data['price'].apply(lambda x: 1 if x > 30000 else -1) >>> m = graphlab.vowpal_wabbit.create(data, 'price') To add quadratic terms between 'user' and 'movie' columns: >>> m = graphlab.vowpal_wabbit.create(sf, 'rating', quadratic=[('user', 'movie')]) If a column contains text, each space-separated word is used as a unique feature. Often times it is useful to also include bigrams as features. This can be done easily with the ``bigram`` argument: >>> m = graphlab.vowpal_wabbit.create(sf, 'rating', bigram=True) """ _mt._get_metric_tracker().track('toolkit.vowpal_wabbit.create') if not (isinstance(dataset, _SFrame)): raise TypeError("Input 'dataset' must be an SFrame") if type(dataset) != _SFrame: dataset = _SFrame(dataset) assert target in dataset.column_names(), "No target provided." quadratic_command = '' for (feature_a, feature_b) in quadratic: # VW uses first letter to describe namespace quadratic_command += ' -q ' + feature_a[0] + feature_b[0] opts = {'verbose': verbose, 'target': target, 'loss_function': loss_function, 'quadratic': quadratic_command, 'step_size': step_size, 'l1_penalty': l1_penalty, 'l2_penalty': l2_penalty, 'num_bits' : num_bits, 'max_iterations': max_iterations, 'bigram': bigram, 'extra_command_line_args': command_line_args} # Initialize the model with basic parameters response = _graphlab.toolkits._main.run("vw_init", opts) m = VowpalWabbitModel(response['model']) # Train the model on the given data set and retrieve predictions opts = {'model': m.__proxy__, 'data': dataset} response = _graphlab.toolkits._main.run("vw_train", opts) m = VowpalWabbitModel(response['model']) yhat = _SArray(None, _proxy=response['predictions']) # Evaluate model start_time = _time.time() y = dataset[target] if loss_function == 'logistic': is_one_or_neg_one = y.apply(lambda x: x == 1 or x == -1) if not all(is_one_or_neg_one): raise TypeError('When using `logistic` as a loss function, the target column must contain only 1\'s and -1\'s.') y = y.apply(lambda x: int(x*.5 + .5)) m = m._set('training_accuracy', _graphlab.evaluation.accuracy(y, yhat)) else: m = m._set('training_rmse', _graphlab.evaluation.rmse(y, yhat)) return m
def get(self, field): """ Return the value of a given field. The list of all queryable fields is detailed below, and can be obtained with the ``list_fields`` method. +-----------------------+----------------------------------------------+ | Field | Description | +=======================+==============================================+ | batch_size | Number of randomly chosen examples to use in | | | each training iteration. | +-----------------------+----------------------------------------------+ | cluster_id | Cluster assignment for each data point and | | | Euclidean distance to the cluster center | +-----------------------+----------------------------------------------+ | cluster_info | Cluster centers, sum of squared Euclidean | | | distances from each cluster member to the | | | assigned center, and the number of data | | | points belonging to the cluster | +-----------------------+----------------------------------------------+ | features | Names of feature columns | +-----------------------+----------------------------------------------+ | max_iterations | Maximum number of iterations to perform | +-----------------------+----------------------------------------------+ | method | Algorithm used to train the model. | +-----------------------+----------------------------------------------+ | num_clusters | Number of clusters | +-----------------------+----------------------------------------------+ | num_examples | Number of examples in the dataset | +-----------------------+----------------------------------------------+ | num_features | Number of feature columns used | +-----------------------+----------------------------------------------+ | num_unpacked_features | Number of features unpacked from the | | | feature columns | +-----------------------+----------------------------------------------+ | training_iterations | Total number of iterations performed | +-----------------------+----------------------------------------------+ | training_time | Total time taken to cluster the data | +-----------------------+----------------------------------------------+ | unpacked_features | Names of features unpacked from the | | | feature columns | +-----------------------+----------------------------------------------+ | verbose | True if model training should print progress | +-----------------------+----------------------------------------------+ Parameters ---------- field : str The name of the field to query. Returns ------- out Value of the requested field See Also -------- list_fields Examples -------- >>> model.get("cluster_info") d1 d2 d3 d4 sum_squared_distance size 0 -0.777484 1.048897 0.523926 0.487775 2.459470 4 1 0.844906 -0.613151 -0.088785 -0.212908 3.651614 5 2 -1.114592 -1.129836 -1.651781 -0.886557 0.000000 1 [3 rows x 6 columns] """ _mt._get_metric_tracker().track('toolkit.kmeans.get') opts = {'model': self.__proxy__, 'model_name': self.__name__, 'field': field} response = _graphlab.toolkits._main.run('kmeans_get_value', opts) # cluster_id and cluster_info both return a unity SFrame. Cast to an SFrame. if field == 'cluster_id' or field == 'cluster_info': return _SFrame(None, _proxy=response['value']) else: return response['value']
def get_default_options_for_model(output_type='sframe'): """ Get the default options for the toolkit :class:`~graphlab.{module_name}.{python_class_name}`. Parameters ---------- output_type : str, optional The output can be of the following types. - `sframe`: A table description each option used in the model. - `json`: A list of option dictionaries. | Each dictionary/row in the JSON/SFrame object describes the following parameters of the given model. +------------------+-------------------------------------------------------+ | Name | Description | +==================+=======================================================+ | name | Name of the option used in the model. | +------------------+---------+---------------------------------------------+ | description | A detailed description of the option used. | +------------------+-------------------------------------------------------+ | type | Option type (REAL, BOOL, INTEGER or CATEGORICAL) | +------------------+-------------------------------------------------------+ | default_value | The default value for the option. | +------------------+-------------------------------------------------------+ | possible_values | List of acceptable values (CATEGORICAL only) | +------------------+-------------------------------------------------------+ | lower_bound | Smallest acceptable value for this option (REAL only) | +------------------+-------------------------------------------------------+ | upper_bound | Largest acceptable value for this option (REAL only) | +------------------+-------------------------------------------------------+ Returns ------- out : JSON/SFrame See Also -------- graphlab.{module_name}.{python_class_name}.get_current_options Examples -------- .. sourcecode:: python >>> import graphlab # SFrame formatted output. >>> out_sframe = graphlab.{module_name}.get_default_options() # JSON formatted output. >>> out_sframe = graphlab.{module_name}.get_default_options('json') """ _mt._get_metric_tracker().track('toolkit.%s.get_default_options' % module_name) if sdk_model: response = _gl.extensions._toolkits_sdk_get_default_options( unity_server_model_name) else: response = _gl.extensions._toolkits_get_default_options( unity_server_model_name) for k in response.keys(): response[k] = json.loads(response[k], parse_int=lambda x: float(x) if type(int(x)) is long else int(x)) if output_type == 'json': return response else: json_list = [{'name': k, '': v} for k, v in response.items()] return _SFrame(json_list).unpack('X1', column_name_prefix='')\ .unpack('X1', column_name_prefix='')
def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0, output_type='topic_probabilities'): """ Get the words associated with a given topic. The score column is the probability of choosing that word given that you have chosen a particular topic. Parameters ---------- topic_ids : list of int, optional The topics to retrieve words. Topic ids are zero-based. Throws an error if greater than or equal to m['num_topics'], or if the requested topic name is not present. num_words : int, optional The number of words to show. cdf_cutoff : float, optional Allows one to only show the most probable words whose cumulative probability is below this cutoff. For example if there exist three words where .. math:: p(word_1 | topic_k) = .1 p(word_2 | topic_k) = .2 p(word_3 | topic_k) = .05 then setting :math:`cdf_{cutoff}=.3` would return only :math:`word_1` and :math:`word_2` since :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}` output_type : {'topic_probabilities' | 'topic_words'}, optional Determine the type of desired output. See below. Returns ------- out : SFrame If output_type is 'topic_probabilities', then the returned value is an SFrame with a column of words ranked by a column of scores for each topic. Otherwise, the returned value is a SArray where each element is a list of the most probable words for each topic. Examples -------- Get the highest ranked words for all topics. >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text') >>> m = graphlab.topic_model.create(docs, num_iterations=50) >>> m.get_topics() +-------+----------+-----------------+ | topic | word | score | +-------+----------+-----------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 1 | function | 0.0482834508265 | | 1 | input | 0.0456270024091 | | 1 | point | 0.0302662839454 | | 1 | result | 0.0239474934631 | | 1 | problem | 0.0231750116011 | | ... | ... | ... | +-------+----------+-----------------+ Get the highest ranked words for topics 0 and 1 and show 15 words per topic. >>> m.get_topics([0, 1], num_words=15) +-------+----------+------------------+ | topic | word | score | +-------+----------+------------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 0 | response | 0.0139740298286 | | 0 | layer | 0.0122585145062 | | 0 | features | 0.0115343177265 | | 0 | feature | 0.0103530459301 | | 0 | spatial | 0.00823387994361 | | ... | ... | ... | +-------+----------+------------------+ If one wants to instead just get the top words per topic, one may change the format of the output as follows. >>> topics = m.get_topics(output_type='topic_words') dtype: list Rows: 10 [['cell', 'image', 'input', 'object', 'visual'], ['algorithm', 'data', 'learning', 'method', 'set'], ['function', 'input', 'point', 'problem', 'result'], ['model', 'output', 'pattern', 'set', 'unit'], ['action', 'learning', 'net', 'problem', 'system'], ['error', 'function', 'network', 'parameter', 'weight'], ['information', 'level', 'neural', 'threshold', 'weight'], ['control', 'field', 'model', 'network', 'neuron'], ['hidden', 'layer', 'system', 'training', 'vector'], ['component', 'distribution', 'local', 'model', 'optimal']] """ _mt._get_metric_tracker().track('toolkit.text.topic_model.get_topics') _check_categorical_option_type('output_type', output_type, ['topic_probabilities', 'topic_words']) if topic_ids is None: topic_ids = list(range(self.get('num_topics'))) assert isinstance(topic_ids, list), \ "The provided topic_ids is not a list." if any([type(x) == str for x in topic_ids]): raise ValueError("Only integer topic_ids can be used at this point in time.") if not all([x >= 0 and x < self['num_topics'] for x in topic_ids]): raise ValueError("Topic id values must be non-negative and less than the " + \ "number of topics used to fit the model.") opts = {'model': self.__proxy__, 'topic_ids': topic_ids, 'num_words': num_words, 'cdf_cutoff': cdf_cutoff} response = _graphlab.toolkits._main.run('text_topicmodel_get_topic', opts) ret = _map_unity_proxy_to_object(response['top_words']) def sort_wordlist_by_prob(z): words = sorted(z.items(), key=_operator.itemgetter(1), reverse=True) return [word for (word, prob) in words] if output_type != 'topic_probabilities': ret = ret.groupby('topic', {'word': _graphlab.aggregate.CONCAT('word', 'score')}) words = ret.sort('topic')['word'].apply(sort_wordlist_by_prob) ret = _SFrame({'words': words}) return ret
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ For each row of the input 'dataset', retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model, but if it is, the 'include_self_edges' parameter can be set to False to exclude results that match query points to themselves. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - The `dataset` input to this method *can* have missing values (in contrast to the reference dataset used to create the nearest neighbors model). Missing numeric values are imputed to be the mean of the corresponding feature in the reference dataset, and missing strings are imputed to be empty strings. - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. - For models created with the 'lsh' method, the query results may have fewer query labels than input query points. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query, the query point is omitted from the results. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = graphlab.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = graphlab.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = graphlab.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors.query') ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.get('features') sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input if label is None: query_labels = _graphlab.SArray.from_sequence(len(dataset)) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype() == str and not dataset[label].dtype() == int: raise TypeError("The label column must contain integers or strings.") if label in ref_features: raise ValueError("The label column cannot be one of the features.") query_labels = dataset[label] ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = {'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'query_labels': query_labels, 'k': k, 'radius': radius} result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])