def _get(self, field): """ Return the value of a given field. The list of all queryable fields is detailed below, and can be obtained with the :py:func:`~TopicModel._list_fields` method. +-----------------------+----------------------------------------------+ | Field | Description | +=======================+==============================================+ | topics | An SFrame containing a column with the unique| | | words observed during training, and a column | | | of arrays containing the probability values | | | for each word given each of the topics. | +-----------------------+----------------------------------------------+ | vocabulary | An SArray containing the words used. This is | | | same as the vocabulary column in the topics | | | field above. | +-----------------------+----------------------------------------------+ Parameters ---------- field : string Name of the field to be retrieved. Returns ------- out Value of the requested field. """ opts = {'model': self.__proxy__, 'field': field} response = _turicreate.toolkits._main.run("text_topicmodel_get_value", opts) if field == 'vocabulary': return _SArray(None, _proxy=response['value']) elif field == 'topics': return _SFrame(None, _proxy=response['value']) return response['value']
def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0, output_type='topic_probabilities'): """ Get the words associated with a given topic. The score column is the probability of choosing that word given that you have chosen a particular topic. Parameters ---------- topic_ids : list of int, optional The topics to retrieve words. Topic ids are zero-based. Throws an error if greater than or equal to m['num_topics'], or if the requested topic name is not present. num_words : int, optional The number of words to show. cdf_cutoff : float, optional Allows one to only show the most probable words whose cumulative probability is below this cutoff. For example if there exist three words where .. math:: p(word_1 | topic_k) = .1 p(word_2 | topic_k) = .2 p(word_3 | topic_k) = .05 then setting :math:`cdf_{cutoff}=.3` would return only :math:`word_1` and :math:`word_2` since :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}` output_type : {'topic_probabilities' | 'topic_words'}, optional Determine the type of desired output. See below. Returns ------- out : SFrame If output_type is 'topic_probabilities', then the returned value is an SFrame with a column of words ranked by a column of scores for each topic. Otherwise, the returned value is a SArray where each element is a list of the most probable words for each topic. Examples -------- Get the highest ranked words for all topics. >>> docs = turicreate.SArray('https://static.turi.com/datasets/nips-text') >>> m = turicreate.topic_model.create(docs, num_iterations=50) >>> m.get_topics() +-------+----------+-----------------+ | topic | word | score | +-------+----------+-----------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 1 | function | 0.0482834508265 | | 1 | input | 0.0456270024091 | | 1 | point | 0.0302662839454 | | 1 | result | 0.0239474934631 | | 1 | problem | 0.0231750116011 | | ... | ... | ... | +-------+----------+-----------------+ Get the highest ranked words for topics 0 and 1 and show 15 words per topic. >>> m.get_topics([0, 1], num_words=15) +-------+----------+------------------+ | topic | word | score | +-------+----------+------------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 0 | response | 0.0139740298286 | | 0 | layer | 0.0122585145062 | | 0 | features | 0.0115343177265 | | 0 | feature | 0.0103530459301 | | 0 | spatial | 0.00823387994361 | | ... | ... | ... | +-------+----------+------------------+ If one wants to instead just get the top words per topic, one may change the format of the output as follows. >>> topics = m.get_topics(output_type='topic_words') dtype: list Rows: 10 [['cell', 'image', 'input', 'object', 'visual'], ['algorithm', 'data', 'learning', 'method', 'set'], ['function', 'input', 'point', 'problem', 'result'], ['model', 'output', 'pattern', 'set', 'unit'], ['action', 'learning', 'net', 'problem', 'system'], ['error', 'function', 'network', 'parameter', 'weight'], ['information', 'level', 'neural', 'threshold', 'weight'], ['control', 'field', 'model', 'network', 'neuron'], ['hidden', 'layer', 'system', 'training', 'vector'], ['component', 'distribution', 'local', 'model', 'optimal']] """ _check_categorical_option_type('output_type', output_type, ['topic_probabilities', 'topic_words']) if topic_ids is None: topic_ids = list(range(self._get('num_topics'))) assert isinstance(topic_ids, list), \ "The provided topic_ids is not a list." if any([type(x) == str for x in topic_ids]): raise ValueError( "Only integer topic_ids can be used at this point in time.") if not all([x >= 0 and x < self.num_topics for x in topic_ids]): raise ValueError("Topic id values must be non-negative and less than the " + \ "number of topics used to fit the model.") opts = { 'model': self.__proxy__, 'topic_ids': topic_ids, 'num_words': num_words, 'cdf_cutoff': cdf_cutoff } response = _turicreate.extensions._text.topicmodel_get_topic(opts) ret = response['top_words'] def sort_wordlist_by_prob(z): words = sorted(z.items(), key=_operator.itemgetter(1), reverse=True) return [word for (word, prob) in words] if output_type != 'topic_probabilities': ret = ret.groupby( 'topic', {'word': _turicreate.aggregate.CONCAT('word', 'score')}) words = ret.sort('topic')['word'].apply(sort_wordlist_by_prob) ret = _SFrame({'words': words}) return ret
def get_default_options_for_model(output_type='sframe'): """ Get the default options for the toolkit :class:`~turicreate.{module_name}.{python_class_name}`. Parameters ---------- output_type : str, optional The output can be of the following types. - `sframe`: A table description each option used in the model. - `json`: A list of option dictionaries suitable for JSON serialization. | Each dictionary/row in the dictionary/SFrame object describes the following parameters of the given model. +------------------+-------------------------------------------------------+ | Name | Description | +==================+=======================================================+ | name | Name of the option used in the model. | +------------------+---------+---------------------------------------------+ | description | A detailed description of the option used. | +------------------+-------------------------------------------------------+ | type | Option type (REAL, BOOL, INTEGER or CATEGORICAL) | +------------------+-------------------------------------------------------+ | default_value | The default value for the option. | +------------------+-------------------------------------------------------+ | possible_values | List of acceptable values (CATEGORICAL only) | +------------------+-------------------------------------------------------+ | lower_bound | Smallest acceptable value for this option (REAL only) | +------------------+-------------------------------------------------------+ | upper_bound | Largest acceptable value for this option (REAL only) | +------------------+-------------------------------------------------------+ Returns ------- out : dict/SFrame See Also -------- turicreate.{module_name}.{python_class_name}.get_current_options Examples -------- .. sourcecode:: python >>> import turicreate # SFrame formatted output. >>> out_sframe = turicreate.{module_name}.get_default_options() # dict formatted output suitable for JSON serialization. >>> out_json = turicreate.{module_name}.get_default_options('json') """ if sdk_model: response = _tc.extensions._toolkits_sdk_get_default_options( unity_server_model_name) else: response = _tc.extensions._toolkits_get_default_options( unity_server_model_name) if output_type == 'json': return response else: json_list = [{'name': k, '': v} for k, v in response.items()] return _SFrame(json_list).unpack('X1', column_name_prefix='')\ .unpack('X1', column_name_prefix='')
def similarity_graph(self, k=5, radius=None, include_self_edges=False, output_type='SGraph', verbose=True): """ Construct the similarity graph on the reference dataset, which is already stored in the model. This is conceptually very similar to running `query` with the reference set, but this method is optimized for the purpose, syntactically simpler, and automatically removes self-edges. Parameters ---------- k : int, optional Maximum number of neighbors to return for each point in the dataset. Setting this to ``None`` deactivates the constraint, so that all neighbors are returned within ``radius`` of a given point. radius : float, optional For a given point, only neighbors within this distance are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. include_self_edges : bool, optional For most distance functions, each point in the model's reference dataset is its own nearest neighbor. If this parameter is set to False, this result is ignored, and the nearest neighbors are returned *excluding* the point itself. output_type : {'SGraph', 'SFrame'}, optional By default, the results are returned in the form of an SGraph, where each point in the reference dataset is a vertex and an edge A -> B indicates that vertex B is a nearest neighbor of vertex A. If 'output_type' is set to 'SFrame', the output is in the same form as the results of the 'query' method: an SFrame with columns indicating the query label (in this case the query data is the same as the reference data), reference label, distance between the two points, and the rank of the neighbor. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame or SGraph The type of the output object depends on the 'output_type' parameter. See the parameter description for more detail. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each data point is matched to the entire dataset. If the reference dataset has :math:`n` rows, the output is an SFrame with :math:`n^2` rows (or an SGraph with :math:`n^2` edges). - For models created with the 'lsh' method, the output similarity graph may have fewer vertices than there are data points in the original reference set. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query and self-edges are excluded, the query point is omitted from the results. Examples -------- First construct an SFrame and create a nearest neighbors model: >>> sf = turicreate.SFrame({'x1': [0.98, 0.62, 0.11], ... 'x2': [0.69, 0.58, 0.36]}) ... >>> model = turicreate.nearest_neighbors.create(sf, distance='euclidean') Unlike the ``query`` method, there is no need for a second dataset with ``similarity_graph``. >>> g = model.similarity_graph(k=1) # an SGraph >>> g.edges +----------+----------+----------------+------+ | __src_id | __dst_id | distance | rank | +----------+----------+----------------+------+ | 0 | 1 | 0.376430604494 | 1 | | 2 | 1 | 0.55542776308 | 1 | | 1 | 0 | 0.376430604494 | 1 | +----------+----------+----------------+------+ """ ## Validate inputs. if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'k': k, 'radius': radius, 'include_self_edges': include_self_edges } result = _turicreate.toolkits._main.run( '_nearest_neighbors.similarity_graph', opts, verbose) knn = _SFrame(None, _proxy=result['neighbors']) if output_type == "SFrame": return knn else: sg = _SGraph(edges=knn, src_field='query_label', dst_field='reference_label') return sg
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ For each row of the input 'dataset', retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model, but if it is, the 'include_self_edges' parameter can be set to False to exclude results that match query points to themselves. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~turicreate.nearest_neighbors.create` documentation for more detail on allowable data types. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - The `dataset` input to this method *can* have missing values (in contrast to the reference dataset used to create the nearest neighbors model). Missing numeric values are imputed to be the mean of the corresponding feature in the reference dataset, and missing strings are imputed to be empty strings. - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. - For models created with the 'lsh' method, the query results may have fewer query labels than input query points. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query, the query point is omitted from the results. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = turicreate.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = turicreate.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = turicreate.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.features sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input if label is None: query_labels = _turicreate.SArray.from_sequence(len(dataset)) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype == str and not dataset[ label].dtype == int: raise TypeError( "The label column must contain integers or strings.") if label in ref_features: raise ValueError( "The label column cannot be one of the features.") query_labels = dataset[label] ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'query_labels': query_labels, 'k': k, 'radius': radius } result = _turicreate.toolkits._main.run('_nearest_neighbors.query', opts, verbose) return _SFrame(None, _proxy=result['neighbors'])
def _get(self, field): """ Return the value of a given field. +-----------------------+----------------------------------------------+ | Field | Description | +=======================+==============================================+ | batch_size | Number of randomly chosen examples to use in | | | each training iteration. | +-----------------------+----------------------------------------------+ | cluster_id | Cluster assignment for each data point and | | | Euclidean distance to the cluster center | +-----------------------+----------------------------------------------+ | cluster_info | Cluster centers, sum of squared Euclidean | | | distances from each cluster member to the | | | assigned center, and the number of data | | | points belonging to the cluster | +-----------------------+----------------------------------------------+ | features | Names of feature columns | +-----------------------+----------------------------------------------+ | max_iterations | Maximum number of iterations to perform | +-----------------------+----------------------------------------------+ | method | Algorithm used to train the model. | +-----------------------+----------------------------------------------+ | num_clusters | Number of clusters | +-----------------------+----------------------------------------------+ | num_examples | Number of examples in the dataset | +-----------------------+----------------------------------------------+ | num_features | Number of feature columns used | +-----------------------+----------------------------------------------+ | num_unpacked_features | Number of features unpacked from the | | | feature columns | +-----------------------+----------------------------------------------+ | training_iterations | Total number of iterations performed | +-----------------------+----------------------------------------------+ | training_time | Total time taken to cluster the data | +-----------------------+----------------------------------------------+ | unpacked_features | Names of features unpacked from the | | | feature columns | +-----------------------+----------------------------------------------+ Parameters ---------- field : str The name of the field to query. Returns ------- out Value of the requested field """ opts = {'model': self.__proxy__, 'model_name': self.__name__, 'field': field} response = _tc.toolkits._main.run('kmeans_get_value', opts) # cluster_id and cluster_info both return a unity SFrame. Cast to an SFrame. if field == 'cluster_id' or field == 'cluster_info': return _SFrame(None, _proxy=response['value']) else: return response['value']