def _get(self, field):
        """
        Return the value of a given field. The list of all queryable fields is
        detailed below, and can be obtained with the
        :py:func:`~TopicModel._list_fields` method.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | topics                | An SFrame containing a column with the unique|
        |                       | words observed during training, and a column |
        |                       | of arrays containing the probability values  |
        |                       | for each word given each of the topics.      |
        +-----------------------+----------------------------------------------+
        | vocabulary            | An SArray containing the words used. This is |
        |                       | same as the vocabulary column in the topics  |
        |                       | field above.                                 |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : string
            Name of the field to be retrieved.

        Returns
        -------
        out
            Value of the requested field.
        """

        opts = {'model': self.__proxy__, 'field': field}
        response = _turicreate.toolkits._main.run("text_topicmodel_get_value",
                                                  opts)
        if field == 'vocabulary':
            return _SArray(None, _proxy=response['value'])
        elif field == 'topics':
            return _SFrame(None, _proxy=response['value'])
        return response['value']
示例#2
0
    def get_topics(self,
                   topic_ids=None,
                   num_words=5,
                   cdf_cutoff=1.0,
                   output_type='topic_probabilities'):
        """
        Get the words associated with a given topic. The score column is the
        probability of choosing that word given that you have chosen a
        particular topic.

        Parameters
        ----------
        topic_ids : list of int, optional
            The topics to retrieve words. Topic ids are zero-based.
            Throws an error if greater than or equal to m['num_topics'], or
            if the requested topic name is not present.

        num_words : int, optional
            The number of words to show.

        cdf_cutoff : float, optional
            Allows one to only show the most probable words whose cumulative
            probability is below this cutoff. For example if there exist
            three words where

            .. math::
               p(word_1 | topic_k) = .1

               p(word_2 | topic_k) = .2

               p(word_3 | topic_k) = .05

            then setting :math:`cdf_{cutoff}=.3` would return only
            :math:`word_1` and :math:`word_2` since
            :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}`

        output_type : {'topic_probabilities' | 'topic_words'}, optional
            Determine the type of desired output. See below.

        Returns
        -------
        out : SFrame
            If output_type is 'topic_probabilities', then the returned value is
            an SFrame with a column of words ranked by a column of scores for
            each topic. Otherwise, the returned value is a SArray where
            each element is a list of the most probable words for each topic.

        Examples
        --------
        Get the highest ranked words for all topics.

        >>> docs = turicreate.SArray('https://static.turi.com/datasets/nips-text')
        >>> m = turicreate.topic_model.create(docs,
                                            num_iterations=50)
        >>> m.get_topics()
        +-------+----------+-----------------+
        | topic |   word   |      score      |
        +-------+----------+-----------------+
        |   0   |   cell   |  0.028974400831 |
        |   0   |  input   | 0.0259470208503 |
        |   0   |  image   | 0.0215721599763 |
        |   0   |  visual  | 0.0173635081992 |
        |   0   |  object  | 0.0172447874156 |
        |   1   | function | 0.0482834508265 |
        |   1   |  input   | 0.0456270024091 |
        |   1   |  point   | 0.0302662839454 |
        |   1   |  result  | 0.0239474934631 |
        |   1   | problem  | 0.0231750116011 |
        |  ...  |   ...    |       ...       |
        +-------+----------+-----------------+

        Get the highest ranked words for topics 0 and 1 and show 15 words per
        topic.

        >>> m.get_topics([0, 1], num_words=15)
        +-------+----------+------------------+
        | topic |   word   |      score       |
        +-------+----------+------------------+
        |   0   |   cell   |  0.028974400831  |
        |   0   |  input   | 0.0259470208503  |
        |   0   |  image   | 0.0215721599763  |
        |   0   |  visual  | 0.0173635081992  |
        |   0   |  object  | 0.0172447874156  |
        |   0   | response | 0.0139740298286  |
        |   0   |  layer   | 0.0122585145062  |
        |   0   | features | 0.0115343177265  |
        |   0   | feature  | 0.0103530459301  |
        |   0   | spatial  | 0.00823387994361 |
        |  ...  |   ...    |       ...        |
        +-------+----------+------------------+

        If one wants to instead just get the top words per topic, one may
        change the format of the output as follows.

        >>> topics = m.get_topics(output_type='topic_words')
        dtype: list
        Rows: 10
        [['cell', 'image', 'input', 'object', 'visual'],
         ['algorithm', 'data', 'learning', 'method', 'set'],
         ['function', 'input', 'point', 'problem', 'result'],
         ['model', 'output', 'pattern', 'set', 'unit'],
         ['action', 'learning', 'net', 'problem', 'system'],
         ['error', 'function', 'network', 'parameter', 'weight'],
         ['information', 'level', 'neural', 'threshold', 'weight'],
         ['control', 'field', 'model', 'network', 'neuron'],
         ['hidden', 'layer', 'system', 'training', 'vector'],
         ['component', 'distribution', 'local', 'model', 'optimal']]
        """
        _check_categorical_option_type('output_type', output_type,
                                       ['topic_probabilities', 'topic_words'])

        if topic_ids is None:
            topic_ids = list(range(self._get('num_topics')))

        assert isinstance(topic_ids, list), \
            "The provided topic_ids is not a list."

        if any([type(x) == str for x in topic_ids]):
            raise ValueError(
                "Only integer topic_ids can be used at this point in time.")
        if not all([x >= 0 and x < self.num_topics for x in topic_ids]):
            raise ValueError("Topic id values must be non-negative and less than the " + \
                "number of topics used to fit the model.")

        opts = {
            'model': self.__proxy__,
            'topic_ids': topic_ids,
            'num_words': num_words,
            'cdf_cutoff': cdf_cutoff
        }
        response = _turicreate.extensions._text.topicmodel_get_topic(opts)
        ret = response['top_words']

        def sort_wordlist_by_prob(z):
            words = sorted(z.items(),
                           key=_operator.itemgetter(1),
                           reverse=True)
            return [word for (word, prob) in words]

        if output_type != 'topic_probabilities':
            ret = ret.groupby(
                'topic',
                {'word': _turicreate.aggregate.CONCAT('word', 'score')})
            words = ret.sort('topic')['word'].apply(sort_wordlist_by_prob)
            ret = _SFrame({'words': words})

        return ret
示例#3
0
    def get_default_options_for_model(output_type='sframe'):
        """
        Get the default options for the toolkit
        :class:`~turicreate.{module_name}.{python_class_name}`.

        Parameters
        ----------
        output_type : str, optional

            The output can be of the following types.

            - `sframe`: A table description each option used in the model.
            - `json`: A list of option dictionaries suitable for JSON serialization.

            | Each dictionary/row in the dictionary/SFrame object describes the
              following parameters of the given model.

            +------------------+-------------------------------------------------------+
            |      Name        |                  Description                          |
            +==================+=======================================================+
            | name             | Name of the option used in the model.                 |
            +------------------+---------+---------------------------------------------+
            | description      | A detailed description of the option used.            |
            +------------------+-------------------------------------------------------+
            | type             | Option type (REAL, BOOL, INTEGER or CATEGORICAL)      |
            +------------------+-------------------------------------------------------+
            | default_value    | The default value for the option.                     |
            +------------------+-------------------------------------------------------+
            | possible_values  | List of acceptable values (CATEGORICAL only)          |
            +------------------+-------------------------------------------------------+
            | lower_bound      | Smallest acceptable value for this option (REAL only) |
            +------------------+-------------------------------------------------------+
            | upper_bound      | Largest acceptable value for this option (REAL only)  |
            +------------------+-------------------------------------------------------+

        Returns
        -------
        out : dict/SFrame

        See Also
        --------
        turicreate.{module_name}.{python_class_name}.get_current_options

        Examples
        --------
        .. sourcecode:: python

          >>> import turicreate

          # SFrame formatted output.
          >>> out_sframe = turicreate.{module_name}.get_default_options()

          # dict formatted output suitable for JSON serialization.
          >>> out_json = turicreate.{module_name}.get_default_options('json')
        """
        if sdk_model:
            response = _tc.extensions._toolkits_sdk_get_default_options(
                unity_server_model_name)
        else:
            response = _tc.extensions._toolkits_get_default_options(
                unity_server_model_name)

        if output_type == 'json':
            return response
        else:
            json_list = [{'name': k, '': v} for k, v in response.items()]
            return _SFrame(json_list).unpack('X1', column_name_prefix='')\
                                     .unpack('X1', column_name_prefix='')
示例#4
0
    def similarity_graph(self,
                         k=5,
                         radius=None,
                         include_self_edges=False,
                         output_type='SGraph',
                         verbose=True):
        """
        Construct the similarity graph on the reference dataset, which is
        already stored in the model. This is conceptually very similar to
        running `query` with the reference set, but this method is optimized
        for the purpose, syntactically simpler, and automatically removes
        self-edges.

        Parameters
        ----------
        k : int, optional
            Maximum number of neighbors to return for each point in the
            dataset. Setting this to ``None`` deactivates the constraint, so
            that all neighbors are returned within ``radius`` of a given point.

        radius : float, optional
            For a given point, only neighbors within this distance are
            returned. The default is ``None``, in which case the ``k`` nearest
            neighbors are returned for each query point, regardless of
            distance.

        include_self_edges : bool, optional
            For most distance functions, each point in the model's reference
            dataset is its own nearest neighbor. If this parameter is set to
            False, this result is ignored, and the nearest neighbors are
            returned *excluding* the point itself.

        output_type : {'SGraph', 'SFrame'}, optional
            By default, the results are returned in the form of an SGraph,
            where each point in the reference dataset is a vertex and an edge A
            -> B indicates that vertex B is a nearest neighbor of vertex A. If
            'output_type' is set to 'SFrame', the output is in the same form as
            the results of the 'query' method: an SFrame with columns
            indicating the query label (in this case the query data is the same
            as the reference data), reference label, distance between the two
            points, and the rank of the neighbor.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame or SGraph
            The type of the output object depends on the 'output_type'
            parameter. See the parameter description for more detail.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each data point is
          matched to the entire dataset. If the reference dataset has
          :math:`n` rows, the output is an SFrame with :math:`n^2` rows (or an
          SGraph with :math:`n^2` edges).

        - For models created with the 'lsh' method, the output similarity graph
          may have fewer vertices than there are data points in the original
          reference set. Because LSH is an approximate method, a query point
          may have fewer than 'k' neighbors. If LSH returns no neighbors at all
          for a query and self-edges are excluded, the query point is omitted
          from the results.

        Examples
        --------
        First construct an SFrame and create a nearest neighbors model:

        >>> sf = turicreate.SFrame({'x1': [0.98, 0.62, 0.11],
        ...                       'x2': [0.69, 0.58, 0.36]})
        ...
        >>> model = turicreate.nearest_neighbors.create(sf, distance='euclidean')

        Unlike the ``query`` method, there is no need for a second dataset with
        ``similarity_graph``.

        >>> g = model.similarity_graph(k=1)  # an SGraph
        >>> g.edges
        +----------+----------+----------------+------+
        | __src_id | __dst_id |    distance    | rank |
        +----------+----------+----------------+------+
        |    0     |    1     | 0.376430604494 |  1   |
        |    2     |    1     | 0.55542776308  |  1   |
        |    1     |    0     | 0.376430604494 |  1   |
        +----------+----------+----------------+------+
        """
        ## Validate inputs.
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")

        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'k': k,
            'radius': radius,
            'include_self_edges': include_self_edges
        }

        result = _turicreate.toolkits._main.run(
            '_nearest_neighbors.similarity_graph', opts, verbose)

        knn = _SFrame(None, _proxy=result['neighbors'])

        if output_type == "SFrame":
            return knn

        else:
            sg = _SGraph(edges=knn,
                         src_field='query_label',
                         dst_field='reference_label')
            return sg
示例#5
0
    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
        """
        For each row of the input 'dataset', retrieve the nearest neighbors
        from the model's stored data. In general, the query dataset does not
        need to be the same as the reference data stored in the model, but if
        it is, the 'include_self_edges' parameter can be set to False to
        exclude results that match query points to themselves.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~turicreate.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        label : str, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the
            query point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the
            ``k`` nearest neighbors are returned for each query point,
            regardless of distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        See Also
        --------
        similarity_graph

        Notes
        -----
        - The `dataset` input to this method *can* have missing values (in
          contrast to the reference dataset used to create the nearest
          neighbors model). Missing numeric values are imputed to be the mean
          of the corresponding feature in the reference dataset, and missing
          strings are imputed to be empty strings.

        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output
          is an SFrame with :math:`nm` rows.

        - For models created with the 'lsh' method, the query results may have
          fewer query labels than input query points. Because LSH is an
          approximate method, a query point may have fewer than 'k' neighbors.
          If LSH returns no neighbors at all for a query, the query point is
          omitted from the results.

        Examples
        --------
        First construct a toy SFrame and create a nearest neighbors model:

        >>> sf = turicreate.SFrame({'label': range(3),
        ...                       'feature1': [0.98, 0.62, 0.11],
        ...                       'feature2': [0.69, 0.58, 0.36]})
        >>> model = turicreate.nearest_neighbors.create(sf, 'label')

        A new SFrame contains query observations with same schema as the
        reference SFrame. This SFrame is passed to the ``query`` method.

        >>> queries = turicreate.SFrame({'label': range(3),
        ...                            'feature1': [0.05, 0.61, 0.99],
        ...                            'feature2': [0.06, 0.97, 0.86]})
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """

        ## Validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Get model features
        ref_features = self.features
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Validate and preprocess the 'label' input
        if label is None:
            query_labels = _turicreate.SArray.from_sequence(len(dataset))

        else:
            if not label in dataset.column_names():
                raise ValueError(
                    "Input 'label' must be a string matching the name of a " +\
                    "column in the reference SFrame 'dataset'.")

            if not dataset[label].dtype == str and not dataset[
                    label].dtype == int:
                raise TypeError(
                    "The label column must contain integers or strings.")

            if label in ref_features:
                raise ValueError(
                    "The label column cannot be one of the features.")

            query_labels = dataset[label]

        ## Validate neighborhood parameters 'k' and 'radius'
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")

        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'features': sf_features,
            'query_labels': query_labels,
            'k': k,
            'radius': radius
        }

        result = _turicreate.toolkits._main.run('_nearest_neighbors.query',
                                                opts, verbose)
        return _SFrame(None, _proxy=result['neighbors'])
示例#6
0
    def _get(self, field):
        """
        Return the value of a given field.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | batch_size            | Number of randomly chosen examples to use in |
        |                       | each training iteration.                     |
        +-----------------------+----------------------------------------------+
        | cluster_id            | Cluster assignment for each data point and   |
        |                       | Euclidean distance to the cluster center     |
        +-----------------------+----------------------------------------------+
        | cluster_info          | Cluster centers, sum of squared Euclidean    |
        |                       | distances from each cluster member to the    |
        |                       | assigned center, and the number of data      |
        |                       | points belonging to the cluster              |
        +-----------------------+----------------------------------------------+
        | features              | Names of feature columns                     |
        +-----------------------+----------------------------------------------+
        | max_iterations        | Maximum number of iterations to perform      |
        +-----------------------+----------------------------------------------+
        | method                | Algorithm used to train the model.           |
        +-----------------------+----------------------------------------------+
        | num_clusters          | Number of clusters                           |
        +-----------------------+----------------------------------------------+
        | num_examples          | Number of examples in the dataset            |
        +-----------------------+----------------------------------------------+
        | num_features          | Number of feature columns used               |
        +-----------------------+----------------------------------------------+
        | num_unpacked_features | Number of features unpacked from the         |
        |                       | feature columns                              |
        +-----------------------+----------------------------------------------+
        | training_iterations   | Total number of iterations performed         |
        +-----------------------+----------------------------------------------+
        | training_time         | Total time taken to cluster the data         |
        +-----------------------+----------------------------------------------+
        | unpacked_features     | Names of features unpacked from the          |
        |                       | feature columns                              |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : str
            The name of the field to query.

        Returns
        -------
        out
            Value of the requested field
        """
        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'field': field}
        response = _tc.toolkits._main.run('kmeans_get_value', opts)

        # cluster_id and cluster_info both return a unity SFrame. Cast to an SFrame.
        if field == 'cluster_id' or field == 'cluster_info':
            return _SFrame(None, _proxy=response['value'])
        else:
            return response['value']