示例#1
0
def create(graph, verbose=True):
    """
    Compute the number of triangles each vertex belongs to, ignoring edge
    directions. A triangle is a complete subgraph with only three vertices.
    Return a model object with total number of triangles as well as the triangle
    counts for each vertex in the graph.

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute triangle counts.

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : TriangleCountingModel

    References
    ----------
    - T. Schank. (2007) `Algorithmic Aspects of Triangle-Based Network Analysis
      <http://digbib.ubka.uni-karlsruhe.de/volltexte/documents/4541>`_.

    Examples
    --------
    If given an :class:`~turicreate.SGraph` ``g``, we can create a
    :class:`~turicreate.triangle_counting.TriangleCountingModel` as follows:

    >>> g =
    >>> turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz',
            >>> format='snap') tc = turicreate.triangle_counting.create(g)

    We can obtain the number of triangles that each vertex in the graph ``g``
    is present in:

    >>> tc_out = tc['triangle_count']  # SFrame

    We can add the new "triangle_count" field to the original graph g using:

    >>> g.vertices['triangle_count'] = tc['graph'].vertices['triangle_count']

    Note that the task above does not require a join because the vertex
    ordering is preserved through ``create()``.

    See Also
    --------
    TriangleCountingModel
    """
    from turicreate._cython.cy_server import QuietProgress

    if not isinstance(graph, _SGraph):
        raise TypeError("graph input must be a SGraph object.")

    with QuietProgress(verbose):
        params = _tc.extensions._toolkits.graph.triangle_counting.create(
            {"graph": graph.__proxy__})
    return TriangleCountingModel(params["model"])
示例#2
0
def create(graph, verbose=True):
    """
    Compute the graph coloring. Assign a color to each vertex such that no
    adjacent vertices have the same color. Return a model object with total
    number of colors used as well as the color ID for each vertex in the graph.
    This algorithm is greedy and is not guaranteed to find the **minimum** graph
    coloring. It is also not deterministic, so successive runs may return
    different answers.

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute the coloring.

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : GraphColoringModel

    References
    ----------
    - `Wikipedia - graph coloring <http://en.wikipedia.org/wiki/Graph_coloring>`_

    Examples
    --------
    If given an :class:`~turicreate.SGraph` ``g``, we can create
    a :class:`~turicreate.graph_coloring.GraphColoringModel` as follows:

    >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap')
    >>> gc = turicreate.graph_coloring.create(g)

    We can obtain the ``color id`` corresponding to each vertex in the graph ``g``
    as follows:

    >>> color_id = gc['color_id']  # SFrame

    We can obtain the total number of colors required to color the graph ``g``
    as follows:

    >>> num_colors = gc['num_colors']

    See Also
    --------
    GraphColoringModel
    """
    from turicreate._cython.cy_server import QuietProgress

    if not isinstance(graph, _SGraph):
        raise TypeError('graph input must be a SGraph object.')

    with QuietProgress(verbose):
        params = _tc.extensions._toolkits.graph.graph_coloring.create(
            {'graph': graph.__proxy__})
    return GraphColoringModel(params['model'])
示例#3
0
    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
        """
        For each row of the input 'dataset', retrieve the nearest neighbors
        from the model's stored data. In general, the query dataset does not
        need to be the same as the reference data stored in the model, but if
        it is, the 'include_self_edges' parameter can be set to False to
        exclude results that match query points to themselves.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~turicreate.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        label : str, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the
            query point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the
            ``k`` nearest neighbors are returned for each query point,
            regardless of distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        See Also
        --------
        similarity_graph

        Notes
        -----
        - The `dataset` input to this method *can* have missing values (in
          contrast to the reference dataset used to create the nearest
          neighbors model). Missing numeric values are imputed to be the mean
          of the corresponding feature in the reference dataset, and missing
          strings are imputed to be empty strings.

        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output
          is an SFrame with :math:`nm` rows.

        - For models created with the 'lsh' method, the query results may have
          fewer query labels than input query points. Because LSH is an
          approximate method, a query point may have fewer than 'k' neighbors.
          If LSH returns no neighbors at all for a query, the query point is
          omitted from the results.

        Examples
        --------
        First construct a toy SFrame and create a nearest neighbors model:

        >>> sf = turicreate.SFrame({'label': range(3),
        ...                       'feature1': [0.98, 0.62, 0.11],
        ...                       'feature2': [0.69, 0.58, 0.36]})
        >>> model = turicreate.nearest_neighbors.create(sf, 'label')

        A new SFrame contains query observations with same schema as the
        reference SFrame. This SFrame is passed to the ``query`` method.

        >>> queries = turicreate.SFrame({'label': range(3),
        ...                            'feature1': [0.05, 0.61, 0.99],
        ...                            'feature2': [0.06, 0.97, 0.86]})
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """

        ## Validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Get model features
        ref_features = self.features
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Validate and preprocess the 'label' input
        if label is None:
            query_labels = _turicreate.SArray.from_sequence(len(dataset))

        else:
            if not label in dataset.column_names():
                raise ValueError(
                    "Input 'label' must be a string matching the name of a " +\
                    "column in the reference SFrame 'dataset'.")

            if not dataset[label].dtype == str and not dataset[
                    label].dtype == int:
                raise TypeError(
                    "The label column must contain integers or strings.")

            if label in ref_features:
                raise ValueError(
                    "The label column cannot be one of the features.")

            query_labels = dataset[label]

        ## Validate neighborhood parameters 'k' and 'radius'
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")

        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'features': sf_features,
            'query_labels': query_labels,
            'k': k,
            'radius': radius
        }

        with QuietProgress(verbose):
            result = _turicreate.extensions._nearest_neighbors.query(opts)

        return result['neighbors']
示例#4
0
def create(dataset, target, model_name, features=None,
           validation_set='auto', distributed='auto',
           verbose=True, seed=None, **kwargs):
    """
    Create a :class:`~turicreate.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    features : list[string], optional
        List of feature names used by feature column

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    distributed: env
        The distributed environment

    verbose : boolean
        whether print out messages during training

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    kwargs : dict
        Additional parameter options that can be passed
    """

    # Perform error-checking and trim inputs to specified columns
    dataset, validation_set = _validate_data(dataset, target, features,
                                             validation_set)

    # Sample a validation set from the training data if requested
    if isinstance(validation_set, str):
        assert validation_set == 'auto'
        if dataset.num_rows() >= 100:
            if verbose:
                print_validation_track_notification()
            dataset, validation_set = dataset.random_split(.95, seed=seed, exact=True)
        else:
            validation_set = _turicreate.SFrame()
    elif validation_set is None:
        validation_set = _turicreate.SFrame()

    # Sanitize model-specific options
    options = {k.lower(): kwargs[k] for k in kwargs}

    # Create a model instance and train it
    model = getattr(_turicreate.extensions, model_name)()
    with QuietProgress(verbose):
        model.train(dataset, target, validation_set, options)

    return SupervisedLearningModel(model, model_name)
def create(observation_data,
           user_id='user_id',
           item_id='item_id',
           target=None,
           user_data=None,
           item_data=None,
           num_factors=32,
           regularization=1e-9,
           linear_regularization=1e-9,
           side_data_factorization=True,
           ranking_regularization=0.25,
           unobserved_rating_value=None,
           num_sampled_negative_examples=4,
           max_iterations=25,
           sgd_step_size=0,
           random_seed=0,
           binary_target=False,
           solver='auto',
           verbose=True,
           **kwargs):
    """Create a RankingFactorizationRecommender that learns latent factors for each
    user and item and uses them to make rating predictions.

    Parameters
    ----------
    observation_data : SFrame
        The dataset to use for training the model. It must contain a column of
        user ids and a column of item ids. Each row represents an observed
        interaction between the user and the item.  The (user, item) pairs
        are stored with the model so that they can later be excluded from
        recommendations if desired. It can optionally contain a target ratings
        column. All other columns are interpreted by the underlying model as
        side features for the observations.

        The user id and item id columns must be of type 'int' or 'str'. The
        target column must be of type 'int' or 'float'.

    user_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        user id.

    item_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        item id.

    target : string, optional
        The `observation_data` can optionally contain a column of scores
        representing ratings given by the users. If present, the name of this
        column may be specified variables `target`.

    user_data : SFrame, optional
        Side information for the users.  This SFrame must have a column with
        the same name as what is specified by the `user_id` input parameter.
        `user_data` can provide any amount of additional user-specific
        information.

    item_data : SFrame, optional
        Side information for the items.  This SFrame must have a column with
        the same name as what is specified by the `item_id` input parameter.
        `item_data` can provide any amount of additional item-specific
        information.

    num_factors : int, optional
        Number of latent factors.

    regularization : float, optional
        L2 regularization for interaction terms. Default: 1e-10; a typical range
        for this parameter is between 1e-12 and 1. Setting this to 0 may cause
        numerical issues.

    linear_regularization : float, optional
        L2 regularization for linear term. Default: 1e-10; a typical range for this
        parameter is between 1e-12 and 1. Setting this to 0 may cause numerical issues.

    side_data_factorization : boolean, optional
        Use factorization for modeling any additional features beyond the user
        and item columns. If True, and side features or any additional columns are
        present, then a Factorization Machine model is trained. Otherwise, only
        the linear terms are fit to these features.  See
        :class:`turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender`
        for more information. Default: True.

    ranking_regularization : float, optional
        Penalize the predicted value of user-item pairs not in the
        training set. Larger values increase this penalization.
        Suggested values: 0, 0.1, 0.5, 1.  NOTE: if no target column
        is present, this parameter is ignored.

    unobserved_rating_value : float, optional
        Penalize unobserved items with a larger predicted score than this value.
        By default, the estimated 5% quantile is used (mean - 1.96*std_dev).

    num_sampled_negative_examples : integer, optional
        For each (user, item) pair in the data, the ranking sgd solver evaluates
        this many randomly chosen unseen items for the negative example step.
        Increasing this can give better performance at the expense of speed,
        particularly when the number of items is large.  Default is 4.

    binary_target : boolean, optional
        Assume the target column is composed of 0's and 1's. If True, use
        logistic loss to fit the model.

    max_iterations : int, optional
        The training algorithm will make at most this many iterations through
        the observed data. Default: 50.

    sgd_step_size : float, optional
        Step size for stochastic gradient descent. Smaller values generally
        lead to more accurate models that take more time to train. The
        default setting of 0 means that the step size is chosen by trying
        several options on a small subset of the data.

    random_seed :  int, optional
        The random seed used to choose the initial starting point for
        model training. Note that some randomness in the training is
        unavoidable, so models trained with the same random seed may still
        differ. Default: 0.

    solver : string, optional
        Name of the solver to be used to solve the regression. See the
        references for more detail on each solver. The available solvers for
        this model are:

        - *auto (default)*: automatically chooses the best solver for the data
                              and model parameters.
        - *ials*:           Implicit Alternating Least Squares [1].
        - *adagrad*:        Adaptive Gradient Stochastic Gradient Descent.
        - *sgd*:            Stochastic Gradient Descent

    verbose : bool, optional
        Enables verbose output.

    kwargs : optional
        Optional advanced keyword arguments passed in to the model
        optimization procedure. These parameters do not typically
        need to be changed.

    Examples
    --------
    **Basic usage**

    When given just user and item pairs, one can create a RankingFactorizationRecommender
    as follows.

    >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"])
    >>> from turicreate.recommender import ranking_factorization_recommender
    >>> m1 = ranking_factorization_recommender.create(sf)

    When a target column is present, one can include this to try and recommend
    items that are rated highly.

    >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],
    ...                       'rating': [1, 3, 2, 5, 4, 1, 4, 3]})

    >>> m1 = ranking_factorization_recommender.create(sf, target='rating')


    **Including side features**

    >>> user_info = turicreate.SFrame({'user_id': ["0", "1", "2"],
    ...                              'name': ["Alice", "Bob", "Charlie"],
    ...                              'numeric_feature': [0.1, 12, 22]})
    >>> item_info = turicreate.SFrame({'item_id': ["a", "b", "c", "d"],
    ...                              'name': ["item1", "item2", "item3", "item4"],
    ...                              'dict_feature': [{'a' : 23}, {'a' : 13},
    ...                                               {'b' : 1},
    ...                                               {'a' : 23, 'b' : 32}]})
    >>> m2 = ranking_factorization_recommender.create(sf, target='rating',
    ...                                               user_data=user_info,
    ...                                               item_data=item_info)

    **Customizing ranking regularization**

    Create a model that pushes predicted ratings of unobserved user-item
    pairs toward 1 or below.

    >>> m3 = ranking_factorization_recommender.create(sf, target='rating',
    ...                                               ranking_regularization = 0.1,
    ...                                               unobserved_rating_value = 1)

    **Using the implicit alternating least squares model**

    Ranking factorization also implements implicit alternating least squares [1] as
    an alternative solver.  This is enable using ``solver = 'ials'``.

    >>> m3 = ranking_factorization_recommender.create(sf, target='rating',
                                                      solver = 'ials')

    See Also
    --------
    :class:`turicreate.recommender.factorization_recommender.FactorizationRecommender`,
    :class:`turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender`

    References
    -----------

    [1] Collaborative Filtering for Implicit Feedback Datasets Hu, Y.; Koren,
        Y.; Volinsky, C. IEEE International Conference on Data Mining
        (ICDM 2008), IEEE (2008).

    """
    from turicreate._cython.cy_server import QuietProgress
    if not (isinstance(observation_data, _SFrame)):
        raise TypeError('observation_data input must be a SFrame')
    opts = {}
    model_proxy = _turicreate.extensions.ranking_factorization_recommender()
    model_proxy.init_options(opts)

    if user_data is None:
        user_data = _turicreate.SFrame()
    if item_data is None:
        item_data = _turicreate.SFrame()

    nearest_items = _turicreate.SFrame()
    if target is None:
        binary_target = True

    opts = {
        'user_id': user_id,
        'item_id': item_id,
        'target': target,
        'random_seed': random_seed,
        'num_factors': num_factors,
        'regularization': regularization,
        'linear_regularization': linear_regularization,
        'ranking_regularization': ranking_regularization,
        'binary_target': binary_target,
        'max_iterations': max_iterations,
        'side_data_factorization': side_data_factorization,
        'num_sampled_negative_examples': num_sampled_negative_examples,
        'solver': solver,

        # Has no effect here.
        'sgd_step_size': sgd_step_size
    }

    if unobserved_rating_value is not None:
        opts["unobserved_rating_value"] = unobserved_rating_value

    if kwargs:
        try:
            possible_args = set(_get_default_options()["name"])
        except (RuntimeError, KeyError):
            possible_args = set()

        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " +
                            ', '.join(bad_arguments))

        opts.update(kwargs)

    extra_data = {"nearest_items": _turicreate.SFrame()}
    with QuietProgress(verbose):
        model_proxy.train(observation_data, user_data, item_data, opts,
                          extra_data)

    return RankingFactorizationRecommender(model_proxy)
示例#6
0
def create(
    graph,
    reset_probability=0.15,
    threshold=1e-2,
    max_iterations=20,
    _single_precision=False,
    _distributed="auto",
    verbose=True,
):
    """
    Compute the PageRank for each vertex in the graph. Return a model object
    with total PageRank as well as the PageRank value for each vertex in the
    graph.

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute the pagerank value.

    reset_probability : float, optional
        Probability that a random surfer jumps to an arbitrary page.

    threshold : float, optional
        Threshold for convergence, measured in the L1 norm
        (the sum of absolute value) of the delta of each vertex's
        pagerank value.

    max_iterations : int, optional
        The maximum number of iterations to run.

    _single_precision : bool, optional
        If true, running pagerank in single precision. The resulting
        pagerank values may not be accurate for large graph, but
        should run faster and use less memory.

    _distributed : distributed environment, internal

    verbose : bool, optional
        If True, print progress updates.


    Returns
    -------
    out : PagerankModel

    References
    ----------
    - `Wikipedia - PageRank <http://en.wikipedia.org/wiki/PageRank>`_
    - Page, L., et al. (1998) `The PageRank Citation Ranking: Bringing Order to
      the Web <http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf>`_.

    Examples
    --------
    If given an :class:`~turicreate.SGraph` ``g``, we can create
    a :class:`~turicreate.pagerank.PageRankModel` as follows:

    >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap')
    >>> pr = turicreate.pagerank.create(g)

    We can obtain the page rank corresponding to each vertex in the graph ``g``
    using:

    >>> pr_out = pr['pagerank']     # SFrame

    We can add the new pagerank field to the original graph g using:

    >>> g.vertices['pagerank'] = pr['graph'].vertices['pagerank']

    Note that the task above does not require a join because the vertex
    ordering is preserved through ``create()``.

    See Also
    --------
    PagerankModel
    """
    from turicreate._cython.cy_server import QuietProgress

    if not isinstance(graph, _SGraph):
        raise TypeError("graph input must be a SGraph object.")

    opts = {
        "threshold": threshold,
        "reset_probability": reset_probability,
        "max_iterations": max_iterations,
        "single_precision": _single_precision,
        "graph": graph.__proxy__,
    }

    with QuietProgress(verbose):
        params = _tc.extensions._toolkits.graph.pagerank.create(opts)
    model = params["model"]

    return PagerankModel(model)
def create(observation_data,
           user_id="user_id",
           item_id="item_id",
           target=None,
           user_data=None,
           item_data=None,
           num_factors=8,
           regularization=1e-8,
           linear_regularization=1e-10,
           side_data_factorization=True,
           nmf=False,
           binary_target=False,
           max_iterations=50,
           sgd_step_size=0,
           random_seed=0,
           solver="auto",
           verbose=True,
           **kwargs):
    """Create a FactorizationRecommender that learns latent factors for each
    user and item and uses them to make rating predictions. This includes
    both standard matrix factorization as well as factorization machines models
    (in the situation where side data is available for users and/or items).

    Parameters
    ----------
    observation_data : SFrame
        The dataset to use for training the model. It must contain a column of
        user ids and a column of item ids. Each row represents an observed
        interaction between the user and the item.  The (user, item) pairs
        are stored with the model so that they can later be excluded from
        recommendations if desired. It can optionally contain a target ratings
        column. All other columns are interpreted by the underlying model as
        side features for the observations.

        The user id and item id columns must be of type 'int' or 'str'. The
        target column must be of type 'int' or 'float'.

    user_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        user id.

    item_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        item id.

    target : string
        The `observation_data` must contain a column of scores
        representing ratings given by the users. If not present,
        consider using the ranking version of the factorization model,
        RankingFactorizationRecommender,
        :class:`turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender`

    user_data : SFrame, optional
        Side information for the users.  This SFrame must have a column with
        the same name as what is specified by the `user_id` input parameter.
        `user_data` can provide any amount of additional user-specific
        information.

    item_data : SFrame, optional
        Side information for the items.  This SFrame must have a column with
        the same name as what is specified by the `item_id` input parameter.
        `item_data` can provide any amount of additional item-specific
        information.

    num_factors : int, optional
        Number of latent factors.

    regularization : float, optional
        Regularization for interaction terms. The type of regularization is L2.
        Default: 1e-8; a typical range for this parameter is between 1e-12 and 1.

    linear_regularization : float, optional
        Regularization for linear term.
        Default: 1e-10; a typical range for this parameter is between 1e-12 and 1.

    side_data_factorization : boolean, optional
        Use factorization for modeling any additional features beyond the user
        and item columns. If True, and side features or any additional columns are
        present, then a Factorization Machine model is trained. Otherwise, only
        the linear terms are fit to these features.  See
        :class:`turicreate.recommender.factorization_recommender.FactorizationRecommender`
        for more information. Default: True.

    nmf : boolean, optional
        Use nonnegative matrix factorization, which forces the factors to be
        nonnegative. Disables linear and intercept terms.

    binary_target : boolean, optional
        Assume the target column is composed of 0's and 1's. If True, use
        logistic loss to fit the model.

    max_iterations : int, optional
        The training algorithm will make at most this many iterations through
        the observed data. Default: 50.

    sgd_step_size : float, optional
        Step size for stochastic gradient descent. Smaller values generally
        lead to more accurate models that take more time to train. The
        default setting of 0 means that the step size is chosen by trying
        several options on a small subset of the data.

    random_seed :  int, optional
        The random seed used to choose the initial starting point for
        model training. Note that some randomness in the training is
        unavoidable, so models trained with the same random seed may still
        differ slightly. Default: 0.

    solver : string, optional
        Name of the solver to be used to solve the regression. See the
        references for more detail on each solver. The available solvers for this
        model are:

        - *auto (default)*: automatically chooses the best solver for the data
                              and model parameters.
        - *sgd*:            Stochastic Gradient Descent.
        - *adagrad*:        Adaptive Gradient Stochastic Gradient Descent [1].
        - *als*:            Alternating Least Squares.

    verbose : bool, optional
        Enables verbose output.

    kwargs : optional
        Optional advanced keyword arguments passed in to the model
        optimization procedure. These parameters do not typically
        need to be changed.

    Examples
    --------
    **Basic usage**

    >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],
    ...                       'rating': [1, 3, 2, 5, 4, 1, 4, 3]})
    >>> m1 = turicreate.factorization_recommender.create(sf, target='rating')

    When a target column is present, :meth:`~turicreate.recommender.create`
    defaults to creating a :class:`~turicreate.recommender.factorization_recommender.FactorizationRecommender`.

    **Including side features**

    >>> user_info = turicreate.SFrame({'user_id': ["0", "1", "2"],
    ...                              'name': ["Alice", "Bob", "Charlie"],
    ...                              'numeric_feature': [0.1, 12, 22]})
    >>> item_info = turicreate.SFrame({'item_id': ["a", "b", "c", "d"],
    ...                              'name': ["item1", "item2", "item3", "item4"],
    ...                              'dict_feature': [{'a' : 23}, {'a' : 13},
    ...                                               {'b' : 1},
    ...                                               {'a' : 23, 'b' : 32}]})
    >>> m2 = turicreate.factorization_recommender.create(sf, target='rating',
    ...                                                user_data=user_info,
    ...                                                item_data=item_info)

    **Using the Alternating Least Squares (ALS) solver**

    The factorization model can also be solved using alternating least
    squares (ALS) as a solver option.  This solver does not support side
    columns or other similar features.

    >>> m3 = turicreate.factorization_recommender.create(sf, target='rating',
                                                                solver = 'als')

    See Also
    --------
    RankingFactorizationRecommender,
    :class:`turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender`

    References
    -----------

    [1] Duchi, John, Elad Hazan, and Yoram Singer. "Adaptive subgradient methods for online
    learning and stochastic optimization." The Journal of Machine Learning Research 12 (2011).

    """
    from turicreate._cython.cy_server import QuietProgress

    if not (isinstance(observation_data, _SFrame)):
        raise TypeError("observation_data input must be a SFrame")
    opts = {}
    model_proxy = _turicreate.extensions.factorization_recommender()
    model_proxy.init_options(opts)

    if user_data is None:
        user_data = _turicreate.SFrame()
    if item_data is None:
        item_data = _turicreate.SFrame()

    opts = {
        "user_id": user_id,
        "item_id": item_id,
        "target": target,
        "random_seed": random_seed,
        "num_factors": num_factors,
        "regularization": regularization,
        "linear_regularization": linear_regularization,
        "binary_target": binary_target,
        "max_iterations": max_iterations,
        "sgd_step_size": sgd_step_size,
        "solver": solver,
        "side_data_factorization": side_data_factorization,
        # has no effect in the c++ end; ignore.
        "nmf": nmf,
    }

    if kwargs:
        try:
            possible_args = set(_get_default_options()["name"])
        except (RuntimeError, KeyError):
            possible_args = set()

        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " +
                            ", ".join(bad_arguments))

        opts.update(kwargs)

    extra_data = {"nearest_items": _turicreate.SFrame()}

    with QuietProgress(verbose):
        model_proxy.train(observation_data, user_data, item_data, opts,
                          extra_data)

    return FactorizationRecommender(model_proxy)
示例#8
0
def create(graph, kmin=0, kmax=10, verbose=True):
    """
    Compute the K-core decomposition of the graph. Return a model object with
    total number of cores as well as the core id for each vertex in the graph.

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute the k-core decomposition.

    kmin : int, optional
        Minimum core id. Vertices having smaller core id than `kmin` will be
        assigned with core_id = `kmin`.

    kmax : int, optional
        Maximum core id. Vertices having larger core id than `kmax` will be
        assigned with core_id=`kmax`.

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : KcoreModel

    References
    ----------
    - Alvarez-Hamelin, J.I., et al. (2005) `K-Core Decomposition: A Tool for the
      Visualization of Large Networks <http://arxiv.org/abs/cs/0504107>`_.

    Examples
    --------
    If given an :class:`~turicreate.SGraph` ``g``, we can create
    a :class:`~turicreate.kcore.KcoreModel` as follows:

    >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap')
    >>> kc = turicreate.kcore.create(g)

    We can obtain the ``core id`` corresponding to each vertex in the graph
    ``g`` using:

    >>> kcore_id = kc['core_id']     # SFrame

    We can add the new core id field to the original graph g using:

    >>> g.vertices['core_id'] = kc['graph'].vertices['core_id']

    Note that the task above does not require a join because the vertex
    ordering is preserved through ``create()``.

    See Also
    --------
    KcoreModel
    """
    from turicreate._cython.cy_server import QuietProgress

    if not isinstance(graph, _SGraph):
        raise TypeError('graph input must be a SGraph object.')

    opts = {'graph': graph.__proxy__, 'kmin': kmin, 'kmax': kmax}

    with QuietProgress(verbose):
        params = _tc.extensions._toolkits.graph.kcore.create(opts)

    return KcoreModel(params['model'])
示例#9
0
def create(item_data, item_id,
           observation_data = None,
           user_id = None, target = None,
           weights = 'auto',
           similarity_metrics = 'auto',
           item_data_transform = 'auto',
           max_item_neighborhood_size = 64, verbose=True):

    """Create a content-based recommender model in which the similarity
    between the items recommended is determined by the content of
    those items rather than learned from user interaction data.

    The similarity score between two items is calculated by first
    computing the similarity between the item data for each column,
    then taking a weighted average of the per-column similarities to
    get the final similarity.  The recommendations are generated
    according to the average similarity of a candidate item to all the
    items in a user's set of rated items.

    Parameters
    ----------

    item_data : SFrame
        An SFrame giving the content of the items to use to learn the
        structure of similar items.  The SFrame must have one column
        that matches the name of the `item_id`; this gives a unique
        identifier that can then be used to make recommendations.  The rest
        of the columns are then used in the distance calculations
        below.

    item_id : string
        The name of the column in item_data (and `observation_data`,
        if given) that represents the item ID.

    observation_data : None (optional)
        An SFrame giving user and item interaction data.  This
        information is stored in the model, and the recommender will
        recommend the items with the most similar content to the
        items that were present and/or highly rated for that user.

    user_id : None (optional)
        If observation_data is given, then this specifies the column
        name corresponding to the user identifier.

    target : None (optional)
        If observation_data is given, then this specifies the column
        name corresponding to the target or rating.

    weights : dict or 'auto' (optional)
        If given, then weights must be a dictionary of column names
        present in item_data to weights between the column names.  If
        'auto' is given, the all columns are weighted equally.

    max_item_neighborhood_size : int, 64
        For each item, we hold this many similar items to use when
        aggregating models for predictions.  Decreasing this value
        decreases the memory required by the model and decreases the
        time required to generate recommendations, but it may also
        decrease recommendation accuracy.

    verbose : True or False (optional)
        If set to False, then less information is printed.

    Examples
    --------

      >>> item_data = tc.SFrame({"my_item_id" : range(4),
                                 "data_1" : [ [1, 0], [1, 0], [0, 1], [0.5, 0.5] ],
                                 "data_2" : [ [0, 1], [1, 0], [0, 1], [0.5, 0.5] ] })

      >>> m = tc.recommender.item_content_recommender.create(item_data, "my_item_id")
      >>> m.recommend_from_interactions([0])

      Columns:
              my_item_id      int
              score   float
              rank    int

      Rows: 3

      Data:
      +------------+----------------+------+
      | my_item_id |     score      | rank |
      +------------+----------------+------+
      |     3      | 0.707106769085 |  1   |
      |     1      |      0.5       |  2   |
      |     2      |      0.5       |  3   |
      +------------+----------------+------+
      [3 rows x 3 columns]

      >>> m.recommend_from_interactions([0, 1])

      Columns:
              my_item_id      int
              score   float
              rank    int

      Rows: 2

      Data:
      +------------+----------------+------+
      | my_item_id |     score      | rank |
      +------------+----------------+------+
      |     3      | 0.707106769085 |  1   |
      |     2      |      0.25      |  2   |
      +------------+----------------+------+
      [2 rows x 3 columns]

    """
    from turicreate._cython.cy_server import QuietProgress

    # item_data is correct type
    if not isinstance(item_data, _SFrame) or item_data.num_rows() == 0:
        raise TypeError("`item_data` argument must be a non-empty SFrame giving item data to use for similarities.")

    # Error checking on column names
    item_columns = set(item_data.column_names())

    if item_id not in item_columns:
            raise ValueError("Item column given as 'item_id = %s', but this is not found in `item_data` SFrame."
                             % item_id)

    # Now, get the set ready to test for other argument issues.
    item_columns.remove(item_id)

    if weights != 'auto':
        if type(weights) is not dict:
            raise TypeError("`weights` parameter must be 'auto' or a dictionary of column "
                            "names in `item_data` to weight values.")

        bad_columns = [col_name for col_name in item_columns if col_name not in item_columns]
        if bad_columns:
            raise ValueError("Columns %s given in weights, but these are not found in item_data."
                             % ', '.join(bad_columns))

        # Now, set any columns not given in the weights column to be
        # weight 0.
        for col_name in item_columns:
            weights.setdefault(col_name, 0)

    ################################################################################
    # Now, check the feature transformer stuff.

    # Pass it through a feature transformer.
    if item_data_transform == 'auto':
        item_data_transform = _turicreate.toolkits._feature_engineering.AutoVectorizer(excluded_features = [item_id])

    if not isinstance(item_data_transform, _turicreate.toolkits._feature_engineering.TransformerBase):
        raise TypeError("item_data_transform must be 'auto' or a valid feature_engineering transformer instance.")

    # Transform the input data.
    item_data = item_data_transform.fit_transform(item_data)

    # Translate any string columns to actually work in nearest
    # neighbors by making it a categorical list.  Also translate lists
    # into dicts, and normalize numeric columns.
    gaussian_kernel_metrics = set()

    for c in item_columns:
        if item_data[c].dtype is str:
            item_data[c] = item_data[c].apply(lambda s: {s : 1})
        elif item_data[c].dtype in [float, int]:
            item_data[c] = (item_data[c] - item_data[c].mean()) / max(item_data[c].std(), 1e-8)
            gaussian_kernel_metrics.add(c)

    if verbose:
        print("Applying transform:")
        print(item_data_transform)

    opts = {}
    model_proxy = _turicreate.extensions.item_content_recommender()
    model_proxy.init_options(opts)

    # The user_id is implicit if none is given.
    if user_id is None:
        user_id = "__implicit_user__"

    normalization_factor = 1

    # Set the observation data.
    if observation_data is None:

        # In this case, it's important to make this a string type.  If
        # the user column is not given, it may be given at recommend
        # time, in which case it is cast to a string type and cast
        # back if necessary.
        empty_user = _turicreate.SArray([], dtype=str)
        empty_item = _turicreate.SArray([], dtype=item_data[item_id].dtype)
        observation_data = _turicreate.SFrame( {user_id : empty_user, item_id : empty_item} )

    # Now, work out stuff for the observation_data component
    normalization_factor = 1

    # 1 for the item_id column.
    if item_data.num_columns() >= 3:

        if weights == "auto":

            # TODO: automatically tune this.
            weights = {col_name : 1 for col_name in item_data.column_names() if col_name != item_id}

        # Use the abs value here in case users pass in weights with negative values.
        normalization_factor = sum(abs(v) for v in weights.values())
        if normalization_factor == 0:
            raise ValueError("Weights cannot all be set to 0.")

        distance = [([col_name], ("gaussian_kernel" if col_name in gaussian_kernel_metrics else "cosine"), weight)
                      for col_name, weight in weights.items()]

    else:
        distance = "cosine"

    # Now, build the nearest neighbors model:
    nn = _turicreate.nearest_neighbors.create(item_data, label=item_id, distance = distance, verbose = verbose)
    graph = nn.query(item_data, label = item_id, k=max_item_neighborhood_size, verbose = verbose)
    graph = graph.rename({"query_label" : item_id,
                          "reference_label" : "similar",
                          "distance" : "score"}, inplace=True)

    def process_weights(x):
        return max(-1, min(1, 1 - x / normalization_factor))

    graph["score"] = graph["score"].apply(process_weights)

    opts = {'user_id': user_id,
            'item_id': item_id,
            'target': target,
            'similarity_type' : "cosine",
            'max_item_neighborhood_size' : max_item_neighborhood_size}

    user_data = _turicreate.SFrame()
    extra_data = {"nearest_items" : graph}
    with QuietProgress(verbose):
        model_proxy.train(observation_data, user_data, item_data, opts, extra_data)

    return ItemContentRecommender(model_proxy)
示例#10
0
def create(graph,
           source_vid,
           weight_field="",
           max_distance=1e30,
           verbose=True):
    """
    Compute the single source shortest path distance from the source vertex to
    all vertices in the graph. Note that because SGraph is directed, shortest
    paths are also directed. To find undirected shortest paths add edges to the
    SGraph in both directions. Return a model object with distance each of
    vertex in the graph.

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute shortest paths.

    source_vid : vertex ID
        ID of the source vertex.

    weight_field : string, optional
        The edge field representing the edge weights. If empty, uses unit
        weights.

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : ShortestPathModel

    References
    ----------
    - `Wikipedia - ShortestPath <http://en.wikipedia.org/wiki/Shortest_path_problem>`_

    Examples
    --------
    If given an :class:`~turicreate.SGraph` ``g``, we can create
    a :class:`~turicreate.shortest_path.ShortestPathModel` as follows:

    >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap')
    >>> sp = turicreate.shortest_path.create(g, source_vid=1)

    We can obtain the shortest path distance from the source vertex to each
    vertex in the graph ``g`` as follows:

    >>> sp_sframe = sp['distance']   # SFrame

    We can add the new distance field to the original graph g using:

    >>> g.vertices['distance_to_1'] = sp['graph'].vertices['distance']

    Note that the task above does not require a join because the vertex
    ordering is preserved through ``create()``.

    To get the actual path from the source vertex to any destination vertex:

    >>> path = sp.get_path(vid=10)


    We can obtain an auxiliary graph with additional information corresponding
    to the shortest path from the source vertex to each vertex in the graph
    ``g`` as follows:

    >>> sp_graph = sp.get.graph      # SGraph

    See Also
    --------
    ShortestPathModel
    """
    from turicreate._cython.cy_server import QuietProgress

    if not isinstance(graph, _SGraph):
        raise TypeError("graph input must be a SGraph object.")

    opts = {
        "source_vid": source_vid,
        "weight_field": weight_field,
        "max_distance": max_distance,
        "graph": graph.__proxy__,
    }
    with QuietProgress(verbose):
        params = _tc.extensions._toolkits.graph.sssp.create(opts)
    return ShortestPathModel(params["model"])
def create(observation_data,
           user_id="user_id",
           item_id="item_id",
           target=None,
           user_data=None,
           item_data=None,
           nearest_items=None,
           similarity_type="jaccard",
           threshold=0.001,
           only_top_k=64,
           verbose=True,
           target_memory_usage=8 * 1024 * 1024 * 1024,
           **kwargs):
    """
    Create a recommender that uses item-item similarities based on
    users in common.

    Parameters
    ----------
    observation_data : SFrame
        The dataset to use for training the model. It must contain a column of
        user ids and a column of item ids. Each row represents an observed
        interaction between the user and the item.  The (user, item) pairs
        are stored with the model so that they can later be excluded from
        recommendations if desired. It can optionally contain a target ratings
        column. All other columns are interpreted by the underlying model as
        side features for the observations.

        The user id and item id columns must be of type 'int' or 'str'. The
        target column must be of type 'int' or 'float'.

    user_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        user id.

    item_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        item id.

    target : string, optional
        The `observation_data` can optionally contain a column of scores
        representing ratings given by the users. If present, the name of this
        column may be specified variables `target`.

    user_data : SFrame, optional
        Side information for the users.  This SFrame must have a column with
        the same name as what is specified by the `user_id` input parameter.
        `user_data` can provide any amount of additional user-specific
        information. (NB: This argument is currently ignored by this model.)

    item_data : SFrame, optional
        Side information for the items.  This SFrame must have a column with
        the same name as what is specified by the `item_id` input parameter.
        `item_data` can provide any amount of additional item-specific
        information. (NB: This argument is currently ignored by this model.)

    similarity_type : {'jaccard', 'cosine', 'pearson'}, optional
        Similarity metric to use. See ItemSimilarityRecommender for details.
        Default: 'jaccard'.

    threshold : float, optional
        Predictions ignore items below this similarity value.
        Default: 0.001.

    only_top_k : int, optional
        Number of similar items to store for each item. Default value is
        64.  Decreasing this  decreases the amount of memory required for the
        model, but may also decrease the accuracy.

    nearest_items : SFrame, optional
        A set of each item's nearest items. When provided, this overrides
        the similarity computed above.
        See Notes in the documentation for ItemSimilarityRecommender.
        Default: None.

    target_memory_usage : int, optional
        The target memory usage for the processing buffers and lookup
        tables.  The actual memory usage may be higher or lower than this,
        but decreasing this decreases memory usage at the expense of
        training time, and increasing this can dramatically speed up the
        training time.  Default is 8GB = 8589934592.

    seed_item_set_size : int, optional
        For users that have not yet rated any items, or have only
        rated uniquely occurring items with no similar item info,
        the model seeds the user's item set with the average
        ratings of the seed_item_set_size most popular items when
        making predictions and recommendations.  If set to 0, then
        recommendations based on either popularity (no target present)
        or average item score (target present) are made in this case.

    nearest_neighbors_interaction_proportion_threshold : (advanced) float
        Any item that has was rated by more than this proportion of
        users is  treated by doing a nearest neighbors search.  For
        frequent items, this  is almost always faster, but it is slower
        for infrequent items.  Furthermore, decreasing this causes more
        items to be processed using the nearest neighbor path, which may
        decrease memory requirements.

    degree_approximation_threshold : (advanced) int, optional
        Users with more than this many item interactions may be
        approximated.  The approximation is done by a combination of
        sampling and choosing the interactions likely to have the most
        impact on the model.  Increasing this can increase the training time
        and may or may not increase the quality of the model.  Default = 4096.

    max_data_passes : (advanced) int, optional
        The maximum number of passes through the data allowed in
        building the similarity lookup tables.  If it is not possible to
        build the recommender in this many passes (calculated before
        that stage of training), then additional approximations are
        applied; namely decreasing degree_approximation_threshold.  If
        this is not possible, an error is raised.  To decrease the
        number of passes required, increase target_memory_usage or
        decrease nearest_neighbors_interaction_proportion_threshold.
        Default = 1024.

    Examples
    --------
    Given basic user-item observation data, an
    :class:`~turicreate.recommender.item_similarity_recommender.ItemSimilarityRecommender` is created:

    >>> sf = turicreate.SFrame({'user_id': ['0', '0', '0', '1', '1', '2', '2', '2'],
    ...                       'item_id': ['a', 'b', 'c', 'a', 'b', 'b', 'c', 'd']})
    >>> m = turicreate.item_similarity_recommender.create(sf)
    >>> recs = m.recommend()

    When a target is available, one can specify the desired similarity. For
    example we may choose to use a cosine similarity, and use it to make
    predictions or recommendations.

    >>> sf2 = turicreate.SFrame({'user_id': ['0', '0', '0', '1', '1', '2', '2', '2'],
    ...                        'item_id': ['a', 'b', 'c', 'a', 'b', 'b', 'c', 'd'],
    ...                        'rating': [1, 3, 2, 5, 4, 1, 4, 3]})
    >>> m2 = turicreate.item_similarity_recommender.create(sf2, target="rating",
    ...                                                  similarity_type='cosine')
    >>> m2.predict(sf)
    >>> m2.recommend()

    Notes
    -----
    Currently, :class:`~turicreate.recommender.item_similarity_recommender.ItemSimilarityRecommender`
    does not leverage the use of side features `user_data` and `item_data`.

    **Incorporating pre-defined similar items**

    For item similarity models, one may choose to provide user-specified
    nearest neighbors graph using the keyword argument `nearest_items`. This is
    an SFrame containing, for each item, the nearest items and the similarity
    score between them. If provided, these item similarity scores are used for
    recommendations. The SFrame must contain (at least) three columns:

    * 'item_id': a column with the same name as that provided to the `item_id`
      argument (which defaults to the string "item_id").
    * 'similar': a column containing the nearest items for the given item id.
      This should have the same type as the `item_id` column.
    * 'score': a numeric score measuring how similar these two items are.

    For example, suppose you first create an ItemSimilarityRecommender and use
    :class:`~turicreate.recommender.ItemSimilarityRecommender.get_similar_items`:

    >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"]})
    >>> m = turicreate.item_similarity_recommender.create(sf)
    >>> nn = m.get_similar_items()
    >>> m2 = turicreate.item_similarity_recommender.create(sf, nearest_items=nn)

    With the above code, the item similarities computed for model `m` can be
    used to create a new recommender object, `m2`. Note that we could have
    created `nn` from some other means, but now use `m2` to make
    recommendations via `m2.recommend()`.


    See Also
    --------
    ItemSimilarityRecommender

    """
    from turicreate._cython.cy_server import QuietProgress

    if not (isinstance(observation_data, _SFrame)):
        raise TypeError("observation_data input must be a SFrame")
    opts = {}
    model_proxy = _turicreate.extensions.item_similarity()
    model_proxy.init_options(opts)

    if user_data is None:
        user_data = _turicreate.SFrame()
    if item_data is None:
        item_data = _turicreate.SFrame()
    if nearest_items is None:
        nearest_items = _turicreate.SFrame()

    opts = {
        "user_id": user_id,
        "item_id": item_id,
        "target": target,
        "similarity_type": similarity_type,
        "threshold": threshold,
        "target_memory_usage": float(target_memory_usage),
        "max_item_neighborhood_size": only_top_k,
    }

    extra_data = {"nearest_items": nearest_items}

    if kwargs:
        try:
            possible_args = set(_get_default_options()["name"])
        except (RuntimeError, KeyError):
            possible_args = set()

        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " +
                            ", ".join(bad_arguments))

        opts.update(kwargs)

    extra_data = {"nearest_items": nearest_items}
    opts.update(kwargs)

    with QuietProgress(verbose):
        model_proxy.train(observation_data, user_data, item_data, opts,
                          extra_data)

    return ItemSimilarityRecommender(model_proxy)
示例#12
0
def create(graph, verbose=True):
    """
    Compute the number of weakly connected components in the graph. Return a
    model object with total number of weakly connected components as well as the
    component ID for each vertex in the graph.

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute the triangle counts.

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : ConnectedComponentsModel

    References
    ----------
    - `Mathworld Wolfram - Weakly Connected Component
      <http://mathworld.wolfram.com/WeaklyConnectedComponent.html>`_

    Examples
    --------
    If given an :class:`~turicreate.SGraph` ``g``, we can create
    a :class:`~turicreate.connected_components.ConnectedComponentsModel` as
    follows:

    >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap')
    >>> cc = turicreate.connected_components.create(g)
    >>> cc.summary()

    We can obtain the ``component id`` corresponding to each vertex in the
    graph ``g`` as follows:

    >>> cc_ids = cc['component_id']  # SFrame

    We can obtain a graph with additional information about the ``component
    id`` corresponding to each vertex as follows:

    >>> cc_graph = cc['graph']      # SGraph

    We can add the new component_id field to the original graph g using:

    >>> g.vertices['component_id'] = cc['graph'].vertices['component_id']

    Note that the task above does not require a join because the vertex
    ordering is preserved through ``create()``.


    See Also
    --------
    ConnectedComponentsModel
    """
    from turicreate._cython.cy_server import QuietProgress

    if not isinstance(graph, _SGraph):
        raise TypeError('"graph" input must be a SGraph object.')

    with QuietProgress(verbose):
        params = _tc.extensions._toolkits.graph.connected_components.create(
            {"graph": graph.__proxy__})
    return ConnectedComponentsModel(params["model"])
示例#13
0
def create(graph, verbose=True):
    """
    Compute the in degree, out degree and total degree of each vertex.

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute degree counts.

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : DegreeCountingModel

    Examples
    --------
    If given an :class:`~turicreate.SGraph` ``g``, we can create
    a :class:`~turicreate.degree_counting.DegreeCountingModel` as follows:

    >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/web-Google.txt.gz',
    ...                         format='snap')
    >>> m = turicreate.degree_counting.create(g)
    >>> g2 = m['graph']
    >>> g2
    SGraph({'num_edges': 5105039, 'num_vertices': 875713})
    Vertex Fields:['__id', 'in_degree', 'out_degree', 'total_degree']
    Edge Fields:['__src_id', '__dst_id']

    >>> g2.vertices.head(5)
    Columns:
        __id	int
        in_degree	int
        out_degree	int
        total_degree	int
    <BLANKLINE>
    Rows: 5
    <BLANKLINE>
    Data:
    +------+-----------+------------+--------------+
    | __id | in_degree | out_degree | total_degree |
    +------+-----------+------------+--------------+
    |  5   |     15    |     7      |      22      |
    |  7   |     3     |     16     |      19      |
    |  8   |     1     |     2      |      3       |
    |  10  |     13    |     11     |      24      |
    |  27  |     19    |     16     |      35      |
    +------+-----------+------------+--------------+

    See Also
    --------
    DegreeCountingModel
    """
    from turicreate._cython.cy_server import QuietProgress

    if not isinstance(graph, _SGraph):
        raise TypeError('"graph" input must be a SGraph object.')

    with QuietProgress(verbose):
        params = _tc.extensions._toolkits.graph.degree_count.create(
            {'graph': graph.__proxy__})
    return DegreeCountingModel(params['model'])
示例#14
0
def create(observation_data,
           user_id='user_id', item_id='item_id', target=None,
           user_data=None, item_data=None,
           random_seed=0,
           verbose=True):
    """
    Create a model that makes recommendations using item popularity. When no
    target column is provided, the popularity is determined by the number of
    observations involving each item. When a target is provided, popularity
    is computed using the item's mean target value. When the target column
    contains ratings, for example, the model computes the mean rating for
    each item and uses this to rank items for recommendations.

    Parameters
    ----------
    observation_data : SFrame
        The dataset to use for training the model. It must contain a column of
        user ids and a column of item ids. Each row represents an observed
        interaction between the user and the item.  The (user, item) pairs
        are stored with the model so that they can later be excluded from
        recommendations if desired. It can optionally contain a target ratings
        column. All other columns are interpreted by the underlying model as
        side features for the observations.

        The user id and item id columns must be of type 'int' or 'str'. The
        target column must be of type 'int' or 'float'.

    user_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        user id.

    item_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        item id.

    target : string, optional
        The `observation_data` can optionally contain a column of scores
        representing ratings given by the users. If present, the name of this
        column may be specified variables `target`.

    user_data : SFrame, optional
        Side information for the users.  This SFrame must have a column with
        the same name as what is specified by the `user_id` input parameter.
        `user_data` can provide any amount of additional user-specific
        information.

    item_data : SFrame, optional
        Side information for the items.  This SFrame must have a column with
        the same name as what is specified by the `item_id` input parameter.
        `item_data` can provide any amount of additional item-specific
        information.

    verbose : bool, optional
        Enables verbose output.

    Examples
    --------
    >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],
    ...                       'rating': [1, 3, 2, 5, 4, 1, 4, 3]})
    >>> m = turicreate.popularity_recommender.create(sf, target='rating')

    See Also
    --------
    PopularityRecommender
    """
    from turicreate._cython.cy_server import QuietProgress
    if not (isinstance(observation_data, _SFrame)):
        raise TypeError('observation_data input must be a SFrame')
    opts = {}
    model_proxy = _turicreate.extensions.popularity()
    model_proxy.init_options(opts)

    if user_data is None:
        user_data = _turicreate.SFrame()
    if item_data is None:
        item_data = _turicreate.SFrame()
    nearest_items = _turicreate.SFrame()

    opts = {'user_id': user_id,
            'item_id': item_id,
            'target': target,
            'random_seed': 1}

    extra_data = {"nearest_items" : _turicreate.SFrame()}
    with QuietProgress(verbose):
        model_proxy.train(observation_data, user_data, item_data, opts, extra_data)

    return PopularityRecommender(model_proxy)
示例#15
0
def create(dataset,
           label=None,
           features=None,
           distance=None,
           method='auto',
           verbose=True,
           **kwargs):
    """
    Create a nearest neighbor model, which can be searched efficiently and
    quickly for the nearest neighbors of a query observation. If the `method`
    argument is specified as `auto`, the type of model is chosen automatically
    based on the type of data in `dataset`.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of Turi Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change;
        it is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Reference data. If the features for each observation are numeric, they
        may be in separate columns of 'dataset' or a single column with lists
        of values. The features may also be in the form of a column of sparse
        vectors (i.e. dictionaries), with string keys and numeric values.

    label : string, optional
        Name of the SFrame column with row labels. If 'label' is not specified,
        row numbers are used to identify reference dataset rows when the model
        is queried.

    features : list[string], optional
        Name of the columns with features to use in computing distances between
        observations and the query points. 'None' (the default) indicates that
        all columns except the label should be used as features. Each column
        can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *List*: list of integer or string values. Each element is treated as
          a separate variable in the model.

        - *String*: string values.

        Please note: if a composite distance is also specified, this parameter
        is ignored.

    distance : string, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~turicreate.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about Turi Create distance functions, please
        see the :py:mod:`~turicreate.toolkits.distances` module.

        If 'distance' is left unspecified or set to 'auto', a composite
        distance is constructed automatically based on feature types.

    method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional
        Method for computing nearest neighbors. The options are:

        - *auto* (default): the method is chosen automatically, based on the
          type of data and the distance. If the distance is 'manhattan' or
          'euclidean' and the features are numeric or vectors of numeric
          values, then the 'ball_tree' method is used. Otherwise, the
          'brute_force' method is used.

        - *ball_tree*: use a tree structure to find the k-closest neighbors to
          each query point. The ball tree model is slower to construct than the
          brute force model, but queries are faster than linear time. This
          method is not applicable for the cosine and dot product distances.
          See `Liu, et al (2004)
          <http://papers.nips.cc/paper/2666-an-investigation-of-p
          ractical-approximat e-nearest-neighbor-algorithms>`_ for
          implementation details.

        - *brute_force*: compute the distance from a query point to all
          reference observations. There is no computation time for model
          creation with the brute force method (although the reference data is
          held in the model, but each query takes linear time.

        - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate
          nearest neighbors efficiently. The LSH model supports 'euclidean',
          'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product'
          (deprecated), and 'transformed_dot_product' distances. Two options
          are provided for LSH -- ``num_tables`` and
          ``num_projections_per_table``. See the notes below for details.

    verbose: bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options for the distance function and query method.

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and
          n/(2^11), which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed. The default value is 20. We recommend choosing values
          from 10 to 30.

        - *num_projections_per_table*: For the LSH method, the number of
          projections/hash functions for each hash table. The default value is
          4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other
          distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8
          ~ 20 for 'cosine' distance and 4 ~ 12 for other distances.

    Returns
    -------
    out : NearestNeighborsModel
        A structure for efficiently computing the nearest neighbors in 'dataset'
        of new query points.

    See Also
    --------
    NearestNeighborsModel.query, turicreate.toolkits.distances

    Notes
    -----
    - Missing data is not allowed in the 'dataset' provided to this function.
      Please use the :func:`turicreate.SFrame.fillna` and
      :func:`turicreate.SFrame.dropna` utilities to handle missing data before
      creating a nearest neighbors model.

    - Missing keys in sparse vectors are assumed to have value 0.

    - The `composite_params` parameter was removed as of Turi Create
      version 1.5. The `distance` parameter now accepts either standard or
      composite distances. Please see the :mod:`~turicreate.toolkits.distances`
      module documentation for more information on composite distances.

    - If the features should be weighted equally in the distance calculations
      but are measured on different scales, it is important to standardize the
      features. One way to do this is to subtract the mean of each column and
      divide by the standard deviation.

    **Locality Sensitive Hashing (LSH)**

    There are several efficient nearest neighbors search algorithms that work
    well for data with low dimensions :math:`d` (approximately 50). However,
    most of the solutions suffer from either space or query time that is
    exponential in :math:`d`. For large :math:`d`, they often provide little,
    if any, improvement over the 'brute_force' method. This is a well-known
    consequence of the phenomenon called `The Curse of Dimensionality`.

    `Locality Sensitive Hashing (LSH)
    <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach
    that is designed to efficiently solve the *approximate* nearest neighbor
    search problem for high dimensional data. The key idea of LSH is to hash
    the data points using several hash functions, so that the probability of
    collision is much higher for data points which are close to each other than
    those which are far apart.

    An LSH family is a family of functions :math:`h` which map points from the
    metric space to a bucket, so that

    - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`.
    - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`.

    LSH for efficient approximate nearest neighbor search:

    - We define a new family of hash functions :math:`g`, where each
      function :math:`g` is obtained by concatenating :math:`k` functions
      :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`.
      The algorithm constructs :math:`L` hash tables, each of which
      corresponds to a different randomly chosen hash function :math:`g`.
      There are :math:`k \\cdot L` hash functions used in total.

    - In the preprocessing step, we hash all :math:`n` reference points
      into each of the :math:`L` hash tables.

    - Given a query point :math:`q`, the algorithm iterates over the
      :math:`L` hash functions :math:`g`. For each :math:`g` considered, it
      retrieves the data points that are hashed into the same bucket as q.
      These data points from all the :math:`L` hash tables are considered as
      candidates that are then re-ranked by their real distances with the query
      data.

    **Note** that the number of tables :math:`L` and the number of hash
    functions per table :math:`k` are two main parameters. They can be set
    using the options ``num_tables`` and ``num_projections_per_table``
    respectively.

    Hash functions for different distances:

    - `euclidean` and `squared_euclidean`:
      :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where
      :math:`a` is a vector, of which the elements are independently
      sampled from normal distribution, and :math:`b` is a number
      uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the
      bucket width. We set :math:`r` using the average all-pair `euclidean`
      distances from a small randomly sampled subset of the reference data.

    - `manhattan`: The hash function of `manhattan` is similar with that of
      `euclidean`. The only difference is that the elements of `a` are sampled
      from Cauchy distribution, instead of normal distribution.

    - `cosine`: Random Projection is designed to approximate the cosine
      distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot
      q)`, where :math:`a` is randomly sampled normal unit vector.

    - `jaccard`: We use a recently proposed method one permutation hashing by
      Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014]
      <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for
      details.

    - `dot_product`: The reference data points are first transformed to
      fixed-norm vectors, and then the minimum `dot_product` distance search
      problem can be solved via finding the reference data with smallest
      `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015]
      <http://proceedings.mlr.press/v37/neyshabur15.html>`_ for details.

    References
    ----------
    - `Wikipedia - nearest neighbor
      search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_

    - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_

    - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of
      Practical Approximate Nearest Neighbor Algorithms
      <http://papers.nips.cc/paper/2666-an-investigation-of-p
      ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural
      Information Processing Systems pp. 825-832.

    - `Wikipedia - Jaccard distance
      <http://en.wikipedia.org/wiki/Jaccard_index>`_

    - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the
      Jaccard Median
      <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_.
      Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete
      Algorithms. Society for Industrial and Applied Mathematics.

    - `Wikipedia - Cosine distance
      <http://en.wikipedia.org/wiki/Cosine_similarity>`_

    - `Wikipedia - Levenshtein distance
      <http://en.wikipedia.org/wiki/Levenshtein_distance>`_

    - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive
      Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_.

    Examples
    --------
    Construct a nearest neighbors model with automatically determined method
    and distance:

    >>> sf = turicreate.SFrame({'X1': [0.98, 0.62, 0.11],
    ...                       'X2': [0.69, 0.58, 0.36],
    ...                       'str_feature': ['cat', 'dog', 'fossa']})
    >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'])

    For datasets with a large number of rows and up to about 100 variables, the
    ball tree method often leads to much faster queries.

    >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'],
    ...                                           method='ball_tree')

    Often the final determination of a neighbor is based on several distance
    computations over different sets of features. Each part of this composite
    distance may have a different relative weight.

    >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.],
    ...            [['str_feature'], 'levenshtein', 3.]]
    ...
    >>> model = turicreate.nearest_neighbors.create(sf, distance=my_dist)
    """

    ## Validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Basic validation of the features input
    if features is not None and not isinstance(features, list):
        raise TypeError("If specified, input 'features' must be a list of " +
                        "strings.")

    ## Clean the method options and create the options dictionary
    allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table']
    _method_options = {}

    for k, v in kwargs.items():
        if k in allowed_kwargs:
            _method_options[k] = v
        else:
            raise _ToolkitError(
                "'{}' is not a valid keyword argument".format(k) +
                " for the nearest neighbors model. Please " +
                "check for capitalization and other typos.")

    ## Exclude inappropriate combinations of method an distance
    if method == 'ball_tree' and (
            distance == 'cosine' or distance == _turicreate.distances.cosine
            or distance == 'dot_product'
            or distance == _turicreate.distances.dot_product
            or distance == 'transformed_dot_product'
            or distance == _turicreate.distances.transformed_dot_product):
        raise TypeError(
            "The ball tree method does not work with 'cosine' " +
            "'dot_product', or 'transformed_dot_product' distance." +
            "Please use the 'brute_force' method for these distances.")

    if method == 'lsh' and ('num_projections_per_table'
                            not in _method_options):
        if distance == 'jaccard' or distance == _turicreate.distances.jaccard:
            _method_options['num_projections_per_table'] = 4
        elif distance == 'cosine' or distance == _turicreate.distances.cosine:
            _method_options['num_projections_per_table'] = 16
        else:
            _method_options['num_projections_per_table'] = 8

    ## Initial validation and processing of the label
    if label is None:
        _label = _robust_column_name('__id', dataset.column_names())
        _dataset = dataset.add_row_number(_label)
    else:
        _label = label
        _dataset = _copy.copy(dataset)

    col_type_map = {c: _dataset[c].dtype for c in _dataset.column_names()}
    _validate_row_label(_label, col_type_map)
    ref_labels = _dataset[_label]

    ## Determine the internal list of available feature names (may still include
    #  the row label name).
    if features is None:
        _features = _dataset.column_names()
    else:
        _features = _copy.deepcopy(features)

    ## Check if there's only one feature and it's the same as the row label.
    #  This would also be trapped by the composite distance validation, but the
    #  error message is not very informative for the user.
    free_features = set(_features).difference([_label])
    if len(free_features) < 1:
        raise _ToolkitError("The only available feature is the same as the " +
                            "row label column. Please specify features " +
                            "that are not also row labels.")

    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.
    # - At this point, the row label 'label' may still be in the list(s) of
    #   features.

    ## Convert any distance function input into a single composite distance.
    # distance is already a composite distance
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    # distance is a single name (except 'auto') or function handle.
    elif (hasattr(distance, '__call__')
          or (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    # distance is unspecified and needs to be constructed.
    elif distance is None or distance == 'auto':
        sample = _dataset.head()
        distance = _construct_auto_distance(_features, _dataset.column_names(),
                                            _dataset.column_types(), sample)

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' "
                        " argument must be a string, function handle, or " +
                        "composite distance.")

    ## Basic composite distance validation, remove the row label from all
    #  feature lists, and convert string distance names into distance functions.
    distance = _scrub_composite_distance_features(distance, [_label])
    distance = _convert_distance_names_to_functions(distance)
    _validate_composite_distance(distance)

    ## Raise an error if any distances are used with non-lists
    list_features_to_check = []
    sparse_distances = [
        'jaccard', 'weighted_jaccard', 'cosine', 'dot_product',
        'transformed_dot_product'
    ]
    sparse_distances = [
        getattr(_turicreate.distances, k) for k in sparse_distances
    ]
    for d in distance:
        feature_names, dist, _ = d
        list_features = [f for f in feature_names if _dataset[f].dtype == list]
        for f in list_features:
            if dist in sparse_distances:
                list_features_to_check.append(f)
            else:
                raise TypeError(
                    "The chosen distance cannot currently be used " +
                    "on list-typed columns.")
    for f in list_features_to_check:
        only_str_lists = _validate_lists(_dataset[f], [str])
        if not only_str_lists:
            raise TypeError("Distances for sparse data, such as jaccard " +
                            "and weighted_jaccard, can only be used on " +
                            "lists containing only strings. Please modify " +
                            "any list features accordingly before creating " +
                            "the nearest neighbors model.")

    ## Raise an error if any component has string features are in single columns
    for d in distance:
        feature_names, dist, _ = d

        if (len(feature_names) > 1) and (dist
                                         == _turicreate.distances.levenshtein):
            raise ValueError(
                "Levenshtein distance cannot be used with multiple " +
                "columns. Please concatenate strings into a single " +
                "column before creating the nearest neighbors model.")

    ## Get the union of feature names and make a clean dataset.
    clean_features = _get_composite_distance_features(distance)
    sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features)

    ## Decide which method to use
    ## - If more than one distance component (specified either directly or
    #  generated automatically because distance set to 'auto'), then do brute
    #  force.
    if len(distance) > 1:
        _method = 'brute_force'

        if method != 'brute_force' and verbose is True:
            print("Defaulting to brute force instead of ball tree because " +\
                "there are multiple distance components.")

    else:
        if method == 'auto':

            # get the total number of variables. Assume the number of elements in
            # array type columns does not change
            num_variables = sum([
                len(x) if hasattr(x, '__iter__') else 1
                for x in _six.itervalues(sf_clean[0])
            ])

            # flag if all the features in the single composite are of numeric
            # type.
            numeric_type_flag = all([
                x in [int, float, list, array.array]
                for x in sf_clean.column_types()
            ])

            ## Conditions necessary for ball tree to work and be worth it
            if ((distance[0][1] in [
                    'euclidean', 'manhattan', _turicreate.distances.euclidean,
                    _turicreate.distances.manhattan
            ]) and numeric_type_flag is True and num_variables <= 200):

                _method = 'ball_tree'

            else:
                _method = 'brute_force'

        else:
            _method = method

    ## Pick the right model name for the method
    if _method == 'ball_tree':
        model_name = 'nearest_neighbors_ball_tree'

    elif _method == 'brute_force':
        model_name = 'nearest_neighbors_brute_force'

    elif _method == 'lsh':
        model_name = 'nearest_neighbors_lsh'

    else:
        raise ValueError(
            "Method must be 'auto', 'ball_tree', 'brute_force', " +
            "or 'lsh'.")

    ## Package the model options
    opts = {}
    opts.update(_method_options)
    opts.update({
        'model_name': model_name,
        'ref_labels': ref_labels,
        'label': label,
        'sf_features': sf_clean,
        'composite_params': distance
    })

    ## Construct the nearest neighbors model
    with QuietProgress(verbose):
        result = _turicreate.extensions._nearest_neighbors.train(opts)

    model_proxy = result['model']
    model = NearestNeighborsModel(model_proxy)

    return model
示例#16
0
    def similarity_graph(self,
                         k=5,
                         radius=None,
                         include_self_edges=False,
                         output_type='SGraph',
                         verbose=True):
        """
        Construct the similarity graph on the reference dataset, which is
        already stored in the model. This is conceptually very similar to
        running `query` with the reference set, but this method is optimized
        for the purpose, syntactically simpler, and automatically removes
        self-edges.

        Parameters
        ----------
        k : int, optional
            Maximum number of neighbors to return for each point in the
            dataset. Setting this to ``None`` deactivates the constraint, so
            that all neighbors are returned within ``radius`` of a given point.

        radius : float, optional
            For a given point, only neighbors within this distance are
            returned. The default is ``None``, in which case the ``k`` nearest
            neighbors are returned for each query point, regardless of
            distance.

        include_self_edges : bool, optional
            For most distance functions, each point in the model's reference
            dataset is its own nearest neighbor. If this parameter is set to
            False, this result is ignored, and the nearest neighbors are
            returned *excluding* the point itself.

        output_type : {'SGraph', 'SFrame'}, optional
            By default, the results are returned in the form of an SGraph,
            where each point in the reference dataset is a vertex and an edge A
            -> B indicates that vertex B is a nearest neighbor of vertex A. If
            'output_type' is set to 'SFrame', the output is in the same form as
            the results of the 'query' method: an SFrame with columns
            indicating the query label (in this case the query data is the same
            as the reference data), reference label, distance between the two
            points, and the rank of the neighbor.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame or SGraph
            The type of the output object depends on the 'output_type'
            parameter. See the parameter description for more detail.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each data point is
          matched to the entire dataset. If the reference dataset has
          :math:`n` rows, the output is an SFrame with :math:`n^2` rows (or an
          SGraph with :math:`n^2` edges).

        - For models created with the 'lsh' method, the output similarity graph
          may have fewer vertices than there are data points in the original
          reference set. Because LSH is an approximate method, a query point
          may have fewer than 'k' neighbors. If LSH returns no neighbors at all
          for a query and self-edges are excluded, the query point is omitted
          from the results.

        Examples
        --------
        First construct an SFrame and create a nearest neighbors model:

        >>> sf = turicreate.SFrame({'x1': [0.98, 0.62, 0.11],
        ...                       'x2': [0.69, 0.58, 0.36]})
        ...
        >>> model = turicreate.nearest_neighbors.create(sf, distance='euclidean')

        Unlike the ``query`` method, there is no need for a second dataset with
        ``similarity_graph``.

        >>> g = model.similarity_graph(k=1)  # an SGraph
        >>> g.edges
        +----------+----------+----------------+------+
        | __src_id | __dst_id |    distance    | rank |
        +----------+----------+----------------+------+
        |    0     |    1     | 0.376430604494 |  1   |
        |    2     |    1     | 0.55542776308  |  1   |
        |    1     |    0     | 0.376430604494 |  1   |
        +----------+----------+----------------+------+
        """
        ## Validate inputs.
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")

        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'k': k,
            'radius': radius,
            'include_self_edges': include_self_edges
        }

        with QuietProgress(verbose):
            result = _turicreate.extensions._nearest_neighbors.similarity_graph(
                opts)

        knn = result['neighbors']

        if output_type == "SFrame":
            return knn

        else:
            sg = _SGraph(edges=knn,
                         src_field='query_label',
                         dst_field='reference_label')
            return sg
示例#17
0
def create(graph,
           label_field,
           threshold=1e-3,
           weight_field='',
           self_weight=1.0,
           undirected=False,
           max_iterations=None,
           _single_precision=False,
           _distributed='auto',
           verbose=True):
    """
    Given a weighted graph with observed class labels of a subset of vertices,
    infer the label probability for the unobserved vertices using the
    "label propagation" algorithm.

    The algorithm iteratively updates the label probability of current vertex
    as a weighted sum of label probability of self and the neighboring vertices
    until converge.  See
    :class:`turicreate.label_propagation.LabelPropagationModel` for the details
    of the algorithm.

    Notes: label propagation works well with small number of labels, i.e. binary
    labels, or less than 1000 classes. The toolkit will throw error
    if the number of classes exceeds the maximum value (1000).

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute the label propagation.

    label_field: str
        Vertex field storing the initial vertex labels. The values in
        must be [0, num_classes). None values indicate unobserved vertex labels.

    threshold : float, optional
        Threshold for convergence, measured in the average L2 norm
        (the sum of squared values) of the delta of each vertex's
        label probability vector.

    max_iterations: int, optional
        The max number of iterations to run. Default is unlimited.
        If set, the algorithm terminates when either max_iterations
        or convergence threshold is reached.

    weight_field: str, optional
        Vertex field for edge weight. If empty, all edges are assumed
        to have unit weight.

    self_weight: float, optional
        The weight for self edge.

    undirected: bool, optional
        If true, treat each edge as undirected, and propagates label in
        both directions.

    _single_precision : bool, optional
        If true, running label propagation in single precision. The resulting
        probability values may less accurate, but should run faster
        and use less memory.

    _distributed : distributed environment, internal

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : LabelPropagationModel

    References
    ----------
    - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data
      with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_.

    Examples
    --------
    If given an :class:`~turicreate.SGraph` ``g``, we can create
    a :class:`~turicreate.label_propagation.LabelPropagationModel` as follows:

    >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz',
    ...                         format='snap')
    # Initialize random classes for a subset of vertices
    # Leave the unobserved vertices with None label.
    >>> import random
    >>> def init_label(vid):
    ...     x = random.random()
    ...     if x < 0.2:
    ...         return 0
    ...     elif x > 0.9:
    ...         return 1
    ...     else:
    ...         return None
    >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int)
    >>> m = turicreate.label_propagation.create(g, label_field='label')

    We can obtain for each vertex the predicted label and the probability of
    each label in the graph ``g`` using:

    >>> labels = m['labels']     # SFrame
    >>> labels
    +------+-------+-----------------+-------------------+----------------+
    | __id | label | predicted_label |         P0        |       P1       |
    +------+-------+-----------------+-------------------+----------------+
    |  5   |   1   |        1        |        0.0        |      1.0       |
    |  7   |  None |        0        |    0.8213214997   |  0.1786785003  |
    |  8   |  None |        1        | 5.96046447754e-08 | 0.999999940395 |
    |  10  |  None |        0        |   0.534984718273  | 0.465015281727 |
    |  27  |  None |        0        |   0.752801638549  | 0.247198361451 |
    |  29  |  None |        1        | 5.96046447754e-08 | 0.999999940395 |
    |  33  |  None |        1        | 5.96046447754e-08 | 0.999999940395 |
    |  47  |   0   |        0        |        1.0        |      0.0       |
    |  50  |  None |        0        |   0.788279032657  | 0.211720967343 |
    |  52  |  None |        0        |   0.666666666667  | 0.333333333333 |
    +------+-------+-----------------+-------------------+----------------+
    [36692 rows x 5 columns]

    See Also
    --------
    LabelPropagationModel
    """
    from turicreate._cython.cy_server import QuietProgress

    _raise_error_if_not_of_type(label_field, str)
    _raise_error_if_not_of_type(weight_field, str)

    if not isinstance(graph, _SGraph):
        raise TypeError('graph input must be a SGraph object.')

    if graph.vertices[label_field].dtype != int:
        raise TypeError('label_field %s must be integer typed.' % label_field)

    opts = {
        'label_field': label_field,
        'threshold': threshold,
        'weight_field': weight_field,
        'self_weight': self_weight,
        'undirected': undirected,
        'max_iterations': max_iterations,
        'single_precision': _single_precision,
        'graph': graph.__proxy__
    }

    with QuietProgress(verbose):
        params = _tc.extensions._toolkits.graph.label_propagation.create(opts)

    model = params['model']
    return LabelPropagationModel(model)