Exemplo n.º 1
0
def create_regression_with_model_selector(dataset,
                                          target,
                                          model_selector,
                                          features=None,
                                          validation_set='auto',
                                          verbose=True):
    """
    Create a :class:`~turicreate.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    model_selector: function
        Provide a model selector.

    features : list[string], optional
        List of feature names used by feature column

    verbose : boolean
        whether print out messages during training

    """

    # Error checking
    _raise_error_if_not_sframe(dataset, "training dataset")
    if features is None:
        features = dataset.column_names()
        if target in features:
            features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError(
            "Invalid feature %s: Feature names must be of type str" % x)

    # Sample the data
    features_sframe = _toolkits_select_columns(dataset, features)
    if features_sframe.num_rows() > 1e5:
        fraction = 1.0 * 1e5 / features_sframe.num_rows()
        features_sframe = features_sframe.sample(fraction, seed=0)

    # Run the model selector.
    selected_model_name = model_selector(features_sframe)
    model = create_selected(selected_model_name, dataset, target, features,
                            validation_set, verbose)

    return model
Exemplo n.º 2
0
def create(dataset,
           label=None,
           features=None,
           distance=None,
           method='auto',
           verbose=True,
           **kwargs):
    """
    Create a nearest neighbor model, which can be searched efficiently and
    quickly for the nearest neighbors of a query observation. If the `method`
    argument is specified as `auto`, the type of model is chosen automatically
    based on the type of data in `dataset`.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of Turi Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change;
        it is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Reference data. If the features for each observation are numeric, they
        may be in separate columns of 'dataset' or a single column with lists
        of values. The features may also be in the form of a column of sparse
        vectors (i.e. dictionaries), with string keys and numeric values.

    label : string, optional
        Name of the SFrame column with row labels. If 'label' is not specified,
        row numbers are used to identify reference dataset rows when the model
        is queried.

    features : list[string], optional
        Name of the columns with features to use in computing distances between
        observations and the query points. 'None' (the default) indicates that
        all columns except the label should be used as features. Each column
        can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *List*: list of integer or string values. Each element is treated as
          a separate variable in the model.

        - *String*: string values.

        Please note: if a composite distance is also specified, this parameter
        is ignored.

    distance : string, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~turicreate.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about Turi Create distance functions, please
        see the :py:mod:`~turicreate.toolkits.distances` module.

        If 'distance' is left unspecified or set to 'auto', a composite
        distance is constructed automatically based on feature types.

    method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional
        Method for computing nearest neighbors. The options are:

        - *auto* (default): the method is chosen automatically, based on the
          type of data and the distance. If the distance is 'manhattan' or
          'euclidean' and the features are numeric or vectors of numeric
          values, then the 'ball_tree' method is used. Otherwise, the
          'brute_force' method is used.

        - *ball_tree*: use a tree structure to find the k-closest neighbors to
          each query point. The ball tree model is slower to construct than the
          brute force model, but queries are faster than linear time. This
          method is not applicable for the cosine and dot product distances.
          See `Liu, et al (2004)
          <http://papers.nips.cc/paper/2666-an-investigation-of-p
          ractical-approximat e-nearest-neighbor-algorithms>`_ for
          implementation details.

        - *brute_force*: compute the distance from a query point to all
          reference observations. There is no computation time for model
          creation with the brute force method (although the reference data is
          held in the model, but each query takes linear time.

        - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate
          nearest neighbors efficiently. The LSH model supports 'euclidean',
          'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product'
          (deprecated), and 'transformed_dot_product' distances. Two options
          are provided for LSH -- ``num_tables`` and
          ``num_projections_per_table``. See the notes below for details.

    verbose: bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options for the distance function and query method.

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and
          n/(2^11), which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed. The default value is 20. We recommend choosing values
          from 10 to 30.

        - *num_projections_per_table*: For the LSH method, the number of
          projections/hash functions for each hash table. The default value is
          4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other
          distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8
          ~ 20 for 'cosine' distance and 4 ~ 12 for other distances.

    Returns
    -------
    out : NearestNeighborsModel
        A structure for efficiently computing the nearest neighbors in 'dataset'
        of new query points.

    See Also
    --------
    NearestNeighborsModel.query, turicreate.toolkits.distances

    Notes
    -----
    - Missing data is not allowed in the 'dataset' provided to this function.
      Please use the :func:`turicreate.SFrame.fillna` and
      :func:`turicreate.SFrame.dropna` utilities to handle missing data before
      creating a nearest neighbors model.

    - Missing keys in sparse vectors are assumed to have value 0.

    - The `composite_params` parameter was removed as of Turi Create
      version 1.5. The `distance` parameter now accepts either standard or
      composite distances. Please see the :mod:`~turicreate.toolkits.distances`
      module documentation for more information on composite distances.

    - If the features should be weighted equally in the distance calculations
      but are measured on different scales, it is important to standardize the
      features. One way to do this is to subtract the mean of each column and
      divide by the standard deviation.

    **Locality Sensitive Hashing (LSH)**

    There are several efficient nearest neighbors search algorithms that work
    well for data with low dimensions :math:`d` (approximately 50). However,
    most of the solutions suffer from either space or query time that is
    exponential in :math:`d`. For large :math:`d`, they often provide little,
    if any, improvement over the 'brute_force' method. This is a well-known
    consequence of the phenomenon called `The Curse of Dimensionality`.

    `Locality Sensitive Hashing (LSH)
    <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach
    that is designed to efficiently solve the *approximate* nearest neighbor
    search problem for high dimensional data. The key idea of LSH is to hash
    the data points using several hash functions, so that the probability of
    collision is much higher for data points which are close to each other than
    those which are far apart.

    An LSH family is a family of functions :math:`h` which map points from the
    metric space to a bucket, so that

    - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`.
    - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`.

    LSH for efficient approximate nearest neighbor search:

    - We define a new family of hash functions :math:`g`, where each
      function :math:`g` is obtained by concatenating :math:`k` functions
      :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`.
      The algorithm constructs :math:`L` hash tables, each of which
      corresponds to a different randomly chosen hash function :math:`g`.
      There are :math:`k \\cdot L` hash functions used in total.

    - In the preprocessing step, we hash all :math:`n` reference points
      into each of the :math:`L` hash tables.

    - Given a query point :math:`q`, the algorithm iterates over the
      :math:`L` hash functions :math:`g`. For each :math:`g` considered, it
      retrieves the data points that are hashed into the same bucket as q.
      These data points from all the :math:`L` hash tables are considered as
      candidates that are then re-ranked by their real distances with the query
      data.

    **Note** that the number of tables :math:`L` and the number of hash
    functions per table :math:`k` are two main parameters. They can be set
    using the options ``num_tables`` and ``num_projections_per_table``
    respectively.

    Hash functions for different distances:

    - `euclidean` and `squared_euclidean`:
      :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where
      :math:`a` is a vector, of which the elements are independently
      sampled from normal distribution, and :math:`b` is a number
      uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the
      bucket width. We set :math:`r` using the average all-pair `euclidean`
      distances from a small randomly sampled subset of the reference data.

    - `manhattan`: The hash function of `manhattan` is similar with that of
      `euclidean`. The only difference is that the elements of `a` are sampled
      from Cauchy distribution, instead of normal distribution.

    - `cosine`: Random Projection is designed to approximate the cosine
      distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot
      q)`, where :math:`a` is randomly sampled normal unit vector.

    - `jaccard`: We use a recently proposed method one permutation hashing by
      Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014]
      <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for
      details.

    - `dot_product`: The reference data points are first transformed to
      fixed-norm vectors, and then the minimum `dot_product` distance search
      problem can be solved via finding the reference data with smallest
      `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015]
      <http://proceedings.mlr.press/v37/neyshabur15.html>`_ for details.

    References
    ----------
    - `Wikipedia - nearest neighbor
      search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_

    - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_

    - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of
      Practical Approximate Nearest Neighbor Algorithms
      <http://papers.nips.cc/paper/2666-an-investigation-of-p
      ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural
      Information Processing Systems pp. 825-832.

    - `Wikipedia - Jaccard distance
      <http://en.wikipedia.org/wiki/Jaccard_index>`_

    - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the
      Jaccard Median
      <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_.
      Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete
      Algorithms. Society for Industrial and Applied Mathematics.

    - `Wikipedia - Cosine distance
      <http://en.wikipedia.org/wiki/Cosine_similarity>`_

    - `Wikipedia - Levenshtein distance
      <http://en.wikipedia.org/wiki/Levenshtein_distance>`_

    - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive
      Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_.

    Examples
    --------
    Construct a nearest neighbors model with automatically determined method
    and distance:

    >>> sf = turicreate.SFrame({'X1': [0.98, 0.62, 0.11],
    ...                       'X2': [0.69, 0.58, 0.36],
    ...                       'str_feature': ['cat', 'dog', 'fossa']})
    >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'])

    For datasets with a large number of rows and up to about 100 variables, the
    ball tree method often leads to much faster queries.

    >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'],
    ...                                           method='ball_tree')

    Often the final determination of a neighbor is based on several distance
    computations over different sets of features. Each part of this composite
    distance may have a different relative weight.

    >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.],
    ...            [['str_feature'], 'levenshtein', 3.]]
    ...
    >>> model = turicreate.nearest_neighbors.create(sf, distance=my_dist)
    """

    ## Validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Basic validation of the features input
    if features is not None and not isinstance(features, list):
        raise TypeError("If specified, input 'features' must be a list of " +
                        "strings.")

    ## Clean the method options and create the options dictionary
    allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table']
    _method_options = {}

    for k, v in kwargs.items():
        if k in allowed_kwargs:
            _method_options[k] = v
        else:
            raise _ToolkitError(
                "'{}' is not a valid keyword argument".format(k) +
                " for the nearest neighbors model. Please " +
                "check for capitalization and other typos.")

    ## Exclude inappropriate combinations of method an distance
    if method == 'ball_tree' and (
            distance == 'cosine' or distance == _turicreate.distances.cosine
            or distance == 'dot_product'
            or distance == _turicreate.distances.dot_product
            or distance == 'transformed_dot_product'
            or distance == _turicreate.distances.transformed_dot_product):
        raise TypeError(
            "The ball tree method does not work with 'cosine' " +
            "'dot_product', or 'transformed_dot_product' distance." +
            "Please use the 'brute_force' method for these distances.")

    if method == 'lsh' and ('num_projections_per_table'
                            not in _method_options):
        if distance == 'jaccard' or distance == _turicreate.distances.jaccard:
            _method_options['num_projections_per_table'] = 4
        elif distance == 'cosine' or distance == _turicreate.distances.cosine:
            _method_options['num_projections_per_table'] = 16
        else:
            _method_options['num_projections_per_table'] = 8

    ## Initial validation and processing of the label
    if label is None:
        _label = _robust_column_name('__id', dataset.column_names())
        _dataset = dataset.add_row_number(_label)
    else:
        _label = label
        _dataset = _copy.copy(dataset)

    col_type_map = {c: _dataset[c].dtype for c in _dataset.column_names()}
    _validate_row_label(_label, col_type_map)
    ref_labels = _dataset[_label]

    ## Determine the internal list of available feature names (may still include
    #  the row label name).
    if features is None:
        _features = _dataset.column_names()
    else:
        _features = _copy.deepcopy(features)

    ## Check if there's only one feature and it's the same as the row label.
    #  This would also be trapped by the composite distance validation, but the
    #  error message is not very informative for the user.
    free_features = set(_features).difference([_label])
    if len(free_features) < 1:
        raise _ToolkitError("The only available feature is the same as the " +
                            "row label column. Please specify features " +
                            "that are not also row labels.")

    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.
    # - At this point, the row label 'label' may still be in the list(s) of
    #   features.

    ## Convert any distance function input into a single composite distance.
    # distance is already a composite distance
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    # distance is a single name (except 'auto') or function handle.
    elif (hasattr(distance, '__call__')
          or (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    # distance is unspecified and needs to be constructed.
    elif distance is None or distance == 'auto':
        sample = _dataset.head()
        distance = _construct_auto_distance(_features, _dataset.column_names(),
                                            _dataset.column_types(), sample)

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' "
                        " argument must be a string, function handle, or " +
                        "composite distance.")

    ## Basic composite distance validation, remove the row label from all
    #  feature lists, and convert string distance names into distance functions.
    distance = _scrub_composite_distance_features(distance, [_label])
    distance = _convert_distance_names_to_functions(distance)
    _validate_composite_distance(distance)

    ## Raise an error if any distances are used with non-lists
    list_features_to_check = []
    sparse_distances = [
        'jaccard', 'weighted_jaccard', 'cosine', 'dot_product',
        'transformed_dot_product'
    ]
    sparse_distances = [
        getattr(_turicreate.distances, k) for k in sparse_distances
    ]
    for d in distance:
        feature_names, dist, _ = d
        list_features = [f for f in feature_names if _dataset[f].dtype == list]
        for f in list_features:
            if dist in sparse_distances:
                list_features_to_check.append(f)
            else:
                raise TypeError(
                    "The chosen distance cannot currently be used " +
                    "on list-typed columns.")
    for f in list_features_to_check:
        only_str_lists = _validate_lists(_dataset[f], [str])
        if not only_str_lists:
            raise TypeError("Distances for sparse data, such as jaccard " +
                            "and weighted_jaccard, can only be used on " +
                            "lists containing only strings. Please modify " +
                            "any list features accordingly before creating " +
                            "the nearest neighbors model.")

    ## Raise an error if any component has string features are in single columns
    for d in distance:
        feature_names, dist, _ = d

        if (len(feature_names) > 1) and (dist
                                         == _turicreate.distances.levenshtein):
            raise ValueError(
                "Levenshtein distance cannot be used with multiple " +
                "columns. Please concatenate strings into a single " +
                "column before creating the nearest neighbors model.")

    ## Get the union of feature names and make a clean dataset.
    clean_features = _get_composite_distance_features(distance)
    sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features)

    ## Decide which method to use
    ## - If more than one distance component (specified either directly or
    #  generated automatically because distance set to 'auto'), then do brute
    #  force.
    if len(distance) > 1:
        _method = 'brute_force'

        if method != 'brute_force' and verbose is True:
            print("Defaulting to brute force instead of ball tree because " +\
                "there are multiple distance components.")

    else:
        if method == 'auto':

            # get the total number of variables. Assume the number of elements in
            # array type columns does not change
            num_variables = sum([
                len(x) if hasattr(x, '__iter__') else 1
                for x in _six.itervalues(sf_clean[0])
            ])

            # flag if all the features in the single composite are of numeric
            # type.
            numeric_type_flag = all([
                x in [int, float, list, array.array]
                for x in sf_clean.column_types()
            ])

            ## Conditions necessary for ball tree to work and be worth it
            if ((distance[0][1] in [
                    'euclidean', 'manhattan', _turicreate.distances.euclidean,
                    _turicreate.distances.manhattan
            ]) and numeric_type_flag is True and num_variables <= 200):

                _method = 'ball_tree'

            else:
                _method = 'brute_force'

        else:
            _method = method

    ## Pick the right model name for the method
    if _method == 'ball_tree':
        model_name = 'nearest_neighbors_ball_tree'

    elif _method == 'brute_force':
        model_name = 'nearest_neighbors_brute_force'

    elif _method == 'lsh':
        model_name = 'nearest_neighbors_lsh'

    else:
        raise ValueError(
            "Method must be 'auto', 'ball_tree', 'brute_force', " +
            "or 'lsh'.")

    ## Package the model options
    opts = {}
    opts.update(_method_options)
    opts.update({
        'model_name': model_name,
        'ref_labels': ref_labels,
        'label': label,
        'sf_features': sf_clean,
        'composite_params': distance
    })

    ## Construct the nearest neighbors model
    with QuietProgress(verbose):
        result = _turicreate.extensions._nearest_neighbors.train(opts)

    model_proxy = result['model']
    model = NearestNeighborsModel(model_proxy)

    return model
Exemplo n.º 3
0
    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
        """
        For each row of the input 'dataset', retrieve the nearest neighbors
        from the model's stored data. In general, the query dataset does not
        need to be the same as the reference data stored in the model, but if
        it is, the 'include_self_edges' parameter can be set to False to
        exclude results that match query points to themselves.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~turicreate.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        label : str, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the
            query point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the
            ``k`` nearest neighbors are returned for each query point,
            regardless of distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        See Also
        --------
        similarity_graph

        Notes
        -----
        - The `dataset` input to this method *can* have missing values (in
          contrast to the reference dataset used to create the nearest
          neighbors model). Missing numeric values are imputed to be the mean
          of the corresponding feature in the reference dataset, and missing
          strings are imputed to be empty strings.

        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output
          is an SFrame with :math:`nm` rows.

        - For models created with the 'lsh' method, the query results may have
          fewer query labels than input query points. Because LSH is an
          approximate method, a query point may have fewer than 'k' neighbors.
          If LSH returns no neighbors at all for a query, the query point is
          omitted from the results.

        Examples
        --------
        First construct a toy SFrame and create a nearest neighbors model:

        >>> sf = turicreate.SFrame({'label': range(3),
        ...                       'feature1': [0.98, 0.62, 0.11],
        ...                       'feature2': [0.69, 0.58, 0.36]})
        >>> model = turicreate.nearest_neighbors.create(sf, 'label')

        A new SFrame contains query observations with same schema as the
        reference SFrame. This SFrame is passed to the ``query`` method.

        >>> queries = turicreate.SFrame({'label': range(3),
        ...                            'feature1': [0.05, 0.61, 0.99],
        ...                            'feature2': [0.06, 0.97, 0.86]})
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """

        ## Validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Get model features
        ref_features = self.features
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Validate and preprocess the 'label' input
        if label is None:
            query_labels = _turicreate.SArray.from_sequence(len(dataset))

        else:
            if not label in dataset.column_names():
                raise ValueError(
                    "Input 'label' must be a string matching the name of a " +\
                    "column in the reference SFrame 'dataset'.")

            if not dataset[label].dtype == str and not dataset[
                    label].dtype == int:
                raise TypeError(
                    "The label column must contain integers or strings.")

            if label in ref_features:
                raise ValueError(
                    "The label column cannot be one of the features.")

            query_labels = dataset[label]

        ## Validate neighborhood parameters 'k' and 'radius'
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")

        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'features': sf_features,
            'query_labels': query_labels,
            'k': k,
            'radius': radius
        }

        with QuietProgress(verbose):
            result = _turicreate.extensions._nearest_neighbors.query(opts)

        return result['neighbors']
Exemplo n.º 4
0
def create(dataset, session_id, target, features=None, prediction_window=100,
           validation_set='auto', max_iterations=10, batch_size=32, verbose=True):
    """
    Create an :class:`ActivityClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data which consists of `sessions` of data where each session is
        a sequence of data. The data must be in `stacked` format, grouped by
        session. Within each session, the data is assumed to be sorted
        temporally. Columns in `features` will be used to train a model that
        will make a prediction using labels in the `target` column.

    session_id : string
        Name of the column that contains a unique ID for each session.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. Use `model.classes` to
        retrieve the order in which the classes are mapped.

    features : list[string], optional
        Name of the columns containing the input features that will be used
        for classification. If set to `None`, all columns except `session_id`
        and `target` will be used.

    prediction_window : int, optional
        Number of time units between predictions. For example, if your input
        data is sampled at 100Hz, and the `prediction_window` is set to 100,
        then this model will make a prediction every 1 second.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance to
        prevent the model from overfitting to the training data.

        For each row of the progress table, accuracy is measured over the
        provided training dataset and the `validation_set`. The format of this
        SFrame must be the same as the training set.

        When set to 'auto', a validation set is automatically sampled from the
        training data (if the training data has > 100 sessions). If
        validation_set is set to None, then all the data will be used for
        training.

    max_iterations : int , optional
        Maximum number of iterations/epochs made over the data during the
        training phase.

    batch_size : int, optional
        Number of sequence chunks used per training step. Must be greater than
        the number of GPUs in use.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ActivityClassifier
        A trained :class:`ActivityClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate as tc

        # Training on dummy data
        >>> data = tc.SFrame({
        ...    'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10,
        ...    'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10,
        ...    'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10,
        ...    'session_id': [0, 0, 0] * 10 + [1, 1] * 10,
        ...    'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10
        ... })

        # Create an activity classifier
        >>> model = tc.activity_classifier.create(train,
        ...     session_id='session_id', target='activity',
        ...     features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z'])

        # Make predictions (as probability vector, or class)
        >>> predictions = model.predict(data)
        >>> predictions = model.predict(data, output_type='probability_vector')

        # Get both predictions and classes together
        >>> predictions = model.classify(data)

        # Get topk predictions (instead of only top-1) if your labels have more
        # 2 classes
        >>> predictions = model.predict_topk(data, k = 3)

        # Evaluate the model
        >>> results = model.evaluate(data)

    See Also
    --------
    ActivityClassifier, util.random_split_by_session
    """
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    from ._model_architecture import _net_params
    from ._model_architecture import _define_model, _fit_model
    from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter
    from ._sframe_sequence_iterator import prep_data as _prep_data

    if not isinstance(target, str):
        raise _ToolkitError('target must be of type str')
    if not isinstance(session_id, str):
        raise _ToolkitError('session_id must be of type str')
    _tkutl._raise_error_if_sframe_empty(dataset, 'dataset')
    _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400)
    _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE)

    if features is None:
        features = _fe_tkutl.get_column_names(dataset,
                                              interpret_as_excluded=True,
                                              column_names=[session_id, target])
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str." % x)
    if len(features) == 0:
        raise TypeError("Input 'features' must contain at least one column name.")

    start_time = _time.time()
    dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target])
    _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int])
    _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int])

    # Encode the target column to numerical values
    use_target = target is not None
    dataset, target_map = _encode_target(dataset, target)

    predictions_in_chunk = 20
    chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window,
                                            predictions_in_chunk, target=target, verbose=verbose)

    if isinstance(validation_set, str) and validation_set == 'auto':
        if num_sessions < 100:
            validation_set = None
        else:
            dataset, validation_set = _random_split_by_session(dataset, session_id)

    # Create data iterators
    num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions)
    user_provided_batch_size = batch_size
    batch_size = max(batch_size, num_gpus, 1)
    data_iter = _SFrameSequenceIter(chunked_data, len(features),
                                    prediction_window, predictions_in_chunk,
                                    batch_size, use_target=use_target)

    if validation_set is not None:
        _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set')
        _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set')
        validation_set = _tkutl._toolkits_select_columns(
            validation_set, features + [session_id, target])
        validation_set = validation_set.filter_by(target_map.keys(), target)
        validation_set, mapping = _encode_target(validation_set, target, target_map)
        chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window,
                                            predictions_in_chunk, target=target, verbose=False)

        valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features),
                                    prediction_window, predictions_in_chunk,
                                    batch_size, use_target=use_target)
    else:
        valid_iter = None

    # Define model architecture
    context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions)
    loss_model, pred_model = _define_model(features, target_map, prediction_window,
                                           predictions_in_chunk, context)

    # Train the model
    log = _fit_model(loss_model, data_iter, valid_iter,
                     max_iterations, num_gpus, verbose)

    # Set up prediction model
    pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None,
                    for_training=False)
    arg_params, aux_params = loss_model.get_params()
    pred_model.init_params(arg_params=arg_params, aux_params=aux_params)

    # Save the model
    state = {
        '_pred_model': pred_model,
        'verbose': verbose,
        'training_time': _time.time() - start_time,
        'target': target,
        'classes': sorted(target_map.keys()),
        'features': features,
        'session_id': session_id,
        'prediction_window': prediction_window,
        'max_iterations': max_iterations,
        'num_examples': len(dataset),
        'num_sessions': num_sessions,
        'num_classes': len(target_map),
        'num_features': len(features),
        'training_accuracy': log['train_acc'],
        'training_log_loss': log['train_loss'],
        '_target_id_map': target_map,
        '_id_target_map': {v: k for k, v in target_map.items()},
        '_predictions_in_chunk': predictions_in_chunk,
        '_recalibrated_batch_size': data_iter.batch_size,
        'batch_size' : user_provided_batch_size
    }

    if validation_set is not None:
        state['valid_accuracy'] = log['valid_acc']
        state['valid_log_loss'] = log['valid_loss']

    model = ActivityClassifier(state)
    return model
Exemplo n.º 5
0
def create(
    dataset,
    session_id,
    target,
    features=None,
    prediction_window=100,
    validation_set="auto",
    max_iterations=10,
    batch_size=32,
    verbose=True,
):
    """
    Create an :class:`ActivityClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data which consists of `sessions` of data where each session is
        a sequence of data. The data must be in `stacked` format, grouped by
        session. Within each session, the data is assumed to be sorted
        temporally. Columns in `features` will be used to train a model that
        will make a prediction using labels in the `target` column.

    session_id : string
        Name of the column that contains a unique ID for each session.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. Use `model.classes` to
        retrieve the order in which the classes are mapped.

    features : list[string], optional
        Name of the columns containing the input features that will be used
        for classification. If set to `None`, all columns except `session_id`
        and `target` will be used.

    prediction_window : int, optional
        Number of time units between predictions. For example, if your input
        data is sampled at 100Hz, and the `prediction_window` is set to 100,
        then this model will make a prediction every 1 second.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance to
        prevent the model from overfitting to the training data.

        For each row of the progress table, accuracy is measured over the
        provided training dataset and the `validation_set`. The format of this
        SFrame must be the same as the training set.

        When set to 'auto', a validation set is automatically sampled from the
        training data (if the training data has > 100 sessions). If
        validation_set is set to None, then all the data will be used for
        training.

    max_iterations : int , optional
        Maximum number of iterations/epochs made over the data during the
        training phase.

    batch_size : int, optional
        Number of sequence chunks used per training step. Must be greater than
        the number of GPUs in use.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ActivityClassifier
        A trained :class:`ActivityClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate as tc

        # Training on dummy data
        >>> data = tc.SFrame({
        ...    'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10,
        ...    'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10,
        ...    'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10,
        ...    'session_id': [0, 0, 0] * 10 + [1, 1] * 10,
        ...    'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10
        ... })

        # Create an activity classifier
        >>> model = tc.activity_classifier.create(data,
        ...     session_id='session_id', target='activity',
        ...     features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z'])

        # Make predictions (as probability vector, or class)
        >>> predictions = model.predict(data)
        >>> predictions = model.predict(data, output_type='probability_vector')

        # Get both predictions and classes together
        >>> predictions = model.classify(data)

        # Get topk predictions (instead of only top-1) if your labels have more
        # 2 classes
        >>> predictions = model.predict_topk(data, k = 3)

        # Evaluate the model
        >>> results = model.evaluate(data)

    See Also
    --------
    ActivityClassifier, util.random_split_by_session
    """

    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    if not isinstance(target, str):
        raise _ToolkitError("target must be of type str")
    if not isinstance(session_id, str):
        raise _ToolkitError("session_id must be of type str")
    if not isinstance(batch_size, int):
        raise _ToolkitError("batch_size must be of type int")

    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")
    _tkutl._numeric_param_check_range("prediction_window", prediction_window, 1, 400)
    _tkutl._numeric_param_check_range("max_iterations", max_iterations, 0, _six.MAXSIZE)

    if features is None:
        features = _fe_tkutl.get_column_names(
            dataset, interpret_as_excluded=True, column_names=[session_id, target]
        )
    if not hasattr(features, "__iter__"):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str." % x)
    if len(features) == 0:
        raise TypeError("Input 'features' must contain at least one column name.")

    start_time = _time.time()
    dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target])
    _tkutl._raise_error_if_sarray_not_expected_dtype(
        dataset[target], target, [str, int]
    )
    _tkutl._raise_error_if_sarray_not_expected_dtype(
        dataset[session_id], session_id, [str, int]
    )

    for feature in features:
        _tkutl._handle_missing_values(dataset, feature, "training_dataset")

    # Check for missing values for sframe validation set
    if isinstance(validation_set, _SFrame):
        _tkutl._raise_error_if_sframe_empty(validation_set, "validation_set")
        for feature in features:
            _tkutl._handle_missing_values(validation_set, feature, "validation_set")

    # C++ model
    name = "activity_classifier"

    import turicreate as _turicreate

    # Imports tensorflow
    import turicreate.toolkits.libtctensorflow

    model = _turicreate.extensions.activity_classifier()
    options = {}
    options["prediction_window"] = prediction_window
    options["batch_size"] = batch_size
    options["max_iterations"] = max_iterations
    options["verbose"] = verbose
    options["_show_loss"] = False

    model.train(dataset, target, session_id, validation_set, options)
    return ActivityClassifier(model_proxy=model, name=name)
Exemplo n.º 6
0
    def predict(self, dataset, output_type='cluster_id', verbose=True):
        """
        Return predicted cluster label for instances in the new 'dataset'.
        K-means predictions are made by assigning each new instance to the
        closest cluster center.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include the features used for
            model training; additional columns are ignored.

        output_type : {'cluster_id', 'distance'}, optional
            Form of the prediction. 'cluster_id' (the default) returns the
            cluster label assigned to each input instance, while 'distance'
            returns the Euclidean distance between the instance and its
            assigned cluster's center.

        verbose : bool, optional
            If True, print progress updates to the screen.

        Returns
        -------
        out : SArray
            Model predictions. Depending on the specified `output_type`, either
            the assigned cluster label or the distance of each point to its
            closest cluster center. The order of the predictions is the same as
            order of the input data rows.

        See Also
        --------
        create

        Examples
        --------
        >>> sf = turicreate.SFrame({
        ...     'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162,
        ...            8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020],
        ...     'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305,
        ...            5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]})
        ...
        >>> model = turicreate.kmeans.create(sf, num_clusters=3)
        ...
        >>> sf_new = turicreate.SFrame({'x1': [-5.6584, -1.0167, -9.6181],
        ...                           'x2': [-6.3803, -3.7937, -1.1022]})
        >>> clusters = model.predict(sf_new, output_type='cluster_id')
        >>> print clusters
        [1, 0, 1]
        """

        ## Validate the input dataset.
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Validate the output type.
        if not isinstance(output_type, str):
            raise TypeError("The 'output_type' parameter must be a string.")

        if not output_type in ('cluster_id', 'distance'):
            raise ValueError("The 'output_type' parameter must be either " +
                             "'cluster_label' or 'distance'.")

        ## Get model features.
        ref_features = self.features
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Compute predictions.
        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'dataset': sf_features}

        result = _tc.toolkits._main.run('kmeans_predict', opts, verbose)
        sf_result = _tc.SFrame(None, _proxy=result['predictions'])

        if output_type == 'distance':
            return sf_result['distance']
        else:
            return sf_result['cluster_id']
Exemplo n.º 7
0
def create(dataset,
           session_id,
           target,
           features=None,
           prediction_window=100,
           validation_set='auto',
           max_iterations=10,
           batch_size=32,
           verbose=True,
           **kwargs):
    """
    Create an :class:`ActivityClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data which consists of `sessions` of data where each session is
        a sequence of data. The data must be in `stacked` format, grouped by
        session. Within each session, the data is assumed to be sorted
        temporally. Columns in `features` will be used to train a model that
        will make a prediction using labels in the `target` column.

    session_id : string
        Name of the column that contains a unique ID for each session.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. Use `model.classes` to
        retrieve the order in which the classes are mapped.

    features : list[string], optional
        Name of the columns containing the input features that will be used
        for classification. If set to `None`, all columns except `session_id`
        and `target` will be used.

    prediction_window : int, optional
        Number of time units between predictions. For example, if your input
        data is sampled at 100Hz, and the `prediction_window` is set to 100,
        then this model will make a prediction every 1 second.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance to
        prevent the model from overfitting to the training data.

        For each row of the progress table, accuracy is measured over the
        provided training dataset and the `validation_set`. The format of this
        SFrame must be the same as the training set.

        When set to 'auto', a validation set is automatically sampled from the
        training data (if the training data has > 100 sessions). If
        validation_set is set to None, then all the data will be used for
        training.

    max_iterations : int , optional
        Maximum number of iterations/epochs made over the data during the
        training phase.

    batch_size : int, optional
        Number of sequence chunks used per training step. Must be greater than
        the number of GPUs in use.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ActivityClassifier
        A trained :class:`ActivityClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate as tc

        # Training on dummy data
        >>> data = tc.SFrame({
        ...    'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10,
        ...    'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10,
        ...    'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10,
        ...    'session_id': [0, 0, 0] * 10 + [1, 1] * 10,
        ...    'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10
        ... })

        # Create an activity classifier
        >>> model = tc.activity_classifier.create(data,
        ...     session_id='session_id', target='activity',
        ...     features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z'])

        # Make predictions (as probability vector, or class)
        >>> predictions = model.predict(data)
        >>> predictions = model.predict(data, output_type='probability_vector')

        # Get both predictions and classes together
        >>> predictions = model.classify(data)

        # Get topk predictions (instead of only top-1) if your labels have more
        # 2 classes
        >>> predictions = model.predict_topk(data, k = 3)

        # Evaluate the model
        >>> results = model.evaluate(data)

    See Also
    --------
    ActivityClassifier, util.random_split_by_session
    """
    from .._mxnet import _mxnet_utils
    from ._mx_model_architecture import _net_params
    from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter
    from ._sframe_sequence_iterator import prep_data as _prep_data
    from ._mx_model_architecture import _define_model_mxnet, _fit_model_mxnet
    from ._mps_model_architecture import _define_model_mps, _fit_model_mps
    from .._mps_utils import (use_mps as _use_mps, mps_device_name as
                              _mps_device_name, ac_weights_mps_to_mxnet as
                              _ac_weights_mps_to_mxnet)

    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    if not isinstance(target, str):
        raise _ToolkitError('target must be of type str')
    if not isinstance(session_id, str):
        raise _ToolkitError('session_id must be of type str')
    _tkutl._raise_error_if_sframe_empty(dataset, 'dataset')
    _tkutl._numeric_param_check_range('prediction_window', prediction_window,
                                      1, 400)
    _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0,
                                      _six.MAXSIZE)

    if features is None:
        features = _fe_tkutl.get_column_names(
            dataset,
            interpret_as_excluded=True,
            column_names=[session_id, target])
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError(
            "Invalid feature %s: Feature names must be of type str." % x)
    if len(features) == 0:
        raise TypeError(
            "Input 'features' must contain at least one column name.")

    start_time = _time.time()
    dataset = _tkutl._toolkits_select_columns(dataset,
                                              features + [session_id, target])
    _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target,
                                                     [str, int])
    _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id],
                                                     session_id, [str, int])

    params = {'use_tensorflow': False, 'show_deprecated_warnings': False}

    if '_advanced_parameters' in kwargs:
        # Make sure no additional parameters are provided
        new_keys = set(kwargs['_advanced_parameters'].keys())
        set_keys = set(params.keys())
        unsupported = new_keys - set_keys
        if unsupported:
            raise _ToolkitError(
                'Unknown advanced parameters: {}'.format(unsupported))

        params.update(kwargs['_advanced_parameters'])

    if params['use_tensorflow'] and not (params['show_deprecated_warnings']):

        # Imports tensorflow
        import tensorflow as _tf
        from ._tf_model_architecture import ActivityTensorFlowModel, _fit_model_tf

        # Supresses verbosity to only errors
        _tf.compat.v1.logging.set_verbosity(_tf.compat.v1.logging.ERROR)

    if isinstance(validation_set, str) and validation_set == 'auto':
        # Computing the number of unique sessions in this way is relatively
        # expensive. Ideally we'd incorporate this logic into the C++ code that
        # chunks the raw data by prediction window.
        # TODO: https://github.com/apple/turicreate/issues/991
        unique_sessions = _SFrame({'session': dataset[session_id].unique()})
        if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT:
            print(
                "The dataset has less than the minimum of",
                _MIN_NUM_SESSIONS_FOR_SPLIT,
                "sessions required for train-validation split. Continuing without validation set"
            )
            validation_set = None
        else:
            dataset, validation_set = _random_split_by_session(
                dataset, session_id)

    for feature in features:
        _tkutl._handle_missing_values(dataset, feature, 'training_dataset')

    # Encode the target column to numerical values
    use_target = target is not None
    dataset, target_map = _encode_target(dataset, target)

    predictions_in_chunk = 20
    chunked_data, num_sessions = _prep_data(dataset,
                                            features,
                                            session_id,
                                            prediction_window,
                                            predictions_in_chunk,
                                            target=target,
                                            verbose=verbose)

    # Decide whether to use MPS GPU, MXnet GPU or CPU
    num_mxnet_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions)
    use_mps = _use_mps() and num_mxnet_gpus == 0 and not (
        params['use_tensorflow'])

    if verbose:
        if use_mps:
            print('Using GPU to create model ({})'.format(_mps_device_name()))
        elif num_mxnet_gpus == 1:
            print('Using GPU to create model (CUDA)')
        elif num_mxnet_gpus > 1:
            print(
                'Using {} GPUs to create model (CUDA)'.format(num_mxnet_gpus))
        elif params['use_tensorflow']:
            print('Using Tensorflow to create model')
        else:
            print('Using CPU to create model')

    # Create data iterators
    user_provided_batch_size = batch_size
    batch_size = max(batch_size, num_mxnet_gpus, 1)

    use_mx_data_batch = not (use_mps or params['use_tensorflow'])
    data_iter = _SFrameSequenceIter(chunked_data,
                                    len(features),
                                    prediction_window,
                                    predictions_in_chunk,
                                    batch_size,
                                    use_target=use_target,
                                    mx_output=use_mx_data_batch)

    if validation_set is not None:
        _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set')
        _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set')
        validation_set = _tkutl._toolkits_select_columns(
            validation_set, features + [session_id, target])
        for feature in features:
            _tkutl._handle_missing_values(dataset, feature, 'validation_set')
        validation_set = validation_set.filter_by(list(target_map.keys()),
                                                  target)
        validation_set, mapping = _encode_target(validation_set, target,
                                                 target_map)
        chunked_validation_set, _ = _prep_data(validation_set,
                                               features,
                                               session_id,
                                               prediction_window,
                                               predictions_in_chunk,
                                               target=target,
                                               verbose=False)

        valid_iter = _SFrameSequenceIter(chunked_validation_set,
                                         len(features),
                                         prediction_window,
                                         predictions_in_chunk,
                                         batch_size,
                                         use_target=use_target,
                                         mx_output=use_mx_data_batch)
    else:
        valid_iter = None

    # Define model architecture
    context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions)

    # Always create MXNet models, as the pred_model is later saved to the state
    # If MPS is used - the loss_model will be overwritten
    loss_model, pred_model = _define_model_mxnet(len(target_map),
                                                 prediction_window,
                                                 predictions_in_chunk, context)

    if use_mps:
        loss_model = _define_model_mps(batch_size,
                                       len(features),
                                       len(target_map),
                                       prediction_window,
                                       predictions_in_chunk,
                                       is_prediction_model=False)

        log = _fit_model_mps(loss_model, data_iter, valid_iter, max_iterations,
                             verbose)
    else:

        if params['use_tensorflow']:
            net_params = _initialize_with_mxnet_weights(
                loss_model, chunked_data, features, prediction_window,
                predictions_in_chunk, batch_size, use_target)
            ac_model = ActivityTensorFlowModel(net_params, batch_size,
                                               len(features), len(target_map),
                                               prediction_window,
                                               predictions_in_chunk)
            # Train the model using Tensorflow
            log = _fit_model_tf(ac_model, net_params, data_iter, valid_iter,
                                max_iterations, verbose, 1e-3)
        else:
            # Train the model using Mxnet
            log = _fit_model_mxnet(loss_model, data_iter, valid_iter,
                                   max_iterations, num_mxnet_gpus, verbose)

    # Set up prediction model
    pred_model.bind(data_shapes=data_iter.provide_data,
                    label_shapes=None,
                    for_training=False)

    if use_mps:
        mps_params = loss_model.export()
        arg_params, aux_params = _ac_weights_mps_to_mxnet(
            mps_params, _net_params['lstm_h'])

    elif params['use_tensorflow']:
        # Copy the weights back in the MXNet format
        arg_params, aux_params = ac_model.get_weights()

    else:
        arg_params, aux_params = loss_model.get_params()

    pred_model.init_params(arg_params=arg_params, aux_params=aux_params)

    # Save the model
    state = {
        '_pred_model': pred_model,
        'verbose': verbose,
        'training_time': _time.time() - start_time,
        'target': target,
        'classes': sorted(target_map.keys()),
        'features': features,
        'session_id': session_id,
        'prediction_window': prediction_window,
        'max_iterations': max_iterations,
        'num_examples': len(dataset),
        'num_sessions': num_sessions,
        'num_classes': len(target_map),
        'num_features': len(features),
        'training_accuracy': log['train_acc'],
        'training_log_loss': log['train_loss'],
        '_target_id_map': target_map,
        '_id_target_map': {v: k
                           for k, v in target_map.items()},
        '_predictions_in_chunk': predictions_in_chunk,
        '_recalibrated_batch_size': data_iter.batch_size,
        'batch_size': user_provided_batch_size
    }

    if validation_set is not None:
        state['valid_accuracy'] = log['valid_acc']
        state['valid_log_loss'] = log['valid_loss']

    model = ActivityClassifier(state)
    return model
Exemplo n.º 8
0
def create_classification_with_model_selector(dataset, target, model_selector,
    features=None, validation_set='auto', verbose=True):
    """
    Create a :class:`~turicreate.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel. This function is normally not called, call
    specific model's create function instead.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    model_selector: function
        Provide a model selector.

    features : list[string], optional
        List of feature names used by feature column

    verbose : boolean
        whether print out messages during training

    """

    # Error checking
    _raise_error_if_not_sframe(dataset, "training dataset")
    if features is None:
        features = dataset.column_names()
        if target in features:
            features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)

    # Sample the data
    features_sframe = _toolkits_select_columns(dataset, features)
    if features_sframe.num_rows() > 1e5:
        fraction = 1.0 * 1e5 / features_sframe.num_rows()
        features_sframe = features_sframe.sample(fraction, seed = 0)

    # Get available models for this dataset
    num_classes = len(dataset[target].unique())
    selected_model_names = model_selector(num_classes, features_sframe)

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95)
            else:
                validation_set = None
        else:
            raise TypeError('Unrecognized value for validation_set.')

    # Match C++ model names with user model names
    python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier',
                    'random_forest_classifier': 'RandomForestClassifier',
                    'decision_tree_classifier': 'DecisionTreeClassifier',
                    'classifier_logistic_regression': 'LogisticClassifier',
                    'classifier_svm': 'SVMClassifier'}

    # Print useful user-facing progress messages
    if verbose:
        print('PROGRESS: The following methods are available for this type of problem.')
        print('PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names]))
        if len(selected_model_names) > 1:
            print('PROGRESS: The returned model will be chosen according to validation accuracy.')

    models = {}
    metrics = {}
    for model_name in selected_model_names:

        # Fit each of the available models
        m = create_selected(model_name, dataset, target, features, validation_set, verbose)
        models[model_name] = m

        if 'validation_accuracy' in m._list_fields():
            metrics[model_name] = m.validation_accuracy

        # Most models have this.
        elif 'progress' in m._list_fields():
            prog = m.progress
            validation_column = 'Validation-accuracy'
            accuracy_column = 'Training-accuracy'
            if validation_column in prog.column_names():
                metrics[model_name] = float(prog[validation_column].tail(1)[0])
            else:
                metrics[model_name] = float(prog[accuracy_column].tail(1)[0])
        else:
            raise ValueError("Model does not have metrics that can be used for model selection.")

    # Choose model based on either validation, if available.
    best_model = None
    best_acc = None
    for model_name in selected_model_names:
        if best_acc is None:
            best_model = model_name
            best_acc = metrics[model_name]
        if best_acc is not None and best_acc < metrics[model_name]:
            best_model = model_name
            best_acc = metrics[model_name]

    ret = []
    width = 32
    if len(selected_model_names) > 1:
        ret.append('PROGRESS: Model selection based on validation accuracy:')
        ret.append('---------------------------------------------')
        key_str = '{:<{}}: {}'
        for model_name in selected_model_names:
            name = python_names[model_name]
            row = key_str.format(name, width, str(metrics[model_name]))
            ret.append(row)
        ret.append('---------------------------------------------')
        ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.')

    if verbose:
        print('\nPROGRESS: '.join(ret))
    return models[best_model]
Exemplo n.º 9
0
def create(dataset, target, model_name, features=None,
           validation_set='auto', distributed='auto',
           verbose=True, seed=None, **kwargs):
    """
    Create a :class:`~turicreate.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    features : list[string], optional
        List of feature names used by feature column

    validation_set : SFrame, optional

        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    distributed: env
        The distributed environment

    verbose : boolean
        whether print out messages during training

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    kwargs : dict
        Additional parameter options that can be passed
    """

    _raise_error_if_not_sframe(dataset, "training dataset")

    # Determine columns to keep
    if features is None:
        features = [feat for feat in dataset.column_names() if feat != target]
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError(
            "Invalid feature %s: Feature names must be of type str" % x)

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95, seed=seed)
            else:
                validation_set = None
        else:
            raise TypeError('Unrecognized value for validation_set.')
    if validation_set is None:
        validation_set = _turicreate.SFrame()
    else:
        if not isinstance(validation_set, _turicreate.SFrame):
            raise TypeError("validation_set must be either 'auto' or an SFrame "
                            "matching the training data.")

        # Attempt to append the two datasets together to check schema
        validation_set.head().append(dataset.head())

        # Reduce validation set to requested columns
        validation_set = _toolkits_select_columns(
            validation_set, features + [target])

    # Reduce training set to requested columns
    dataset = _toolkits_select_columns(dataset, features + [target])

    # Sanitize model-specific options
    options = {k.lower(): kwargs[k] for k in kwargs}

    # Create a model instance and train it
    model = _turicreate.extensions.__dict__[model_name]()
    with QuietProgress(verbose):
        model.train(dataset, target, validation_set, options)

    return SupervisedLearningModel(model, model_name)
Exemplo n.º 10
0
def create(dataset,
           target,
           model_name,
           features=None,
           validation_set='auto',
           verbose=True,
           distributed='auto',
           **kwargs):
    """
    Create a :class:`~turicreate.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    features : list[string], optional
        List of feature names used by feature column

    validation_set : SFrame, optional

        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    distributed: env
        The distributed environment

    verbose : boolean
        whether print out messages during training

    kwargs : dict
        Additional parameter options that can be passed
    """

    _raise_error_if_not_sframe(dataset, "training dataset")

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95)
            else:
                validation_set = None
        else:
            raise TypeError('Unrecognized value for validation_set.')

    # Target
    target_sframe = _toolkits_select_columns(dataset, [target])

    # Features
    if features is None:
        features = dataset.column_names()
        features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError(
            "Invalid feature %s: Feature names must be of type str" % x)
    features_sframe = _toolkits_select_columns(dataset, features)

    options = {}
    _kwargs = {}
    for k in kwargs:
        _kwargs[k.lower()] = kwargs[k]
    options.update(_kwargs)
    options.update({
        'target': target_sframe,
        'features': features_sframe,
        'model_name': model_name
    })

    if validation_set is not None:

        if not isinstance(validation_set, _turicreate.SFrame):
            raise TypeError(
                "validation_set must be either 'auto' or an SFrame matching the training data."
            )

        # Attempt to append the two datasets together to check schema
        validation_set.head().append(dataset.head())

        options.update({
            'features_validation':
            _toolkits_select_columns(validation_set, features),
            'target_validation':
            _toolkits_select_columns(validation_set, [target])
        })

    ret = _turicreate.toolkits._main.run("supervised_learning_train", options,
                                         verbose)
    model = SupervisedLearningModel(ret['model'], model_name)

    return model