示例#1
0
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None):
    """
    Randomly split an SFrame into two SFrames based on the `session_id` such
    that one split contains data for a `fraction` of the sessions while the
    second split contains all data for the rest of the sessions.

    Parameters
    ----------
    dataset : SFrame
        Dataset to split. It must contain a column of session ids.

    session_id : string, optional
        The name of the column in `dataset` that corresponds to the
        a unique identifier for each session.

    fraction : float, optional
        Fraction of the sessions to fetch for the first returned SFrame.  Must
        be between 0 and 1. Once the sessions are split, all data from a single
        session is in the same SFrame.

    seed : int, optional
        Seed for the random number generator used to split.

    Examples
    --------

    .. sourcecode:: python

        # Split the data so that train has 90% of the users.
        >>> train, valid = tc.activity_classifier.util.random_split_by_session(
        ...     dataset, session_id='session_id', fraction=0.9)

        # For example: If dataset has 2055 sessions
        >>> len(dataset['session_id'].unique())
        2055

        # The training set now has 90% of the sessions
        >>> len(train['session_id'].unique())
        1850

        # The validation set has the remaining 10% of the sessions
        >>> len(valid['session_id'].unique())
        205
    """

    _raise_error_if_not_of_type(dataset, _SFrame, 'dataset')
    _raise_error_if_not_of_type(session_id, str, 'session_id')
    _raise_error_if_not_of_type(fraction, float, 'fraction')
    _raise_error_if_not_of_type(seed, [int, type(None)], 'seed')
    _numeric_param_check_range('fraction', fraction, 0, 1)

    if session_id not in dataset.column_names():
        raise _ToolkitError(
            'Input "dataset" must contain a column called %s.' % session_id)

    unique_sessions = _SFrame({'session': dataset[session_id].unique()})
    chosen, not_chosen = unique_sessions.random_split(fraction, seed)
    train = dataset.filter_by(chosen['session'], session_id)
    valid = dataset.filter_by(not_chosen['session'], session_id)
    return train, valid
    def classify(self, dataset, output_frequency='per_row'):
        """
        Return a classification, for each ``prediction_window`` examples in the
        ``dataset``, using the trained activity classification model. The output
        SFrame contains predictions as both class labels as well as probabilities 
        that the predicted value is the associated label.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features and session id used for model training, but
            does not require a target column. Additional columns are ignored.

        output_frequency : {'per_row', 'per_window'}, optional
            The frequency of the predictions which is one of:

            - 'per_row': Each prediction is returned ``prediction_window`` times.
            - 'per_window': Return a single prediction for each 
              ``prediction_window`` rows in ``dataset`` per ``session_id``.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions i.e class labels and probabilities.

        See Also
        ----------
        create, evaluate, predict

        Examples
        ----------
        >>> classes = model.classify(data)
        """
        _tkutl._check_categorical_option_type('output_frequency',
                                              output_frequency,
                                              ['per_window', 'per_row'])
        id_target_map = self._id_target_map
        preds = self.predict(dataset,
                             output_type='probability_vector',
                             output_frequency=output_frequency)

        if output_frequency == 'per_row':
            return _SFrame({
                'class':
                preds.apply(lambda p: id_target_map[_np.argmax(p)]),
                'probability':
                preds.apply(_np.max)
            })
        elif output_frequency == 'per_window':
            preds['class'] = preds['probability_vector'].apply(
                lambda p: id_target_map[_np.argmax(p)])
            preds['probability'] = preds['probability_vector'].apply(_np.max)
            preds = preds.remove_column('probability_vector')
            return preds
    def predict_topk(self, dataset, output_type='probability', k=3, output_frequency='per_row'):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `prediction_id`, 
        `class`, and `probability`, or `rank`, depending on the ``output_type``
        parameter.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features and session id used for model training, but
            does not require a target column. Additional columns are ignored.

        output_type : {'probability', 'rank'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        output_frequency : {'per_row', 'per_window'}, optional
            The frequency of the predictions which is one of:

            - 'per_row': Each prediction is returned ``prediction_window`` times.
            - 'per_window': Return a single prediction for each 
              ``prediction_window`` rows in ``dataset`` per ``session_id``.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +---------------+-------+-------------------+
        |     row_id    | class |    probability    |
        +---------------+-------+-------------------+
        |       0       |   4   |   0.995623886585  |
        |       0       |   9   |  0.0038311756216  |
        |       0       |   7   | 0.000301006948575 |
        |       1       |   1   |   0.928708016872  |
        |       1       |   3   |  0.0440889261663  |
        |       1       |   2   |  0.0176190119237  |
        |       2       |   3   |   0.996967732906  |
        |       2       |   2   |  0.00151345680933 |
        |       2       |   7   | 0.000637513934635 |
        |       3       |   1   |   0.998070061207  |
        |      ...      |  ...  |        ...        |
        +---------------+-------+-------------------+
        """
        _tkutl._check_categorical_option_type('output_type', output_type, ['probability', 'rank'])
        id_target_map = self._id_target_map
        preds = self.predict(
            dataset, output_type='probability_vector', output_frequency=output_frequency)

        if output_frequency == 'per_row':
            probs = preds
        elif output_frequency == 'per_window':
            probs = preds['probability_vector']

        if output_type == 'rank':
            probs = probs.apply(lambda p: [
                {'class': id_target_map[i],
                 'rank': i}
                for i in reversed(_np.argsort(p)[-k:])]
            )
        elif output_type == 'probability':
            probs = probs.apply(lambda p: [
                {'class': id_target_map[i],
                 'probability': p[i]}
                for i in reversed(_np.argsort(p)[-k:])]
            )

        if output_frequency == 'per_row':
            output = _SFrame({'probs': probs})
            output = output.add_row_number(column_name='row_id')
        elif output_frequency == 'per_window':
            output = _SFrame({
                'probs': probs,
                self.session_id: preds[self.session_id],
                'prediction_id': preds['prediction_id']
            })

        output = output.stack('probs', new_column_name='probs')
        output = output.unpack('probs', column_name_prefix='')
        return output
    def predict(self, dataset, output_type='class', output_frequency='per_row'):
        """
        Return predictions for ``dataset``, using the trained activity classifier.
        Predictions can be generated as class labels, or as a probability
        vector with probabilities for each class.

        The activity classifier generates a single prediction for each
        ``prediction_window`` rows in ``dataset``, per ``session_id``. Thus the
        number of predictions is smaller than the length of ``dataset``. By
        default each prediction is replicated by ``prediction_window`` to return
        a prediction for each row of ``dataset``. Use ``output_frequency`` to
        get the unreplicated predictions.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'class', 'probability_vector'}, optional
            Form of each prediction which is one of:

            - 'probability_vector': Prediction probability associated with each
              class as a vector. The probability of the first class (sorted
              alphanumerically by name of the class in the training set) is in
              position 0 of the vector, the second in position 1 and so on.
            - 'class': Class prediction. This returns the class with maximum
              probability.

        output_frequency : {'per_row', 'per_window'}, optional
            The frequency of the predictions which is one of:

            - 'per_window': Return a single prediction for each
              ``prediction_window`` rows in ``dataset`` per ``session_id``.
            - 'per_row': Convenience option to make sure the number of
              predictions match the number of rows in the dataset. Each
              prediction from the model is repeated ``prediction_window``
              times during that window.

        Returns
        -------
        out : SArray | SFrame
            If ``output_frequency`` is 'per_row' return an SArray with predictions
            for each row in ``dataset``.
            If ``output_frequency`` is 'per_window' return an SFrame with
            predictions for ``prediction_window`` rows in ``dataset``.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------

        .. sourcecode:: python

            # One prediction per row
            >>> probability_predictions = model.predict(
            ...     data, output_type='probability_vector', output_frequency='per_row')[:4]
            >>> probability_predictions

            dtype: array
            Rows: 4
            [array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086])]

            # One prediction per window
            >>> class_predictions = model.predict(
            ...     data, output_type='class', output_frequency='per_window')
            >>> class_predictions

            +---------------+------------+-----+
            | prediction_id | session_id |class|
            +---------------+------------+-----+
            |       0       |     3      |  5  |
            |       1       |     3      |  5  |
            |       2       |     3      |  5  |
            |       3       |     3      |  5  |
            |       4       |     3      |  5  |
            |       5       |     3      |  5  |
            |       6       |     3      |  5  |
            |       7       |     3      |  4  |
            |       8       |     3      |  4  |
            |       9       |     3      |  4  |
            |      ...      |    ...     | ... |
            +---------------+------------+-----+
        """
        _tkutl._raise_error_if_not_sframe(dataset, 'dataset')
        _tkutl._check_categorical_option_type(
            'output_frequency', output_frequency, ['per_window', 'per_row'])
        _tkutl._check_categorical_option_type(
            'output_type', output_type, ['probability_vector', 'class'])
        from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter
        from ._sframe_sequence_iterator import prep_data as _prep_data

        from ._sframe_sequence_iterator import _ceil_dev

        prediction_window = self.prediction_window
        chunked_dataset, _ = _prep_data(dataset, self.features, self.session_id, prediction_window,
                                     self._predictions_in_chunk, verbose=False)
        data_iter = _SFrameSequenceIter(chunked_dataset, len(self.features),
                                        prediction_window, self._predictions_in_chunk,
                                        self._recalibrated_batch_size, use_pad=True)

        chunked_data = data_iter.dataset
        preds = self._pred_model.predict(data_iter).asnumpy()

        if output_frequency == 'per_row':
            # Replicate each prediction times prediction_window
            preds = preds.repeat(prediction_window, axis=1)

            # Remove predictions for padded rows
            unpadded_len = chunked_data['chunk_len'].to_numpy()
            preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)]

            # Reshape from (num_of_chunks, chunk_size, num_of_classes)
            # to (ceil(length / prediction_window), num_of_classes)
            # chunk_size is DIFFERENT between chunks - since padding was removed.
            out = _np.concatenate(preds)
            out = out.reshape((-1, len(self._target_id_map)))
            out = _SArray(out)

            if output_type == 'class':
                id_target_map = self._id_target_map
                out = out.apply(lambda c: id_target_map[_np.argmax(c)])

        elif output_frequency == 'per_window':
            # Calculate the number of expected predictions and
            # remove predictions for padded data
            unpadded_len = chunked_data['chunk_len'].apply(
                lambda l: _ceil_dev(l, prediction_window)).to_numpy()
            preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)]

            out = _SFrame({
                self.session_id: chunked_data['session_id'],
                'preds': _SArray(preds, dtype=list)
            }).stack('preds', new_column_name='probability_vector')

            # Calculate the prediction index per session
            out = out.add_row_number(column_name='prediction_id')
            start_sess_idx = out.groupby(
                self.session_id, {'start_idx': _agg.MIN('prediction_id')})
            start_sess_idx = start_sess_idx.unstack(
                [self.session_id, 'start_idx'], new_column_name='idx')['idx'][0]

            if output_type == 'class':
                id_target_map = self._id_target_map
                out['probability_vector'] = out['probability_vector'].apply(
                    lambda c: id_target_map[_np.argmax(c)])
                out = out.rename({'probability_vector': 'class'})

        return out
示例#5
0
def bm25(dataset, query, k1=1.5, b=.75):
    """
    For a given query and set of documents, compute the BM25 score for each
    document. If we have a query with words q_1, ..., q_n the BM25 score for
    a document is:

        .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i) + k_1 * (1-b+b*|D|/d_avg))}

    where

    * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)`
    * :math:`f(q_i)` is the number of times q_i occurs in the document
    * :math:`n(q_i)` is the number of documents containing q_i
    * :math:`|D|` is the number of words in the document
    * :math:`d_avg` is the average number of words per document in the corpus
    * :math:`k_1` and :math:`b` are free parameters.

    Parameters
    ----------
    dataset : SArray of type dict, list, or str
        An SArray where each element eitherrepresents a document in:

        * **dict** : a bag-of-words format, where each key is a word and each
          value is the number of times that word occurs in the document.

        * **list** : The list is converted to bag of words of format, where the
          keys are the unique elements in the list and the values are the counts
          of those unique elements. After this step, the behaviour is identical
          to dict.

        * **string** : Behaves identically to a **dict**, where the dictionary
          is generated by converting the string into a bag-of-words format.
          For example, 'I really like really fluffy dogs" would get converted
          to {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}.

    query : A list, set, or SArray of type str
        A list, set or SArray where each element is a word.

    k1 : float, optional
        Free parameter which controls the relative importance of term
        frequencies. Recommended values are [1.2, 2.0].

    b : float, optional
        Free parameter which controls how much to downweight scores for long
        documents. Recommended value is 0.75.

    Returns
    -------
    out : SFrame
        An SFrame containing the BM25 score for each document containing one of
        the query words. The doc_id column is the row number of the document.

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        >>> dataset = turicreate.SArray([
          {'a':5, 'b':7, 'c':10},
          {'a':3, 'c':1, 'd':2},
          {'a':10, 'b':3, 'e':5},
          {'a':1},
          {'f':5}])

        >>> query = ['a', 'b', 'c']
        >>> turicreate.text_analytics.bm25(dataset, query)


    References
    ----------
    .. [BM25] `"Okapi BM-25" <http://en.wikipedia.org/wiki/Okapi_BM25>`_
    """

    if type(dataset) != _turicreate.SArray:
        raise TypeError('bm25 requires an SArray of dict, list, or str type'+\
            ', where each dictionary whose keys are words and whose values' + \
            ' are word frequency.')
    sf = _SFrame({'docs': dataset})

    if type(query) is dict:  # For backwards compatibility
        query = list(query.keys())
    if type(query) is _turicreate.SArray:
        query = list(query)
    if type(query) is set:
        query = list(query)
    if type(query) is not list:
        raise TypeError('The query must either be an SArray of str type, '+\
           ' a list of strings, or a set of strings.')

    # Calculate BM25
    sf = sf.add_row_number('doc_id')
    sf = sf.dropna('docs')  # Drop missing documents
    scores = _feature_engineering.BM25(
        'docs', query, k1, b, output_column_name='bm25').fit_transform(sf)

    # Find documents with query words

    if scores['docs'].dtype is dict:
        scores['doc_terms'] = scores['docs'].dict_keys()
    elif scores['docs'].dtype is list:
        scores['doc_terms'] = scores['docs'].apply(lambda x: list(set(x)))
    elif scores['docs'].dtype is str:
        scores['doc_terms'] = count_words(scores['docs']).dict_keys()
    else:
        # This should never occur (handled by BM25)
        raise TypeError('bm25 requires an SArray of dict, list, or str type')
    scores['doc_counts'] = scores['doc_terms'].apply(
        lambda x: len([word for word in query if word in x]))
    scores = scores[scores['doc_counts'] >
                    0]  # Drop documents without query word
    scores = scores.select_columns(['doc_id', 'bm25'])

    return scores
示例#6
0
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None):
    """
    Randomly split an SFrame into two SFrames based on the `session_id` such
    that one split contains data for a `fraction` of the sessions while the
    second split contains all data for the rest of the sessions.

    Parameters
    ----------
    dataset : SFrame
        Dataset to split. It must contain a column of session ids.

    session_id : string, optional
        The name of the column in `dataset` that corresponds to the
        a unique identifier for each session.

    fraction : float, optional
        Fraction of the sessions to fetch for the first returned SFrame.  Must
        be between 0 and 1. Once the sessions are split, all data from a single
        session is in the same SFrame.

    seed : int, optional
        Seed for the random number generator used to split.

    Examples
    --------

    .. sourcecode:: python

        # Split the data so that train has 90% of the users.
        >>> train, valid = tc.activity_classifier.util.random_split_by_session(
        ...     dataset, session_id='session_id', fraction=0.9)

        # For example: If dataset has 2055 sessions
        >>> len(dataset['session_id'].unique())
        2055

        # The training set now has 90% of the sessions
        >>> len(train['session_id'].unique())
        1850

        # The validation set has the remaining 10% of the sessions
        >>> len(valid['session_id'].unique())
        205
    """

    _raise_error_if_not_of_type(dataset, _SFrame, 'dataset')
    _raise_error_if_not_of_type(session_id, str, 'session_id')
    _raise_error_if_not_of_type(fraction, float, 'fraction')
    _raise_error_if_not_of_type(seed, [int, type(None)], 'seed')
    _numeric_param_check_range('fraction', fraction, 0, 1)

    if session_id not in dataset.column_names():
        raise _ToolkitError(
            'Input "dataset" must contain a column called %s.' % session_id)

    unique_sessions = _SFrame({'session': dataset[session_id].unique()})
    if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT:
        print(
            "The dataset has less than the minimum of",
            _MIN_NUM_SESSIONS_FOR_SPLIT,
            "sessions required for train-validation split. Continuing without validation set"
        )
        return dataset, None

    if seed is None:
        # Include the nanosecond component as well.
        import time
        seed = abs(hash("%0.20f" % time.time())) % (2**31)

    # The cython bindings require this to be an int, so cast if we can.
    try:
        seed = int(seed)
    except ValueError:
        raise ValueError('The \'seed\' parameter must be of type int.')

    random = Random()

    # Create a random binary filter (boolean SArray), using the same probability across all lines
    # that belong to the same session. In expectancy - the desired fraction of the sessions will
    # go to the training set.
    # Since boolean filters preserve order - there is no need to re-sort the lines within each session.
    # The boolean filter is a pseudorandom function of the session_id and the
    # global seed above, allowing the train-test split to vary across runs using
    # the same dataset.
    def random_session_pick(session_id_hash):
        random.seed(session_id_hash)
        return random.uniform(0, 1) < fraction

    chosen_filter = dataset[session_id].hash(seed).apply(random_session_pick)

    train = dataset[chosen_filter]
    valid = dataset[1 - chosen_filter]
    return train, valid
def create(dataset,
           session_id,
           target,
           features=None,
           prediction_window=100,
           validation_set='auto',
           max_iterations=10,
           batch_size=32,
           verbose=True):
    """
    Create an :class:`ActivityClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data which consists of `sessions` of data where each session is
        a sequence of data. The data must be in `stacked` format, grouped by
        session. Within each session, the data is assumed to be sorted
        temporally. Columns in `features` will be used to train a model that
        will make a prediction using labels in the `target` column.

    session_id : string
        Name of the column that contains a unique ID for each session.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. Use `model.classes` to
        retrieve the order in which the classes are mapped.

    features : list[string], optional
        Name of the columns containing the input features that will be used
        for classification. If set to `None`, all columns except `session_id`
        and `target` will be used.

    prediction_window : int, optional
        Number of time units between predictions. For example, if your input
        data is sampled at 100Hz, and the `prediction_window` is set to 100,
        then this model will make a prediction every 1 second.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance to
        prevent the model from overfitting to the training data.

        For each row of the progress table, accuracy is measured over the
        provided training dataset and the `validation_set`. The format of this
        SFrame must be the same as the training set.

        When set to 'auto', a validation set is automatically sampled from the
        training data (if the training data has > 100 sessions). If
        validation_set is set to None, then all the data will be used for
        training.

    max_iterations : int , optional
        Maximum number of iterations/epochs made over the data during the
        training phase.

    batch_size : int, optional
        Number of sequence chunks used per training step. Must be greater than
        the number of GPUs in use.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ActivityClassifier
        A trained :class:`ActivityClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate as tc

        # Training on dummy data
        >>> data = tc.SFrame({
        ...    'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10,
        ...    'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10,
        ...    'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10,
        ...    'session_id': [0, 0, 0] * 10 + [1, 1] * 10,
        ...    'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10
        ... })

        # Create an activity classifier
        >>> model = tc.activity_classifier.create(data,
        ...     session_id='session_id', target='activity',
        ...     features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z'])

        # Make predictions (as probability vector, or class)
        >>> predictions = model.predict(data)
        >>> predictions = model.predict(data, output_type='probability_vector')

        # Get both predictions and classes together
        >>> predictions = model.classify(data)

        # Get topk predictions (instead of only top-1) if your labels have more
        # 2 classes
        >>> predictions = model.predict_topk(data, k = 3)

        # Evaluate the model
        >>> results = model.evaluate(data)

    See Also
    --------
    ActivityClassifier, util.random_split_by_session
    """
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    from .._mxnet import _mxnet_utils
    from ._mx_model_architecture import _net_params
    from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter
    from ._sframe_sequence_iterator import prep_data as _prep_data
    from ._mx_model_architecture import _define_model_mxnet, _fit_model_mxnet
    from ._mps_model_architecture import _define_model_mps, _fit_model_mps
    from .._mps_utils import (use_mps as _use_mps, mps_device_name as
                              _mps_device_name, ac_weights_mps_to_mxnet as
                              _ac_weights_mps_to_mxnet)

    if not isinstance(target, str):
        raise _ToolkitError('target must be of type str')
    if not isinstance(session_id, str):
        raise _ToolkitError('session_id must be of type str')
    _tkutl._raise_error_if_sframe_empty(dataset, 'dataset')
    _tkutl._numeric_param_check_range('prediction_window', prediction_window,
                                      1, 400)
    _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0,
                                      _six.MAXSIZE)

    if features is None:
        features = _fe_tkutl.get_column_names(
            dataset,
            interpret_as_excluded=True,
            column_names=[session_id, target])
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError(
            "Invalid feature %s: Feature names must be of type str." % x)
    if len(features) == 0:
        raise TypeError(
            "Input 'features' must contain at least one column name.")

    start_time = _time.time()
    dataset = _tkutl._toolkits_select_columns(dataset,
                                              features + [session_id, target])
    _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target,
                                                     [str, int])
    _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id],
                                                     session_id, [str, int])

    if isinstance(validation_set, str) and validation_set == 'auto':
        # Computing the number of unique sessions in this way is relatively
        # expensive. Ideally we'd incorporate this logic into the C++ code that
        # chunks the raw data by prediction window.
        # TODO: https://github.com/apple/turicreate/issues/991
        unique_sessions = _SFrame({'session': dataset[session_id].unique()})
        if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT:
            print(
                "The dataset has less than the minimum of",
                _MIN_NUM_SESSIONS_FOR_SPLIT,
                "sessions required for train-validation split. Continuing without validation set"
            )
            validation_set = None
        else:
            dataset, validation_set = _random_split_by_session(
                dataset, session_id)

    # Encode the target column to numerical values
    use_target = target is not None
    dataset, target_map = _encode_target(dataset, target)

    predictions_in_chunk = 20
    chunked_data, num_sessions = _prep_data(dataset,
                                            features,
                                            session_id,
                                            prediction_window,
                                            predictions_in_chunk,
                                            target=target,
                                            verbose=verbose)

    # Decide whether to use MPS GPU, MXnet GPU or CPU
    num_mxnet_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions)
    use_mps = _use_mps() and num_mxnet_gpus == 0

    if verbose:
        if use_mps:
            print('Using GPU to create model ({})'.format(_mps_device_name()))
        elif num_mxnet_gpus == 1:
            print('Using GPU to create model (CUDA)')
        elif num_mxnet_gpus > 1:
            print(
                'Using {} GPUs to create model (CUDA)'.format(num_mxnet_gpus))
        else:
            print('Using CPU to create model')

    # Create data iterators
    user_provided_batch_size = batch_size
    batch_size = max(batch_size, num_mxnet_gpus, 1)
    use_mx_data_batch = not use_mps
    data_iter = _SFrameSequenceIter(chunked_data,
                                    len(features),
                                    prediction_window,
                                    predictions_in_chunk,
                                    batch_size,
                                    use_target=use_target,
                                    mx_output=use_mx_data_batch)

    if validation_set is not None:
        _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set')
        _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set')
        validation_set = _tkutl._toolkits_select_columns(
            validation_set, features + [session_id, target])
        validation_set = validation_set.filter_by(list(target_map.keys()),
                                                  target)
        validation_set, mapping = _encode_target(validation_set, target,
                                                 target_map)
        chunked_validation_set, _ = _prep_data(validation_set,
                                               features,
                                               session_id,
                                               prediction_window,
                                               predictions_in_chunk,
                                               target=target,
                                               verbose=False)

        valid_iter = _SFrameSequenceIter(chunked_validation_set,
                                         len(features),
                                         prediction_window,
                                         predictions_in_chunk,
                                         batch_size,
                                         use_target=use_target,
                                         mx_output=use_mx_data_batch)
    else:
        valid_iter = None

    # Define model architecture
    context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions)

    # Always create MXnet models, as the pred_model is later saved to the state
    # If MPS is used - the loss_model will be overwritten
    loss_model, pred_model = _define_model_mxnet(len(target_map),
                                                 prediction_window,
                                                 predictions_in_chunk, context)

    if use_mps:
        loss_model = _define_model_mps(batch_size,
                                       len(features),
                                       len(target_map),
                                       prediction_window,
                                       predictions_in_chunk,
                                       is_prediction_model=False)

        log = _fit_model_mps(loss_model, data_iter, valid_iter, max_iterations,
                             verbose)

    else:
        # Train the model using Mxnet
        log = _fit_model_mxnet(loss_model, data_iter, valid_iter,
                               max_iterations, num_mxnet_gpus, verbose)

    # Set up prediction model
    pred_model.bind(data_shapes=data_iter.provide_data,
                    label_shapes=None,
                    for_training=False)

    if use_mps:
        mps_params = loss_model.export()
        arg_params, aux_params = _ac_weights_mps_to_mxnet(
            mps_params, _net_params['lstm_h'])
    else:
        arg_params, aux_params = loss_model.get_params()

    pred_model.init_params(arg_params=arg_params, aux_params=aux_params)

    # Save the model
    state = {
        '_pred_model': pred_model,
        'verbose': verbose,
        'training_time': _time.time() - start_time,
        'target': target,
        'classes': sorted(target_map.keys()),
        'features': features,
        'session_id': session_id,
        'prediction_window': prediction_window,
        'max_iterations': max_iterations,
        'num_examples': len(dataset),
        'num_sessions': num_sessions,
        'num_classes': len(target_map),
        'num_features': len(features),
        'training_accuracy': log['train_acc'],
        'training_log_loss': log['train_loss'],
        '_target_id_map': target_map,
        '_id_target_map': {v: k
                           for k, v in target_map.items()},
        '_predictions_in_chunk': predictions_in_chunk,
        '_recalibrated_batch_size': data_iter.batch_size,
        'batch_size': user_provided_batch_size
    }

    if validation_set is not None:
        state['valid_accuracy'] = log['valid_acc']
        state['valid_log_loss'] = log['valid_loss']

    model = ActivityClassifier(state)
    return model
示例#8
0
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None):
    """
    Randomly split an SFrame into two SFrames based on the `session_id` such
    that one split contains data for a `fraction` of the sessions while the
    second split contains all data for the rest of the sessions.

    Parameters
    ----------
    dataset : SFrame
        Dataset to split. It must contain a column of session ids.

    session_id : string, optional
        The name of the column in `dataset` that corresponds to the
        a unique identifier for each session.

    fraction : float, optional
        Fraction of the sessions to fetch for the first returned SFrame.  Must
        be between 0 and 1. Once the sessions are split, all data from a single
        session is in the same SFrame.

    seed : int, optional
        Seed for the random number generator used to split.

    Examples
    --------

    .. sourcecode:: python

        # Split the data so that train has 90% of the users.
        >>> train, valid = tc.activity_classifier.util.random_split_by_session(
        ...     dataset, session_id='session_id', fraction=0.9)

        # For example: If dataset has 2055 sessions
        >>> len(dataset['session_id'].unique())
        2055

        # The training set now has 90% of the sessions
        >>> len(train['session_id'].unique())
        1850

        # The validation set has the remaining 10% of the sessions
        >>> len(valid['session_id'].unique())
        205
    """

    _raise_error_if_not_of_type(dataset, _SFrame, 'dataset')
    _raise_error_if_not_of_type(session_id, str, 'session_id')
    _raise_error_if_not_of_type(fraction, float, 'fraction')
    _raise_error_if_not_of_type(seed, [int, type(None)], 'seed')
    _numeric_param_check_range('fraction', fraction, 0, 1)

    if session_id not in dataset.column_names():
        raise _ToolkitError(
            'Input "dataset" must contain a column called %s.' % session_id)

    unique_sessions = _SFrame({'session': dataset[session_id].unique()})
    if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT:
        print ("The dataset has less than the minimum of", _MIN_NUM_SESSIONS_FOR_SPLIT, "sessions required for train-validation split. Continuing without validation set")
        return dataset, None

    # We need an actual seed number, which we will later use in the apply function (see below).
    # If the user didn't provide a seed - we can generate one based on current system time
    # (similarly to mechanism behind random.seed(None) )
    if seed is None:
        import time
        seed = long(time.time() * 256)
    
    random = Random()
    
    # Create a random binary filter (boolean SArray), using the same probability across all lines
    # that belong to the same session. In expectancy - the desired fraction of the sessions will
    # go to the training set.
    # Since boolean filters preserve order - there is no need to re-sort the lines within each session.
    def random_session_pick(session_id):
        # If we will use only the session_id as the seed - the split will be constant for the
        # same dataset across different runs, which is of course undesired
        random.seed(hash(session_id) + seed)
        return random.uniform(0, 1) < fraction
    
    chosen_filter = dataset[session_id].apply(random_session_pick)
    train = dataset[chosen_filter]
    valid = dataset[1 - chosen_filter]
    return train, valid
示例#9
0
    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<feature>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """

        sections = []
        fields = []

        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.features))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.excluded_features))

        header_fields = [("Features", "features"),
                         ("Excluded Features", "excluded_features")]

        sections.append("Model Fields")
        fields.append(header_fields)

        if self.user_column_interpretations:
            sections.append("User Specified Interpretations")
            fields.append(
                list(sorted(self._get("user_column_interpretations").items())))

        column_interpretations = self._get("column_interpretations")
        features = self._get("features")

        if self._get("fitted") and features is not None:

            n_rows = len(features)
            transform_info = [None] * n_rows

            for i, f in enumerate(features):
                interpretation = column_interpretations[f]
                input_type = self.input_types[f]
                description, output_type = _get_interpretation_description_and_output_type(
                    interpretation, input_type)

                transform_info[i] = (f, input_type.__name__, interpretation,
                                     description, output_type.__name__)

            transform_table = _SFrame()
            transform_table["Column"] = [t[0] for t in transform_info]
            transform_table["Type"] = [t[1] for t in transform_info]
            transform_table["Interpretation"] = [t[2] for t in transform_info]
            transform_table["Transforms"] = [t[3] for t in transform_info]
            transform_table["Output Type"] = [t[4] for t in transform_info]

            fields[-1].append(transform_table)

        return fields, sections