def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such that one split contains data for a `fraction` of the sessions while the second split contains all data for the rest of the sessions. Parameters ---------- dataset : SFrame Dataset to split. It must contain a column of session ids. session_id : string, optional The name of the column in `dataset` that corresponds to the a unique identifier for each session. fraction : float, optional Fraction of the sessions to fetch for the first returned SFrame. Must be between 0 and 1. Once the sessions are split, all data from a single session is in the same SFrame. seed : int, optional Seed for the random number generator used to split. Examples -------- .. sourcecode:: python # Split the data so that train has 90% of the users. >>> train, valid = tc.activity_classifier.util.random_split_by_session( ... dataset, session_id='session_id', fraction=0.9) # For example: If dataset has 2055 sessions >>> len(dataset['session_id'].unique()) 2055 # The training set now has 90% of the sessions >>> len(train['session_id'].unique()) 1850 # The validation set has the remaining 10% of the sessions >>> len(valid['session_id'].unique()) 205 """ _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') _raise_error_if_not_of_type(session_id, str, 'session_id') _raise_error_if_not_of_type(fraction, float, 'fraction') _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') _numeric_param_check_range('fraction', fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( 'Input "dataset" must contain a column called %s.' % session_id) unique_sessions = _SFrame({'session': dataset[session_id].unique()}) chosen, not_chosen = unique_sessions.random_split(fraction, seed) train = dataset.filter_by(chosen['session'], session_id) valid = dataset.filter_by(not_chosen['session'], session_id) return train, valid
def classify(self, dataset, output_frequency='per_row'): """ Return a classification, for each ``prediction_window`` examples in the ``dataset``, using the trained activity classification model. The output SFrame contains predictions as both class labels as well as probabilities that the predicted value is the associated label. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features and session id used for model training, but does not require a target column. Additional columns are ignored. output_frequency : {'per_row', 'per_window'}, optional The frequency of the predictions which is one of: - 'per_row': Each prediction is returned ``prediction_window`` times. - 'per_window': Return a single prediction for each ``prediction_window`` rows in ``dataset`` per ``session_id``. Returns ------- out : SFrame An SFrame with model predictions i.e class labels and probabilities. See Also ---------- create, evaluate, predict Examples ---------- >>> classes = model.classify(data) """ _tkutl._check_categorical_option_type('output_frequency', output_frequency, ['per_window', 'per_row']) id_target_map = self._id_target_map preds = self.predict(dataset, output_type='probability_vector', output_frequency=output_frequency) if output_frequency == 'per_row': return _SFrame({ 'class': preds.apply(lambda p: id_target_map[_np.argmax(p)]), 'probability': preds.apply(_np.max) }) elif output_frequency == 'per_window': preds['class'] = preds['probability_vector'].apply( lambda p: id_target_map[_np.argmax(p)]) preds['probability'] = preds['probability_vector'].apply(_np.max) preds = preds.remove_column('probability_vector') return preds
def predict_topk(self, dataset, output_type='probability', k=3, output_frequency='per_row'): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `prediction_id`, `class`, and `probability`, or `rank`, depending on the ``output_type`` parameter. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features and session id used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'rank'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. k : int, optional Number of classes to return for each input example. output_frequency : {'per_row', 'per_window'}, optional The frequency of the predictions which is one of: - 'per_row': Each prediction is returned ``prediction_window`` times. - 'per_window': Return a single prediction for each ``prediction_window`` rows in ``dataset`` per ``session_id``. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +---------------+-------+-------------------+ | row_id | class | probability | +---------------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +---------------+-------+-------------------+ """ _tkutl._check_categorical_option_type('output_type', output_type, ['probability', 'rank']) id_target_map = self._id_target_map preds = self.predict( dataset, output_type='probability_vector', output_frequency=output_frequency) if output_frequency == 'per_row': probs = preds elif output_frequency == 'per_window': probs = preds['probability_vector'] if output_type == 'rank': probs = probs.apply(lambda p: [ {'class': id_target_map[i], 'rank': i} for i in reversed(_np.argsort(p)[-k:])] ) elif output_type == 'probability': probs = probs.apply(lambda p: [ {'class': id_target_map[i], 'probability': p[i]} for i in reversed(_np.argsort(p)[-k:])] ) if output_frequency == 'per_row': output = _SFrame({'probs': probs}) output = output.add_row_number(column_name='row_id') elif output_frequency == 'per_window': output = _SFrame({ 'probs': probs, self.session_id: preds[self.session_id], 'prediction_id': preds['prediction_id'] }) output = output.stack('probs', new_column_name='probs') output = output.unpack('probs', column_name_prefix='') return output
def predict(self, dataset, output_type='class', output_frequency='per_row'): """ Return predictions for ``dataset``, using the trained activity classifier. Predictions can be generated as class labels, or as a probability vector with probabilities for each class. The activity classifier generates a single prediction for each ``prediction_window`` rows in ``dataset``, per ``session_id``. Thus the number of predictions is smaller than the length of ``dataset``. By default each prediction is replicated by ``prediction_window`` to return a prediction for each row of ``dataset``. Use ``output_frequency`` to get the unreplicated predictions. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. output_type : {'class', 'probability_vector'}, optional Form of each prediction which is one of: - 'probability_vector': Prediction probability associated with each class as a vector. The probability of the first class (sorted alphanumerically by name of the class in the training set) is in position 0 of the vector, the second in position 1 and so on. - 'class': Class prediction. This returns the class with maximum probability. output_frequency : {'per_row', 'per_window'}, optional The frequency of the predictions which is one of: - 'per_window': Return a single prediction for each ``prediction_window`` rows in ``dataset`` per ``session_id``. - 'per_row': Convenience option to make sure the number of predictions match the number of rows in the dataset. Each prediction from the model is repeated ``prediction_window`` times during that window. Returns ------- out : SArray | SFrame If ``output_frequency`` is 'per_row' return an SArray with predictions for each row in ``dataset``. If ``output_frequency`` is 'per_window' return an SFrame with predictions for ``prediction_window`` rows in ``dataset``. See Also ---------- create, evaluate, classify Examples -------- .. sourcecode:: python # One prediction per row >>> probability_predictions = model.predict( ... data, output_type='probability_vector', output_frequency='per_row')[:4] >>> probability_predictions dtype: array Rows: 4 [array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086])] # One prediction per window >>> class_predictions = model.predict( ... data, output_type='class', output_frequency='per_window') >>> class_predictions +---------------+------------+-----+ | prediction_id | session_id |class| +---------------+------------+-----+ | 0 | 3 | 5 | | 1 | 3 | 5 | | 2 | 3 | 5 | | 3 | 3 | 5 | | 4 | 3 | 5 | | 5 | 3 | 5 | | 6 | 3 | 5 | | 7 | 3 | 4 | | 8 | 3 | 4 | | 9 | 3 | 4 | | ... | ... | ... | +---------------+------------+-----+ """ _tkutl._raise_error_if_not_sframe(dataset, 'dataset') _tkutl._check_categorical_option_type( 'output_frequency', output_frequency, ['per_window', 'per_row']) _tkutl._check_categorical_option_type( 'output_type', output_type, ['probability_vector', 'class']) from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data from ._sframe_sequence_iterator import _ceil_dev prediction_window = self.prediction_window chunked_dataset, _ = _prep_data(dataset, self.features, self.session_id, prediction_window, self._predictions_in_chunk, verbose=False) data_iter = _SFrameSequenceIter(chunked_dataset, len(self.features), prediction_window, self._predictions_in_chunk, self._recalibrated_batch_size, use_pad=True) chunked_data = data_iter.dataset preds = self._pred_model.predict(data_iter).asnumpy() if output_frequency == 'per_row': # Replicate each prediction times prediction_window preds = preds.repeat(prediction_window, axis=1) # Remove predictions for padded rows unpadded_len = chunked_data['chunk_len'].to_numpy() preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)] # Reshape from (num_of_chunks, chunk_size, num_of_classes) # to (ceil(length / prediction_window), num_of_classes) # chunk_size is DIFFERENT between chunks - since padding was removed. out = _np.concatenate(preds) out = out.reshape((-1, len(self._target_id_map))) out = _SArray(out) if output_type == 'class': id_target_map = self._id_target_map out = out.apply(lambda c: id_target_map[_np.argmax(c)]) elif output_frequency == 'per_window': # Calculate the number of expected predictions and # remove predictions for padded data unpadded_len = chunked_data['chunk_len'].apply( lambda l: _ceil_dev(l, prediction_window)).to_numpy() preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)] out = _SFrame({ self.session_id: chunked_data['session_id'], 'preds': _SArray(preds, dtype=list) }).stack('preds', new_column_name='probability_vector') # Calculate the prediction index per session out = out.add_row_number(column_name='prediction_id') start_sess_idx = out.groupby( self.session_id, {'start_idx': _agg.MIN('prediction_id')}) start_sess_idx = start_sess_idx.unstack( [self.session_id, 'start_idx'], new_column_name='idx')['idx'][0] if output_type == 'class': id_target_map = self._id_target_map out['probability_vector'] = out['probability_vector'].apply( lambda c: id_target_map[_np.argmax(c)]) out = out.rename({'probability_vector': 'class'}) return out
def bm25(dataset, query, k1=1.5, b=.75): """ For a given query and set of documents, compute the BM25 score for each document. If we have a query with words q_1, ..., q_n the BM25 score for a document is: .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i) + k_1 * (1-b+b*|D|/d_avg))} where * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)` * :math:`f(q_i)` is the number of times q_i occurs in the document * :math:`n(q_i)` is the number of documents containing q_i * :math:`|D|` is the number of words in the document * :math:`d_avg` is the average number of words per document in the corpus * :math:`k_1` and :math:`b` are free parameters. Parameters ---------- dataset : SArray of type dict, list, or str An SArray where each element eitherrepresents a document in: * **dict** : a bag-of-words format, where each key is a word and each value is the number of times that word occurs in the document. * **list** : The list is converted to bag of words of format, where the keys are the unique elements in the list and the values are the counts of those unique elements. After this step, the behaviour is identical to dict. * **string** : Behaves identically to a **dict**, where the dictionary is generated by converting the string into a bag-of-words format. For example, 'I really like really fluffy dogs" would get converted to {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}. query : A list, set, or SArray of type str A list, set or SArray where each element is a word. k1 : float, optional Free parameter which controls the relative importance of term frequencies. Recommended values are [1.2, 2.0]. b : float, optional Free parameter which controls how much to downweight scores for long documents. Recommended value is 0.75. Returns ------- out : SFrame An SFrame containing the BM25 score for each document containing one of the query words. The doc_id column is the row number of the document. Examples -------- .. sourcecode:: python >>> import turicreate >>> dataset = turicreate.SArray([ {'a':5, 'b':7, 'c':10}, {'a':3, 'c':1, 'd':2}, {'a':10, 'b':3, 'e':5}, {'a':1}, {'f':5}]) >>> query = ['a', 'b', 'c'] >>> turicreate.text_analytics.bm25(dataset, query) References ---------- .. [BM25] `"Okapi BM-25" <http://en.wikipedia.org/wiki/Okapi_BM25>`_ """ if type(dataset) != _turicreate.SArray: raise TypeError('bm25 requires an SArray of dict, list, or str type'+\ ', where each dictionary whose keys are words and whose values' + \ ' are word frequency.') sf = _SFrame({'docs': dataset}) if type(query) is dict: # For backwards compatibility query = list(query.keys()) if type(query) is _turicreate.SArray: query = list(query) if type(query) is set: query = list(query) if type(query) is not list: raise TypeError('The query must either be an SArray of str type, '+\ ' a list of strings, or a set of strings.') # Calculate BM25 sf = sf.add_row_number('doc_id') sf = sf.dropna('docs') # Drop missing documents scores = _feature_engineering.BM25( 'docs', query, k1, b, output_column_name='bm25').fit_transform(sf) # Find documents with query words if scores['docs'].dtype is dict: scores['doc_terms'] = scores['docs'].dict_keys() elif scores['docs'].dtype is list: scores['doc_terms'] = scores['docs'].apply(lambda x: list(set(x))) elif scores['docs'].dtype is str: scores['doc_terms'] = count_words(scores['docs']).dict_keys() else: # This should never occur (handled by BM25) raise TypeError('bm25 requires an SArray of dict, list, or str type') scores['doc_counts'] = scores['doc_terms'].apply( lambda x: len([word for word in query if word in x])) scores = scores[scores['doc_counts'] > 0] # Drop documents without query word scores = scores.select_columns(['doc_id', 'bm25']) return scores
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such that one split contains data for a `fraction` of the sessions while the second split contains all data for the rest of the sessions. Parameters ---------- dataset : SFrame Dataset to split. It must contain a column of session ids. session_id : string, optional The name of the column in `dataset` that corresponds to the a unique identifier for each session. fraction : float, optional Fraction of the sessions to fetch for the first returned SFrame. Must be between 0 and 1. Once the sessions are split, all data from a single session is in the same SFrame. seed : int, optional Seed for the random number generator used to split. Examples -------- .. sourcecode:: python # Split the data so that train has 90% of the users. >>> train, valid = tc.activity_classifier.util.random_split_by_session( ... dataset, session_id='session_id', fraction=0.9) # For example: If dataset has 2055 sessions >>> len(dataset['session_id'].unique()) 2055 # The training set now has 90% of the sessions >>> len(train['session_id'].unique()) 1850 # The validation set has the remaining 10% of the sessions >>> len(valid['session_id'].unique()) 205 """ _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') _raise_error_if_not_of_type(session_id, str, 'session_id') _raise_error_if_not_of_type(fraction, float, 'fraction') _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') _numeric_param_check_range('fraction', fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( 'Input "dataset" must contain a column called %s.' % session_id) unique_sessions = _SFrame({'session': dataset[session_id].unique()}) if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT: print( "The dataset has less than the minimum of", _MIN_NUM_SESSIONS_FOR_SPLIT, "sessions required for train-validation split. Continuing without validation set" ) return dataset, None if seed is None: # Include the nanosecond component as well. import time seed = abs(hash("%0.20f" % time.time())) % (2**31) # The cython bindings require this to be an int, so cast if we can. try: seed = int(seed) except ValueError: raise ValueError('The \'seed\' parameter must be of type int.') random = Random() # Create a random binary filter (boolean SArray), using the same probability across all lines # that belong to the same session. In expectancy - the desired fraction of the sessions will # go to the training set. # Since boolean filters preserve order - there is no need to re-sort the lines within each session. # The boolean filter is a pseudorandom function of the session_id and the # global seed above, allowing the train-test split to vary across runs using # the same dataset. def random_session_pick(session_id_hash): random.seed(session_id_hash) return random.uniform(0, 1) < fraction chosen_filter = dataset[session_id].hash(seed).apply(random_session_pick) train = dataset[chosen_filter] valid = dataset[1 - chosen_filter] return train, valid
def create(dataset, session_id, target, features=None, prediction_window=100, validation_set='auto', max_iterations=10, batch_size=32, verbose=True): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(data, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ _tkutl._raise_error_if_not_sframe(dataset, "dataset") from .._mxnet import _mxnet_utils from ._mx_model_architecture import _net_params from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data from ._mx_model_architecture import _define_model_mxnet, _fit_model_mxnet from ._mps_model_architecture import _define_model_mps, _fit_model_mps from .._mps_utils import (use_mps as _use_mps, mps_device_name as _mps_device_name, ac_weights_mps_to_mxnet as _ac_weights_mps_to_mxnet) if not isinstance(target, str): raise _ToolkitError('target must be of type str') if not isinstance(session_id, str): raise _ToolkitError('session_id must be of type str') _tkutl._raise_error_if_sframe_empty(dataset, 'dataset') _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400) _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names( dataset, interpret_as_excluded=True, column_names=[session_id, target]) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError( "Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError( "Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int]) if isinstance(validation_set, str) and validation_set == 'auto': # Computing the number of unique sessions in this way is relatively # expensive. Ideally we'd incorporate this logic into the C++ code that # chunks the raw data by prediction window. # TODO: https://github.com/apple/turicreate/issues/991 unique_sessions = _SFrame({'session': dataset[session_id].unique()}) if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT: print( "The dataset has less than the minimum of", _MIN_NUM_SESSIONS_FOR_SPLIT, "sessions required for train-validation split. Continuing without validation set" ) validation_set = None else: dataset, validation_set = _random_split_by_session( dataset, session_id) # Encode the target column to numerical values use_target = target is not None dataset, target_map = _encode_target(dataset, target) predictions_in_chunk = 20 chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=verbose) # Decide whether to use MPS GPU, MXnet GPU or CPU num_mxnet_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions) use_mps = _use_mps() and num_mxnet_gpus == 0 if verbose: if use_mps: print('Using GPU to create model ({})'.format(_mps_device_name())) elif num_mxnet_gpus == 1: print('Using GPU to create model (CUDA)') elif num_mxnet_gpus > 1: print( 'Using {} GPUs to create model (CUDA)'.format(num_mxnet_gpus)) else: print('Using CPU to create model') # Create data iterators user_provided_batch_size = batch_size batch_size = max(batch_size, num_mxnet_gpus, 1) use_mx_data_batch = not use_mps data_iter = _SFrameSequenceIter(chunked_data, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target, mx_output=use_mx_data_batch) if validation_set is not None: _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set') _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set') validation_set = _tkutl._toolkits_select_columns( validation_set, features + [session_id, target]) validation_set = validation_set.filter_by(list(target_map.keys()), target) validation_set, mapping = _encode_target(validation_set, target, target_map) chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=False) valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target, mx_output=use_mx_data_batch) else: valid_iter = None # Define model architecture context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions) # Always create MXnet models, as the pred_model is later saved to the state # If MPS is used - the loss_model will be overwritten loss_model, pred_model = _define_model_mxnet(len(target_map), prediction_window, predictions_in_chunk, context) if use_mps: loss_model = _define_model_mps(batch_size, len(features), len(target_map), prediction_window, predictions_in_chunk, is_prediction_model=False) log = _fit_model_mps(loss_model, data_iter, valid_iter, max_iterations, verbose) else: # Train the model using Mxnet log = _fit_model_mxnet(loss_model, data_iter, valid_iter, max_iterations, num_mxnet_gpus, verbose) # Set up prediction model pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None, for_training=False) if use_mps: mps_params = loss_model.export() arg_params, aux_params = _ac_weights_mps_to_mxnet( mps_params, _net_params['lstm_h']) else: arg_params, aux_params = loss_model.get_params() pred_model.init_params(arg_params=arg_params, aux_params=aux_params) # Save the model state = { '_pred_model': pred_model, 'verbose': verbose, 'training_time': _time.time() - start_time, 'target': target, 'classes': sorted(target_map.keys()), 'features': features, 'session_id': session_id, 'prediction_window': prediction_window, 'max_iterations': max_iterations, 'num_examples': len(dataset), 'num_sessions': num_sessions, 'num_classes': len(target_map), 'num_features': len(features), 'training_accuracy': log['train_acc'], 'training_log_loss': log['train_loss'], '_target_id_map': target_map, '_id_target_map': {v: k for k, v in target_map.items()}, '_predictions_in_chunk': predictions_in_chunk, '_recalibrated_batch_size': data_iter.batch_size, 'batch_size': user_provided_batch_size } if validation_set is not None: state['valid_accuracy'] = log['valid_acc'] state['valid_log_loss'] = log['valid_loss'] model = ActivityClassifier(state) return model
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such that one split contains data for a `fraction` of the sessions while the second split contains all data for the rest of the sessions. Parameters ---------- dataset : SFrame Dataset to split. It must contain a column of session ids. session_id : string, optional The name of the column in `dataset` that corresponds to the a unique identifier for each session. fraction : float, optional Fraction of the sessions to fetch for the first returned SFrame. Must be between 0 and 1. Once the sessions are split, all data from a single session is in the same SFrame. seed : int, optional Seed for the random number generator used to split. Examples -------- .. sourcecode:: python # Split the data so that train has 90% of the users. >>> train, valid = tc.activity_classifier.util.random_split_by_session( ... dataset, session_id='session_id', fraction=0.9) # For example: If dataset has 2055 sessions >>> len(dataset['session_id'].unique()) 2055 # The training set now has 90% of the sessions >>> len(train['session_id'].unique()) 1850 # The validation set has the remaining 10% of the sessions >>> len(valid['session_id'].unique()) 205 """ _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') _raise_error_if_not_of_type(session_id, str, 'session_id') _raise_error_if_not_of_type(fraction, float, 'fraction') _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') _numeric_param_check_range('fraction', fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( 'Input "dataset" must contain a column called %s.' % session_id) unique_sessions = _SFrame({'session': dataset[session_id].unique()}) if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT: print ("The dataset has less than the minimum of", _MIN_NUM_SESSIONS_FOR_SPLIT, "sessions required for train-validation split. Continuing without validation set") return dataset, None # We need an actual seed number, which we will later use in the apply function (see below). # If the user didn't provide a seed - we can generate one based on current system time # (similarly to mechanism behind random.seed(None) ) if seed is None: import time seed = long(time.time() * 256) random = Random() # Create a random binary filter (boolean SArray), using the same probability across all lines # that belong to the same session. In expectancy - the desired fraction of the sessions will # go to the training set. # Since boolean filters preserve order - there is no need to re-sort the lines within each session. def random_session_pick(session_id): # If we will use only the session_id as the seed - the split will be constant for the # same dataset across different runs, which is of course undesired random.seed(hash(session_id) + seed) return random.uniform(0, 1) < fraction chosen_filter = dataset[session_id].apply(random_session_pick) train = dataset[chosen_filter] valid = dataset[1 - chosen_filter] return train, valid
def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<feature>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ sections = [] fields = [] _features = _precomputed_field( _internal_utils.pretty_print_list(self.features)) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.excluded_features)) header_fields = [("Features", "features"), ("Excluded Features", "excluded_features")] sections.append("Model Fields") fields.append(header_fields) if self.user_column_interpretations: sections.append("User Specified Interpretations") fields.append( list(sorted(self._get("user_column_interpretations").items()))) column_interpretations = self._get("column_interpretations") features = self._get("features") if self._get("fitted") and features is not None: n_rows = len(features) transform_info = [None] * n_rows for i, f in enumerate(features): interpretation = column_interpretations[f] input_type = self.input_types[f] description, output_type = _get_interpretation_description_and_output_type( interpretation, input_type) transform_info[i] = (f, input_type.__name__, interpretation, description, output_type.__name__) transform_table = _SFrame() transform_table["Column"] = [t[0] for t in transform_info] transform_table["Type"] = [t[1] for t in transform_info] transform_table["Interpretation"] = [t[2] for t in transform_info] transform_table["Transforms"] = [t[3] for t in transform_info] transform_table["Output Type"] = [t[4] for t in transform_info] fields[-1].append(transform_table) return fields, sections