class _SampleTransformer(Transformer): get_default_options = staticmethod( _get_default_options_wrapper('_SampleTransformer', '_SampleTransformer', '_SampleTransformer', True)) def __init__(self, features=None, constant=0.5): # Set up options opts = {} opts['features'] = features opts['constant'] = constant # Initialize object proxy = _gl.extensions._SampleTransformer() proxy.init_transformer(opts) super(_SampleTransformer, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ section = [] section_titles = ['Attributes'] for f in self.list_fields(): section.append(("%s" % f, "%s" % f)) return ([section], section_titles) def __repr__(self): (section, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, section, section_titles, width=30)
from graphlab.toolkits._model import Model as _Model from graphlab.data_structures.sframe import SFrame as _SFrame from graphlab.data_structures.sarray import SArray as _SArray from graphlab.toolkits.text_analytics._util import _check_input from graphlab.toolkits.text_analytics._util import random_split as _random_split from graphlab.toolkits._internal_utils import _check_categorical_option_type from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object from itertools import izip as _izip import array as _array import json as _json from graphlab.toolkits._model import _get_default_options_wrapper get_default_options = _get_default_options_wrapper( 'cgs_topic_model', 'topic_model', 'TopicModel') def create(dataset, num_topics=10, initial_topics=None, alpha=None, beta=.1, num_iterations=10, associations=None, verbose=False, print_interval=10, validation_set=None, method='auto'): """ Create a topic model from the given data set. A topic model assumes each
import graphlab.toolkits._supervised_learning as _sl from graphlab.toolkits._supervised_learning import Classifier as _Classifier from graphlab.toolkits._model import _get_default_options_wrapper from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe, \ _map_unity_proxy_to_object, \ _toolkit_repr_print, \ _numeric_param_check_range from graphlab.toolkits._model_workflow import _collect_model_workflow from graphlab.util import cloudpickle as _cloudpickle import logging as _logging from copy import copy as _copy get_default_options = _get_default_options_wrapper( 'neuralnet_classifier_v2', 'neuralnet_classifier', 'NeuralNetClassifier') _context_doc_string = ''' >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train') >>> training_data, validation_data = data.random_split(0.8) >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(training_data, ... target='label', ... network=net, ... max_iterations=3) ''' class NeuralNetClassifier(_Classifier): """
detection tool. """ import time as _time import graphlab as _gl import graphlab.connect as _mt from graphlab.toolkits._model import SDKModel as _SDKModel import graphlab.toolkits._internal_utils as _tkutl from graphlab.toolkits._private_utils import _summarize_accessible_fields from graphlab.toolkits._main import ToolkitError as _ToolkitError from graphlab.toolkits._model import _get_default_options_wrapper import datetime as _dt import logging as _logging get_default_options = _get_default_options_wrapper( '_BayesianOnlineChangepoint', '_BayesianOnlineChangepoint', '_BayesianOnlineChangepoint', True) def create(dataset, feature=None, expected_runlength=250, lag=7): """ Create a `BayesianChangepointsModel`. The changepoint detection calculates where there is a shift in mean or variance in a univariate timeseries. This model calculates a probability that a given point is changepoint, given the data up to the point. The BayesianChangepointsModel works with either TimeSeries, SArray, or SFrame inputs. The model created by this function contains a table `scores` that contains the computed anomaly scores. The type of `scores` matches the type of the input `dataset`, and the table contains 4 columns:
from graphlab.util import _make_internal_url _RANDOM_FOREST_MODEL_PARAMS_KEYS = [ 'max_depth', 'min_child_weight', 'min_loss_reduction', 'row_subsample' ] _RANDOM_FOREST_TRAINING_PARAMS_KEYS = [ 'objective', 'training_time', 'training_error', 'validation_error', 'evaluation_metric' ] _RANDOM_FOREST_TRAINING_DATA_PARAMS_KEYS = [ 'target', 'features', 'num_features', 'num_examples', 'num_validation_examples' ] get_default_options = _get_default_options_wrapper('random_forest_regression', 'random_forest_regression', 'RandomForestRegression') class RandomForestRegression(_SupervisedLearningModel, _TreeModelMixin): """ Encapsulates random forest models for regression tasks. The prediction is based on a collection of base learners, `regression trees <http://en.wikipedia.org/wiki/Decision_tree_learning>`_ and combines them through a technique called `random forest <http://en.wikipedia.org/wiki/Random_forest>`_. Different from linear models, e.g. linear regression, the random forests are able to model non-linear interactions between the features and the target using decision trees as the subroutine. It is good for handling numerical features and categorical features with
class FeatureHasher(Transformer): ''' Hashes an input feature space to an n-bit feature space. Feature hashing is an efficient way of vectorizing features, and performing dimensionality reduction or expansion along the way. Supported types include array.array, list, dict, float, int, and string. The behavior for creating keys and values for different input data column types is given below. * **array.array** : Keys are created by 1) combining the index of an element and the column name, 2) hashing the combination of the two. Each element in the array are the values in the returned dictionary. * **list** : Behaves the same as array.array, but if the element is non-numerical the element is combined with the column name and hashed, and 1 is used as the value. * **dict** : Each key in the dictionary is combined with the column name and hashed, and the value is kept. If the value is is non-numerical, the element is combined with the column name and hashed, and 1 is used as the value. * **float** : the column name is hashed, and the column entry becomes the value * **int** : Same behavior as float * **string** : Hash the string and use it as a key, and use 1 as the value. The hashed values are collapsed into a single sparse representation of a vector, so all hashed columns are replaced by a single column with name specified by 'output_column_name'. Parameters ---------- features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. num_bits : int, optional The number of bits to hash to. There will be :math:`2^{num\_bits}` indices in the resulting vector. output_column_name : str, optional The name of the output column. If the column already exists, then a suffix is append to the name. Returns ------- out : FeatureHasher A FeatureHasher object which is initialized with the defined parameters. Notes ----- - Each time a key is hashed, the corresponding value is multipled by either 1.0 or -1.0, chosen with equal probability. The final hashed feature value is the accumulation of values for all keys hashed to that bucket. References ---------- - Collaborative Spam Filtering with the Hashing Trick. J. Attenberg, K. Q. Weinberger, A. Smola, A. Dasgupta, M. Zinkevich Virus Bulletin (VB) 2009. See Also -------- graphlab.toolkits.feature_engineering._feature_hasher.FeatureHasher graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python from graphlab.toolkits.feature_engineering import * # Hash the feature space ['a', 'b, 'c'] into a single space. >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4], 'c': [9,10,11]}) >>> hasher = graphlab.feature_engineering.create(sf, FeatureHasher(features = ['a', 'b', 'c'])) # Transform the data using the hasher. >>> hashed_sf = hasher.transform(sf) >>> hashed_sf Columns: hashed_features dict Rows: 3 Data: +-------------------------------+ | hashed_features | +-------------------------------+ | {79785: -1, 188475: -2, 21... | | {79785: -2, 188475: -3, 21... | | {79785: -3, 188475: -4, 21... | +-------------------------------+ [3 rows x 1 columns] # Save the transformer. >>> hasher.save('save-path') ''' _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc _transform_examples_doc = _transform_examples_doc get_default_options = staticmethod( _get_default_options_wrapper( '_FeatureHasher', 'toolkits.feature_engineering._feature_hasher', 'FeatureHasher', True)) def __init__(self, features=None, excluded_features=None, num_bits=18, output_column_name='hashed_features'): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(num_bits, [int]) _raise_error_if_not_of_type(output_column_name, [str]) # Set up options opts = { 'num_bits': num_bits, 'output_column_name': output_column_name, } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._FeatureHasher() proxy.init_transformer(opts) super(FeatureHasher, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) fields = [("Features", _features), ("Excluded features", _exclude), ("Output column name", 'output_column_name'), ("Number of bits", 'num_bits')] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, width=30) @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({'a': [1, 2, 3], 'b': [2, 3, 4]}) hasher = _gl.feature_engineering.FeatureHasher(features=['a', 'b']) return hasher.fit(sf), sf
from graphlab.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid from graphlab.toolkits._internal_utils import _raise_error_if_column_exists from graphlab.toolkits._model_workflow import _collect_model_workflow from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin _RANDOM_FOREST_MODEL_PARAMS_KEYS = ['num_trees', 'step_size', 'max_depth', 'min_child_weight', 'min_loss_reduction', 'row_subsample'] _RANDOM_FOREST_TRAINING_PARAMS_KEYS = ['objective', 'training_time', 'training_error', 'validation_error', 'evaluation_metric'] _RANDOM_FOREST_TRAINING_DATA_PARAMS_KEYS = ['target', 'features', 'num_features', 'num_examples', 'num_validation_examples'] get_default_options = _get_default_options_wrapper( 'random_forest_regression', 'random_forest_regression', 'RandomForestRegression') class RandomForestRegression(_SupervisedLearningModel, _TreeModelMixin): """ Encapsulates random forest models for regression tasks. The prediction is based on a collection of base learners, `regression trees <http://en.wikipedia.org/wiki/Decision_tree_learning>`_ and combines them through a technique called `random forest <http://en.wikipedia.org/wiki/Random_forest>`_. Different from linear models, e.g. linear regression, the random forests are able to model non-linear interactions between the features and the target using decision trees as the subroutine. It is good for handling numerical features and categorical features with
class BM25(Transformer): ''' Transform an SFrame into BM25 scores for a given query. If we have a query with words :math:`q_1, ..., q_n` the BM25 score for a document is: .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i) + k_1 * (1-b+b*|D|/d_{avg}))} where we use the natural logarithm and * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)` is the inverse document frequency of :math:`q_i` * :math:`N` is the number of documents (in the training corpus) * :math:`n(q_i)` is the number of documents (in the training corpus) containing :math:`q_i` * :math:`f(q_i)` is the number of times :math:`q_i` occurs in the document * :math:`|D|` is the number of words in the document * :math:`d_{avg}` is the average number of words per document (in the training corpus) * :math:`k_1` and :math:`b` are free parameters. The transformed output is a column of type float with the BM25 score for each document. The behavior of BM25 for different input data column types is as follows: * **dict** : Each (key, value) pair is treated as count associated with the key for this row. A common example is to have a dict element contain a bag-of-words representation of a document, where each key is a word and each value is the number of times that word occurs in the document. All non-numeric values are ignored. * **list** : The list is converted to bag of words of format, where the keys are the unique elements in the list and the values are the counts of those unique elements. After this step, the behaviour is identical to dict. * **string** : Behaves identically to a **dict**, where the dictionary is generated by converting the string into a bag-of-words format. For example, "I really like really fluffy dogs" would get converted to {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}. Parameters ---------- features : str Name of feature column to be transformed. query : A list, set, or SArray of type str A list, set or SArray where each element is a word. k1 : float, optional Free parameter which controls the relative importance of term frequencies. Recommend values are [1.2, 2.0]. Default is 1.5. b : float, optional Free parameter which controls how much to downweight scores for long documents. Recommend value is 0.75. Default is 0.75. max_document_frequency: float, optional The maximum ratio of document_frequency to num_documents that is encoded. All query terms with a document frequency higher than this are discarded. This value must be between 0 and 1. min_document_frequency: float, optional The minimum ratio of document_frequency to num_documents that is encoded. All query terms with a document frequency lower than this are discarded. This value must be between 0 and 1. output_column_name: str, optional The output column name of the transform. If specified, it a new column name with the specified column name is added to the input SFrame. Otherwise, the 'feature' column is overwritten. Returns ------- out : BM25 A BM25 object which is initialized with the defined parameters. Notes ----- - `None` values are treated as separate categories and are encoded along with the rest of the values. References ---------- - For more details about BM25, see http://en.wikipedia.org/wiki/Okapi_BM25 See Also -------- graphlab.toolkits.feature_engineering._tfidf.TFIDF, graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python >>> import graphlab as gl # Create data >>> sf = gl.SFrame( {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, {'this': 1, 'is': 1, 'another': 2, 'example': 3}, {'final': 1, 'doc': 1, 'here': 2}]}) # Create a query set >>> query = ['a','query','example'] # Create a BM25 encoder >>> from graphlab.toolkits.feature_engineering import BM25 >>> encoder = gl.feature_engineering.create(dataset = sf, transformers = BM25('docs')) # Transform the data >>> transformed_sf = encoder.transform(data = sf) Data: +----------------+ | docs | +----------------+ | 0.744711615513 | | 0.789682123696 | | 0.0 | +----------------+ [3 rows x 1 columns] # Save the transformer. >>> encoder.save('save-path') # Return the indices in the encoding. >>> encoder['document_frequencies'] Data: +----------------+---------+--------------------+ | feature_column | term | document_frequency | +----------------+---------+--------------------+ | docs | a | 1 | | docs | example | 1 | +----------------+---------+--------------------+ ''' # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc _transform_examples_doc = _transform_examples_doc # Default options get_default_options = staticmethod(_get_default_options_wrapper( '_BM25', 'toolkits.feature_engineering._bm25', 'BM25', True)) def __init__(self, feature, query, k1 = 1.5, b = 0.75, min_document_frequency = 0.0, max_document_frequency=1.0, output_column_name=None): # Convert query to list if necessary if isinstance(query, _gl.SArray): query = list(query) if isinstance(query, set): query = list(query) # Type checking _raise_error_if_not_of_type(feature, [str]) for q in query: _raise_error_if_not_of_type(q, [str]) # query must be list of strings _raise_error_if_not_of_type(k1, [float, int]) _raise_error_if_not_of_type(b, [float, int]) _raise_error_if_not_of_type(min_document_frequency, [float, int]) _raise_error_if_not_of_type(max_document_frequency, [float, int]) _raise_error_if_not_of_type(output_column_name, [str, type(None)]) # Set up options opts = { 'features': [feature], 'query': query, 'k1': k1, 'b': b, 'min_document_frequency': min_document_frequency, 'max_document_frequency': max_document_frequency, 'output_column_name' : output_column_name } # Initialize object proxy = _gl.extensions._BM25() proxy.init_transformer(opts) super(BM25, self).__init__(proxy, self.__class__) def _get_summary_struct(self): _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) fields = [ ("Features", _features), ("query", 'query'), ("k1", 'k1'), ("b", 'b'), ("Minimimum Document Frequency", 'min_document_frequency'), ("Maximimum Document Frequency", 'max_document_frequency'), ("Output Column Name", 'output_column_name') ] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, 30) @classmethod def _get_instance_and_data(self): sf = _gl.SFrame({'docs': ["this is a test", "this is another test"]}) encoder = _gl.feature_engineering.BM25('docs', ['a', 'test']) encoder = encoder.fit(sf) return encoder, sf
class FeatureBinner(Transformer): ''' Feature binning is a method of turning continuous variables into categorical values. This is accomplished by grouping the values into a pre-defined number of bins. The continuous value then gets replaced by a string describing the bin that contains that value. FeatureBinner supports both 'logarithmic' and 'quantile' binning strategies for either int or float columns. Parameters ---------- features : list[str] , optional Column names of features to be transformed. If None, all columns are selected. excluded_features : list[str] | str | None, optional Column names of features to be ignored in transformation. Can be string or list of strings. Either 'excluded_features' or 'features' can be passed, but not both. strategy : 'logarithmic' | 'quantiles', optional If the strategy is 'logarithmic', bin break points are defined by :math:`10^i` for i in [0,...,num_bins-2]. For instance, if num_bins = 2, the bins become (-Inf, 1], (1, Inf]. If num_bins = 3, the bins become (-Inf, 1], (1, 10], (10, Inf]. If the strategy is 'quantile', the bin breaks are defined by the 'num_bins'-quantiles for that columns data. Quantiles are values that separate the data into roughly equal-sized subsets. num_bins : int, optional The number of bins to group the continuous variables into. output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : FeatureBinner A FeatureBinner object which is initialized with the defined parameters. See Also -------- graphlab.toolkits.feature_engineering._feature_binner.FeatureBinner graphlab.toolkits.feature_engineering.create Notes ----- - If the SFrame to be transformed already contains a column with the designated output column name, then that column will be replaced with the new output. In particular, this means that `output_column_prefix=None` will overwrite the original feature columns. Examples -------- .. sourcecode:: python >>> from graphlab.toolkits.feature_engineering import * # Construct a feature binner with default options. >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4], 'c': [9,10,11]}) >>> binner = graphlab.feature_engineering.create(sf, FeatureBinner(features = ['a', 'b', 'c'], strategy = 'quantile')) # Transform the data using the binner. >>> binned_sf = binner.transform(sf) # Save the transformer. >>> binner.save('save-path') # Return the details about the bins >>> binner['bins'] Columns: column str name str left float right float Rows: 30 Data: +--------+------+---------------------+--------------------+ | column | name | left | right | +--------+------+---------------------+--------------------+ | a | a_0 | -1.79769313486e+308 | 1.0 | | a | a_1 | 1.0 | 1.0 | | a | a_2 | 1.0 | 1.0 | | a | a_3 | 1.0 | 1.0 | | a | a_4 | 1.0 | 2.0 | | a | a_5 | 2.0 | 2.0 | | a | a_6 | 2.0 | 2.0 | | a | a_7 | 2.0 | 3.0 | | a | a_8 | 3.0 | 3.0 | | a | a_9 | 3.0 | 1.79769313486e+308 | +--------+------+---------------------+--------------------+ [30 rows x 4 columns] Note: Only the head of the SFrame is printed. You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns. ''' _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc get_default_options = staticmethod( _get_default_options_wrapper( '_FeatureBinner', 'toolkits.feature_engineering._feature_binner', 'FeatureBinner', True)) def __init__(self, features=None, excluded_features=None, strategy='logarithmic', num_bins=10, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(num_bins, [int]) _raise_error_if_not_of_type(strategy, [str]) # Set up options opts = { 'strategy': strategy, 'num_bins': num_bins, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._FeatureBinner() proxy.init_transformer(opts) super(FeatureBinner, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) fields = [("Features", _features), ("Excluded_features", _exclude), ("Strategy for creating bins", 'strategy'), ("Number of bins to use", 'num_bins')] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, width=30) @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({'a': [1, 2, 3], 'b': [2, 3, 4]}) binner = _gl.feature_engineering.FeatureBinner(features=['a', 'b'], strategy='quantile') return binner.fit(sf), sf
class NumericImputer(Transformer): ''' Impute missing values with feature means. Input columns to the NumericImputer must be of type *int*, *float*, *dict*, *list*, or *array.array*. For each column in the input, the transformed output is a column where the input is retained as is if: * there is no missing value. Inputs that do not satisfy the above are set to the mean value of that feature. The behavior for different input data column types is as follows: (see :func:`~graphlab.feature_engineering.NumericImputer.transform` for for examples). * **float** : If there is a missing value, it is replaced with the mean of that column. * **int** : Behaves the same way as *float*. * **list** : Each index of the list is treated as a feature column, and missing values are replaced with per-feature means. This is the same as unpacking, computing the mean, and re-packing. All elements must be of type *float*, *int*, or *None*. See :func:`~graphlab.SFrame.pack_columns` for more information. * **array** : Same behavior as *list* * **dict** : Same behavior as *list*, except keys not present in a particular row are implicitly interpreted as having the value 0. This makes the *dict* type a sparse representation of a vector. Parameters ---------- features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all feature columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. strategy: 'auto'|'mean', optional The strategy with which to perform imputation.Currently can be 'auto' or 'mean'. Both currently perform mean imputation. output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : NumericImputer A NumericImputer object which is initialized with the defined parameters. See Also -------- graphlab.toolkits.feature_engineering._numeric_imputer.NumericImputer graphlab.toolkits.feature_engineering.create Notes ----- - If the SFrame to be transformed already contains a column with the designated output column name, then that column will be replaced with the new output. In particular, this means that `output_column_prefix=None` will overwrite the original feature columns. Examples -------- .. sourcecode:: python # Create data. >>> sf = graphlab.SFrame({'a': [1,3], 'b' : [2,4]}) # Create a transformer. >>> from graphlab.toolkits.feature_engineering import NumericImputer >>> imputer = graphlab.feature_engineering.create(sf, NumericImputer(features = ['a', 'b'], strategy = 'mean')) # Transform the data. >>> new_sf = graphlab.SFrame({'a': [1,None,3], 'b' : [2, None,4]}) >>> transformed_sf = imputer.transform(new_sf) # Save the transformer. >>> imputer.save('save-path') # Return the means. >>> imputer['means'] Columns: a float b float Rows: 1 Data: +-----+-----+ | a | b | +-----+-----+ | 2.0 | 3.0 | +-----+-----+ [1 rows x 2 columns] ''' # Doc strings _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc # Default options get_default_options = staticmethod( _get_default_options_wrapper( '_MeanImputer', 'toolkits.feature_engineering._mean_imputer', 'MeanImputer', True)) def __init__(self, features=None, excluded_features=None, strategy='auto', output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(strategy, [str]) # Set up options opts = { 'strategy': strategy, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._MeanImputer() proxy.init_transformer(opts) super(NumericImputer, self).__init__(proxy, self.__class__) def _get_summary_struct(self): _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) fields = [ ("Features", _features), ("Excluded features", _exclude), ] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, 30) @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({'a': [1, 2, 3], 'b': [2, 3, 4]}) imputer = _gl.feature_engineering.NumericImputer(features=['a', 'b'], strategy='mean') return imputer.fit(sf), sf
possible_args = set(get_default_options()["name"]) except (RuntimeError, KeyError): possible_args = set() bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) opts.update(kwargs) response = _graphlab.toolkits._main.run('recsys_train', opts, verbose) return FactorizationRecommender(response['model']) get_default_options = _get_default_options_wrapper( 'factorization_recommender', 'recommender.factorization_recommender', 'FactorizationRecommender') class FactorizationRecommender(_Recommender): r""" A FactorizationRecommender learns latent factors for each user and item and uses them to make rating predictions. FactorizationRecommender [Koren_et_al]_ contains a number of options that tailor to a variety of datasets and evaluation metrics, making this one of the most powerful model in the GraphLab Create recommender toolkit. **Side information** Side features may be provided via the `user_data` and `item_data` options when the model is created.
class CategoricalImputer(Transformer): ''' The purpose of this imputer is to fill missing values (None) in data sets that have categorical data. For instance, if the data set has a "feature" column where some rows have values, and some rows have None, this imputer will fill the Nones with values. It will also return a probability associated with the imputed value. This is accomplished by grouping the data based on provided reference_features (unsupervised clustering) then by assigning reference_features to the clusters following a graph walk among the resulting clusters. Parameters ---------- reference_features : list[str] , optional Column names of reference_features to be used for clustering. If None, all columns are selected. feature : 'feature', optional Name of the column to impute. This column should contain some categorical values, as well as rows with None. Those rows will be imputed. Returns ------- out : CategoricalImputer A CategoricalImputer object which is initialized with the defined parameters. See Also -------- graphlab.toolkits.feature_engineering._categorical_imputer.CategoricalImputer graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python from graphlab.toolkits.feature_engineering import * # Impute the column "feature" using information from columns ['a', 'b'] >>> sf = graphlab.SFrame({'a' : [0,1,1], 'b' : [1,0,0], 'label' : [1,2,None]}) >>> imputer = graphlab.feature_engineering.CategoricalImputer( feature = 'label', reference_features = ['a', 'b']) >>> imputer.fit(sf) # Print the input data. >>> sf Columns: a int b int label int Rows: 3 Data: +---+---+-------+ | a | b | label | +---+---+-------+ | 0 | 1 | 1 | | 1 | 0 | 2 | | 1 | 0 | None | +---+---+-------+ [3 rows x 3 columns] # Transform the data using the imputer. >>> imputed_sf = imputer.transform(sf) # Retrieve the imputed data. >>> imputed_sf Columns: a int b int label int predicted_feature_label int feature_probability_label float Rows: 3 Data: +---+---+---------+-------------------------+---------------------------+ | a | b | feature | predicted_feature_label | feature_probability_label | +---+---+---------+-------------------------+---------------------------+ | 0 | 1 | 1 | 1 | 1.0 | | 1 | 0 | 2 | 2 | 1.0 | | 1 | 0 | None | 2 | 1.0 | +---+---+---------+-------------------------+---------------------------+ [3 rows x 5 columns] # Save the transformer. >>> imputer.save('save-path') # Bin only a single column 'a'. >>> imputer = graphlab.feature_engineering.create(sf, graphlab.feature_engineering.CategoricalImputer( reference_features = ['a'], feature='label')) ''' _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc get_default_options = staticmethod( _get_default_options_wrapper( '_CategoricalImputer', 'toolkits.feature_engineering._categorical_imputer', 'CategoricalImputer', True)) def __init__(self, reference_features=None, feature="feature", verbose=False): # Process and make a copy of the reference_features _reference_features, _exclude = _internal_utils.process_features( reference_features, None) # Type checking _raise_error_if_not_of_type(feature, [str]) # Set up options opts = { 'reference_features': reference_features, 'feature': feature, 'verbose': verbose } opts['reference_features'] = _reference_features # Initialize object proxy = _gl.extensions._CategoricalImputer() proxy.init_transformer(opts) super(CategoricalImputer, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<feature>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _reference_features = _precomputed_field( _internal_utils.pretty_print_list(self.get('reference_features'))) fields = [("reference_features", _reference_features), ("Column to impute", 'feature')] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, width=30) @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({ 'a': [1, 1, 1], 'b': [1, 0, 1], 'feature': [1, 2, None] }) imputer = _gl.feature_engineering.CategoricalImputer( feature='feature', reference_features=['a', 'b']) return imputer.fit(sf), sf
from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin _DECISION_TREE_MODEL_PARAMS_KEYS = [ 'max_depth', 'min_child_weight', 'min_loss_reduction' ] _DECISION_TREE_TRAINING_PARAMS_KEYS = [ 'objective', 'training_time', 'training_error', 'validation_error', 'evaluation_metric' ] _DECISION_TREE_TRAINING_DATA_PARAMS_KEYS = [ 'target', 'features', 'num_features', 'num_examples', 'num_validation_examples' ] get_default_options = _get_default_options_wrapper('decision_tree_classifier', 'decision_tree_classifier', 'DecisionTreeClassifier') __doc_string_context = ''' >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = graphlab.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = graphlab.decision_tree_classifier.create(train, target='label') ''' class DecisionTreeClassifier(_Classifier, _TreeModelMixin): """ Special case of gradient boosted trees with the number of trees set to 1.
_SupervisedLearningModel from graphlab.toolkits._internal_utils import _toolkit_repr_print, \ _toolkit_get_topk_bottomk, \ _summarize_coefficients, \ _raise_error_evaluation_metric_is_valid from graphlab.toolkits._model_workflow import _collect_model_workflow from graphlab.toolkits._model import _get_default_options_wrapper _DEFAULT_SOLVER_OPTIONS = { 'convergence_threshold': 1e-2, 'step_size': 1.0, 'lbfgs_memory_level': 11, 'max_iterations': 10 } get_default_options = _get_default_options_wrapper( 'regression_linear_regression', 'linear_regression', 'LinearRegression') def create( dataset, target, features=None, l2_penalty=1e-2, l1_penalty=0.0, solver='auto', feature_rescaling=True, convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'], step_size=_DEFAULT_SOLVER_OPTIONS['step_size'], lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'], validation_set="auto",
possible_args = set() bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) opts.update(kwargs) response = _graphlab.toolkits._main.run('recsys_train', opts, verbose) return RankingFactorizationRecommender(response['model']) get_default_options = _get_default_options_wrapper( 'ranking_factorization_recommender', 'recommender.RankingFactorizationRecommender', 'RankingFactorizationRecommender') class RankingFactorizationRecommender(_Recommender): r""" A RankingFactorizationRecommender learns latent factors for each user and item and uses them to rank recommended items according to the likelihood of observing those (user, item) pairs. This is commonly desired when performing collaborative filtering for implicit feedback datasets or datasets with explicit ratings for which ranking prediction is desired. RankingFactorizationRecommender contains a number of options that tailor to a variety of datasets and evaluation metrics, making this one of the most powerful models in the GraphLab Create
_toolkit_get_topk_bottomk, \ _raise_error_if_not_sframe, \ _check_categorical_option_type, \ _map_unity_proxy_to_object, \ _raise_error_evaluation_metric_is_valid, \ _summarize_coefficients from graphlab.toolkits._model_workflow import _collect_model_workflow _DEFAULT_SOLVER_OPTIONS = { 'convergence_threshold': 1e-2, 'step_size': 1.0, 'lbfgs_memory_level': 11, 'max_iterations': 10} get_default_options = _get_default_options_wrapper( 'classifier_logistic_regression', 'logistic_classifier', 'LogisticClassifier') def create(dataset, target, features=None, l2_penalty=0.01, l1_penalty=0.0, solver='auto', feature_rescaling=True, convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'], step_size = _DEFAULT_SOLVER_OPTIONS['step_size'], lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'], class_weights = None, validation_set = 'auto', verbose=True): """ Create a :class:`~graphlab.logistic_classifier.LogisticClassifier` (using logistic regression as a classifier) to predict the class of a discrete
_BOOSTED_TREE_TRAINING_DATA_PARAMS_KEYS = ['target', 'features', 'num_features', 'num_examples', 'num_validation_examples'] DEFAULT_HYPER_PARAMETER_RANGE = { 'max_depth': [6, 8, 10], 'step_size': 0.3, 'min_loss_reduction': [0, 1, 10], 'min_child_weight': 0.1, 'row_subsample': 1, 'column_subsample': 1, 'max_iterations': [10, 50, 100] } get_default_options = _get_default_options_wrapper( 'boosted_trees_regression', 'boosted_trees_regression', 'BoostedTreesRegression') class BoostedTreesRegression(_SupervisedLearningModel): """ Encapsulates gradient boosted trees for regression tasks. The prediction is based on a collection of base learners, `regression trees <http://en.wikipedia.org/wiki/Decision_tree_learning>`_. Different from linear models, e.g. linear regression, the gradient boost trees model is able to model non-linear interactions between the features and the target using decision trees as the subroutine. It is good for handling numerical features and categorical features with
'user_id': user_id, 'item_id': item_id, 'target': target, 'user_data': user_data, 'item_data': item_data, 'nearest_items': _graphlab.SFrame(), 'model': model_proxy, 'random_seed': 1 } response = _graphlab.toolkits._main.run('recsys_train', opts, verbose) return PopularityRecommender(response['model']) get_default_options = _get_default_options_wrapper( 'popularity', 'recommender.popularity_recommender', 'PopularityRecommender') class PopularityRecommender(_Recommender): """ The Popularity Model ranks an item according to its overall popularity. When making recommendations, the items are scored by the number of times it is seen in the training set. The item scores are the same for all users. Hence the recommendations are not tailored for individuals. The Popularity Recommender is simple and fast and provides a reasonable baseline. It can work well when observation data is sparse. It can be used as a "background" model for new users.
class TFIDF(Transformer): ''' Transform an SFrame into TF-IDF scores. The prototypical application of TF-IDF transformations involves document collections, where each element represents a document in bag-of-words format, i.e. a dictionary whose keys are words and whose values are the number of times the word occurs in the document. For more details, check the reference section for further reading. The TF-IDF transformation performs the following computation .. math:: \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w)) where :math:`tf(w, d)` is the number of times word :math:`w` appeared in document :math:`d`, :math:`f(w)` is the number of documents word :math:`w` appeared in, :math:`N` is the number of documents, and we use the natural logarithm. The transformed output is a column of type dictionary (`max_categories` per column dimension sparse vector) where the key corresponds to the index of the categorical variable and the value is `1`. The behavior of TF-IDF for each input data column type for supported types is as follows. (see :func:`~graphlab.feature_engineering.TFIDF.transform` for examples of the same). * **dict** : Each (key, value) pair is treated as count associated with the key for this row. A common example is to have a dict element contain a bag-of-words representation of a document, where each key is a word and each value is the number of times that word occurs in the document. All non-numeric values are ignored. * **list** : The list is converted to bag of words of format, where the keys are the unique elements in the list and the values are the counts of those unique elements. After this step, the behaviour is identical to dict. * **string** : Behaves identically to a **dict**, where the dictionary is generated by converting the string into a bag-of-words format. For example, 'I really like really fluffy dogs" would get converted to {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}. Parameters ---------- features : str Name of feature column to be transformed. max_document_frequency: float The maximum ratio of document_frequency to num_documents that is encoded. All terms with a document frequency higher than this are discarded. This value must be between 0 and 1. min_document_frequency: int, optional The minimum ratio of document_frequency to num_documents that is encoded. All terms with a document frequency lower than this are discarded. This value must be between 0 and 1. output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : TFIDF A TFIDF object which is initialized with the defined parameters. Notes ------- - `None` values are treated as separate categories and are encoded along with the rest of the values. - If the SFrame to be transformed already contains a column with the designated output column name, then that column will be replaced with the new output. In particular, this means that `output_column_prefix=None` will overwrite the original feature columns. References ---------- For more details about tf-idf, see http://en.wikipedia.org/wiki/Tf%E2%80%93idf See Also -------- graphlab.toolkits.feature_engineering._tfidf.TFIDF, graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python >>> import graphlab as gl # Create the data >>> sf = gl.SFrame( {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, {'this': 1, 'is': 1, 'another': 2, 'example': 3}]}) # Create a TFIDF encoder object. >>> encoder = gl.feature_engineering.TFIDF('docs') # Fit the encoder for a given dataset. >>> encoder = encoder.fit(sf) >>> result = transformed_sf = encoder.transform(sf) >>> result.print_rows(max_column_width=60) +-------------------------------------------------------------+ | docs | +-------------------------------------------------------------+ | {'this': 0.0, 'a': 1.3862943611198906, 'is': 0.0, 'sampl... | | {'this': 0.0, 'is': 0.0, 'example': 2.0794415416798357, ... | +-------------------------------------------------------------+ ''' # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc _transform_examples_doc = _transform_examples_doc # Default options get_default_options = staticmethod(_get_default_options_wrapper( '_TFIDF', 'toolkits.feature_engineering._tfidf', 'TFIDF', True)) def __init__(self, features=None, excluded_features=None, min_document_frequency=0.0, max_document_frequency=1.0, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(min_document_frequency, [float, int]) _raise_error_if_not_of_type(max_document_frequency, [float, int]) _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) # Set up options opts = { 'min_document_frequency': min_document_frequency, 'max_document_frequency': max_document_frequency, 'output_column_prefix' : output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._TFIDF() proxy.init_transformer(opts) super(TFIDF, self).__init__(proxy, self.__class__) def _get_summary_struct(self): _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) fields = [ ("Features", _features), ("Minimimum Document Frequency", 'min_document_frequency'), ("Maximimum Document Frequency", 'max_document_frequency'), ("Output Column Prefix", 'output_column_prefix') ] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, 30) @classmethod def _get_instance_and_data(self): sf = _gl.SFrame( {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, {'this': 1, 'is': 1, 'another': 2, 'example': 3}]}) encoder = _gl.feature_engineering.TFIDF(features=['docs']) encoder = encoder.fit(sf) return encoder, sf
class WordCounter(Transformer): ''' __init__(features=None, excluded_features=None, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "], output_column_prefix=None) Transform string/dict/list columns of an SFrame into their respective bag-of-words representation. Bag-of-words is a common text representation. An input text string is first tokenized. Each token is understood to be a word. The output is a dictionary of the count of the number of times each unique word appears in the text string. This dictionary is a sparse representation because most of the words in the vocabulary do not appear in every single sentence, hence their count is zero, which are not explicitly included in the dictionary. WordCounter can be applied to all the string-, dictionary-, and list-typed columns in a given SFrame. Its behavior for each supported input column type is as follows. (See :func:`~graphlab.feature_engineering.WordCounter.transform` for usage examples). * **string** : The string is first tokenized. By default, all letters are first converted to lower case, then tokenized by space characters. The user can specify a custom delimiter list, or use Penn tree-bank style tokenization (see input parameter description for details). Each token is taken to be a word, and a dictionary is generated where each key is a unique word that appears in the input text string, and the value is the number of times the word appears. For example, "I really like Really fluffy dogs" would get converted to {'i' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}. * **list** : Each element of the list must be a string, which is tokenized according to the input method and tokenization settings, followed by counting. The behavior is analogous to that of dict-type input, where the count of each list element is taken to be 1. For example, under default settings, an input list of ['alice bob Bob', 'Alice bob'] generates an output bag-of-words dictionary of {'alice': 2, 'bob': 3}. * **dict** : The method first obtains the list of keys in the dictionary. This list is processed as described above. Parameters ---------- features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all feature columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. to_lower : bool, optional Indicates whether to map the input strings to lower case before counting. delimiters: list[string], optional A list of delimiter characters for tokenization. By default, the list is defined to be the list of space characters. The user can define any custom list of single-character delimiters. Alternatively, setting `delimiters=None` will use a Penn treebank type tokenization, which is better at handling punctuations. (See reference below for details.) output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : WordCounter A WordCounter feature engineering object which is initialized with the defined parameters. Notes ----- If the SFrame to be transformed already contains a column with the designated output column name, then that column will be replaced with the new output. In particular, this means that `output_column_prefix=None` will overwrite the original feature columns. References ---------- - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ See Also -------- graphlab.toolkits.text_analytics.count_words, graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter, graphlab.toolkits.feature_engineering._tfidf.TFIDF, graphlab.toolkits.feature_engineering._tokenizer.Tokenizer, graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python >>> import graphlab as gl # Create data. >>> sf = gl.SFrame({ ... 'string': ['sentences Sentences', 'another sentence'], ... 'dict': [{'bob': 1, 'Bob': 0.5}, {'a': 0, 'cat': 5}], ... 'list': [['one', 'two', 'three'], ['a', 'cat']]}) # Create a WordCounter transformer. >>> from graphlab.toolkits.feature_engineering import WordCounter >>> encoder = WordCounter() # Fit and transform the data. >>> transformed_sf = encoder.fit_transform(sf) Columns: dict dict list dict string dict Rows: 2 Data: +------------------------+----------------------------------+ | dict | list | +------------------------+----------------------------------+ | {'bob': 1.5} | {'one': 1, 'three': 1, 'two': 1} | | {'a': 0, 'cat': 5} | {'a': 1, 'cat': 1} | +------------------------+----------------------------------+ +-------------------------------+ | string | +-------------------------------+ | {'sentences': 2} | | {'another': 1, 'sentence': 1} | +-------------------------------+ [2 rows x 3 columns] # Penn treebank-style tokenization (recommended for smarter handling # of punctuations) >>> sf = gl.SFrame({'string': ['sentence $$one', 'sentence two...']}) >>> WordCounter(delimiters=None).fit_transform(sf) Columns: string dict Rows: 2 Data: +-----------------------------------+ | string | +-----------------------------------+ | {'sentence': 1, '$': 2, 'one': 1} | | {'sentence': 1, 'two': 1, '.': 3} | +-----------------------------------+ [2 rows x 1 columns] # Save the transformer. >>> encoder.save('save-path') ''' # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc _transform_examples_doc = _transform_examples_doc # Default options get_default_options = staticmethod( _get_default_options_wrapper( '_WordCounter', 'toolkits.feature_engineering._word_counter', 'WordCounter', True)) def __init__(self, features=None, excluded_features=None, to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "], output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType]) _raise_error_if_not_of_type(to_lower, [bool]) _raise_error_if_not_of_type(delimiters, [list, _NoneType]) _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType]) if delimiters != None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") if (len(delim) != 1): raise ValueError( "Delimiters must be single-character strings") # Set up options opts = { 'features': features, 'to_lower': to_lower, 'delimiters': delimiters, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._WordCounter() proxy.init_transformer(opts) super(WordCounter, self).__init__(proxy, self.__class__) def _get_summary_struct(self): _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) fields = [("Features", _features), ("Convert strings to lower case", 'to_lower'), ("Delimiters", "delimiters"), ("Output column prefix", 'output_column_prefix')] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, 30) @classmethod def _get_instance_and_data(self): sf = _gl.SFrame({ 'docs': [{ 'this': 1, 'is': 1, 'a': 2, 'sample': 1 }, { 'this': 1, 'is': 1, 'another': 2, 'example': 3 }] }) encoder = _gl.feature_engineering.WordCounter('docs') encoder = encoder.fit(sf) return encoder, sf
_DEFAULT_SOLVER_OPTIONS = { 'convergence_threshold': 1e-2, 'step_size': 1.0, 'lbfgs_memory_level': 11, 'mini_batch_size': 1, 'auto_tuning': True, 'max_iterations': 10} DEFAULT_HYPER_PARAMETER_RANGE = { 'l1_penalty' : [0.0, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'l2_penalty' : [0.0, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0] } get_default_options = _get_default_options_wrapper( 'regression_linear_regression', 'linear_regression', 'LinearRegression') def create(dataset, target, features=None, l2_penalty=1e-2, l1_penalty=0.0, solver='auto', feature_rescaling=True, convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'], step_size = _DEFAULT_SOLVER_OPTIONS['step_size'], lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], mini_batch_size = _DEFAULT_SOLVER_OPTIONS['mini_batch_size'], max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'], auto_tuning = _DEFAULT_SOLVER_OPTIONS['auto_tuning'], verbose=True): """ Create a :class:`~graphlab.linear_regression.LinearRegression` to predict a scalar target variable as a linear function of one or more features. In addition to standard numeric and categorical types, features
from graphlab.toolkits._supervised_learning import Classifier as _Classifier import graphlab.toolkits._supervised_learning as _sl from graphlab.toolkits._model_workflow import _collect_model_workflow from graphlab.toolkits._internal_utils import _toolkit_repr_print from graphlab.toolkits._supervised_learning import _show_model_tree from graphlab.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe from graphlab.toolkits._internal_utils import _raise_error_if_column_exists from graphlab.toolkits._internal_utils import _check_categorical_option_type from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin from graphlab.util import _make_internal_url import logging as _logging get_default_options = _get_default_options_wrapper('random_forest_classifier', 'random_forest_classifier', 'RandomForestClassifier') __doc_string_context = ''' >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = graphlab.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = graphlab.random_forest_classifier.create(train, target='label') ''' class RandomForestClassifier(_Classifier, _TreeModelMixin): """ The random forest model can be used as a classifier for predictive tasks.
>>> import graphlab as gl >>> import datetime # Load a data set. >>> sf = gl.SFrame( ... 'https://static.turi.com/datasets/churn-prediction/online_retail.csv') # Convert InvoiceDate from string to datetime. >>> import dateutil >>> from dateutil import parser >>> sf['InvoiceDate'] = sf['InvoiceDate'].apply(parser.parse) # Convert SFrame into TimeSeries. >>> time_series = gl.TimeSeries(sf, 'InvoiceDate') # Create a train-test split. >>> train, valid = gl.churn_predictor.random_split(time_series, ... user_id='CustomerID', fraction=0.9) # Train a churn prediction model. >>> model = gl.churn_predictor.create(train, user_id='CustomerID', ... features = ['Quantity']) """ from ._churn_predictor import create from ._churn_predictor import ChurnPredictor from ._churn_predictor import random_split from graphlab.toolkits._model import _get_default_options_wrapper get_default_options = _get_default_options_wrapper( '_ChurnPredictor', 'churn_predictor', 'ChurnPredictor', True)
'item_data': item_data, 'nearest_items': nearest_items, 'model': model_proxy, 'random_seed': 1, 'similarity_type': similarity_type, 'training_method': training_method, 'threshold': threshold, 'only_top_k': only_top_k} response = _graphlab.toolkits._main.run('recsys_train', opts, verbose) return ItemSimilarityRecommender(response['model']) get_default_options = _get_default_options_wrapper( 'item_similarity', 'recommender.item_similarity', 'ItemSimilarityRecommender') class ItemSimilarityRecommender(_Recommender): """ A model that ranks an item according to its similarity to other items observed for the user in question. **Creating an ItemSimilarityRecommender** This model cannot be constructed directly. Instead, use :func:`graphlab.recommender.item_similarity_recommender.create` to create an instance of this model. A detailed list of parameter options and code samples are available in the documentation for the create function.
class CountFeaturizer(Transformer): ''' Replaces a collection of categorical columns with counts of a target column. The CountFeaturizer is an efficient way of reducing high dimensional categorical columns into simple counts for the purpose of classification. Supported types are only str and int and both are interpreted categorically. The CountFeaturizer is effective for significantly accelerating downstream learning procedures without loss of accuracy for extremely large datasets. Assuming we are going to try to predict column Y which has K unique classses. Then for every column X, we replace it with 2 columns, "count_X" and "prob_X". The column count_X contains an array of length K which contains the counts of each unique value of Y where X is fixed. The column prob_X contains the normalized value of count_X dropping the last value. For instance, given the following SFrame: .. sourcecode:: python >>> sf = graphlab.SFrame({'a' : [1,1,2], 'y':[0,1,0]}) +---+---+ | a | y | +---+---+ | 1 | 0 | | 1 | 1 | | 2 | 0 | +---+---+ After fit_transform the output SFrame is .. sourcecode:: python >>> cf = graphlab.feature_engineering.CountFeaturizer(target = 'y', laplace_smearing=0) >>> cf.fit_transform(sf) +------------+--------+---+ | count_a | prob_a | y | +------------+--------+---+ | [1.0, 1.0] | [0.5] | 0 | | [1.0, 1.0] | [0.5] | 1 | | [1.0, 0.0] | [1.0] | 0 | +------------+--------+---+ [3 rows x 3 columns] Observe that in the original sframe, there is 1 occurance where a = 1 & y = 0 and 1 occurance where a = 1 & y = 1. Thus in every row where a = 1, we output [1.0, 1.0] in the count_a column. Similarly, for the case of a = 2, we have a count of 1 where y = 0 & a = 2, and no occurances of y = 1 & a = 2. Hence in every row where a = 2, we output [1.0, 0.0] in the count_a column. The prob_a column is just count_a column, normalized to sum to 1, and dropping the last value. The laplace_smearing parameter controls the amount of noise added to the result which may will allow fit() and transform() to be performed on the same dataset. Tuning this parameter can be difficult in practice however. Therefore it is highly recommended (and is the default behavior) to set laplace_smearing=0 and split the training dataset into two sets, where one set is used only in fit() and the other set used only in transform(). Parameters ---------- target: str, required The target column we are trying to predict. features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. num_bits : int, optional This parameter is the size of the countmin sketch used to approximate the counts and controls the accuracy of the counts. The higher the value, the more accurate the counts, but takes up more memory. Defaults to 20. laplace_smearing : float, optional Defaults to 0. Adds some noise to the transform result to allow the same dataset to be used for both fit and transform. When the number of rows is small, this parameter can be reduced in value. If set to 0, it is recommended that the training set be split into two sets, where one set is used used in fit(), the the other used in transform(). random_seed : int, optional A random seed. Fix this to get deterministic outcomes. count_column_prefix : str, optional The prefix added to the input column name to produce the output column name containing the counts. Defaults to `count_` prob_column_prefix : str, optional The prefix added to the input column name to produce the output column name containing the normalized counts. Defaults to `prob_` Returns ------- out : CountFeaturizer A CountFeaturizer object which is initialized with the defined parameters. Notes ----- The prob_X columns have one value dropped to eliminate a linear dependency. References ---------- Implements the method described in `this blog <https://blogs.technet.microsoft.com/machinelearning/2015/02/17/big-learning-made-easy-with-counts/>`. Examples -------- .. sourcecode:: python >>> from graphlab.toolkits.feature_engineering import * # Perform Count Featurization on columns 'a' and 'b' with respect to # the target 'y' >>> sf = graphlab.SFrame({'a' : [1,1,2], 'b' : [2,2,3], 'y':[0,1,0]}) >>> cf = graphlab.feature_engineering.create(sf, ... graphlab.feature_engineering.CountFeaturizer( ... features = ['a', 'b'], target = 'y')) # Transform the data >>> out_sf = cf.fit_transform(sf) >>> out_sf +------------+--------+------------+--------+---+ | count_a | prob_a | count_b | prob_b | y | +------------+--------+------------+--------+---+ | [1.0, 1.0] | [0.5] | [1.0, 1.0] | [0.5] | 0 | | [1.0, 1.0] | [0.5] | [1.0, 1.0] | [0.5] | 1 | | [1.0, 0.0] | [1.0] | [1.0, 0.0] | [1.0] | 0 | +------------+--------+------------+--------+---+ [3 rows x 5 columns] # Save the transformer. >>> cf.save('save-path') ''' _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc _transform_examples_doc = _transform_examples_doc get_default_options = staticmethod(_get_default_options_wrapper( '_CountFeaturizer', 'toolkits.feature_engineering._count_featurizer', 'CountFeaturizer', True)) _metric_handle = 'toolkits.feature_engineering.count_featurizer' def __init__(self, target, features=None, excluded_features=None, random_seed=None, laplace_smearing=0.0, num_bits=20, count_column_prefix='count_', prob_column_prefix='prob_'): _mt._get_metric_tracker().track(self._metric_handle + '.__init__') if count_column_prefix == prob_column_prefix: raise RuntimeError("count_column_prefix cannot be equal to prob_column_prefix") # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(num_bits, [int]) # Set up options opts = { 'target':target, 'num_bits': num_bits, 'random_seed': random_seed, 'laplace_smearing':laplace_smearing, 'num_bits':num_bits, 'count_column_prefix':count_column_prefix, 'prob_column_prefix':prob_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._CountFeaturizer() proxy.init_transformer(opts) super(CountFeaturizer, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) fields = [ ("Target", "target"), ("Features", _features), ("Excluded features", _exclude), ("Number of bits", 'num_bits'), ("Random seed", 'random_seed'), ("Laplace Smearing", 'laplace_smearing'), ("Count Column Prefix", 'count_column_prefix'), ("Probability Column Prefix", 'prob_column_prefix') ] section_titles = [ 'Model fields' ] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, width= 30) @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({'a' : [1,1,2], 'b' : [2,2,3], 'y':[0,1,0]}) cf = _gl.feature_engineering.CountFeaturizer(features = ['a', 'b'], target='y') return cf.fit(sf), sf
# Utils from graphlab.util import _raise_error_if_not_of_type from graphlab.toolkits._internal_utils import _toolkit_repr_print, \ _precomputed_field, \ _raise_error_if_not_sframe, \ _check_categorical_option_type from graphlab.toolkits._model import _get_default_options_wrapper from graphlab.toolkits._model import SDKModel as _SDKModel _DEFAULT_OPTIONS = { 'min_support': 1, 'max_patterns': 100, 'min_length': 1, } get_default_options = _get_default_options_wrapper( '_FPGrowth', 'frequent_pattern_mining', 'FrequentPatternMiner', True) def create(dataset, item, features=None, min_support=1, max_patterns=100, min_length=1): """ Create a :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner` to extract the set of frequently occurring items in an event-series. Parameters ---------- dataset : SFrame Dataset for training the model. item: string
except (RuntimeError, KeyError): possible_args = set() bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) opts.update(kwargs) response = _graphlab.toolkits._main.run('recsys_train', opts, verbose) return RankingFactorizationRecommender(response['model']) get_default_options = _get_default_options_wrapper( 'ranking_factorization_recommender', 'recommender.RankingFactorizationRecommender', 'RankingFactorizationRecommender') class RankingFactorizationRecommender(_Recommender): r""" A RankingFactorizationRecommender learns latent factors for each user and item and uses them to rank recommended items according to the likelihood of observing those (user, item) pairs. This is commonly desired when performing collaborative filtering for implicit feedback datasets or datasets with explicit ratings for which ranking prediction is desired. RankingFactorizationRecommender contains a number of options that tailor to a variety of datasets and evaluation metrics, making this one of the most powerful models in the GraphLab Create recommender toolkit.
from graphlab.toolkits._model import _get_default_options_wrapper from graphlab.toolkits._supervised_learning import Classifier as _Classifier import graphlab.toolkits._supervised_learning as _sl from graphlab.toolkits._model_workflow import _collect_model_workflow from graphlab.toolkits._internal_utils import _toolkit_repr_print from graphlab.toolkits._supervised_learning import _show_model_tree from graphlab.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe from graphlab.toolkits._internal_utils import _raise_error_if_column_exists from graphlab.toolkits._internal_utils import _check_categorical_option_type from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin get_default_options = _get_default_options_wrapper( 'random_forest_classifier', 'random_forest_classifier', 'RandomForestClassifier') __doc_string_context = ''' >>> url = 'http://s3.amazonaws.com/gl-testdata/xgboost/mushroom.csv' >>> data = graphlab.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = graphlab.random_forest_classifier.create(train, target='label') ''' class RandomForestClassifier(_Classifier, _TreeModelMixin): """ The random forest model can be used as a classifier for predictive tasks.
| EmpId | stay_probability | +-------+------------------+ | 1 | 0.841130895119 | | 2 | 0.121616783954 | | 3 | 0.121616783954 | | 4 | 0.121616783954 | | 5 | 0.121616783954 | | 6 | 0.121616783954 | | 7 | 0.121616783954 | | 8 | 0.121616783954 | | 9 | 0.121616783954 | | 10 | 0.121616783954 | +-------+------------------+ [49 rows x 2 columns] # It is important to notice that the output of the model is the User Id, as well # as the Probability of the user Staying (not churning). This means that 100% # means the user will stay (not churn), and 0% means the user will definitely churn. >>> model.save("model_file") >>> load_model = gl.load_model("model_file") """ from _churn_predictor import create from _churn_predictor import ChurnPredictor from graphlab.toolkits._model import _get_default_options_wrapper get_default_options = _get_default_options_wrapper( '_ChurnPredictor', 'churn_predictor', 'ChurnPredictor', True)
class CountThresholder(Transformer): ''' Map infrequent categorical variables to a `new/separate` category. Input columns to the CountThresholder must be of type *int*, *string*, *dict*, or *list*. For each column in the input, the transformed output is a column where the input category is retained as is if: * it has occurred at least `threshold` times in the training data. categories that does not satisfy the above are set to `output_category_name`. The behaviour for different input data column types is as follows: (see :func:`~graphlab.feature_engineering.CountThresholder.transform` for for examples). * **string** : Strings are marked with the `output_category_name` if the threshold condition described above is not satisfied. * **int** : Behave the same way as *string*. If `output_category_name` is of type *string*, then the entire column is cast to string. * **list** : Each of the values in the list are mapped in the same way as a string value. * **dict** : They key of the dictionary is treated as a `namespace` and the value is treated as a `sub-category` in the `namespace`. The categorical variable passed through the transformer is a combination of the `namespace` and the `sub-category`. Parameters ---------- features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all feature columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. threshold : int, optional Ignore all categories that have not occurred at least `threshold` times. All categories that do not occur at least `threshold` times are mapped to the `output_category_name`. output_category_name : str | None, optional The value to use for the categories that do not satisfy the `threshold` condition. output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : CountThresholder A CountThresholder object which is initialized with the defined parameters. See Also -------- graphlab.toolkits.feature_engineering._count_thresholder.CountThresholder graphlab.toolkits.feature_engineering.create Notes ----- - If the SFrame to be transformed already contains a column with the designated output column name, then that column will be replaced with the new output. In particular, this means that `output_column_prefix=None` will overwrite the original feature columns. - If the `output_category_name` and input feature column are not of the same type, then the output column is cast to `str`. - `None` values are treated as separate categories and are encoded along with the rest of the values. Examples -------- .. sourcecode:: python # Create data. >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4]}) # Create a transformer. >>> from graphlab.toolkits.feature_engineering import CountThresholder >>> count_tr = graphlab.feature_engineering.create(sf, CountThresholder(features = ['a', 'b'], threshold = 1)) # Transform the data. >>> transformed_sf = count_tr.transform(sf) # Save the transformer. >>> count_tr.save('save-path') # Return the categories that are not discarded. >>> count_tr['categories'] Columns: feature str category str Rows: 6 Data: +---------+----------+ | feature | category | +---------+----------+ | a | 1 | | a | 2 | | a | 3 | | b | 2 | | b | 3 | | b | 4 | +---------+----------+ [6 rows x 2 columns] ''' # Doc strings _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc # Default options get_default_options = staticmethod( _get_default_options_wrapper( '_CountThresholder', 'toolkits.feature_engineering._count_thresholder', 'CountThresholder', True)) def __init__(self, features=None, excluded_features=None, threshold=1, output_category_name=None, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(threshold, [int, type(None)]) # Set up options opts = { 'threshold': threshold, 'output_category_name': output_category_name, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._CountThresholder() proxy.init_transformer(opts) super(CountThresholder, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) fields = [ ("Features", _features), ("Excluded features", _exclude), ("New category name", 'output_category_name'), ("Occurrence threshold", 'threshold'), ] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, 30) @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({'a': [1, 2, 3, 2, 3], 'b': [2, 3, 4, 2, 3]}) count_tr = _gl.feature_engineering.CountThresholder( features=['a', 'b'], threshold=2, output_category_name='junk') return count_tr.fit(sf), sf
class RareWordTrimmer(Transformer): ''' Remove words that occur below a certain number of times in a given column. This is a common method of cleaning text before it is used, and can increase the quality and explainability of the models learned on the transformed data. RareWordTrimmer can be applied to all the string-, dictionary-, and list-typed columns in a given SFrame. Its behavior for each supported input column type is as follows. (See :func:`~graphlab.feature_engineering.RareWordTrimmer.transform` for usage examples). * **string** : The string is first tokenized. By default, all letters are first converted to lower case, then tokenized by space characters. Each token is taken to be a word, and the words occuring below a threshold number of times across the entire column are removed, then the remaining tokens are concatenated back into a string. * **list** : Each element of the list must be a string, where each element is assumed to be a token. The remaining tokens are then filtered by count occurences and a threshold value. * **dict** : The method first obtains the list of keys in the dictionary. This list is then processed as a standard list, except the value of each key must be of integer type and is considered to be the count of that key. Parameters ---------- features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all feature columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. threshold : int, optional The count below which words are removed from the input. stopwords: list[str], optional A manually specified list of stopwords, which are removed regardless of count. to_lower : bool, optional Indicates whether to map the input strings to lower case before counting. delimiters: list[string], optional A list of delimiter characters for tokenization. By default, the list is defined to be the list of space characters. The user can define any custom list of single-character delimiters. Alternatively, setting `delimiters=None` will use a Penn treebank type tokenization, which is better at handling punctuations. (See reference below for details.) output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : RareWordTrimmer A RareWordTrimmer feature engineering object which is initialized with the defined parameters. Notes ----- If the SFrame to be transformed already contains a column with the designated output column name, then that column will be replaced with the new output. In particular, this means that `output_column_prefix=None` will overwrite the original feature columns. References ---------- - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ See Also -------- graphlab.toolkits.text_analytics.count_words, graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter, graphlab.toolkits.feature_engineering._tfidf.TFIDF, graphlab.toolkits.feature_engineering._tokenizer.Tokenizer, graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python >>> import graphlab as gl # Create data. >>> sf = gl.SFrame({ ... 'string': ['sentences Sentences', 'another sentence another year'], ... 'dict': [{'bob': 1, 'Bob': 2}, {'a': 0, 'cat': 5}], ... 'list': [['one', 'two', 'three', 'Three'], ['a', 'cat', 'Cat']]}) # Create a RareWordTrimmer transformer. >>> from graphlab.toolkits.feature_engineering import RareWordTrimmer >>> trimmer = RareWordTrimmer() # Fit and transform the data. >>> transformed_sf = trimmer.fit_transform(sf) Columns: dict dict list list string str Rows: 2 Data: +------------+----------------+---------------------+ | dict | list | string | +------------+----------------+---------------------+ | {'bob': 2} | [three, three] | sentences sentences | | {'cat': 5} | [cat, cat] | another another | +------------+----------------+---------------------+ [2 rows x 3 columns] # Save the transformer. >>> trimmer.save('save-path') ''' # Doc strings _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc # Default options get_default_options = staticmethod( _get_default_options_wrapper( '_RareWordTrimmer', 'toolkits.feature_engineering._word_trimmer', 'RareWordTrimmer', True)) def __init__(self, features=None, excluded_features=None, threshold=2, stopwords=None, to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "], output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(features, [list, str, type(None)]) _raise_error_if_not_of_type(threshold, [int, type(None)]) _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) _raise_error_if_not_of_type(stopwords, [list, type(None)]) _raise_error_if_not_of_type(to_lower, [bool]) _raise_error_if_not_of_type(delimiters, [list, type(None)]) if delimiters != None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") if (len(delim) != 1): raise ValueError( "Delimiters must be single-character strings") # Set up options opts = { 'threshold': threshold, 'output_column_prefix': output_column_prefix, 'to_lower': to_lower, 'stopwords': stopwords, 'delimiters': delimiters } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._RareWordTrimmer() proxy.init_transformer(opts) super(RareWordTrimmer, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) _stopwords = _precomputed_field( _internal_utils.pretty_print_list(self.get('stopwords'))) fields = [("Features", _features), ("Excluded features", _exclude), ("Output column name", 'output_column_prefix'), ("Word count threshold", 'threshold'), ("Manually specified stopwords", _stopwords), ("Whether to convert to lowercase", "to_lower"), ("Delimiters", "delimiters")] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): """ Return a string description of the model, including a description of the training data, training statistics, and model hyper-parameters. Returns ------- out : string A description of the model. """ accessible_fields = { "vocabulary": "The vocabulary of the trimmed input." } (sections, section_titles) = self._get_summary_struct() out = _toolkit_repr_print(self, sections, section_titles, width=30) out2 = _summarize_accessible_fields(accessible_fields, width=30) return out + "\n" + out2 @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({ 'a': ['dog', 'dog', 'dog'], 'b': ['cat', 'one', 'one'] }) trimmer = _gl.feature_engineering.RareWordTrimmer(features=['a', 'b']) return trimmer.fit(sf), sf
class QuadraticFeatures(Transformer): ''' Calculates quadratic interaction terms between features. Adding interaction terms is a good way of injecting complex relationships between predictor variables while still using a simple learning algorithm (ie. Logistic Regression) that is easy to use and explain. The QuadraticFeatures transformer accomplishes this by taking a row of the SFrame, and multiplying the specified features together. If the features are of array.array or dictionary type, multiplications of all possible pairs are computed. If a non-numeric value is encountered, 1 is substituted for the value and the old string value becomes part of the interaction term name. Supported types are int, float, string, array.array, list, and dict. When the transformer is applied, an additional column with name specified by 'output_column_name' is added to the input SFrame. In this column of dictionary type, interactions are specified in the key names (by concatenating column names and keys/indices if applicable) and values are the multiplied values. Parameters ---------- features : list | str | tuple , optional Can be a list of tuples, a list of feature name strings, a feature name string, a tuple, or None. If it is a list of tuples containing two interaction terms, those are the calculated interaction terms. In the case of providing a list of feature_names, all pairs between those feature names are calculated. If the list is of size none, all feature pairs are calculated in the SFrame the transformer is applied to. excluded_features: list | str | tuple, optional Can be a list of tuples, a list of feature name strings, a feature name string, a tuple, or None. In the case of tuples, those particular interactions are excluded. In the case of feature names, all interactions with those features are excluded. Cannot set both 'exclude' and 'features'. output_column_name : str , optional The name of the output column Returns ------- out : QuadraticFeatures A QuadraticFeatures object which is initialized with the defined parameters. See Also -------- graphlab.toolkits.feature_engineering.QuadraticFeatures graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python from graphlab.toolkits.feature_engineering import * # Construct a quadratic features transformer with default options. >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4], 'c': [9,10,11]}) >>> quadratic = graphlab.feature_engineering.create(sf, QuadraticFeatures(features = ['a', 'b', 'c'])) # Transform the data. >>> quadratic_sf = quadratic.transform(sf) # Save the transformer. >>> quadratic.save('save-path') ''' _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc get_default_options = staticmethod( _get_default_options_wrapper( '_QuadraticFeatures', 'toolkits.feature_engineering._quadratic_features', 'QuadraticFeatures', True)) def __init__(self, features=None, excluded_features=None, output_column_name='quadratic_features'): #Type checking _raise_error_if_not_of_type(output_column_name, [str]) # set up options opts = {'output_column_name': output_column_name} # Make a copy of the parameters. _features = _copy.copy(features) _exclude = _copy.copy(excluded_features) # Check of both are None or empty. if _features and _exclude: raise ValueError( "The parameters 'features' and 'exclude' cannot both be set." " Please set one or the other.") if _features == [] and not _exclude: raise ValueError("Features cannot be an empty list.") # Check types _raise_error_if_not_of_type(_features, [NoneType, list, str, tuple], 'features') _raise_error_if_not_of_type(_exclude, [NoneType, list, str, tuple], 'exclude') # Allow a single list _features = [ _features ] if type(_features) == str or type(_features) == tuple else _features _exclude = [ _exclude ] if type(_exclude) == str or type(_exclude) == tuple else _exclude # Type check each feature/exclude if _features: for f in _features: _raise_error_if_not_of_type(f, [str, tuple], "Feature names") if _exclude: for e in _exclude: _raise_error_if_not_of_type(e, [str, tuple], "Excluded feature names") if _exclude: opts['exclude'] = True unprocessed_features = _exclude else: opts['exclude'] = False unprocessed_features = _features pair_list = set() if unprocessed_features is not None: if type(unprocessed_features[0]) is tuple: for t in unprocessed_features: pair_list.add(tuple(sorted(t))) elif type(unprocessed_features[0]) is str: if _exclude: for t in unprocessed_features: pair_list.add(t) else: for t in unprocessed_features: for k in unprocessed_features: pair_list.add(tuple(sorted((t, k)))) if type(output_column_name) is not str: raise ValueError("'output_column_name' must be of type str") if unprocessed_features is not None: if type(unprocessed_features[0]) is str: opts['features'] = unprocessed_features if _exclude: opts['feature_pairs'] = list(pair_list) else: opts['feature_pairs'] = [list(x) for x in pair_list] else: opts['feature_pairs'] = [list(x) for x in pair_list] opts['features'] = [list(x) for x in unprocessed_features] else: opts['feature_pairs'] = None opts['features'] = None # initialize object proxy = _gl.extensions._QuadraticFeatures() proxy.init_transformer(opts) super(QuadraticFeatures, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) fields = [("Features", _features), ("Excluded features", _exclude), ("Output column name", 'output_column_name')] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, width=30) @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({'a': [1, 2, 3], 'b': [2, 3, 4]}) encoder = _gl.feature_engineering.QuadraticFeatures( features=['a', 'b']) return encoder.fit(sf), sf
get_default_options, list_fields, get Examples -------- >>> sf = graphlab.SFrame({'a' : [0.1, 8, 3.5], 'b':[-3, 7.6, 3]}) >>> model = graphlab.kmeans.create(sf, 2) >>> model.get_current_options() {'num_clusters': 2, 'max_iterations': 10} """ opts = {'model': self.__proxy__, 'model_name': self.__name__} return _gl.toolkits._main.run('kmeans_get_current_options', opts) get_default_options = _get_default_options_wrapper( 'kmeans', 'kmeans', 'KmeansModel') def create(dataset, num_clusters=None, features=None, label=None, initial_centers=None, max_iterations=10, batch_size=None, verbose=True): """ Create a k-means clustering model. The KmeansModel object contains the computed cluster centers and the cluster assignment for each instance in the input 'dataset'. Given a number of clusters, k-means iteratively chooses the best cluster centers and assigns nearby points to the best cluster. If no points change cluster membership between iterations, the algorithm terminates.
bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) opts.update(kwargs) opts.update(kwargs) response = _graphlab.toolkits._main.run('recsys_train', opts, verbose) return ItemSimilarityRecommender(response['model']) get_default_options = _get_default_options_wrapper( 'item_similarity', 'recommender.item_similarity', 'ItemSimilarityRecommender') class ItemSimilarityRecommender(_Recommender): """ A model that ranks an item according to its similarity to other items observed for the user in question. **Creating an ItemSimilarityRecommender** This model cannot be constructed directly. Instead, use :func:`graphlab.recommender.item_similarity_recommender.create` to create an instance of this model. A detailed list of parameter options and code samples are available in the documentation for the create function.
_DEFAULT_SOLVER_OPTIONS = { "convergence_threshold": 1e-2, "step_size": 1.0, "lbfgs_memory_level": 11, "max_iterations": 10, } DEFAULT_HYPER_PARAMETER_RANGE = { "l1_penalty": [0.0, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0], "l2_penalty": [0.0, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0], } get_default_options = _get_default_options_wrapper( "classifier_logistic_regression", "logistic_classifier", "LogisticClassifier" ) def create( dataset, target, features=None, l2_penalty=0.01, l1_penalty=0.0, solver="auto", feature_rescaling=True, convergence_threshold=_DEFAULT_SOLVER_OPTIONS["convergence_threshold"], step_size=_DEFAULT_SOLVER_OPTIONS["step_size"], lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS["lbfgs_memory_level"], max_iterations=_DEFAULT_SOLVER_OPTIONS["max_iterations"],
class OneHotEncoder(Transformer): ''' Encode a collection of categorical features using a *1-of-K* encoding scheme. Input columns to the one-hot-encoder must by of type *int*, *string*, *dict*, or *list*. The transformed output is a column of type dictionary (`max_categories` per column dimension sparse vector) where the key corresponds to the index of the categorical variable and the value is `1`. The behaviour of the one-hot-encoder for each input data column type is as follows. (see :func:`~graphlab.feature_engineering.OneHotEncoder.transform` for examples of the same). * **string** : The key in the output dictionary is the string category and the value is 1. * **int** : Behave similar to *string* columns. * **list** : Each value in the list is treated like an individual string. Hence, a *list* of categorical variables can be used to represent a feature where all categories in the list are simultaneously `hot`. * **dict** : They key of the dictionary is treated as a `namespace` and the value is treated as a `sub-category` in the `namespace`. The categorical variable being encoded in this case is a combination of the `namespace` and the `sub-category`. Parameters ---------- features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. max_categories: int, optional The maximum number of categories (per feature column) to use in the encoding. If the number of unique categories in a column exceed `max_categories`, then only the most frequent used categories are retained. If set to None, then all categories in the column are used. output_column_name : str, optional The name of the output column. If the column already exists, then a suffix is appended to the name. Returns ------- out : OneHotEncoder A OneHotEncoder object which is initialized with the defined parameters. Notes ------- - `None` values are treated as separate categories and are encoded along with the rest of the values. See Also -------- graphlab.toolkits.feature_engineering._count_thresholder.OneHotEnconder, graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python # Create data. >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4]}) # Create a one-hot encoder on the features ['a', 'b']. >>> from graphlab.toolkits.feature_engineering import OneHotEncoder >>> encoder = graphlab.feature_engineering.create(sf, OneHotEncoder(features = ['a', 'b'])) # Transform data. >>> transformed_sf = encoder.transform(sf) Columns: encoded_features dict Rows: 3 Data: +------------------+ | encoded_features | +------------------+ | {0: 1, 3: 1} | | {1: 1, 4: 1} | | {2: 1, 5: 1} | +------------------+ [3 rows x 1 columns] # Save the transformer. >>> encoder.save('save-path') # Return the indices in the encoding. >>> encoder['feature_encoding'] Columns: feature str category str index int Rows: 6 Data: +---------+----------+-------+ | feature | category | index | +---------+----------+-------+ | a | 1 | 0 | | a | 2 | 1 | | a | 3 | 2 | | b | 2 | 3 | | b | 3 | 4 | | b | 4 | 5 | +---------+----------+-------+ ''' # Doc strings _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc # Default options get_default_options = staticmethod( _get_default_options_wrapper( '_OneHotEncoder', 'toolkits.feature_engineering._one_hot_encoder', 'OneHotEncoder', True)) def __init__(self, features=None, excluded_features=None, max_categories=None, output_column_name='encoded_features'): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(max_categories, [int, type(None)]) _raise_error_if_not_of_type(output_column_name, [str]) # Set up options opts = { 'max_categories': max_categories, 'output_column_name': output_column_name, } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._OneHotEncoder() proxy.init_transformer(opts) super(OneHotEncoder, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) fields = [ ("Features", _features), ("Excluded features", _exclude), ("Output column name", 'output_column_name'), ("Max categories per column", 'max_categories'), ] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): """ Return a string description of the model, including a description of the training data, training statistics, and model hyper-parameters. Returns ------- out : string A description of the model. """ (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, width=30) @classmethod def _get_instance_and_data(cls): sf = _gl.SFrame({'a': [1, 2, 3, 2, 3], 'b': [2, 3, 4, 2, 3]}) encoder = _gl.feature_engineering.OneHotEncoder(features=['a', 'b'], max_categories=2) return encoder.fit(sf), sf
from graphlab.toolkits._supervised_learning import Classifier as _Classifier import graphlab.toolkits._supervised_learning as _sl import graphlab.toolkits._main as _toolkits_main from graphlab.toolkits._model_workflow import _collect_model_workflow from graphlab.toolkits._internal_utils import _toolkit_repr_print from graphlab.toolkits._supervised_learning import _show_model_tree from graphlab.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe from graphlab.toolkits._internal_utils import _raise_error_if_column_exists from graphlab.toolkits._internal_utils import _check_categorical_option_type from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin get_default_options = _get_default_options_wrapper( 'boosted_trees_classifier', 'boosted_trees_classifier', 'BoostedTreesClassifier') __doc_string_context = ''' >>> url = 'http://s3.amazonaws.com/gl-testdata/xgboost/mushroom.csv' >>> data = graphlab.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = graphlab.boosted_trees_classifier.create(train, target='label') ''' class BoostedTreesClassifier(_Classifier, _TreeModelMixin): """ The gradient boosted trees model can be used as a classifier for predictive tasks.
import graphlab.toolkits._supervised_learning as _sl from graphlab.toolkits._supervised_learning import Classifier as _Classifier from graphlab.toolkits._model import _get_default_options_wrapper from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe, \ _map_unity_proxy_to_object, \ _toolkit_repr_print, \ _numeric_param_check_range from graphlab.toolkits._model_workflow import _collect_model_workflow from graphlab.util import cloudpickle as _cloudpickle import logging as _logging from copy import copy as _copy import six as _six get_default_options = _get_default_options_wrapper('neuralnet_classifier_v2', 'neuralnet_classifier', 'NeuralNetClassifier') _context_doc_string = ''' >>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train') >>> training_data, validation_data = data.random_split(0.8) >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(training_data, ... target='label', ... network=net, ... max_iterations=3) ''' class NeuralNetClassifier(_Classifier): """
class TransformToFlatDictionary(Transformer): ''' Transforms column values into dictionaries with flat, non-nested string keys and numeric values. Each key in nested containers is a concatenation of the keys in each dictionary with `separator` separating them. For example, if ``separator = "."``, then {"a" : {"b" : 1}, "c" : 2} becomes {"a.b" : 1, "c" : 2}. - List and vector elements are handled by converting the index of the appropriate element to a string, then treating that as the key. - String values are handled by treating them as a single {"string_value" : 1} pair. - None values are handled by replacing them with the string contents of `none_tag`. - image and datetime values are currently not supported and raise an error. Parameters ---------- features : list, str Name of feature column(s) to be transformed. exclude : list, str Names of feature column(s) to be excluded from the transformation. separator : str The separator string added between keys of nested dicts or lists. output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : TransformToFlatDictionary A TransformToFlatDictionary object which is initialized with the defined parameters. Examples -------- .. sourcecode:: python >>> import graphlab as gl # Create the data >>> sf = gl.SFrame( {'values': [{"a" : {"b" : 3}, "c": 2}, { "a" : { "b" : 3, "c" : 2.5 }, "c" : 2 }, {"a" : [1,2,4] , "c" : 2 }, { "a" : "b", "c" : 2 }]} # Create a TransformToFlatDictionary transformer object. >>> ft = gl.feature_engineering.TransformToFlatDictionary('values') # Fit the encoder for a given dataset. >>> ft = ft.fit(sf) >>> transformed_sf = ft.transform(sf) >>> transformed_sf.print_rows(max_column_width=60) +----------------------------------------------+ | values | +----------------------------------------------+ | {'c': 2, 'a.b': 3} | | {'c': 2, 'a.b': 3, 'a.c': 2.5} | | {'c': 2, 'a.0': 1.0, 'a.1': 2.0, 'a.2': 4.0} | | {'c': 2, 'a.b': 1} | +----------------------------------------------+ [4 rows x 1 columns] ''' # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc _transform_examples_doc = _transform_examples_doc # Default options get_default_options = staticmethod(_get_default_options_wrapper( '_TransformToFlatDictionary', 'toolkits.feature_engineering._transform_to_flat_dictionary', 'TransformToFlatDictionary', True)) def __init__(self, features=None, excluded_features=None, separator = ".", none_tag = "__none__", output_column_prefix = None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) if output_column_prefix is None: output_column_prefix = '' opts = { 'separator' : separator, 'none_tag' : none_tag, 'output_column_prefix' : output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._TransformToFlatDictionary() proxy.init_transformer(opts) super(TransformToFlatDictionary, self).__init__(proxy, self.__class__) def _get_summary_struct(self): _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.get('excluded_features'))) fields = [ ("Features", _features), ("Excluded_features", _exclude), ("Separator", "separator"), ("None Tag", "none_tag"), ("Output Column Prefix", 'output_column_prefix') ] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, 30) @classmethod def _get_instance_and_data(self): sf = _gl.SFrame( {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, {'this': 1, 'is': 1, 'another': 2, 'example': 3}]}) encoder = _gl.feature_engineering.TFIDF(features=['docs']) encoder = encoder.fit(sf) return encoder, sf
from graphlab.toolkits._internal_utils import _toolkit_repr_print, \ _toolkit_get_topk_bottomk, \ _raise_error_evaluation_metric_is_valid, \ _summarize_coefficients from graphlab.toolkits._model_workflow import _collect_model_workflow from graphlab.toolkits._model import _get_default_options_wrapper _DEFAULT_SOLVER_OPTIONS = { 'convergence_threshold': 1e-2, 'max_iterations': 10, 'lbfgs_memory_level': 11, } get_default_options = _get_default_options_wrapper( 'classifier_svm', 'svm_classifier', 'SVMClassifier') def create(dataset, target, features=None, penalty=1.0, solver='auto', feature_rescaling=True, convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'], lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'], class_weights = None, validation_set = 'auto', verbose=True): """ Create a :class:`~graphlab.svm_classifier.SVMClassifier` to predict the class of a binary target variable based on a model of which side of a hyperplane the example falls on. In addition to standard numeric and categorical types, features
class NGramCounter(Transformer): ''' __init__(self, features=None, excluded_features=None, n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " ", \ "!", "#", "$", "%", "&", "'", "(", ")", \ "*", "+", ",", "-", ".", "/", ":", ";", \ "<", "=", ">", "?", "@", "[", "\\\\", "]", \ "^", "_", "`", "{", "|", "}", "~"], \ output_column_prefix=None) Transform string/dict/list columns of an SFrame into their respective bag-of-ngrams representation. An ngram is a sequence of n consecutive tokens. NGrams are often used to represent natural text. Text ngrams can be word-based or character-based. To formulate word-based ngrams, a text string is first tokenized into words. An ngram is then a sliding window of n words. For character ngrams, no tokenization is necessary, and the sliding window is taken directly over accepted characters. The output is a dictionary of the count of the number of times each unique ngram appears in the text string. This dictionary is a sparse representation because most of the ngrams do not appear in every single sentence, hence they have a zero count and are not explicitly included in the dictionary. NGramCounter can be applied to all the string-, dictionary-, and list-typed columns in a given SFrame. Its behavior for each supported input column type is as follows. (See :func:`~graphlab.feature_engineering.NGramCounter.transform` for usage examples). * **string** : By default, all letters are first converted to lower case. Then, if computing word ngrams, each string is tokenized by space and puncutation characters. (The user can specify a custom delimiter list, or use Penn tree-bank style tokenization. See input parameter description for details.) If computing character ngrams, then each accepted character is understood to be a token. What is accepted is determined based on the flags `ignore_punct` and `ignore_space`. A dictionary is generated where each key is a sequence of `n` tokens that appears in the input text string, and the value is the number of times the ngram appears. For example, based on default settings, the string "I really like Really fluffy dogs" would generate these 2-gram counts: {'i really': 1, 'really like': 1, 'like really': 1, 'really fluffy': 1, 'fluffy dogs': 1}. The string "aaa..hhh" would generate these character 2-gram counts: {'aa': 2, 'ah': 1, 'hh': 2}. * **dict** : Each (key, value) pair is treated as a string-count pair. The keys are tokenized according to either word or character tokenization methods. Input keys must be strings and input values numeric (integer or float). The output dictionary is a sum of the input values for the ngrams in the key string. For example, under default settings, the input dictionary {'alice bob Bob': 1, 'Alice bob': 2.5} would generate a word 2-gram dictionary of {'alice bob': 3.5, 'bob bob': 1}. * **list** : Each element of the list must be a string, which is tokenized according to the input method and tokenization settings, followed by ngram counting. The behavior is analogous to that of dict-type input, where the count of each list element is taken to be 1. For example, under the default settings, an input list of ['alice bob Bob', 'Alice bob'] generates an output word 2-gram dictionary of {'alice bob': 2, 'bob bob': 1}. Parameters ---------- features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all feature columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. n : int, optional The number of words in each n-gram. An ``n`` value of 1 returns word counts. method : {'word', 'character'}, optional If "word", the function performs a count of word n-grams. If "character", does a character n-gram count. to_lower : bool, optional If True, all strings are converted to lower case before counting. ignore_punct : bool, optional If method is "character", indicates if *punctuations* between words are counted as part of the n-gram. For instance, with the input SArray element of "fun.games", if this parameter is set to False one tri-gram would be 'n.g'. If ``ignore_punct`` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". ignore_space : bool, optional If method is "character", indicates if *spaces* between words are counted as part of the n-gram. For instance, with the input SArray element of "fun games", if this parameter is set to False one tri-gram would be 'n g'. If ``ignore_space`` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". delimiters: list[string], optional A list of delimiter characters for tokenization. By default, the list is defined to be the list of space and punctuation characters. The user can define any custom list of single-character delimiters. Alternatively, setting `delimiters=None` will use a Penn treebank type tokenization, which is better at handling punctuations. (See reference below for details.) output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : NGramCounter A NGramCounter feature engineering object which is initialized with the defined parameters. Notes ----- If the SFrame to be transformed already contains a column with the designated output column name, then that column will be replaced with the new output. In particular, this means that `output_column_prefix=None` will overwrite the original feature columns. A bag-of-words representation is essentially an ngram where `n=1`. Larger `n` generates more unique ngrams. Therefore the output dictionary will be more sparse, contain more unique keys, and will be more expensive to compute. Calling this function with large values `n` (larger than 3 or 4) should be done very carefully. References ---------- - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_ - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ See Also -------- graphlab.toolkits.text_analytics.count_ngrams, graphlab.toolkits.feature_engineering._ngram_counter.WordCounter, graphlab.toolkits.feature_engineering._tfidf.TFIDF, graphlab.toolkits.feature_engineering._tokenizer.Tokenizer, graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python import graphlab as gl # Create data. >>> sf = gl.SFrame({ ... 'string': ['sent.ences Sent.ences', 'another sentence'], ... 'dict': [{'alice bob': 1, 'Bob alice': 0.5}, {'a dog': 0, 'a dog cat': 5}], ... 'list': [['one', 'bar bah'], ['a dog', 'a dog cat']]}) # Create a NGramCounter transformer. >>> from graphlab.toolkits.feature_engineering import NGramCounter >>> encoder = NGramCounter() # Save the transformer. >>> encoder.save('save-path') # Fit and transform the data. >>> transformed_sf = encoder.fit_transform(sf) Columns: dict dict list dict string dict Rows: 2 Data: +------------------------------------+----------------------------+ | dict | list | +------------------------------------+----------------------------+ | {'bob alice': 0.5, 'alice bob': 1} | {'bar bah': 1} | | {'dog cat': 5, 'a dog': 5} | {'dog cat': 1, 'a dog': 2} | +------------------------------------+----------------------------+ +------------------------------------+ | string | +------------------------------------+ | {'sent ences': 2, 'ences sent': 1} | | {'another sentence': 1} | +------------------------------------+ [2 rows x 3 columns] # Penn treebank-style tokenization (recommended for smarter handling # of punctuations) >>> sf = gl.SFrame({'string': ['sentence $$one', 'sentence two...']}) >>> NGramCounter(delimiters=None).fit_transform(sf) Columns: string dict Rows: 2 Data: +-------------------------------------------+ | string | +-------------------------------------------+ | {'sentence $': 1, '$ $': 1, '$ one': 1} | | {'sentence two': 1, '. .': 2, 'two .': 1} | +-------------------------------------------+ [2 rows x 1 columns] # Character n-grams >>> sf = gl.SFrame({'string': ['aa$bb.', ' aa bb ']}) >>> NGramCounter(method='character').fit_transform(sf) Columns: string dict Rows: 2 Data: +-----------------------------+ | string | +-----------------------------+ | {'aa': 1, 'ab': 1, 'bb': 1} | | {'aa': 1, 'ab': 1, 'bb': 1} | +-----------------------------+ [2 rows x 1 columns] # Character n-grams, not skipping over spaces or punctuations >>> sf = gl.SFrame({'string': ['aa$bb.', ' aa bb ']}) >>> encoder = NGramCounter(method='character', ignore_punct=False, ignore_space=False) >>> encoder.fit_transform(sf) Columns: string dict Rows: 2 Data: +-----------------------------------------------------------------+ | string | +-----------------------------------------------------------------+ | {'aa': 1, 'b.': 1, '$b': 1, 'a$': 1, 'bb': 1} | | {' b': 1, 'aa': 1, ' ': 1, ' a': 1, 'b ': 1, 'bb': 1, 'a ': 1} | +-----------------------------------------------------------------+ [2 rows x 1 columns] ''' # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc _transform_examples_doc = _transform_examples_doc # Default options get_default_options = staticmethod(_get_default_options_wrapper( '_NGramCounter', 'toolkits.feature_engineering._ngram_counter', 'NGramCounter', True)) def __init__(self, features=None, excluded_features=None, n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True, delimiters=["\r", "\v", "\n", "\f", "\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"], output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType]) _raise_error_if_not_of_type(n, [int]) _raise_error_if_not_of_type(method, [str]) _raise_error_if_not_of_type(to_lower, [bool]) _raise_error_if_not_of_type(ignore_punct, [bool]) _raise_error_if_not_of_type(ignore_space, [bool]) _raise_error_if_not_of_type(delimiters, [list, _NoneType]) _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType]) if delimiters != None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") if (len(delim) != 1): raise ValueError("Delimiters must be single-character strings") if n < 1: raise ValueError("Input 'n' must be greater than 0") if n > 5 and method == 'word': warnings.warn("It is unusual for n-grams to be of size larger than 5.") if method != "word" and method != "character": raise ValueError("Invalid 'method' input value. Please input " + "either 'word' or 'character' ") # Set up options opts = { 'n': n, 'features': features, 'ngram_type': method, 'to_lower': to_lower, 'ignore_punct': ignore_punct, 'ignore_space': ignore_space, 'delimiters': delimiters, 'output_column_prefix' : output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._NGramCounter() proxy.init_transformer(opts) super(NGramCounter, self).__init__(proxy, self.__class__) def _get_summary_struct(self): _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) fields = [ ("NGram length", 'n'), ("NGram type (word or character)", 'ngram_type'), ("Convert strings to lower case", 'to_lower'), ("Ignore punctuation in character ngram", 'ignore_punct'), ("Ignore space in character ngram", 'ignore_space'), ("Delimiters", "delimiters"), ("Features", _features), ("Output column prefix", 'output_column_prefix') ] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, 30) @classmethod def _get_instance_and_data(self): sf = _gl.SFrame( {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, {'this': 1, 'is': 1, 'another': 2, 'example': 3}]}) encoder = _gl.feature_engineering.NGramCounter('docs') encoder = encoder.fit(sf) return encoder, sf
>>> sf = graphlab.SFrame({'a' : [0.1, 8, 3.5], 'b':[-3, 7.6, 3]}) >>> model = graphlab.kmeans.create(sf, 2) >>> model.get_current_options() {'num_clusters': 2, 'max_iterations': 10} """ _mt._get_metric_tracker().track('toolkit.kmeans.get_current_options') opts = {'model': self.__proxy__, 'model_name': self.__name__} return _graphlab.toolkits._main.run( 'kmeans_get_current_options', opts) get_default_options = _get_default_options_wrapper( 'kmeans', 'kmeans', 'KmeansModel') def create(dataset, num_clusters=None, features=None, initial_centers=None, max_iterations=10, batch_size=None, verbose=True): r""" Run the k-means++ clustering algorithm, returning a KmeansModel object that contains the cluster centers and the cluster assignment for each data point in the dataset. Given a number of clusters, k-means++ iteratively chooses the best cluster centers and assigns nearby points to the best cluster. If no points change cluster membership between iterations, the algorithm terminates. Parameters
class Tokenizer(Transformer): ''' __init__(features=None, excluded_features=None, to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "], output_column_prefix=None) Tokenizing is a method of breaking natural language text into its smallest standalone and meaningful components (in English, usually space-delimited words, but not always). By default, Tokenizer tokenizes strings by space characters. The user may specify a customized list of delimiters, or use Penn treebank-style tokenization. .. warning:: The default tokenization setting is now different from that of GraphLab Create v1.6. The old default was Penn treebank-style tokenization. (This is still available by setting `delimiters=None`.) The current default is to tokenize by space characters. Parameters ---------- features : list[str] | str | None, optional Name(s) of feature column(s) to be transformed. If set to None, then all feature columns are used. excluded_features : list[str] | str | None, optional Name(s) of feature columns in the input dataset to be ignored. Either `excluded_features` or `features` can be passed, but not both. to_lower : bool, optional Indicates whether to map the input strings to lower case before counting. delimiters: list[string], optional A list of delimiter characters for tokenization. By default, the list is defined to be the list of space characters. The user can define any custom list of single-character delimiters. Alternatively, setting `delimiters=None` will use a Penn treebank-style tokenization that separates individual punctuation marks and detects positive and negative real numbers, phone numbers with no spaces, urls, and emails. The Penn treebank-style tokenization also attempts to separate contractions and possessives. For instance, "don't" would be tokenized as ["do", "n\'t"]. output_column_prefix : str, optional The prefix to use for the column name of each transformed column. When provided, the transformation will add columns to the input data, where the new name is "`output_column_prefix`.original_column_name". If `output_column_prefix=None` (default), then the output column name is the same as the original feature column name. Returns ------- out : Tokenizer A Tokenizer object which is initialized with the defined parameters. Notes ----- This implementation of Tokenizer applies regular expressions to the natural language text to capture a high-recall set of valid text patterns. If the SFrame to be transformed already contains a column with the designated output column name, then that column will be replaced with the new output. In particular, this means that `output_column_prefix=None` will overwrite the original feature columns. References ---------- - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ See Also -------- graphlab.toolkits.text_analytics.tokenize, graphlab.toolkits.feature_engineering._word_counter.WordCounter, graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter, graphlab.toolkits.feature_engineering._tfidf.TFIDF, graphlab.toolkits.feature_engineering.create Examples -------- .. sourcecode:: python >>> import graphlab >>> from graphlab.toolkits.feature_engineering import * # Create a sample dataset >>> sf = graphlab.SFrame({ ... 'docs': ["This is a document!", "This one's also a document."]}) # Construct a tokenizer with default options. >>> tokenizer = Tokenizer() # Transform the data using the tokenizer. >>> tokenized_sf = tokenizer.fit_transform(sf) >>> tokenized_sf Columns: docs list Rows: 2 Data: +-----------------------------------+ | docs | +-----------------------------------+ | [This, is, a, document!] | | [This, one's, also, a, document.] | +-----------------------------------+ [2 rows x 1 columns] # Convert to lower case and use Penn treebank-style tokenization. >>> ptb_tokenizer = Tokenizer(to_lower=True, delimiters=None) >>> tokenized_sf = ptb_tokenizer.fit_transform(sf) >>> tokenized_sf Columns: docs list Rows: 2 Data: +---------------------------------------+ | docs | +---------------------------------------+ | [this, is, a, document, !] | | [this, one, 's, also, a, document, .] | +---------------------------------------+ [2 rows x 1 columns] # Tokenize only a single column 'docs'. >>> tokenizer = Tokenizer(features = ['docs']) >>> tokenizer['features'] ['docs'] # Tokenize all columns except 'docs'. >>> tokenizer = Tokenizer(excluded_features = ['docs']) >>> tokenizer['features'] # `features` are set to `None` ''' _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc get_default_options = staticmethod( _get_default_options_wrapper( '_Tokenizer', 'toolkits.feature_engineering._tokenizer', 'Tokenizer', True)) def __init__(self, features=None, excluded_features=None, to_lower=False, delimiters=["\r", "\v", "\n", "\f", "\t", " "], output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType]) _raise_error_if_not_of_type(to_lower, [bool]) _raise_error_if_not_of_type(delimiters, [list, _NoneType]) _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType]) if delimiters != None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") if (len(delim) != 1): raise ValueError( "Delimiters must be single-character strings") # Set up options opts = { 'features': features, 'to_lower': to_lower, 'delimiters': delimiters, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._Tokenizer() proxy.init_transformer(opts) super(Tokenizer, self).__init__(proxy, self.__class__) def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<label>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ _features = _precomputed_field( _internal_utils.pretty_print_list(self.get('features'))) fields = [("Features", _features), ("Convert strings to lower case", 'to_lower'), ("Delimiters", "delimiters"), ("Output column prefix", 'output_column_prefix')] section_titles = ['Model fields'] return ([fields], section_titles) def __repr__(self): (sections, section_titles) = self._get_summary_struct() return _toolkit_repr_print(self, sections, section_titles, width=30) @classmethod def _get_instance_and_data(self): sf = _gl.SFrame({'docs': ["this is a test", "this is another test"]}) encoder = _gl.feature_engineering.Tokenizer('docs') return encoder.fit(sf), sf
opts = {'dataset': observation_data, 'user_id': user_id, 'item_id': item_id, 'target': target, 'user_data': user_data, 'item_data': item_data, 'nearest_items': _graphlab.SFrame(), 'model': model_proxy, 'random_seed': 1} response = _graphlab.toolkits._main.run('recsys_train', opts, verbose) return PopularityRecommender(response['model']) get_default_options = _get_default_options_wrapper( 'popularity', 'recommender.popularity_recommender', 'PopularityRecommender') class PopularityRecommender(_Recommender): """ The Popularity Model ranks an item according to its overall popularity. When making recommendations, the items are scored by the number of times it is seen in the training set. The item scores are the same for all users. Hence the recommendations are not tailored for individuals. The Popularity Recommender is simple and fast and provides a reasonable baseline. It can work well when observation data is sparse. It can be used as a "background" model for new users.
_BOOSTED_TREES_MODEL_PARAMS_KEYS = [ 'step_size', 'max_depth', 'max_iterations', 'min_child_weight', 'min_loss_reduction', 'row_subsample' ] _BOOSTED_TREE_TRAINING_PARAMS_KEYS = [ 'objective', 'training_time', 'training_error', 'validation_error', 'evaluation_metric' ] _BOOSTED_TREE_TRAINING_DATA_PARAMS_KEYS = [ 'target', 'features', 'num_features', 'num_examples', 'num_validation_examples' ] get_default_options = _get_default_options_wrapper('boosted_trees_regression', 'boosted_trees_regression', 'BoostedTreesRegression') class BoostedTreesRegression(_SupervisedLearningModel, _TreeModelMixin): """ Encapsulates gradient boosted trees for regression tasks. The prediction is based on a collection of base learners, `regression trees <http://en.wikipedia.org/wiki/Decision_tree_learning>`_. Different from linear models, e.g. linear regression, the gradient boost trees model is able to model non-linear interactions between the features and the target using decision trees as the subroutine. It is good for handling numerical features and categorical features with
if _sys.version_info.major == 3: _izip = zip _xrange = range else: from itertools import izip as _izip _xrange = xrange import operator as _operator import array as _array from graphlab.toolkits._model import _get_default_options_wrapper get_default_options = _get_default_options_wrapper( 'cgs_topic_model', 'topic_model', 'TopicModel') def create(dataset, num_topics=10, initial_topics=None, alpha=None, beta=.1, num_iterations=10, num_burnin=5, associations=None, verbose=False, print_interval=10, validation_set=None, method='auto'): """
from graphlab.toolkits.text_analytics._util import _check_input from graphlab.toolkits.text_analytics._util import random_split as _random_split from graphlab.toolkits._internal_utils import ( _check_categorical_option_type, _map_unity_proxy_to_object, _precomputed_field, _toolkit_repr_print, ) from graphlab.toolkits._model_workflow import _collect_model_workflow from itertools import izip as _izip import array as _array from graphlab.toolkits._model import _get_default_options_wrapper get_default_options = _get_default_options_wrapper("cgs_topic_model", "topic_model", "TopicModel") def create( dataset, num_topics=10, initial_topics=None, alpha=None, beta=0.1, num_iterations=10, num_burnin=5, associations=None, verbose=False, print_interval=10, validation_set=None, method="auto",