def __init__(self, features=None, excluded_features=None, max_categories=None, output_column_name='encoded_features'): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(max_categories, [int, type(None)]) _raise_error_if_not_of_type(output_column_name, [str]) # Set up options opts = { 'max_categories': max_categories, 'output_column_name': output_column_name, } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._OneHotEncoder() proxy.init_transformer(opts) super(OneHotEncoder, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, max_categories=None, output_column_name = 'encoded_features'): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(max_categories, [int, _NoneType]) _raise_error_if_not_of_type(output_column_name, [str]) # Set up options opts = { 'max_categories': max_categories, 'output_column_name': output_column_name, } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._OneHotEncoder() proxy.init_transformer(opts) super(OneHotEncoder, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, num_bits=18, output_column_name='hashed_features'): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(num_bits, [int]) _raise_error_if_not_of_type(output_column_name, [str]) # Set up options opts = { 'num_bits': num_bits, 'output_column_name': output_column_name, } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._FeatureHasher() proxy.init_transformer(opts) super(FeatureHasher, self).__init__(proxy, self.__class__)
def query(self, query, num_results=10, expansion_k=5, expansion_epsilon=0.1, expansion_near_match_weight=.5): """ Search for text. Parameters ---------- query: str A string of text. num_results : int The number of results to return. expansion_k : int Maximum number of nearest words to include from query token. expansion_epsilon : float Maximum distance to allow between query token and nearby word when doing query expansion. Must be between 0 and 1. expansion_near_match_weight : float Multiplier to use on BM25 scores for documents indexed via an approximate match with a given token. This will be used for each of the `expansion_k` words that are considered an approximate match. Must be between 0 and 1. Returns ------- out: SFrame The rows of the original SFrame along with a `score` column which contains the BM25 score between this query and the row. Examples -------- >>> import graphlab as gl >>> sf = gl.SFrame({'text': ['Hello my friend', 'I love this burrito']}) >>> s = gl.search.create(sf, features=['text']) >>> s.query('burrito') """ if _sys.version_info.major == 2: _raise_error_if_not_of_type(query, [str, unicode]) else: _raise_error_if_not_of_type(query, [str]) q = query.split(' ') results = self.__proxy__.query_index( q, expansion_k=expansion_k, expansion_epsilon=expansion_epsilon, expansion_near_match_weight=expansion_near_match_weight) results = self.__proxy__.join_query_result(results, method='default', num_results=num_results) return results
def __init__(self, features=None, excluded_features=None, output_column_prefix=None, verbose=True): self._setup() _features, _exclude = process_features(features, excluded_features) #Type check _raise_error_if_not_of_type(output_column_prefix, [str, NoneType]) _raise_error_if_not_of_type(verbose, [bool]) state = {} state['output_column_prefix'] = output_column_prefix state['features'] = _features state['excluded_features'] = _exclude state['fitted'] = False state['verbose'] = verbose if _exclude: self._exclude = True self._features = _exclude else: self._exclude = False self._features = _features self.__proxy__.update(state)
def __init__(self, features=None, excluded_features=None, threshold=1, output_category_name=None, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(threshold, [int, _NoneType]) # Set up options opts = { 'threshold': threshold, 'output_category_name': output_category_name, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._CountThresholder() proxy.init_transformer(opts) super(CountThresholder, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, separator = ".", none_tag = "__none__", output_column_prefix = None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) if output_column_prefix is None: output_column_prefix = '' opts = { 'separator' : separator, 'none_tag' : none_tag, 'output_column_prefix' : output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._TransformToFlatDictionary() proxy.init_transformer(opts) super(TransformToFlatDictionary, self).__init__(proxy, self.__class__)
def get_prediction_score(self, node_id): """ Return the prediction score (if leaf node) or None if its an intermediate node. Parameters ---------- node_id: id of the node to get the prediction value. Returns ------- float or None: returns float value of predictio if leaf node and None if not. Examples -------- .. sourcecode:: python >>> tree.get_prediction_score(120) # Leaf node 0.251092 >>> tree.get_prediction_score(120) # Not a leaf node None """ _raise_error_if_not_of_type(node_id, [int,long], "node_id") _numeric_param_check_range("node_id", node_id, 0, self.num_nodes - 1) node = self.nodes[node_id] return None if node.is_leaf == False else node.value
def _validate_job_create_args(function, name, environment): """ Validate the arguments for job.create and map_job.create """ __LOGGER__.info("Validating job.") _raise_error_if_not_of_type(environment, [type(None), str, _environment._Environment], 'environment') _raise_error_if_not_of_type(name, [type(None), str], 'name') if name is not None and not _job_name_checker.match(name): raise ValueError('Job name can only contain digits, characters, "-" and "_".') # Setup the env if not environment: try: environment = _gl.deploy.environments['async'] except KeyError: __LOGGER__.info("Creating a LocalAsync environment called 'async'.") try: environment = _environment.LocalAsync('async') except KeyError: environment = _gl.deploy.environments['async'] else: if isinstance(environment, str): __LOGGER__.debug("Loading environment: %s" % environment) environment = _gl.deploy.environments[environment] # Clone to prevent the user's environment to reflect changes. return function, name, environment
def _validate_job_create_args(function, name, environment): """ Validate the arguments for job.create and map_job.create """ __LOGGER__.info("Validating job.") _raise_error_if_not_of_type(environment, [type(None), str, _environment._Environment], 'environment') _raise_error_if_not_of_type(name, [type(None), str], 'name') if name is not None and not _job_name_checker.match(name): raise ValueError( 'Job name can only contain digits, characters, "-" and "_".') # Setup the env if not environment: try: environment = _gl.deploy.environments['async'] except KeyError: __LOGGER__.info( "Creating a LocalAsync environment called 'async'.") try: environment = _environment.LocalAsync('async') except KeyError: environment = _gl.deploy.environments['async'] else: if isinstance(environment, str): __LOGGER__.debug("Loading environment: %s" % environment) environment = _gl.deploy.environments[environment] # Clone to prevent the user's environment to reflect changes. return function, name, environment
def __init__(self, features=None, excluded_features=None, strategy='logarithmic', num_bins=10, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(num_bins, [int]) _raise_error_if_not_of_type(strategy, [str]) # Set up options opts = { 'strategy': strategy, 'num_bins': num_bins, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._FeatureBinner() proxy.init_transformer(opts) super(FeatureBinner, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, threshold=1, output_category_name=None, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(threshold, [int, type(None)]) # Set up options opts = { 'threshold': threshold, 'output_category_name': output_category_name, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._CountThresholder() proxy.init_transformer(opts) super(CountThresholder, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, strategy='logarithmic', num_bins=10, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(num_bins, [int]) _raise_error_if_not_of_type(strategy, [str]) # Set up options opts = { 'strategy': strategy, 'num_bins': num_bins, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._FeatureBinner() proxy.init_transformer(opts) super(FeatureBinner, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, output_column_prefix=None, transform_function=lambda x: x, transform_function_name="none"): self._setup() # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) #Type check _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) state = {} state['output_column_prefix'] = output_column_prefix state['features'] = _features state['excluded_features'] = _exclude state['fitted'] = False state['transform_function'] = transform_function state['transform_function_name'] = transform_function_name if _exclude: self._exclude = True self._features = _exclude else: self._exclude = False self._features = _features self.__proxy__.update(state)
def fit(self, data): """ Fits a transformer using the SFrame `data`. The `fit` phase does not train a deep learning model, it only checks that the trained model is comptable with the data provided. If the `auto` model is chosen, then the fit phase choses the right model to extract features from. Parameters ---------- data : SFrame The data used to fit the transformer. Returns ------- self (A fitted object) See Also -------- transform, fit_transform Examples -------- # Create data. >>> import graphlab as gl # Import data from MNIST >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') # Create a DeepFeatureExtractorObject >>> extractor = gl.feature_engineering.DeepFeatureExtractor( features = 'image') # Fit the encoder for a given dataset. >>> extractor = extractor.fit(data) # Return the model used for the deep feature extraction. >>> extractor['model'] """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit') # Check that the column is in the SFrame. _raise_error_if_not_of_type(data, [_SFrame]) _raise_error_if_column_exists(data, self._state["features"]) # Make sure the output column_name exists. count = 1 old_output_column_name = self._state["output_column_name"] output_column_name = old_output_column_name while output_column_name in data.column_names(): output_column_name = "%s.%s" % (old_output_column_name, count) count = count + 1 self._state["output_column_name"] = output_column_name if data[self._state["features"]].dtype() != _Image: raise ToolkitError( "Feature `%s` must be of type Image." % self._state["features"]) return self
def item_history_count(self, item_id): ''' Returns the number of interactions per item. ''' _raise_error_if_not_of_type(item_id, self._allowed_item_types, 'item_id') user = _gl.SFrame({self.item_id_column: [item_id]}) return user.join(self._item_counts, on=self.item_id_column)
def user_history_count(self, user_id): ''' Returns the number of interactions per user. ''' _raise_error_if_not_of_type(user_id, self._allowed_user_types, 'user_id') user = _gl.SFrame({self.user_id_column: [user_id]}) return user.join(self._user_counts, on=self.user_id_column)
def user_details(self, user_id): ''' Returns the row of the user table given a user id. ''' _raise_error_if_not_of_type(user_id, self._allowed_user_types, 'user_id') user = _gl.SFrame({self.user_id_column: [user_id]}) return user.join(self._users, on=self.user_id_column)
def _set_one_input(self, name='input', value=None, from_task=None, delete=False): """ Set/Update an input for this Task. Parameters ---------- name : str Name for this input. This will be how the code refers to this input at runtime. Default is 'input'. value : obj (supported by GL Pickle) Value for the object refered to using 'name'. from_task : Task|str Dependent Task to set as input, specifying the tuple with: (Task, output_name). Tasks can be referred to either by name or by reference. The output_name needs to be a string. For example, if the following is specified: >>> task._set_one_input(name='in', from_task='dep') then an input named 'in' will be defined on this Task, which has a dependency on the output of the Task named 'dep'. delete : bool, optional If delete is set to True then the name input is removed. """ _raise_error_if_not_of_type(name, str, "name") _raise_error_if_not_of_type(from_task, [type(None), Task], "from_task") # Delete the input. if delete is True and name in self._data['inputs']: del self._data['inputs'][name] return self # Early binding: Set the input if from_task is None: self._data['inputs'][name] = value self._set_dirty_bit() return self # Late binding: Set an input from a task. elif isinstance(from_task, Task): task = from_task self._data['inputs'][name] = task self._set_dirty_bit() return self
def set_inputs(self, names): """ Set input(s) for this Task. Inputs can be any object that can be pickled using GL-Pickle but cannot come from the output of another task. For that, use the set_inputs_from_task function. Parameters ---------- names : list [str] | dict [str, obj] If a dict is provided, then each key is considered a name for an input in this Task, and each value is considered the definition of the input. When a list is provided, then each entry is considered a name for an input in this Task, and the value for that slot is set to None. Returns ------- self : Task See Also -------- set_output Examples -------- To define only input names for a task, use a list of strings: >>> # For late binding >>> t1 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex1') >>> t1.set_inputs(['one', 'two', 'three']) >>> # For early binding >>> t3 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex3') >>> t3.set_inputs({ ... 'b' : 'set_inputs_ex2', ... 'c' : 'foo', ... 'd' : ('foo', 'bar')}) """ if names is None: raise TypeError('Names are required while binding two tasks.') _raise_error_if_not_of_type(names, [list, dict], 'names') if isinstance(names, list): for name in set(names): self._set_one_input(name=name, delete=False) elif isinstance(names, dict): for key, value in names.items(): self._set_one_input(name=key, value=value, delete=False) return self
def set_inputs(self, names): """ Set input(s) for this Task. Inputs can be any object that can be pickled using GL-Pickle but cannot come from the output of another task. For that, use the set_inputs_from_task function. Parameters ---------- names : list [str] | dict [str, obj] If a dict is provided, then each key is considered a name for an input in this Task, and each value is considered the definition of the input. When a list is provided, then each entry is considered a name for an input in this Task, and the value for that slot is set to None. Returns ------- self : Task See Also -------- set_output Examples -------- To define only input names for a task, use a list of strings: >>> # For late binding >>> t1 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex1') >>> t1.set_inputs(['one', 'two', 'three']) >>> # For early binding >>> t3 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex3') >>> t3.set_inputs({ ... 'b' : 'set_inputs_ex2', ... 'c' : 'foo', ... 'd' : ('foo', 'bar')}) """ if names is None: raise TypeError('Names are required while binding two tasks.') _raise_error_if_not_of_type(names, [list, dict], 'names') if isinstance(names, list): for name in set(names): self._set_one_input(name=name, delete=False) elif isinstance(names, dict): for key, value in names.iteritems(): self._set_one_input(name=key, value=value, delete=False) return self
def __init__(self, func, name=None, description=None): """ Create a new Task specifying its name and optionally a description. """ # Must be a function _raise_error_if_not_function(func, "func") # Set the name name = func.__name__ if not name else name _raise_error_if_not_of_type(name, str, "name") self.name = name self._data = dict() self._data['code'] = None self._data['codestr'] = None self._data['inputs'] = dict() self._data['output'] = None self._data['packages'] = set() self._data['description'] = '' self._modified_since_last_saved = None if description is not None: self.set_description(description) # Inspect the function. specs = _inspect.getargspec(func) varargs = specs.varargs defaults = _copy.copy(specs.defaults) args = _copy.copy(specs.args) # Set the code to function arguments + *args + **kwargs self.set_code(func) # Set the inputs all_args = _copy.copy(args) if varargs: all_args.append(varargs) self.set_inputs(all_args) # Bind default values if defaults: for index, arg in enumerate(args[-len(defaults):]): self.set_inputs({arg : defaults[index]}) # Set required packages if _sys.version_info.major == 3: func_dict = func.__dict__ else: func_dict = func.func_dict
def fit(self, data): """ Fits a transformer using the SFrame `data`. The `fit` phase does not train a deep learning model, it only checks that the trained model is comptable with the data provided. If the `auto` model is chosen, then the fit phase choses the right model to extract features from. Parameters ---------- data : SFrame The data used to fit the transformer. Returns ------- self (A fitted object) See Also -------- transform, fit_transform Examples -------- # Create data. >>> import graphlab as gl # Import data from MNIST >>> data = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k') # Create a DeepFeatureExtractorObject >>> extractor = gl.feature_engineering.DeepFeatureExtractor(features = 'image') # Fit the encoder for a given dataset. >>> extractor = extractor.fit(data) # Return the model used for the deep feature extraction. >>> extractor['model'] """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit') # Check that the column is in the SFrame. _raise_error_if_not_of_type(data, [_SFrame]) for feature in self._state["features"]: _raise_error_if_column_exists(data, feature) if data[feature].dtype() != _Image: raise ToolkitError("Feature `%s` must be of type Image." % feature) return self
def __init__(self, name, session_aware=True): """ Constructor for base Environment, should not be instantiated directly. """ if not name: raise TypeError("Name is required when creating an Environment.") _raise_error_if_not_of_type(name, [str, unicode], 'name') self._session = _gl.deploy._default_session self.name = name self._env_type = type(self).__name__ self._modified_since_last_saved = None if session_aware: self._session.register(self)
def __init__(self, func, name=None, description=None): """ Create a new Task specifying its name and optionally a description. """ # Must be a function _raise_error_if_not_function(func, "func") # Set the name name = func.__name__ if not name else name _raise_error_if_not_of_type(name, str, "name") self.name = name self._data = dict() self._data['code'] = None self._data['codestr'] = None self._data['inputs'] = dict() self._data['output'] = None self._data['packages'] = set() self._data['description'] = '' self._modified_since_last_saved = None if description is not None: self.set_description(description) # Inspect the function. specs = _inspect.getargspec(func) varargs = specs.varargs defaults = _copy.copy(specs.defaults) args = _copy.copy(specs.args) # Set the code to function arguments + *args + **kwargs self.set_code(func) # Set the inputs all_args = _copy.copy(args) if varargs: all_args.append(varargs) self.set_inputs(all_args) # Bind default values if defaults: for index, arg in enumerate(args[-len(defaults):]): self.set_inputs({arg: defaults[index]}) # Set required packages if 'required_packages' in func.func_dict: self.set_required_packages(func.func_dict['required_packages'])
def users_in_common(self, item_a, item_b, num_results=None): """ Get data on the users in common between two items. Parameters ---------- item_a : The id of one item. item_b : The id of the other item. num_results : int, optional The number of users in common to return. Returns ------- out : SFrame A SFrame with the two item columns given above, the number of users that rated each, and a dictionary mapping the user to a pair of the ratings, with the first rating being the rating of the first item and the second being the rating of the second item. If no ratings are provided, these values are always 1.0. Returns the observation data relevant to the provided user. """ _raise_error_if_not_of_type(item_a, self._allowed_item_types, 'item_id') _raise_error_if_not_of_type(item_b, self._allowed_item_types, 'item_id') item_a_history = self.item_history(item_a) item_b_history = self.item_history(item_b) item_a_users = item_a_history[self.user_id_column].unique() item_b_users = item_b_history[self.user_id_column].unique() users_in_common = set(item_a_users).intersection(set(item_b_users)) users = _gl.SFrame({self.user_id_column: list(users_in_common)}) users = users.join(self._users, on=self.user_id_column) if num_results is not None: users = users.head(num_results) result = { 'item_a_count': len(item_a_users), 'item_b_count': len(item_b_users), 'in_common_count': len(users_in_common), 'in_common_users': users } return result
def process_features(features, exclude): """ Parameters ---------- features : list[str] | str | None, optional Column names of features to be transformed. If None, all columns are selected. If string, that column is transformed. If list of strings, this list of column names is selected. exclude : list[str] | str | None, optional Column names of features to be ignored in transformation. Can be string or list of strings. Either 'exclude' or 'features' can be passed, but not both. Returns ------- (features, exclude) that are processed. """ # Check types _raise_error_if_not_of_type(features, [NoneType, str, list], 'features') _raise_error_if_not_of_type(exclude, [NoneType, str, list], 'exclude') # Make a copy of the parameters. _features = _copy.copy(features) _exclude = _copy.copy(exclude) # Check of both are None or empty. if _features and _exclude: raise ValueError( "The parameters 'features' and 'exclude' cannot both be set." " Please set one or the other.") if _features == [] and not _exclude: raise ValueError("Features cannot be an empty list.") # Allow a single list _features = [_features] if type(_features) == str else _features _exclude = [_exclude] if type(_exclude) == str else _exclude # Type check each feature/exclude if _features: for f in _features: _raise_error_if_not_of_type(f, str, "Feature names") if _exclude: for e in _exclude: _raise_error_if_not_of_type(e, str, "Excluded feature names") if _exclude is not None and _features is not None: feature_set = set(_features) for col_name in _exclude: if col_name in feature_set: raise ValueError( "'%s' appears in both features and excluded_features." % col_name) return _features, _exclude
def set_description(self, description): """ Set the description for this Task. Parameters ---------- description : str A description for the Task. Returns ------- self : Task """ _raise_error_if_not_of_type(description, str, "description") self._data['description'] = description self._set_dirty_bit() return self
def user_history(self, user_id, num_results=None): """ Returns the observation data relevant to the provided user. """ _raise_error_if_not_of_type(user_id, self._allowed_user_types, 'user_id') try: result = self._observations_by_user.get_group(user_id) except RuntimeError as e: result = self._empty_observation_data if self.item_name_column: result = result.join(self._items, on=self.item_id_column) if num_results is not None: result = result.head(num_results) return result
def set_name(self, name): """ Set the name of the Task, which must be unique. Parameters ---------- name : str Name of the Task. Returns ------- self : Task """ _raise_error_if_not_of_type(name, str, "name") self.name = str(name) self._set_dirty_bit() return self
def __init__(self, name, session_aware=True): """ Constructor for base Environment, should not be instantiated directly. """ if not name: raise TypeError("Name is required when creating an Environment.") if _sys.version_info.major == 3: _raise_error_if_not_of_type(name, [str], 'name') else: _raise_error_if_not_of_type(name, [str, unicode], 'name') self._session = _gl.deploy._default_session self.name = name self._env_type = type(self).__name__ self._modified_since_last_saved = None if session_aware: self._session.register(self)
def __init__(self, steps): """ Parameters ---------- steps: list[Transformer] | list[tuple(name, Transformer)] List of Transformers or (name, Transformer) tuples. These are chained in the order in which they are provided in the list. """ # Basic type checking. _raise_error_if_not_of_type(steps, [list]) # Split into (name, transformer) pairs. If the name is not present # then use the index as name. transformers = [] index = 0 for step in steps: if isinstance(step, tuple): name, tr = step else: tr = step name = index if isinstance(tr, list): tr = TransformerChain(tr) if not issubclass(tr.__class__, _TransformerBase): raise TypeError( "Each step in the chain must be a Transformer.") transformers.append((name, tr)) index = index + 1 # Save into a dictionary for lookups by name and index. self._state = {} self._state["steps"] = steps self._state["steps_by_name"] = {} index = 0 for name, tr in transformers: self._state["steps_by_name"][name] = tr index = index + 1 # The transformers as (name, obj) tuple (used here for fitting # and transforming). self._transformers = transformers
def __init__(self, steps): """ Parameters ---------- steps: list[Transformer] | list[tuple(name, Transformer)] List of Transformers or (name, Transformer) tuples. These are chained in the order in which they are provided in the list. """ # Basic type checking. _raise_error_if_not_of_type(steps, [list]) # Split into (name, transformer) pairs. If the name is not present # then use the index as name. transformers = [] index = 0 for step in steps: if isinstance(step, tuple): name, tr = step else: tr = step name = index if isinstance(tr, list): tr = TransformerChain(tr) if not issubclass(tr.__class__, _TransformerBase): raise TypeError("Each step in the chain must be a Transformer.") transformers.append((name, tr)) index = index + 1 # Save into a dictionary for lookups by name and index. self._state = {} self._state["steps"] = steps self._state["steps_by_name"] = {} index = 0 for name, tr in transformers: self._state["steps_by_name"][name] = tr index = index + 1 # The transformers as (name, obj) tuple (used here for fitting # and transforming). self._transformers = transformers
def __init__(self, reference_features=None, feature="feature", verbose=False): # Process and make a copy of the reference_features _reference_features, _exclude = _internal_utils.process_features(reference_features, None) # Type checking _raise_error_if_not_of_type(feature, [str]) # Set up options opts = { 'reference_features': reference_features, 'feature': feature, 'verbose': verbose } opts['reference_features'] = _reference_features # Initialize object proxy = _gl.extensions._CategoricalImputer() proxy.init_transformer(opts) super(CategoricalImputer, self).__init__(proxy, self.__class__)
def delete_inputs(self, names): """ Set input(s) for this Task. Inputs can be any object that can be pickled using GL-Pickle but cannot come from the output of another task. For that, use the set_inputs_from_task function. Parameters ---------- names : list [str] When a list is provided, then each entry is considered a name for an input in this Task, and is hence removed. Returns ------- self : Task See Also -------- delete_output Examples -------- To define only input names for a task, use a list of strings: >>> # For late binding >>> t1 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex1') >>> t1.delete_inputs(['one', 'two', 'three']) """ if names is None: return self _raise_error_if_not_of_type(names, [list], 'names') for name in set(names): self._set_one_input(name=name, delete=True) return self
def item_details(self, item_id): """ Obtain data for a given item. Parameters ---------- item_id : int, str The id of the desired item. Returns ------- out : SFrame Data for the desired item. If no row has the desired item_id, then an empty SFrame is returned. """ _raise_error_if_not_of_type(item_id, self._allowed_item_types, 'item_id') item = _gl.SFrame({self.item_id_column: [item_id]}) return item.join(self._items, on=self.item_id_column)
def fit(self, dataset): """ Fits a transformer using the SFrame `dataset`. Parameters ---------- data : SFrame The data used to fit the transformer. Returns ------- self (A fitted object) See Also -------- transform, fit_transform """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit') _raise_error_if_not_of_type(dataset, [_SFrame]) fitted_state = {} feature_columns = get_column_names(dataset, self._exclude, self._features) feature_columns = select_valid_features(dataset, feature_columns, [str, list]) fitted_state['features'] = feature_columns validate_feature_columns(dataset.column_names(), feature_columns) fitted_state['col_type_map'] = { col_name: col_type for (col_name, col_type ) in zip(dataset.column_names(), dataset.column_types()) } fitted_state['fitted'] = True self.__proxy__.update(fitted_state) return self
def __init__( self, features=None, excluded_features=None, min_document_frequency=0.0, max_document_frequency=1.0, output_column_prefix=None, ): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(min_document_frequency, [float, int]) _raise_error_if_not_of_type(max_document_frequency, [float, int]) _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType]) # Set up options opts = { "min_document_frequency": min_document_frequency, "max_document_frequency": max_document_frequency, "output_column_prefix": output_column_prefix, } if _exclude: opts["exclude"] = True opts["features"] = _exclude else: opts["exclude"] = False opts["features"] = _features # Initialize object proxy = _gl.extensions._TFIDF() proxy.init_transformer(opts) super(TFIDF, self).__init__(proxy, self.__class__)
def __init__(self, features, model='auto', output_column_prefix=None): """ Parameters ---------- """ _raise_error_if_not_of_type(features, [str, list, type(None)]) _raise_error_if_not_of_type(model, [str, _NeuralNetClassifier]) _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) if isinstance(features, str): features = [features] # Set the model. self._state = {} self._state["features"] = features if not output_column_prefix: output_column_prefix = "deep_features" self._state["output_column_prefix"] = output_column_prefix self._state['model'] = model if self._state["model"] == 'auto': model_path = \ "https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45" import graphlab as gl self._state['model'] = gl.load_model(model_path) if type(self._state['model']) is not _NeuralNetClassifier: raise ValueError( "Model parameters must be of type NeuralNetClassifier " + "or string literal 'auto'")
def __init__(self, feature, model = 'auto', output_column_name=None): """ Parameters ---------- """ _raise_error_if_not_of_type(feature, [str]) _raise_error_if_not_of_type(model, [str, _NeuralNetClassifier]) _raise_error_if_not_of_type(output_column_name, [str, _NoneType]) # Set the model. self._state = {} self._state["features"] = feature if not output_column_name: self._state["output_column_name"] = "deep_features_%s" % feature else: self._state["output_column_name"] = output_column_name self._state['model'] = model if self._state["model"] == 'auto': model_path = \ "http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45" import graphlab as gl self._state['model'] = gl.load_model(model_path) if type(self._state['model']) is not _NeuralNetClassifier: raise ValueError("Model parameters must be of type NeuralNetClassifier " + "or string literal 'auto'")
def __init__(self, features=None, excluded_features=None, min_document_frequency=0.0, max_document_frequency=1.0, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(min_document_frequency, [float, int]) _raise_error_if_not_of_type(max_document_frequency, [float, int]) _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) # Set up options opts = { 'min_document_frequency': min_document_frequency, 'max_document_frequency': max_document_frequency, 'output_column_prefix' : output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._TFIDF() proxy.init_transformer(opts) super(TFIDF, self).__init__(proxy, self.__class__)
def from_model(cls, model, tree_id = 0): import graphlab as _gl from graphlab.toolkits import _supervised_learning as _sl import json as _json _raise_error_if_not_of_type(tree_id, [int,long], "tree_id") _numeric_param_check_range("tree_id", tree_id, 0, model.num_trees - 1) tree = DecisionTree() nodes = {} tree_str = _gl.extensions._xgboost_get_tree(model.__proxy__, tree_id) metadata_mapping = _gl.extensions._get_metadata_mapping(model.__proxy__) trees_json = _json.loads(tree_str) # Parse the tree from the JSON. tree._make_tree(trees_json, metadata_mapping) tree.root_id = 0 # Keep track of the attributes. for key in {"num_examples", "num_features", "num_unpacked_features", "max_depth"}: setattr(tree, key, model[key]) return tree
def process_features(features, exclude): """ Parameters ---------- features : list[str] | str | None, optional Column names of features to be transformed. If None, all columns are selected. If string, that column is transformed. If list of strings, this list of column names is selected. exclude : list[str] | str | None, optional Column names of features to be ignored in transformation. Can be string or list of strings. Either 'exclude' or 'features' can be passed, but not both. Returns ------- (features, exclude) that are processed. """ # Make a copy of the parameters. _features = _copy.copy(features) _exclude = _copy.copy(exclude) # Check of both are None or empty. if _features and _exclude: raise ValueError("The parameters 'features' and 'exclude' cannot both be set." " Please set one or the other.") if _features == [] and not _exclude: raise ValueError("Features cannot be an empty list.") # Check types _raise_error_if_not_of_type(_features, [NoneType, str, list], 'features') _raise_error_if_not_of_type(_exclude, [NoneType, str, list], 'exclude') # Allow a single list _features = [_features] if type(_features) == str else _features _exclude = [_exclude] if type(_exclude) == str else _exclude # Type check each feature/exclude if _features: for f in _features: _raise_error_if_not_of_type(f, str, "Feature names") if _exclude: for e in _exclude: _raise_error_if_not_of_type(e, str, "Excluded feature names") return _features, _exclude
def __init__(self, feature, min_document_frequency=0.0, max_document_frequency=1.0, output_column_name=None): # Type checking _raise_error_if_not_of_type(feature, [str]) _raise_error_if_not_of_type(min_document_frequency, [float, int]) _raise_error_if_not_of_type(max_document_frequency, [float, int]) _raise_error_if_not_of_type(output_column_name, [str, _NoneType]) # Set up options opts = { "features": [feature], "min_document_frequency": min_document_frequency, "max_document_frequency": max_document_frequency, "output_column_name": output_column_name, } # Initialize object proxy = _gl.extensions._TFIDF() proxy.init_transformer(opts) super(TFIDF, self).__init__(proxy, self.__class__)
def __init__(self, feature, query, k1 = 1.5, b = 0.75, min_document_frequency = 0.0, max_document_frequency=1.0, output_column_name=None): # Convert query to list if necessary if isinstance(query, _gl.SArray): query = list(query) if isinstance(query, set): query = list(query) # Type checking _raise_error_if_not_of_type(feature, [str]) for q in query: _raise_error_if_not_of_type(q, [str]) # query must be list of strings _raise_error_if_not_of_type(k1, [float, int]) _raise_error_if_not_of_type(b, [float, int]) _raise_error_if_not_of_type(min_document_frequency, [float, int]) _raise_error_if_not_of_type(max_document_frequency, [float, int]) _raise_error_if_not_of_type(output_column_name, [str, _NoneType]) # Set up options opts = { 'features': [feature], 'query': query, 'k1': k1, 'b': b, 'min_document_frequency': min_document_frequency, 'max_document_frequency': max_document_frequency, 'output_column_name' : output_column_name } # Initialize object proxy = _gl.extensions._BM25() proxy.init_transformer(opts) super(BM25, self).__init__(proxy, self.__class__)
def create(dataset, item, features=None, min_support=1, max_patterns=100, min_length=1): """ Create a :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner` to extract the set of frequently occurring items in an event-series. Parameters ---------- dataset : SFrame Dataset for training the model. item: string Name of the column containing the item. The values in this column must be of string or integer type. features : list[string], optional Names of the columns containing features. 'None' (the default) indicates that all columns except the target variable should be used as features. The feature columns are the ones that together identify a unique transaction ID for the item. min_support : int, optional The minimum number of times that a pattern must occur in order for it to be considered `frequent`. max_patterns : int, optional The maximum number of frequent patterns to be mined. min_length: int, optional The minimum size (number of elements in the set) of each pattern being mined. Returns ------- out : FrequentPatternMiner A trained model of type :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner`. Notes ----- Frequent closed itemests are mined using the `top-k FP growth` algorithm. Mining occurs until the top max_patterns closed itemsets of size min_length and support greater than min_support are found. See Also -------- FrequentPatternMiner References ---------- - Wikipedia - Association Rule Learning <https://en.wikipedia.org/wiki/Association_rule_learning> - Han, Jiawei, et al. "Mining top-k frequent closed patterns without minimum support." Data Mining, 2002. ICDM 2003. - Wang, Jianyong, et al. "TFP: An efficient algorithm for mining top-k frequent closed itemsets." Knowledge and Data Engineering, IEEE Transactions on 17.5 (2005): 652-663. Examples -------- .. sourcecode:: python >>> import graphlab as gl >>> bakery_sf = gl.SFrame("http://s3.amazonaws.com/dato-datasets/bakery.sf") >>> bakery_sf Data: +---------+-------------+-------+----------+----------+-----------------+ | Receipt | SaleDate | EmpId | StoreNum | Quantity | Item | +---------+-------------+-------+----------+----------+-----------------+ | 1 | 12-JAN-2000 | 20 | 20 | 1 | GanacheCookie | | 1 | 12-JAN-2000 | 20 | 20 | 5 | ApplePie | | 2 | 15-JAN-2000 | 35 | 10 | 1 | CoffeeEclair | | 2 | 15-JAN-2000 | 35 | 10 | 3 | ApplePie | | 2 | 15-JAN-2000 | 35 | 10 | 4 | AlmondTwist | | 2 | 15-JAN-2000 | 35 | 10 | 3 | HotCoffee | | 3 | 8-JAN-2000 | 13 | 13 | 5 | OperaCake | | 3 | 8-JAN-2000 | 13 | 13 | 3 | OrangeJuice | | 3 | 8-JAN-2000 | 13 | 13 | 3 | CheeseCroissant | | 4 | 24-JAN-2000 | 16 | 16 | 1 | TruffleCake | +---------+-------------+-------+----------+----------+-----------------+ [266209 rows x 6 columns] >>> model = gl.frequent_pattern_mining.create(train, 'Item', features=['Receipt'], min_length=4, max_patterns=500) Model fields ------------ Min support : 1 Max patterns : 500 Min pattern length : 4 Most frequent patterns ---------------------- ['CoffeeEclair', 'HotCoffee', 'AlmondTwist', 'ApplePie']: 1704 ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'RaspberryCookie']: 1565 ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'GreenTea']: 1290 ['LemonLemonade', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1289 ['LemonLemonade', 'LemonCookie', 'RaspberryCookie', 'GreenTea']: 1279 ['LemonCookie', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1279 ['AppleTart', 'AppleDanish', 'AppleCroissant', 'CherrySoda']: 1253 ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1221 ['CherryTart', 'ApricotDanish', 'OperaCake', 'ApricotTart']: 61 ['CherryTart', 'ApricotDanish', 'OperaCake', 'RaspberryLemonade']: 55 """ _mt._get_metric_tracker().track('toolkit.frequent_pattern_mining.create') # Type checking. _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_not_of_type(item, str, "item") _raise_error_if_not_of_type(features, [list, _types.NoneType], "features") _raise_error_if_not_of_type(min_support, [int, float], "min_support") _raise_error_if_not_of_type(max_patterns, [int, float], "max_patterns") _raise_error_if_not_of_type(min_length, [int, float], "min_length") # Value checking. column_names = dataset.column_names() # If features is None, then use all other column names than item if features is None: features = column_names features.remove(item) # Call the C++ create function. proxy = _gl.extensions._pattern_mining_create( dataset, item, features, min_support, max_patterns, min_length) return FrequentPatternMiner(proxy)
def create(data, row_label=None, features=None, feature_model='auto', method='lsh', verbose=True): """ Create a similarity search model, which can be used to quickly retrieve items similar to a query observation. In the case of images, this model automatically performs the appropriate feature engineering steps. NOTE: If you are using a CPU for the creation step with feature_model='auto', creation time may take a while. This is because extracting features for images on a CPU is expensive. With a GPU, one can expect large speedups. .. warning:: The similarity search toolkit is currently in beta, and feedback is welcome! Please send comments to [email protected]. Parameters ---------- dataset : SFrame The SFrame that represents the training data for the model, including at least one column of images. row_label : str, optional Name of the SFrame column with row id's. If 'row_label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : str, optional The name of an image column in the input 'dataset' SFrame. feature_model : 'auto' | A model of type NeuralNetClassifier, optional A trained model for extracting features from raw data objects. By default ('auto'), we choose an appropriate model from our set of pre-trained models. See :class:`~graphlab.toolkits.feature_engineering.DeepFeatureExtractor` for more information. method : {'lsh', 'brute_force'}, optional The method used for nearest neighbor search. The 'lsh' option uses locality-sensitive hashing to find approximate results more quickly. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : SimilaritySearchModel See Also -------- SimilaritySearchModel graphlab.toolkits.nearest_neighbors graphlab.toolkits.feature_engineering Notes ----- The similarity search toolkit currently uses cosine distance to evaluate the similarity between each query and candidate results. Examples -------- First, split data into reference and query. >>> import graphlab as gl >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') >>> reference, query = data.random_split(0.8) Build neuralnet feature extractor for images: >>> nn_model = gl.neuralnet_classifier.create(reference, target='label') Construct SimilaritySearchModel: >>> model = gl.similarity_search.create(reference, features= 'image', ... feature_model=nn_model) Find the most similar items in the reference set for each item in the query set: >>> model.search(query) """ _mt._get_metric_tracker().track(__name__ + '.create') _raise_error_if_not_of_type(data, [_SFrame]) _raise_error_if_not_of_type(features, [str]) _raise_error_if_column_exists(data, features) if data[features].dtype() != _Image: raise _ToolkitError("Feature `%s` must be of type Image" \ % features) return SimilaritySearchModel(data, row_label=row_label, feature=features, feature_model=feature_model, method=method, verbose=verbose)
def search(self, data, row_label=None, k=5): """ Search for the nearest neighbors from the reference set for each element of the query set. The query SFrame must include columns with the same names as the row_label and feature columns used to create the SimilaritySearchModel. Parameters ---------- data : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. row_label : string, optional Name of the query SFrame column with row id's. If 'row_label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors. Returns ------- out A SFrame that contains all the nearest neighbors. Examples -------- First, split data into reference and query: >>> import graphlab as gl >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') >>> reference, query = data.random_split(0.8) Build a neural net feature extractor for images: >>> nn_model = gl.neuralnet_classifier.create(reference, target='label') Construct the SimilaritySearchModel: >>> model = gl.similarity_search.create(reference, features='image', ... feature_model=nn_model) Find the most similar items in the reference set for each query: >>> model.search(query) """ _raise_error_if_not_of_type(row_label, [str, _NoneType]) feature = self._state['features'] _raise_error_if_column_exists(data, feature) if (data[feature].dtype() != self._feature_type): raise ValueError('Feature columns must have same data type in both reference and query set') if row_label != None: _raise_error_if_column_exists(data, row_label) if data[feature].dtype() == _Image: transformed_data = self._extractor.transform(data) else: transformed_data = data transformed_data[self._state['output_column_name']] = transformed_data[feature] return self._neighbors_model.query(transformed_data, label=row_label, k=k)
def __init__(self, name, stages=[[]], final_stage=None, environment=None, _exec_dir=None, _task_output_paths=None, _job_type = 'PIPELINE'): """ Construct a job. Parameters ---------- name : str Name of this Job, must be unique. stages: list[list[Task]] Collection of task(s) to be executed. final_stage : list[task] | task Collection of task(s) whose outputs are to be returned._ environment : Environment, optional Environment used for this execution. See :py:class:`~graphlab.deploy.environment.LocalAsync` for an example environment. """ _raise_error_if_not_of_type(name, [str], 'name') _raise_error_if_not_of_type(stages, [list], 'stages') _raise_error_if_not_of_type(final_stage, [list, _Task, type(None)], 'final_stage') self.name = name self.environment = environment self._stages = stages self._num_tasks = 0 self._status = 'Pending' self._start_time = None self._end_time = None self._error = None self._job_type = _job_type # Set the packages self._packages = set() for task in self._stages: for t in task: self._num_tasks += 1 self._packages.update(t.get_required_packages()) self._final_stage = final_stage self._task_status = {} self._session = _gl.deploy._default_session if not _exec_dir: relative_path = "job-results-%s" % str(_uuid()) self._exec_dir = self.get_path_join_method()(self._session.results_dir, relative_path) else: self._exec_dir = _exec_dir # Location where all the outputs for the tasks are saved. if not _task_output_paths: Job._update_exec_dir(self, self._exec_dir) else: self._task_output_paths = _task_output_paths
def __init__(self, features=None, excluded_features=None, output_column_name='quadratic_features'): #Type checking _raise_error_if_not_of_type(output_column_name, [str]) # set up options opts = { 'output_column_name': output_column_name } # Make a copy of the parameters. _features = _copy.copy(features) _exclude = _copy.copy(excluded_features) # Check of both are None or empty. if _features and _exclude: raise ValueError("The parameters 'features' and 'exclude' cannot both be set." " Please set one or the other.") if _features == [] and not _exclude: raise ValueError("Features cannot be an empty list.") # Check types _raise_error_if_not_of_type(_features, [NoneType, list, str, tuple], 'features') _raise_error_if_not_of_type(_exclude, [NoneType, list, str, tuple], 'exclude') # Allow a single list _features = [_features] if type(_features) == str or type(_features) == tuple else _features _exclude = [_exclude] if type(_exclude) == str or type(_exclude) == tuple else _exclude # Type check each feature/exclude if _features: for f in _features: _raise_error_if_not_of_type(f, [str, tuple], "Feature names") if _exclude: for e in _exclude: _raise_error_if_not_of_type(e, [str, tuple], "Excluded feature names") if _exclude: opts['exclude'] = True unprocessed_features = _exclude else: opts['exclude'] = False unprocessed_features = _features pair_list = set() if unprocessed_features is not None: if type(unprocessed_features[0]) is tuple: for t in unprocessed_features: pair_list.add(tuple(sorted(t))) elif type(unprocessed_features[0]) is str: if _exclude: for t in unprocessed_features: pair_list.add(t) else: for t in unprocessed_features: for k in unprocessed_features: pair_list.add(tuple(sorted((t, k)))) if type(output_column_name) is not str: raise ValueError("'output_column_name' must be of type str") if unprocessed_features is not None: if type(unprocessed_features[0]) is str: opts['features'] = unprocessed_features if _exclude: opts['feature_pairs'] = list(pair_list) else: opts['feature_pairs'] = [list(x) for x in pair_list] else: opts['feature_pairs'] = [list(x) for x in pair_list ] opts['features'] = [list(x) for x in unprocessed_features] else: opts['feature_pairs'] = None opts['features'] = None # initialize object proxy = _gl.extensions._QuadraticFeatures() proxy.init_transformer(opts) super(QuadraticFeatures, self).__init__(proxy, self.__class__)
def sample(self, k, diversity=0.5, method=None, side_data=None, **kwargs): """ After constructing a diverse sampler, sample a diverse set stochastically. The stochastic algorithm depends on the sampling method itself. Parameters ---------- k : int The number of items to sample. diversity : double in [0, 1], optional This is a tunable parameter that trades off between quality and diversity. A diversity factor of 0 will only consider quality when building a set (equivalent to using the method "quality_only"), while a diversity factor of 1 will only consider item similarity and will ignore quality. A value between 0 and 1 will force the algorithm to trade off between quality and diversity. The actual effect of the diversity factor depends on the algorithm: - When method="weighted_vertex_cover", the diversity factor changes the number of nearest-neighbors to remove when sampling an item. Specifically, the number of neighbors is set to the value floor( (N-1)/(k-1) * diversity_factor). - When method="ipsen", the diversity factor will scale the similarity values by the value of diversity, and the quality values by (1-diversity). method : {'random', 'quality_only', 'weighted_vertex_cover', 'ipsen'}, optional The sampling method to use. The options available are: - *"random"*: Returns a completely random set of items, with no reference to item qualities or similarities. Note that the greedy method is undefined for a random sampler. - *"quality_only"*: Form a sampling distribution with the item qualities, and return a set from this distribution. The sample() method will sample a set from this distribution, while the greedy() method will return the top-k items according to item quality. Requirements: The column quality_feature must be present. - *"ipsen"*: Sample a diverse set using an approximation to the log- determinant. One method of sampling diverse sets is to use determinantal point process (DPP) sampling (see: http://arxiv.org/abs/1207.6083). Given any set of items, one measure of diversity is the log-determinant of the items' similarity matrix L. The diagonal entry L_{ii} corresponds to the quality of item i, while the off-diagonal entries L_{ij} correspond to the similarity between items i and j. The log-determinant of this matrix corresponds directly to the joint quality-diversity of the items that define L (high-qualities lead to a larger value of the determinant, while large similarities diminish the log-determinant). However, DPP sampling does not scale well, so we can instead approximate the log- determinant of a similarity matrix using more scalable methods. The Ipsen sampler uses the block-approximation of the determinant given in http://arxiv.org/pdf/1105.0437v1.pdf in order to mimic DPP sampling in a scalable fashion. Requirements: The columns quality_feature and similarity_features must be present. - *"weighted_vertex_cover"*: Sample a set of items with high quality, and with no (or a minimum) of nearest-neighbors also in the set. Given a graph with a quality field on the vertices and edges connecting similar items, for each item, this algorithm either samples from a distribution formed by item qualities or selects the item with the maximum quality, and then "covers" (or removes from consideration) that item's neighbors. There are two options depending on whether you pass in an SGraph or an SFrame. You can: 1. Define similarity by passing in an SGraph, where an edge between two vertices denotes the fact that those items are neighbors. Any time a point is sampled, all of its neighbors in the graph are removed. 2. Or, you can pass in an SFrame and the additional keyword argument "num_neighbors". Then when a point is sampled, its num_neighbors nearest-neighbors will be removed from consideration. Requirements: The parameters quality_feature and similarity_features must be defined (where they either match column names or vertex and edge fields). If no sampling_method is given, then the default option is to use the weighted_vertex_cover algorithm with a diversity factor of 0.1. side_data: sframe, optional An ID-based subset of the original data to sample from. Sometimes you may wish to sample from only a subset of the original data - e.g., only provide a diverse sample of movies from a particular user's top recommendations. In addition, some features may not be initially available when creating the sampler object. In order to sample from a subset of IDs, with the option to add additional features, set side_data to an SFrame with a column of IDs and (optionally) additional features. Note that the model must be aware of these initial features when creating it by adding the column names for the side-quality or side-similarity features. The sampler will first subset the groundset by the list of IDs passed. Then the sampler will use any updated or additional quality or similarity features in side_data. If some feature is not available in side_data, the sampler uses the original features in the SFrame or SGraph passed in with create(). If side_data is empty, then the sampler will return subsets from the original SFrame or SGraph passed in with the data parameter used in create(). similarity_function : string TODO: I haven't added this yet **kwargs : optional Additional method-specific parameters for fine-tuning. - *wvc_neighbors*: For sampling_method=weighted_vertex_cover and a sampler constructed with an SFrame, remove num_neighbors when a point is sampled. Examples -------- Sample k items directly from the ground set passed in via create() with the default sampling methods: >>> sf = graphlab.SFrame.read_csv( 'https://s3.amazonaws.com/dato-datasets/auto-mpg/auto-mpg.csv') >>> sampler = graphlab.diversity.diverse_sampler.create(data=sf, item_id='name', quality_feature='accel', similarity_features=['mpg', 'displ', 'hp', 'weight']) >>> sampler.sample(k=5) +-----+-----+-------+-----+--------+-------+----+--------+----------------------+ | mpg | cyl | displ | hp | weight | accel | yr | origin | name | +-----+-----+-------+-----+--------+-------+----+--------+----------------------+ | 15 | 8 | 318.0 | 150 | 3777 | 12.5 | 73 | 1 | dodge coronet custom | | 15 | 6 | 258.0 | 110 | 3730 | 19.0 | 75 | 1 | amc matador | | 30 | 4 | 97.0 | 67 | 1985 | 16.4 | 77 | 3 | subaru dl | | 34 | 4 | 86.0 | 65 | 1975 | 15.2 | 79 | 3 | maxda glc deluxe | | 32 | 4 | 98.0 | 70 | 2120 | 15.5 | 80 | 1 | chevrolet chevette | +-----+-----+-------+-----+--------+-------+----+--------+----------------------+ This method returns an SFrame (or SGraph, depending on what was used to create the sampler) containing the sampled items. If the diverse sampler was created with an SGraph, the sampler will return an SFrame containing the sampled vertices and their associated fields. You can change the sampling method with the "method" keyword. The default algorithm is weighted vertex cover. >>> sf = sampler.sample(k=5, method='ipsen') +-----+-----+-------+-----+--------+-------+----+--------+-----------------------+ | mpg | cyl | displ | hp | weight | accel | yr | origin | name | +-----+-----+-------+-----+--------+-------+----+--------+-----------------------+ | 15 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 | | 17 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 | ford torino | | 15 | 8 | 400.0 | 150 | 3761 | 9.5 | 70 | 1 | chevrolet monte carlo | | 22 | 6 | 198.0 | 95 | 2833 | 15.5 | 70 | 1 | plymouth duster | | 19 | 6 | 232.0 | 100 | 2634 | 13.0 | 71 | 1 | amc gremlin | +-----+-----+-------+-----+--------+-------+----+--------+-----------------------+ Instead of stochastic sampling, you can also force the algorithm to try to form the best possible set by using the greedy method: >>> sf = sampler.sample(k=5, greedy=True) It's possible to tune the methods with the "diversity" keyword, which can range between 0 and 1. Larger values will favor reducing inter-item similarity (increasing diversity), while smaller values will favor high- quality items (decreasing diversity). >>> sf = sampler.sample(k=5, diversity=0.0, method='ipsen') +-----+-----+-------+-----+--------+-------+----+--------+--------------------+ | mpg | cyl | displ | hp | weight | accel | yr | origin | name | +-----+-----+-------+-----+--------+-------+----+--------+--------------------+ | 14 | 8 | 440.0 | 215 | 4312 | 8.5 | 70 | 1 | plymouth fury iii | | 15 | 8 | 390.0 | 190 | 3850 | 8.5 | 70 | 1 | amc ambassador dpl | | 18 | 6 | 199.0 | 97 | 2774 | 15.5 | 70 | 1 | amc hornet | | 18 | 6 | 232.0 | 100 | 3288 | 15.5 | 71 | 1 | amc matador | | 11 | 8 | 429.0 | 208 | 4633 | 11.0 | 72 | 1 | mercury marquis | +-----+-----+-------+-----+--------+-------+----+--------+--------------------+ >>> sf = sampler.sample(k=5, diversity=1.0, method='ipsen') +-----+-----+-------+-----+--------+-------+----+--------+---------------------------+ | mpg | cyl | displ | hp | weight | accel | yr | origin | name | +-----+-----+-------+-----+--------+-------+----+--------+---------------------------+ | 18 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 | chevrolet chevelle malibu | | 15 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 | | 18 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 | plymouth satellite | | 16 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 | amc rebel sst | | 18 | 6 | 171.0 | 97 | 2984 | 14.5 | 75 | 1 | ford pinto | +-----+-----+-------+-----+--------+-------+----+--------+---------------------------+ Finally, if you want to restrict the ground set to a smaller subset, you can pass in a list of IDs with the "side_data" keyword: >>> ford_sf = sf[sf['name'].apply(lambda x: 'ford' in x)]['name'] >>> sampler.sample(k=5, side_data=graphlab.SFrame({'name':names_sf})) +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+ | name | mpg | cyl | displ | hp | weight | accel | yr | origin | +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+ | ford pinto runabout | 21 | 4 | 122.0 | 86 | 2226 | 16.5 | 72 | 1 | | ford maverick | 18 | 6 | 250.0 | 88 | 3021 | 16.5 | 73 | 1 | | ford gran torino (sw) | 14 | 8 | 302.0 | 140 | 4638 | 16.0 | 74 | 1 | | ford fairmont (auto) | 20 | 6 | 200.0 | 85 | 2965 | 15.8 | 78 | 1 | | ford ltd landau | 17 | 8 | 302.0 | 129 | 3725 | 13.4 | 79 | 1 | +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+ You can also add updated features, or even features that weren't passed in when creating the model (as long as they are one of the features specified in "quality_feature" or "similarity_featurs"). These new features will be joined to the original dataset. However, if any new features were not specified in the "similarity_features" parameter during sampler creation, they will not be included when computing similarity between items. """ _raise_error_if_not_of_type(k, int) if side_data is not None: _raise_error_if_not_of_type(side_data, _gl.SFrame) opts = dict() if method is not None: opts["method"] = method if diversity < 0.0 or diversity > 1.0: raise ValueError("The diversity parameter must be between 0.0 and 1.0.") opts["diversity"] = diversity if "wvc_neighbors" in kwargs.keys(): opts["num_neighbors"] = kwargs["wvc_neighbors"] if "greedy" in kwargs.keys(): opts["greedy"] = kwargs["greedy"] if side_data is None: return self.__proxy__.sample_from_ground_set(k, opts) else: return self.__proxy__.sample_from_frame_ref_data(k, side_data, opts)
def create(graph, label_field, threshold=1e-3, weight_field='', self_weight=1.0, undirected=False, max_iterations=None, _single_precision=False, _distributed='auto', verbose=True): """ Given a weighted graph with observed class labels of a subset of vertices, infer the label probability for the unobserved vertices using the "label propagation" algorithm. The algorithm iteratively updates the label probability of current vertex as a weighted sum of label probability of self and the neighboring vertices until converge. See :class:`graphlab.label_propagation.LabelPropagationModel` for the details of the algorithm. Parameters ---------- graph : SGraph The graph on which to compute the label propagation. label_field: str Vertex field storing the initial vertex labels. The values in must be [0, num_classes). None values indicate unobserved vertex labels. threshold : float, optional Threshold for convergence, measured in the average L2 norm (the sum of squared values) of the delta of each vertex's label probability vector. max_iterations: int, optional The max number of iterations to run. Default is unlimited. If set, the algorithm terminates when either max_iterations or convergence threshold is reached. weight_field: str, optional Vertex field for edge weight. If empty, all edges are assumed to have unit weight. self_weight: float, optional The weight for self edge. undirected: bool, optional If true, treat each edge as undirected, and propagates label in both directions. _single_precision : bool, optional If true, running label propagation in single precision. The resulting probability values may less accurate, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : LabelPropagationModel References ---------- - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.label_propagation.LabelPropagationModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', ... format='snap') # Initialize random classes for a subset of vertices # Leave the unobserved vertices with None label. >>> import random >>> def init_label(vid): ... x = random.random() ... if x < 0.2: ... return 0 ... elif x > 0.9: ... return 1 ... else: ... return None >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int) >>> m = graphlab.label_propagation.create(g, label_field='label') We can obtain for each vertex the predicted label and the probability of each label in the graph ``g`` using: >>> labels = m['labels'] # SFrame >>> labels +------+-------+-----------------+-------------------+----------------+ | __id | label | predicted_label | P0 | P1 | +------+-------+-----------------+-------------------+----------------+ | 5 | 1 | 1 | 0.0 | 1.0 | | 7 | None | 0 | 0.8213214997 | 0.1786785003 | | 8 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 10 | None | 0 | 0.534984718273 | 0.465015281727 | | 27 | None | 0 | 0.752801638549 | 0.247198361451 | | 29 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 33 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 47 | 0 | 0 | 1.0 | 0.0 | | 50 | None | 0 | 0.788279032657 | 0.211720967343 | | 52 | None | 0 | 0.666666666667 | 0.333333333333 | +------+-------+-----------------+-------------------+----------------+ [36692 rows x 5 columns] See Also -------- LabelPropagationModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.label_propagation.create') _raise_error_if_not_of_type(label_field, str) _raise_error_if_not_of_type(weight_field, str) if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') if graph.vertices[label_field].dtype() != int: raise TypeError('label_field %s must be integer typed.' % label_field) opts = {'label_field': label_field, 'threshold': threshold, 'weight_field': weight_field, 'self_weight': self_weight, 'undirected': undirected, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__} distributed_context = _get_distributed_execution_environment() if distributed_context is None: params = _main.run('label_propagation', opts, verbose) model = params['model'] else: model = _distributed_run('distributed_labelprop', opts, env=_distributed, verbose=verbose) return LabelPropagationModel(model)
def sample(self, k, diversity=0.1, subset_ids=None, **kwargs): """ After constructing a diverse sampler, sample a diverse set stochastically. The stochastic algorithm depends on the sampling method itself. Parameters ---------- k : int The number of items to sample. diversity : double in [0, 1], optional This is a tunable parameter that trades off between quality and diversity. A diversity factor of 0 will only consider quality when building a set, while a diversity factor of 1 will only consider item similarity and will ignore quality. A value between 0 and 1 will force the algorithm to trade off between quality and diversity. Note that this keyword argument is only applicable if both quality and similarity features were passed to create(). The actual effect of the diversity factor depends on the algorithm: - When the method is vertex cover or weighted vertex cover, the diversity factor changes the number of nearest-neighbors to remove when sampling an item. Specifically, the number of neighbors is set to the value floor( (N-1)/(k-1) * diversity_factor). subset_ids: SArray, optional A list of IDs to sample from. Sometimes you may wish to sample from only a subset of the original data - e.g., only provide a diverse sample of movies from a particular user's top recommendations. If subset_ids is empty, then the sampler will return subsets from the original SFrame or SGraph passed in with the data parameter used in create(). **kwargs : optional Additional method-specific parameters for fine-tuning. - *greedy*: Use the greedy algorithm to generate a set. Instead of stochastically building a set based on a distribution, for each item, take the mode of the current distribution. For instance, if only quality features are being considered, using the greedy option will return the top-k items. Usually the greedy algorithm provides the highest-quality and most-diverse set, but for each set of items and algorithm, there is only one set that greedy can generate. Based on which features were given to create(), different sampling methods will be used. One of the four following algorithms are chosen based on the initial feature set. - *"random"*: If no quality or similarity features are given. Returns a completely random set of items, with no reference to item qualities or similarities. Note that the greedy method is undefined for a random sampler, so it is ignored. - *"quality-only"*: If only a quality feature are given. Generate a distribution over items based on their quality, and sample from this distribution. If greedy is specified, the top-k items in terms of quality are returned. - *"vertex-cover"*: If only similarity features are given. An internal graph is generated if an SFrame is given, and each item is connected to it's k-nearest neighbors, where k is determined by the diversity factor. When an item is sampled at random, its neighbors are removed from the candidate set. If an SGraph is given initially, all vertices connected to a sampled point are removed. Note that the greedy method for this algorithm is undefined, so it is ignored. - *"weighted_vertex_cover"*: The same as vertex cover, except each vertex has an associated quality field. When selecting the next point, it is sampled from a distribution over the remaining points' qualities. If greedy is specified, then the next point is the point with the highest quality in the remaining points. Examples -------- Sample k items directly from the reference set passed in via create() with the default sampling methods: >>> cars = graphlab.SFrame.read_csv('https://s3.amazonaws.com/dato-datasets/auto-mpg/auto-mpg.csv') >>> sampler = graphlab.diverse_sampler.create(data=cars, item_id='name', quality_feature='accel', similarity_features=['mpg', 'displ', 'hp', 'weight', 'origin']) >>> sampler.sample(k=5) +-----+-----+-------+-----+--------+-------+----+--------+----------------+ | mpg | cyl | displ | hp | weight | accel | yr | origin | name | +-----+-----+-------+-----+--------+-------+----+--------+----------------+ | 26 | 4 | 121.0 | 113 | 2234 | 12.5 | 70 | 2 | bmw 2002 | | 18 | 6 | 232.0 | 100 | 2945 | 16.0 | 73 | 1 | amc hornet | | 24 | 4 | 116.0 | 75 | 2158 | 15.5 | 73 | 2 | opel manta | | 36 | 4 | 98.0 | 70 | 2125 | 17.3 | 82 | 1 | mercury lynx l | | 44 | 4 | 97.0 | 52 | 2130 | 24.6 | 82 | 2 | vw pickup | +-----+-----+-------+-----+--------+-------+----+--------+----------------+ This method returns an SFrame (or SGraph, depending on what was used to create the sampler) containing the sampled items. If the diverse sampler was created with an SGraph, the sampler will return an SFrame containing the sampled vertices and their associated fields. Instead of stochastic sampling, you can also force the algorithm to try to form the best possible set by using the greedy method: >>> sampler.sample(k=5, greedy=True) +-----+-----+-------+----+--------+-------+----+--------+-------------------------------+ | mpg | cyl | displ | hp | weight | accel | yr | origin | name | +-----+-----+-------+----+--------+-------+----+--------+-------------------------------+ | 19 | 4 | 120.0 | 88 | 3270 | 21.9 | 76 | 2 | peugeot 504 | | 27 | 4 | 141.0 | 71 | 3190 | 24.8 | 79 | 2 | peugeot 504 | | 23 | 8 | 260.0 | 90 | 3420 | 22.2 | 79 | 1 | oldsmobile cutlass salon b... | | 43 | 4 | 90.0 | 48 | 2335 | 23.7 | 80 | 2 | vw dasher (diesel) | | 44 | 4 | 97.0 | 52 | 2130 | 24.6 | 82 | 2 | vw pickup | +-----+-----+-------+----+--------+-------+----+--------+-------------------------------+ In this example, two Peugeot cars were selected. Although they were somewhat different based on the original similarity features we specified, it's possible to get an even more diverse sample. To increase diversity, the "diversity" keyword (which can range between 0 and 1) can be increased. Larger values will favor reducing inter-item similarity (increasing diversity), while smaller values will favor high- quality items (decreasing diversity). >>> sampler.sample(k=5, diversity=0.8, greedy=True) +-----+-----+-------+-----+--------+-------+----+--------+-------------------------------+ | mpg | cyl | displ | hp | weight | accel | yr | origin | name | +-----+-----+-------+-----+--------+-------+----+--------+-------------------------------+ | 27 | 4 | 97.0 | 60 | 1834 | 19.0 | 71 | 2 | volkswagen model 111 | | 32 | 4 | 71.0 | 65 | 1836 | 21.0 | 74 | 3 | toyota corolla 1200 | | 17 | 6 | 231.0 | 110 | 3907 | 21.0 | 75 | 1 | buick century | | 27 | 4 | 141.0 | 71 | 3190 | 24.8 | 79 | 2 | peugeot 504 | | 23 | 8 | 260.0 | 90 | 3420 | 22.2 | 79 | 1 | oldsmobile cutlass salon b... | +-----+-----+-------+-----+--------+-------+----+--------+-------------------------------+ Finally, if you want to restrict the reference set to a smaller subset, you can pass in a list of IDs with the "subset_ids" keyword: >>> ford_names = gl.SArray([n for n in cars['name'] if 'ford' in n]) >>> sampler.sample(5, diversity=1.0, subset_ids=ford_names) +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+ | name | mpg | cyl | displ | hp | weight | accel | yr | origin | +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+ | ford gran torino (sw) | 13 | 8 | 302.0 | 140 | 4294 | 16.0 | 72 | 1 | | ford maverick | 15 | 6 | 250.0 | 72 | 3158 | 19.5 | 75 | 1 | | ford fiesta | 36 | 4 | 98.0 | 66 | 1800 | 14.4 | 78 | 1 | | ford escort 2h | 29 | 4 | 98.0 | 65 | 2380 | 20.7 | 81 | 1 | | ford fairmont futura | 24 | 4 | 140.0 | 92 | 2865 | 16.4 | 82 | 1 | +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+ """ _raise_error_if_not_of_type(k, int) if subset_ids is not None: _raise_error_if_not_of_type(subset_ids, _gl.SArray) if diversity < 0.0 or diversity > 1.0: raise ValueError("The diversity parameter must be between 0.0 and 1.0.") if k <= 0: raise ValueError("k must be greater than 0.") opts = dict() opts["diversity"] = diversity if "wvc_neighbors" in kwargs.keys(): opts["num_neighbors"] = kwargs["wvc_neighbors"] if "greedy" in kwargs.keys(): opts["greedy"] = kwargs["greedy"] if subset_ids is None: return self.__proxy__.sample_from_ground_set(k, opts) else: return self.__proxy__.sample_from_id_subset(k, subset_ids, opts)
def create(data, features=None, bm25_k1=1.5, bm25_b=0.75, tfidf_threshold=0.01): """ Create a searchable index of text columns in an SFrame. .. warning:: This toolkit is currently in beta, and feedback is welcome! Please send comments to [email protected]. Parameters ---------- data : SFrame An SFrame containing at least one str column containing text that should be indexed. features : list of str A list of column names that contain text that should be indexed. Default: all str columns in the provided dataset. bm25_k1 : float Tuning parameter for the relative importance of term frequencies when computing the BM25 score between a query token and a document. bm25_b : float Tuning parameter to downweight scores of long documents when computing the BM25 score between a query token and a document. tfidf_threshold : float Tuning parameter to skip indexing words that have a TF-IDF score below this value. query_expansion_k : int Maximum number of nearest words to include from query token. query_expansion_epsilon : float Maximum distance to allow between query token and nearby word when doing query expansion. Must be between 0 and 1. query_expansion_near_match_weight : float Multiplier to use on BM25 scores for documents indexed via an approximate match with a given token. Must be between 0 and 1. Returns ------- out SearchModel See Also -------- SearchModel.query References ---------- Christopher D. Manning, Hinrich Schutze, and Prabhakar Raghavan. Introduction to information retrieval. http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf Examples -------- >>> import graphlab as gl >>> sf = gl.SFrame({'text': ['Hello my friend', 'I love this burrito']}) >>> m = gl._internal.search.create(sf) >>> print m.query('burrito') """ # Input validation on data and features if features is None: features = _get_str_columns(data) if not isinstance(features, list): raise ValueError("Expected features to be a list.") _raise_error_if_not_of_type(data, [_gl.SFrame]) _raise_error_if_not_of_type(features, [list]) for f in features: if data[f].dtype() != str: raise _ToolkitError("Feature `%s` must be of type str" % f) # Store options options = {} options['bm25_b'] = bm25_b options['bm25_k1'] = bm25_k1 options['tfidf_threshold'] = tfidf_threshold # Construct model proxy = _gl.extensions._SearchIndex() proxy.init_options(options) proxy.init_indexer(data) for f in features: proxy.index(f) return SearchModel(proxy)
def __init__(self, features=None, excluded_features=None, n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True, delimiters=["\r", "\v", "\n", "\f", "\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"], output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType]) _raise_error_if_not_of_type(n, [int]) _raise_error_if_not_of_type(method, [str]) _raise_error_if_not_of_type(to_lower, [bool]) _raise_error_if_not_of_type(ignore_punct, [bool]) _raise_error_if_not_of_type(ignore_space, [bool]) _raise_error_if_not_of_type(delimiters, [list, _NoneType]) _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType]) if delimiters != None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") if (len(delim) != 1): raise ValueError("Delimiters must be single-character strings") if n < 1: raise ValueError("Input 'n' must be greater than 0") if n > 5 and method == 'word': warnings.warn("It is unusual for n-grams to be of size larger than 5.") if method != "word" and method != "character": raise ValueError("Invalid 'method' input value. Please input " + "either 'word' or 'character' ") # Set up options opts = { 'n': n, 'features': features, 'ngram_type': method, 'to_lower': to_lower, 'ignore_punct': ignore_punct, 'ignore_space': ignore_space, 'delimiters': delimiters, 'output_column_prefix' : output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._NGramCounter() proxy.init_transformer(opts) super(NGramCounter, self).__init__(proxy, self.__class__)