def tag(self, dataset, query_name=None, k=5, similarity_threshold=None, exclude_zeros=True, verbose=True): """ Match the reference tags passed when a model is created to a new set of queries. This is a many-to-many match: each query may have any number of occurrences of a reference tag. Parameters ---------- dataset : SFrame Query data to be tagged. query_name : string, optional Name of the column in ``dataset`` to be auto-tagged. If ``dataset`` has more than one column, ``query_name`` must be specified. k : int, optional Number of results to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all results whose score is greater than or equal to ``similarity_threshold``. similarity_threshold : float, optional Only results whose score is greater than or equal to the specified ``similarity_threshold`` are returned. The default is ``None``, in which case the ``k`` best results are returned for each query point. verbose : bool, optional If True, print progress updates and model details. exclude_zeros : boolean, optional If True, only entries for which there is a tag with a nonzero score are preserved in the output. This is the default behavior. Returns ------- out : SFrame An SFrame with four columns: - row ID - column name specified as `tag_name` parameter to `create` method - column name specified as `query_name` parameter to `tag` method - a similarity score between 0 and 1, indicating the strength of the match between the query data and the suggested reference tag, where a score of zero indicates a poor match and a strength of 1 corresponds to a perfect match Notes ----- - By default, only rows for which there is a tag with a nonzero score are included in the output. To guarantee at least one output row for every input row in ``dataset``, set the ``exclude_zeros`` parameter to False. - If both ``k`` and ``similarity_threshold`` are set to ``None``, a ToolkitError is raised. Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "https://static.turi.com/datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's `similarity_threshold` parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ """ _mt._get_metric_tracker().track(self.__module__ + '.tag') # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that either k or similarity_threshold is set if not (k or similarity_threshold): raise _ToolkitError("Either k or similarity_threshold parameters " \ "must be set") # ensure that query_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not query_name: raise _ToolkitError("No query_name parameter specified on " \ "dataset with %d columns" % dataset.num_cols()) query_column = query_name or dataset.column_names()[0] # ensure that column with name tag_name exists if query_column not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' \ % query_column) query_sa = dataset.select_column(query_column) query_sf = _gl.SFrame({ "id": range(len(query_sa)), query_column: query_sa }) features = _preprocess(query_sa) features = features.add_row_number() if similarity_threshold: if not isinstance(similarity_threshold, (float, int)): raise _ToolkitError("similarity_threshold parameter must be a" \ "float or an int.") if similarity_threshold < 0 or similarity_threshold > 1: raise _ToolkitError("similarity_threshold parameter must be " \ "between 0 and 1.") radius = (1 - similarity_threshold) if similarity_threshold else None results = self.__proxy__['nearest_neighbors_model'].query( features, label="id", k=k, radius=radius, verbose=verbose) # return empty SFrame immediately if no NN results if len(results) == 0: return _gl.SFrame({ query_column + "_id": [], query_column: [], self.get("tag_name"): [], "score": [] }) results = results.join(query_sf, on={"query_label": "id"}) results.rename({"query_label": query_column + "_id"}) results.rename({query_column: "query_label"}) # convert distances to similarity scores scores = _dists_to_sim_scores("weighted_jaccard", results) results.add_column(scores, "score") results.remove_column("distance") results.remove_column("rank") results.rename({ "reference_label": self.get("tag_name"), "query_label": query_column }) results.swap_columns(self.get("tag_name"), query_column) if exclude_zeros: try: results = results.filter_by(0.0, "score", exclude=True) except RuntimeError: # nothing to join _logging.getLogger(__name__).warn( "Empty results after filtering scores of 0.") results = results.head(0) return results
def tag(self, dataset, query_name=None, k=5, similarity_threshold=None, exclude_zeros=True, verbose=True): """ Match the reference tags passed when a model is created to a new set of queries. This is a many-to-many match: each query may have any number of occurrences of a reference tag. Parameters ---------- dataset : SFrame Query data to be tagged. query_name : string, optional Name of the column in ``dataset`` to be auto-tagged. If ``dataset`` has more than one column, ``query_name`` must be specified. k : int, optional Number of results to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all results whose score is greater than or equal to ``similarity_threshold``. similarity_threshold : float, optional Only results whose score is greater than or equal to the specified ``similarity_threshold`` are returned. The default is ``None``, in which case the ``k`` best results are returned for each query point. verbose : bool, optional If True, print progress updates and model details. exclude_zeros : boolean, optional If True, only entries for which there is a tag with a nonzero score are preserved in the output. This is the default behavior. Returns ------- out : SFrame An SFrame with four columns: - row ID - column name specified as `tag_name` parameter to `create` method - column name specified as `query_name` parameter to `tag` method - a similarity score between 0 and 1, indicating the strength of the match between the query data and the suggested reference tag, where a score of zero indicates a poor match and a strength of 1 corresponds to a perfect match Notes ----- - By default, only rows for which there is a tag with a nonzero score are included in the output. To guarantee at least one output row for every input row in ``dataset``, set the ``exclude_zeros`` parameter to False. - If both ``k`` and ``similarity_threshold`` are set to ``None``, a ToolkitError is raised. Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "s3://dato-datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's `similarity_threshold` parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ """ _mt._get_metric_tracker().track(self.__module__ + '.tag') # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that either k or similarity_threshold is set if not (k or similarity_threshold): raise _ToolkitError("Either k or similarity_threshold parameters " \ "must be set") # ensure that query_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not query_name: raise _ToolkitError("No query_name parameter specified on " \ "dataset with %d columns" % dataset.num_cols()) query_column = query_name or dataset.column_names()[0] # ensure that column with name tag_name exists if query_column not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' \ % query_column) query_sa = dataset.select_column(query_column) query_sf = _gl.SFrame({"id": range(len(query_sa)), query_column: query_sa}) features = _preprocess(query_sa) features = features.add_row_number() if similarity_threshold: if not isinstance(similarity_threshold, (float, int)): raise _ToolkitError("similarity_threshold parameter must be a" \ "float or an int.") if similarity_threshold < 0 or similarity_threshold > 1: raise _ToolkitError("similarity_threshold parameter must be " \ "between 0 and 1.") radius = (1 - similarity_threshold) if similarity_threshold else None results = self._nn_model.query(features, label="id", k=k, radius=radius, verbose=verbose) # return empty SFrame immediately if no NN results if len(results) == 0: return _gl.SFrame({query_column + "_id": [], query_column: [], self.get("tag_name"): [], "score": []}) results = results.join(query_sf, on={"query_label": "id"}) results.rename({"query_label": query_column + "_id", query_column: "query_label"}) # convert distances to similarity scores scores = _dists_to_sim_scores("weighted_jaccard", results) results.add_column(scores, "score") results.remove_column("distance") results.remove_column("rank") results.rename({"reference_label": self.get("tag_name"), "query_label": query_column}) results.swap_columns(self.get("tag_name"), query_column) if exclude_zeros: try: results = results.filter_by(0.0, "score", exclude=True) except RuntimeError: # nothing to join _logging.getLogger(__name__).warn( "Empty results after filtering scores of 0.") results = results.head(0) return results
def create(dataset, tag_name=None, features=None, verbose=True): """ Create a :class:`NearestNeighborAutoTagger` model, which can be used to quickly apply tags from a reference set of text labels to a new query set using the ``tag`` method. Parameters ---------- dataset : SFrame Reference data. This SFrame must contain at least one column. By default, only the ``tag_name`` column is used as the basis for tagging. You may optionally include additional columns with the ``features`` parameter. tag_name : string, optional Name of the column in ``dataset`` with the tags. This column must contain string values. If ``dataset`` contains more than one column, ``tag_name`` must be specified. features : list[string], optional Names of the columns with features to use as the basis for tagging. 'None' (the default) indicates that only the column specified by the ``tag_name`` parameter should be used. Only str or list fields are allowed. If a column of type list is specified, all values must be either of type string or convertible to type string. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : model A model for quickly tagging new query observations with entries from `dataset`. Currently, the only implementation is the following: - NearestNeighborAutoTagger See Also -------- graphlab.nearest_neighbors.NearestNeighborsModel Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.nearest_neighbor_autotagger.create( actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "https://static.turi.com/datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's similarity_threshold parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ In this second example, you'll notice that the ``review_id`` column is much more sparse. This is because all results whose score was below the specified similarity threshold (.8) were excluded from the output. """ # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that tag_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not tag_name: raise _ToolkitError("No tag_name parameter specified on dataset " \ "with %d columns" % dataset.num_cols()) tag_name = tag_name or dataset.column_names()[0] # ensure that column with name tag_name exists if tag_name not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' % tag_name) # ensure that column is of type string if dataset[tag_name].dtype() != str: raise TypeError("The column used as the tag name must be of type " \ "string.") # use reasonable default for general case distance = _gl.distances.weighted_jaccard # if additional features are specified, ensure they are of appropriate types if features and not isinstance(features, list) and \ all([isinstance(x, str) for x in features]): raise TypeError("The feature parameter must be a list of strings " \ "and those strings must correspond to columns in " \ "`dataset`.") # at a minimum, this SFrame will contain the tags as features; features = features or [] features = [tag_name] + [x for x in features if x != tag_name] # ensure that each specified feature column is either of type list or str column_names = set(dataset.column_names()) for col_name in features: if col_name not in column_names: raise _ToolkitError("Specified feature column (%s) not found " \ "in dataset" % col_name) if dataset.select_column(col_name).dtype() not in (str, list): raise TypeError("Only string and list columns are allowed as " \ "features.") # concatenate the feature columns into a single column features_sf = dataset.select_columns(features) feature_col, features_sf = _concat_string_features(features_sf, features) # compute features if verbose: _logging.getLogger().info("Extracting features...") features = _preprocess(features_sf.select_column(feature_col)) # group by tag_name to ensure that tags are unique feature_cols = features.column_names() select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \ in feature_cols} features.add_column(dataset[tag_name], tag_name) features = features.groupby(tag_name, select_cols) # create nearest neighbors model m = _gl.nearest_neighbors.create(features, label=tag_name, distance=distance, features=feature_cols, verbose=verbose) # add standard toolkit state attributes state = { "nearest_neighbors_model": m, "training_time": m.get("training_time"), "tag_name": tag_name, "verbose": verbose, "num_examples": len(features), "features": feature_cols, "num_features": len(feature_cols), "distance": m.get("distance") } model = NearestNeighborAutoTagger(state) return model
def create(dataset, tag_name=None, features=None, verbose=True): """ Create a NearestNeighborAutotagger model, which can be used to quickly apply tags from a reference set of text labels to a new query set using the ``tag`` method. Parameters ---------- dataset : SFrame Reference data. This SFrame must contain at least one column. By default, only the ``tag_name`` column is used as the basis for tagging. You may optionally include additional columns with the ``features`` parameter. tag_name : string, optional Name of the column in ``dataset`` with the tags. This column must contain string values. If ``dataset`` contains more than one column, ``tag_name`` must be specified. features : list[string], optional Names of the columns with features to use as the basis for tagging. 'None' (the default) indicates that only the column specified by the ``tag_name`` parameter should be used. Only str or list fields are allowed. If a column of type list is specified, all values must be either of type string or convertible to type string. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : model A model for quickly tagging new query observations with entries from `dataset`. Currently, the only implementation is the following: - NearestNeighborAutoTagger See Also -------- NearestNeighborsModel Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.nearest_neighbor_autotagger.create( actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "s3://dato-datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's similarity_threshold parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ In this second example, you'll notice that the ``review_id`` column is much more sparse. This is because all results whose score was below the specified similarity threshold (.8) were excluded from the output. """ # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that tag_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not tag_name: raise _ToolkitError("No tag_name parameter specified on dataset " \ "with %d columns" % dataset.num_cols()) tag_name = tag_name or dataset.column_names()[0] # ensure that column with name tag_name exists if tag_name not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' % tag_name) # ensure that column is of type string if dataset[tag_name].dtype() != str: raise TypeError("The column used as the tag name must be of type " \ "string.") # use reasonable default for general case distance = _gl.distances.weighted_jaccard # if additional features are specified, ensure they are of appropriate types if features and not isinstance(features, list) and \ all([isinstance(x, str) for x in features]): raise TypeError("The feature parameter must be a list of strings " \ "and those strings must correspond to columns in " \ "`dataset`.") # at a minimum, this SFrame will contain the tags as features; features = features or [] features = [tag_name] + [x for x in features if x != tag_name] # ensure that each specified feature column is either of type list or str column_names = set(dataset.column_names()) for col_name in features: if col_name not in column_names: raise _ToolkitError("Specified feature column (%s) not found " \ "in dataset" % x) if dataset.select_column(col_name).dtype() not in (str, list): raise TypeError("Only string and list columns are allowed as " \ "features.") # concatenate the feature columns into a single column features_sf = dataset.select_columns(features) feature_col, features_sf = _concat_string_features(features_sf, features) # compute features if verbose: _logging.getLogger().info("Extracting features...") features = _preprocess(features_sf.select_column(feature_col)) # group by tag_name to ensure that tags are unique feature_cols = features.column_names() select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \ in feature_cols} features.add_column(dataset[tag_name], tag_name) features = features.groupby(tag_name, select_cols) # create nearest neighbors model m = _gl.nearest_neighbors.create( features, label=tag_name, distance=distance, features=feature_cols, verbose=verbose) # add standard toolkit state attributes state = {"training_time": m.get("training_time"), "tag_name": tag_name, "verbose": verbose, "num_examples": len(features), "features": feature_cols, "num_features": len(feature_cols), "distance": m.get("distance")} model = NearestNeighborAutoTagger(m, state) model.summary() return model