예제 #1
0
def _run_toolkit_function(fnname, arguments, args, kwargs):
    """
    Dispatches arguments to a toolkit function.

    Parameters
    ----------
    fnname : string
        The toolkit function to run

    arguments : list[string]
        The list of all the arguments the function takes.

    args : list
        The arguments that were passed

    kwargs : dictionary
        The keyword arguments that were passed
    """
    # scan for all the arguments in args
    num_args_got = len(args) + len(kwargs)
    num_args_required = len(arguments)
    if num_args_got != num_args_required:
        raise TypeError("Expecting " + str(num_args_required) +
                        " arguments, got " + str(num_args_got))

    ## fill the dict first with the regular args
    argument_dict = {}
    for i in range(len(args)):
        argument_dict[arguments[i]] = args[i]

    # now fill with the kwargs.
    for k in kwargs.keys():
        if k in argument_dict:
            raise TypeError("Got multiple values for keyword argument '" + k +
                            "'")
        argument_dict[k] = kwargs[k]

    argument_dict = _translate_function_arguments(argument_dict)
    # unwrap it
    with cython_context():
        ret = _gl.connect.main.get_unity().run_toolkit(fnname, argument_dict)
    # handle errors
    if ret[0] != True:
        if len(ret[1]) > 0:
            raise _ToolkitError(ret[1])
        else:
            raise _ToolkitError("Toolkit failed with unknown error")

    ret = _wrap_function_return(ret[2])
    if type(ret) == dict and 'return_value' in ret:
        return ret['return_value']
    else:
        return ret
    def __init__(self, data, row_label=None, feature=None, feature_model='auto',
                 method='brute_force', verbose=False):

        start_time = _time.time()

        self._state = {'row_label': row_label,
                       'method': method,
                       'verbose': verbose,
                       'features': feature,
                       'num_examples': data.num_rows()}

        if row_label is not None:
            data_subset = data[[feature, row_label]]
        else:
            data_subset = data[[feature]]

        self._feature_type = data_subset[feature].dtype()

        if data_subset[feature].dtype() == _Image:
            prefix = 'extracted'
            extractor = _gl.feature_engineering.DeepFeatureExtractor(
                    features=feature, output_column_prefix=prefix,
                    model=feature_model)
            self._state['output_column_name'] = prefix + '.' + feature
            self._state['feature_model'] = extractor['model']
            self._extractor = extractor.fit(data_subset)
            self._data = self._extractor.transform(data_subset)
        else:
            raise _ToolkitError('Feature type not supported.')


        if method == 'brute_force':
            self._neighbors_model = _gl.toolkits.nearest_neighbors.create(
                self._data, label=row_label,
                features=[self._state['output_column_name']],
                distance='cosine', method='brute_force', verbose=verbose)

        elif method == 'lsh':
            num_tables = 20
            num_projections_per_table = 16
            self._neighbors_model = _gl.toolkits.nearest_neighbors.create(
                self._data, label=row_label,
                features=[self._state['output_column_name']],
                distance='cosine', method = 'lsh',
                num_tables=num_tables,
                num_projections_per_table=num_projections_per_table,
                verbose=verbose)

        else:
            raise _ToolkitError('Unsupported Method %s' % method)

        self._state['training_time'] = _time.time() - start_time
    def __init__(self, data, row_label=None, feature=None, feature_model='auto',
                 method='brute_force', verbose=False):

        start_time = _time.time()

        self._state = {'row_label': row_label,
                       'method': method,
                       'verbose': verbose,
                       'features': feature,
                       'num_examples': data.num_rows()}

        if row_label is not None:
            data_subset = data[[feature, row_label]]
        else:
            data_subset = data[[feature]]

        self._feature_type = data_subset[feature].dtype()

        if data_subset[feature].dtype() == _Image:
            prefix = 'extracted'
            extractor = _gl.feature_engineering.DeepFeatureExtractor(
                    features=feature, output_column_prefix=prefix,
                    model=feature_model)
            self._state['output_column_name'] = prefix + '.' + feature
            self._state['feature_model'] = extractor['model']
            self._extractor = extractor.fit(data_subset)
            self._data = self._extractor.transform(data_subset)
        else:
            raise _ToolkitError('Feature type not supported.')


        if method == 'brute_force':
            self._neighbors_model = _gl.toolkits.nearest_neighbors.create(
                self._data, label=row_label,
                features=[self._state['output_column_name']],
                distance='cosine', method='brute_force', verbose=verbose)

        elif method == 'lsh':
            num_tables = 20
            num_projections_per_table = 16
            self._neighbors_model = _gl.toolkits.nearest_neighbors.create(
                self._data, label=row_label,
                features=[self._state['output_column_name']],
                distance='cosine', method = 'lsh',
                num_tables=num_tables,
                num_projections_per_table=num_projections_per_table,
                verbose=verbose)

        else:
            raise _ToolkitError('Unsupported Method %s' % method)

        self._state['training_time'] = _time.time() - start_time
def _run_toolkit_function(fnname, arguments, args, kwargs):
    """
    Dispatches arguments to a toolkit function.

    Parameters
    ----------
    fnname : string
        The toolkit function to run

    arguments : list[string]
        The list of all the arguments the function takes.

    args : list
        The arguments that were passed

    kwargs : dictionary
        The keyword arguments that were passed
    """
    # scan for all the arguments in args
    num_args_got = len(args) + len(kwargs)
    num_args_required = len(arguments)
    if num_args_got != num_args_required:
        raise TypeError("Expecting " + str(num_args_required) + " arguments, got " + str(num_args_got))

    ## fill the dict first with the regular args
    argument_dict = {}
    for i in range(len(args)):
        argument_dict[arguments[i]] = args[i]

    # now fill with the kwargs.
    for k in kwargs.keys():
        if k in argument_dict:
            raise TypeError("Got multiple values for keyword argument '" + k + "'")
        argument_dict[k] = kwargs[k]

    argument_dict = _translate_function_arguments(argument_dict)
    # unwrap it
    with cython_context():
        ret = _gl.connect.main.get_unity().run_toolkit(fnname, argument_dict)
    # handle errors
    if ret[0] != True:
        if len(ret[1]) > 0:
            raise _ToolkitError(ret[1])
        else:
            raise _ToolkitError("Toolkit failed with unknown error")

    ret = _wrap_function_return(ret[2])
    if type(ret) == dict and 'return_value' in ret:
        return ret['return_value']
    else:
        return ret
예제 #5
0
    def __init__(self, *args, **kwargs):
        tkclass_name = getattr(self.__init__, "tkclass_name")
        _proxy = None
        if "_proxy" in kwargs:
            _proxy = kwargs['_proxy']
            del kwargs['_proxy']

        if _proxy:
            self.__dict__['_tkclass'] = _proxy
        elif tkclass_name:
            self.__dict__['_tkclass'] = _gl.connect.main.get_unity().create_toolkit_class(tkclass_name)
        try:
            # fill the functions and properties
            self.__dict__['_functions'] = self._tkclass.get('list_functions')
            self.__dict__['_get_properties'] = self._tkclass.get('list_get_properties')
            self.__dict__['_set_properties'] = self._tkclass.get('list_set_properties')
            # rewrite the doc string for this class
            try:
                self.__dict__['__doc__'] = self._tkclass.get('get_docstring', {'__symbol__':'__doc__'})
                self.__class__.__dict__['__doc__'] = self.__dict__['__doc__']
            except:
                pass
        except:
            raise _ToolkitError("Cannot create Toolkit Class for this class. "
                               "This class was not created with the new toolkit class system.")
        # for compatibility with older classes / models
        self.__dict__['__proxy__'] = self.__dict__['_tkclass']

        if '__init__' in self.__dict__['_functions']:
            self.__run_class_function("__init__", args, kwargs)
        elif len(args) != 0 or len(kwargs) != 0:
            raise TypeError("This constructor takes no arguments")
def validate_distance_feature_types(dataset, distance, allowed_types):
    """
    Check that the features passed to each standard distance function are
    allowed for that distance. NOTE: this function *does not* check that each
    distance function is one of the standard types; only that the feature types
    are correct if a distance function *is* standard.

    Parameters
    ----------
    dataset : SFrame
        Input dataset.

    distance : list[list]
        Composite distance.

    allowed_types : dict(string, list[type])
        Feature types allowed for each distance function.
    """

    for d in distance:
        ftr_names, dist, weight = d

        if dist in allowed_types.keys():
            for ftr in ftr_names:
                try:
                    ftr_type = dataset[ftr].dtype()
                except:
                    raise _ToolkitError("Feature '{}' could not be found in".format(ftr) +
                                        " the input dataset.")

                if not ftr_type in allowed_types[dist]:
                    raise TypeError("Feature '{}' is type '{}'".format(ftr, ftr_type.__name__) +
                                    " in the input dataset, which is not allowed " +
                                    "for distance function '{}'.".format(dist))
예제 #7
0
def validate_distance_feature_types(dataset, distance, allowed_types):
    """
    Check that the features passed to each standard distance function are
    allowed for that distance. NOTE: this function *does not* check that each
    distance function is one of the standard types; only that the feature types
    are correct if a distance function *is* standard.

    Parameters
    ----------
    dataset : SFrame
        Input dataset.

    distance : list[list]
        Composite distance.

    allowed_types : dict(string, list[type])
        Feature types allowed for each distance function.
    """

    for d in distance:
        ftr_names, dist, weight = d

        if dist in allowed_types.keys():
            for ftr in ftr_names:
                try:
                    ftr_type = dataset[ftr].dtype()
                except:
                    raise _ToolkitError("Feature '{}' could not be found in".format(ftr) +
                                        " the input dataset.")

                if not ftr_type in allowed_types[dist]:
                    raise TypeError("Feature '{}' is type '{}'".format(ftr, ftr_type.__name__) +
                                    " in the input dataset, which is not allowed " +
                                    "for distance function '{}'.".format(dist))
    def __init__(self, model, state=None):
        assert(isinstance(
            model, _gl.nearest_neighbors.NearestNeighborsModel))

        if model.get("distance") == "dot_product":
            raise _ToolkitError("%s is not a supported distance function for " \
                                "the NearestNeighborAutoTagger. Use %s " \
                                "instead." % ("dot_product", "cosine"))

        if model.get("distance") == "transformed_dot_product":
            raise _ToolkitError("%s is not a supported distance function for " \
                                "the NearestNeighborAutoTagger. Use %s " \
                                "instead." % ("transformed_dot_product", "cosine"))

        self._state = state or {}
        self._nn_model = model
def _validate_num_clusters(num_clusters, initial_centers, num_rows):
    """
    Validate the combination of the `num_clusters` and `initial_centers`
    parameters in the Kmeans model create function. If the combination is
    valid, determine and return the correct number of clusters.

    Parameters
    ----------
    num_clusters : int
        Specified number of clusters.

    initial_centers : SFrame
        Specified initial cluster center locations, in SFrame form. If the
        number of rows in this SFrame does not match `num_clusters`, there is a
        problem.

    num_rows : int
        Number of rows in the input dataset.

    Returns
    -------
    _num_clusters : int
        The correct number of clusters to use going forward
    """

    ## Basic validation
    if num_clusters is not None and not isinstance(num_clusters, int):
        raise _ToolkitError("Parameter 'num_clusters' must be an integer.")

    ## Determine the correct number of clusters.
    if initial_centers is None:
        if num_clusters is None:
            raise ValueError("Number of clusters cannot be determined from " +
                             "'num_clusters' or 'initial_centers'. You must " +
                             "specify one of these arguments.")
        else:
            _num_clusters = num_clusters

    else:
        num_centers = initial_centers.num_rows()

        if num_clusters is None:
            _num_clusters = num_centers
        else:
            if num_clusters != num_centers:
                raise ValueError("The value of 'num_clusters' does not match " +
                                 "the number of provided initial centers. " +
                                 "Please provide only one of these arguments " +
                                 "or ensure the values match.")
            else:
                _num_clusters = num_clusters

    if _num_clusters > num_rows:
        raise ValueError("The desired number of clusters exceeds the number " +
                         "of data points. Please set 'num_clusters' to be " +
                         "smaller than the number of data points.")

    return _num_clusters
예제 #10
0
def _supervised_evaluation_error_checking(targets, predictions):
    """
    Perform basic error checking for the evaluation metrics. Check
    types and sizes of the inputs.
    """
    _raise_error_if_not_sarray(targets, "targets")
    _raise_error_if_not_sarray(predictions, "predictions")
    if (targets.size() != predictions.size()):
        raise _ToolkitError(
         "Input SArrays 'targets' and 'predictions' must be of the same length.")
예제 #11
0
def _supervised_evaluation_error_checking(targets, predictions):
    """
    Perform basic error checking for the evaluation metrics. Check
    types and sizes of the inputs.
    """
    _raise_error_if_not_sarray(targets, "targets")
    _raise_error_if_not_sarray(predictions, "predictions")
    if (targets.size() != predictions.size()):
        raise _ToolkitError(
         "Input SArrays 'targets' and 'predictions' must be of the same length.")
예제 #12
0
    def __init__(self, state={}):

        if 'nearest_neighbors_model' in state:
            model = state['nearest_neighbors_model']
        else:
            model = None

        assert (isinstance(model, _gl.nearest_neighbors.NearestNeighborsModel))

        if model.get("distance") == "dot_product":
            raise _ToolkitError("%s is not a supported distance function for " \
                                "the NearestNeighborAutoTagger. Use %s " \
                                "instead." % ("dot_product", "cosine"))

        if model.get("distance") == "transformed_dot_product":
            raise _ToolkitError("%s is not a supported distance function for " \
                                "the NearestNeighborAutoTagger. Use %s " \
                                "instead." % ("transformed_dot_product", "cosine"))

        self.__proxy__ = _PythonProxy(state)
 def __init__(self, tkclass_name=None, _proxy=None):
     if _proxy:
         self.__dict__['_tkclass'] = _proxy
     elif tkclass_name:
         self.__dict__['_tkclass'] = _gl.connect.main.get_unity().create_toolkit_class(tkclass_name)
     try:
         # fill the functions and properties
         self.__dict__['_functions'] = self._tkclass.get('list_functions')
         self.__dict__['_get_properties'] = self._tkclass.get('list_get_properties')
         self.__dict__['_set_properties'] = self._tkclass.get('list_set_properties')
         # rewrite the doc string for this class
         try:
             self.__dict__['__doc__'] = self._tkclass.get('get_docstring', {'__symbol__':'__doc__'})
             self.__class__.__dict__['__doc__'] = self.__dict__['__doc__']
         except:
             pass
     except:
         raise _ToolkitError("Cannot create Toolkit Class for this class. "
                            "This class was not created with the new toolkit class system.")
예제 #14
0
    def __init__(self, *args, **kwargs):
        tkclass_name = getattr(self.__init__, "tkclass_name")
        _proxy = None
        if "_proxy" in kwargs:
            _proxy = kwargs['_proxy']
            del kwargs['_proxy']

        if _proxy:
            self.__dict__['_tkclass'] = _proxy
        elif tkclass_name:
            self.__dict__['_tkclass'] = _gl.connect.main.get_unity(
            ).create_toolkit_class(tkclass_name)
        try:
            # fill the functions and properties
            self.__dict__['_functions'] = self._tkclass.get('list_functions')
            self.__dict__['_get_properties'] = self._tkclass.get(
                'list_get_properties')
            self.__dict__['_set_properties'] = self._tkclass.get(
                'list_set_properties')
            # rewrite the doc string for this class
            try:
                self.__dict__['__doc__'] = self._tkclass.get(
                    'get_docstring', {'__symbol__': '__doc__'})
                self.__class__.__dict__['__doc__'] = self.__dict__['__doc__']
            except:
                pass
        except:
            raise _ToolkitError(
                "Cannot create Toolkit Class for this class. "
                "This class was not created with the new toolkit class system."
            )
        # for compatibility with older classes / models
        self.__dict__['__proxy__'] = self.__dict__['_tkclass']

        if '__init__' in self.__dict__['_functions']:
            self.__run_class_function("__init__", args, kwargs)
        elif len(args) != 0 or len(kwargs) != 0:
            raise TypeError("This constructor takes no arguments")
def create(datasets, row_label=None, features=None, grouping_features=None,
           distance=None, k=2, radius=None, verbose=True):
    """
    Create a deduplication model based on nearest neighbors and SGraph connected
    components.

    This method creates a :class:`NearestNeighborDeduplication` model by
    constructing a nearest neighbors similarity graph on all of the rows in the
    input 'datasets', then using the connected components tool in the
    :mod:`~graphlab.toolkits.graph_analytics` module to assign an entity label
    to each record. Records which share the same label are considered to be
    duplicates.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    datasets : SFrame or list[SFrame] or dict(string: SFrame)
        Input datasets. Each SFrame in the list must include all of the features
        specified in the `features` or 'distance' parameters, but may
        have additional columns as well. SFrames can be input as values in a
        dictionary, where the keys are strings used in the output to identify
        the SFrame from which each record originated.

    row_label : string, optional
        Name of the SFrame column with row labels. If not specified, row numbers
        are used to identify rows in the output.

    features : list[string], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates the intersection of columns over all SFrames in
        `datasets` should be used (except the label column, if specified). Each
        column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model. Any
        additional columns named in 'features' will be included in the model
        output but not used for distance computations.

    grouping_features : list[string], optional
        Names of features to use in grouping records before finding approximate
        matches. These columns must have string or integer type data. See the
        Notes section for more details on grouping.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    k : int, optional
        Number of neighbors to consider for each point.

    radius : float, optional
        Maximum distance from each point to a potential duplicate.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborDeduplication model
        The NearestNeighborDeduplication object contains a field 'entities'
        which shows the entity label for each input record. It also shows the
        features for each record that are used to construct the model, as well
        as the original SFrame and row label for each record. If the original
        `datasets` are passed in a list, the SFrame identifier is the index of
        the SFrame in that list.

    See Also
    --------
    NearestNeighborDeduplication, graphlab.toolkits.nearest_neighbors,
    graphlab.SFrame.groupby

    Notes
    -----
    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - For datasets with more than about 10,000 records, *grouping* (also known
      as *blocking*) is a critical step to avoid computing distances between all
      pairs of records. The grouping step simply assigns each record to a group
      that has identical values for all `grouping_features`, and only looks for
      duplicates within each group.

    - Records with missing data in the `grouping_features` are removed from
      consideration as duplicates. These records are given the entity label
      "None".

    - For tasks that require *only* exact matches on certain features, it is
      generally more natural to use the SFrame `groupby` function.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    References
    ----------
    - Christen, Peter. "Data matching: concepts and techniques for record
      linkage, entity resolution, and duplicate detection." Springer Science &
      Business Media, 2012.

    Examples
    --------
    >>> sf1 = graphlab.SFrame({'id': [0, 1, 2],
    ...                        'x0': [0.5, 0.5, 0.3],
    ...                        'x1': [1., 0.8, 0.6],
    ...                        'city': ['seattle', 'olympia', 'boston'],
    ...                        'state': ['WA', 'WA', 'MA']})
    ...
    ... # note: misspellings in the following dataset do not prevent correct
    ... # matches.
    >>> sf2 = graphlab.SFrame({'id': [9, 10],
    ...                        'x0': [0.35, 0.4],
    ...                        'x1': [0.65, 0.8],
    ...                        'city': ['bostan', 'seatle'],
    ...                        'state': ['MA', 'WA']})
    ...
    >>> dist = [[('city',), 'levenshtein', 2],
    ...         [('x0', 'x1'), 'euclidean', 1.5]]
    ...
    >>> m = graphlab.nearest_neighbor_deduplication.create({'a': sf1, 'b': sf2},
    ...                                                    row_label='id',
    ...                                                    grouping_features=['state'],
    ...                                                    distance=dist, k=None,
    ...                                                    radius=3)
    ...
    >>> print m['entities']
    +----------+----+----------+-------+------+---------+------+
    | __sframe | id | __entity | state |  x0  |   city  |  x1  |
    +----------+----+----------+-------+------+---------+------+
    |    a     | 1  |    0     |   WA  | 0.5  | olympia | 0.8  |
    |    a     | 0  |    1     |   WA  | 0.5  | seattle | 1.0  |
    |    b     | 10 |    1     |   WA  | 0.4  |  seatle | 0.8  |
    |    a     | 2  |    2     |   MA  | 0.3  |  boston | 0.6  |
    |    b     | 9  |    2     |   MA  | 0.35 |  bostan | 0.65 |
    +----------+----+----------+-------+------+---------+------+
    [5 rows x 7 columns]
    """

    ## Set up
    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    start_time = _time.time()

    model = NearestNeighborDeduplication()
    model._state['verbose'] = verbose
    model._state['k'] = k
    model._state['radius'] = radius


    ### ----------------------------- ###
    ### Validation and preprocessing ###
    ### ----------------------------- ###

    ### Validate input datasets
    ### -----------------------

    ## If datasets is already a dict, check the keys are all strings
    if isinstance(datasets, dict):
        if not(all([isinstance(x, str) for x in datasets.keys()])):
            raise ValueError("Keys in the 'datasets' dict must be strings.")

    ## Convert singleton SFrame dataset into a list of datasets
    if isinstance(datasets, _gl.SFrame):
        _raise_error_if_sframe_empty(datasets, "dataset")
        datasets = {0: datasets}

    ## Convert a list of SFrames into a dict
    if isinstance(datasets, list):
        datasets = {k: sf for k, sf in enumerate(datasets)}

    ## At this point, 'datasets' must be dict. If it's not, something is wrong.
    if not isinstance(datasets, dict):
        raise TypeError("Input 'datasets' must be an SFrame, a list of SFrames, " +
                        "or a dictionary of (string, SFrame) pairs.")

    model._state['num_datasets'] = len(datasets)

    ## Ensure that all datasets are SFrames
    for d in datasets.values():
        _raise_error_if_not_sframe(d, "dataset")


    ### Validate row label
    ### ------------------

    ## Validate the label column
    if row_label:
        if not isinstance(row_label, str):
            raise TypeError("The 'row_label' parameter must be the name (string " +
                            "type) of a column in each of the input datasets.")

        for d in datasets.values():
            if row_label not in d.column_names():
                raise _ToolkitError("The specified row_label column does not " +
                                    " exist in all input datasets.")
    else:
        row_label = 'row_number'

        for d in datasets.values():
            if row_label in d.column_names():
                raise _ToolkitError("Input 'row_label' defaulted to " +
                                    "'row_number', which is already a column" +
                                    " in at least one input dataset. Please " +
                                    "specify a row label column manually.")

    model._state['row_label'] = row_label


    ### Validate 'features' and 'grouping_features' parameters
    ### ------------------------------------------------------
    if features is not None:
        if not hasattr(features, '__iter__'):
            raise TypeError("Input 'features' must be a list.")

        if not all([isinstance(x, str) for x in features]):
            raise TypeError("Input 'features' must contain only strings.")

    if grouping_features is not None:
        if not hasattr(grouping_features, '__iter__'):
            raise TypeError("Input 'grouping_features' must be a list.")

        if not all([isinstance(x, str) for x in grouping_features]):
            raise TypeError("Input 'grouping_features' must contain only strings.")


    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.

    ## Find the intersection of all feature sets and feature types
    col_types = {k: v for k, v in zip(datasets.values()[0].column_names(),
                                      datasets.values()[0].column_types())}

    all_features = [sf.column_names() for sf in datasets.values()]
    ftr_intersection = list(set(all_features[0]).intersection(*all_features))
    ftr_intersection = [x for x in ftr_intersection if x != row_label]


    ## Convert features and distance arguments into a composite distance.
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif isinstance(distance, str):
        if features is not None:
            distance = [[features, distance, 1]]
        else:
            distance = [[ftr_intersection, distance, 1]]

    elif distance == None:
        if features is not None:
            distance = _construct_auto_distance(features, col_types)
        else:
            distance = _construct_auto_distance(ftr_intersection, col_types)

    else:
        raise TypeError("Input 'distance' not understood. Note that for the " +
                         "data matching toolkit, 'distance' must be a string or " +
                         "a composite distance list."   )


    ## Validate the form of the composite distance and add to the model
    allowed_dists = {
        'euclidean': [int, float, _array.array],
        'squared_euclidean': [int, float, _array.array],
        'manhattan': [int, float, _array.array],
        'levenshtein': [str],
        'jaccard': [str, dict],
        'weighted_jaccard': [str, dict],
        'cosine': [int, float, str, dict, _array.array],
        'dot_product': [int, float, str, dict, _array.array],
        'transformed_dot_product': [int, float, str, dict, _array.array]}

    distance = _dmutl.validate_composite_distance(distance, row_label,
                                                  allowed_dists.keys(),
                                                  verbose)
    model._state['distance'] = _copy.deepcopy(distance)


    ## Figure out which features are 'fuzzy', i.e. used for approximate
    #  matching, and set in the model state.
    fuzzy_features = _dmutl.extract_composite_features(distance)  # already has row_label removed

    model._state['features'] = fuzzy_features
    model._state['num_features'] = len(fuzzy_features)


    ## Compile a master list of all features. This includes grouping features,
    #  fuzzy features (the ones used for approximate matching), and "ancillary"
    #  features, which are specified in the 'features' parameter but not in the
    #  composite distance function for whatever reason. by the user in the
    #  'features' parameter, but not included in the 'distance' specification
    #  for some reason.
    if features is None:
        features = []
    else:
        features = [x for x in features if x != row_label]

    if grouping_features is None:
        grouping_features = []
    else:
        grouping_features = [x for x in grouping_features if x != row_label]

    model._state['grouping_features'] = grouping_features
    model._state['num_grouping_features'] = len(grouping_features)

    master_features = list(set(features + grouping_features + fuzzy_features))


    ### Consolidate data and engineer features
    ### --------------------------------------

    ## Consolidate multiple input datasets into a single SFrame, with a useful
    #  row label.
    sf_union = _dmutl.concatenate_sframes(datasets, row_label=row_label,
                                   features=master_features,
                                   sf_index_name='__sframe')
    overall_label = '__sframe.' + row_label
    sf_union[overall_label] = (sf_union['__sframe'].astype(str) + "." +
                               sf_union[row_label].astype(str))


    ## Validate the feature types in the consolidated dataset against the
    #  specified distance functions.
    _dmutl.validate_distance_feature_types(sf_union, distance, allowed_dists)


    ## Clean string-type features in the fuzzy feature set.
    for ftr in fuzzy_features:
        if col_types[ftr] == str:
            new_ftr = '__clean.' + ftr
            sf_union[new_ftr] = sf_union[ftr].fillna("")
            sf_union[new_ftr] = sf_union[new_ftr].apply(
                lambda x: _dmutl.cleanse_string(x), dtype=str)

            for dist_comp in distance:
                dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]]


    ## Feature engineering, distance-component-wise. Also update list of
    #  features and a map to their types.
    sf_union, distance = _engineer_distance_features(sf_union, distance)
    transformed_features = _dmutl.extract_composite_features(distance)

    ### -------------------------------------------- ###
    ### Main loop over blocks of neighbor candidates ###
    ### -------------------------------------------- ###

    ## Construct blocks on features that must match exactly
    if verbose:
        _logging.info("Constructing groups of records that match exactly on " +
                      "the 'grouping_features'.")

    sf_union, block_errors, blocks = \
        _dmutl.construct_exact_blocks(sf_union, grouping_features)

    if verbose and len(distance) > 0 and blocks['Count'].max() > 10000:
        _logging.warning("There are more than 10,000 records in the largest match " +
            "group. For many uses, approximate matches within each match group are " +
            "computed with brute force nearest neighbors, which may be slow. " +
            "Consider using smaller groups by requiring different features to " +
            "match exactly.")

    max_entity_number = 0
    sf_entity = _gl.SFrame()
    output_features = (master_features + [row_label, '__sframe', '__entity'])

    ## Main loop over blocks
    for i, block in enumerate(blocks):

        if verbose:
            _logging.info("Processing {} records in match group: {}/{}".format(block['Count'],
                                                                         i+1,
                                                                         len(blocks)))

        ## Retrieve records in the block and impute the mean for missing numeric
        #  values.
        records = sf_union[block['min_idx']:(block['max_idx'] + 1)]
        complete_records = _dmutl.impute_numeric_means(records, transformed_features)

        if len(distance) > 0:
            ## Run all-point nearest neighbors
            if verbose:
                _logging.info("Building the similarity graph....")

            m = _gl.nearest_neighbors.create(complete_records, label=overall_label,
                                             distance=distance, verbose=False)
            knn = m.query(complete_records, label=overall_label, k=k, radius=radius,
                          verbose=verbose)


            ## Construct similarity graph to resolve transitive closure
            sg = _gl.SGraph()
            sg = sg.add_vertices(records[[overall_label]], vid_field=overall_label)
            sg = sg.add_edges(knn, src_field='query_label',
                              dst_field='reference_label')


            ## Cut the similarity graph to establish an entity for each vertex
            if verbose:
                _logging.info("Finding duplicate records in the similarity graph....")

            cc = _gl.connected_components.create(sg, verbose=verbose)

            ## Relabel the component IDs to be consecutive integers starting with
            #  the max index of the previous block's entity labels.
            block_labels = cc['component_size'].add_row_number('__entity')
            block_labels['__entity'] += max_entity_number
            max_entity_number += block_labels.num_rows()
            block_entity_labels = cc['component_id'].join(block_labels,
                                                          on='component_id',
                                                          how='left')

            ## Join the entity labels for the block back to the block's records,
            #  then append to the master output
            records = records.join(block_entity_labels[['__id', '__entity']],
                                   on={overall_label: '__id'}, how='left')
            records = records.sort('__entity')

        else:  # no fuzzy features, so no nearest neighbors, just block ID
            records['__entity'] = _gl.SArray.from_const(i, len(records))


        sf_entity = sf_entity.append(records[output_features])


    ### ------------------------------------- ###
    ### Postprocessing and results formatting ###
    ### ------------------------------------- ###

    ## Add rows missing from the blocking back to the master results
    if len(block_errors) > 0:
        block_errors['__entity'] = _gl.SArray.from_const(None, len(block_errors)).astype(int)
        sf_entity = sf_entity.append(block_errors[output_features])

    ## Rearrange columns
    sf_entity.swap_columns('__sframe', sf_entity.column_names()[0])
    sf_entity.swap_columns(row_label, sf_entity.column_names()[1])
    sf_entity.swap_columns('__entity', sf_entity.column_names()[2])


    ## Finalize the model state
    model._state['training_time'] = _time.time() - start_time
    model._state['entities'] = sf_entity
    model._state['num_entities'] = max_entity_number

    return model
def create(dataset, tag_name=None, features=None, verbose=True):
    """
    Create a NearestNeighborAutotagger model, which can be used to quickly apply
    tags from a reference set of text labels to a new query set using the
    ``tag`` method.

    Parameters
    ----------
    dataset : SFrame
        Reference data. This SFrame must contain at least one column. By
        default, only the ``tag_name`` column is used as the basis for
        tagging. You may optionally include additional columns with the
        ``features`` parameter.

    tag_name : string, optional
        Name of the column in ``dataset`` with the tags. This column must
        contain string values. If ``dataset`` contains more than one column,
        ``tag_name`` must be specified.

    features : list[string], optional
        Names of the columns with features to use as the basis for tagging.
        'None' (the default) indicates that only the column specified by the
        ``tag_name`` parameter should be used. Only str or list fields are
        allowed. If a column of type list is specified, all values must be
        either of type string or convertible to type string.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : model
        A model for quickly tagging new query observations with entries from
        `dataset`. Currently, the only implementation is the following:

        - NearestNeighborAutoTagger

    See Also
    --------
    NearestNeighborsModel

    Examples
    --------
    First construct a toy `SFrame` of actor names, which will serve as the
    reference set for our autotagger model.

    >>> actors_sf = gl.SFrame(
            {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                       "Tom Cruise", "Jude Law", "Robert Pattinson",
                       "Matt Damon", "Brad Pitt", "Johnny Depp",
                       "Leonardo DiCaprio", "Jennifer Aniston",
                       "Jessica Alba", "Emma Stone", "Cameron Diaz",
                       "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                       "Charlize Theron", "Marion Cotillard",
                       "Angelina Jolie"]})
    >>> m = gl.data_matching.nearest_neighbor_autotagger.create(
                actors_sf, tag_name="actor")

    Then we load some IMDB movie reviews into an `SFrame` and tag them using
    the model we created above. The score field in the output is a
    similarity score, indicating the strength of the match between the query
    data and the suggested reference tag.

    >>> reviews_sf = gl.SFrame(
            "s3://dato-datasets/imdb_reviews/reviews.sframe")
    >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
    +-----------+-------------------------------+------------------+-----------------+
    | review_id |             review            |      actor       |      score      |
    +-----------+-------------------------------+------------------+-----------------+
    |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
    |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
    |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
    |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
    |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
    |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
    |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
    |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
    |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
    |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
    |    ...    |              ...              |       ...        |       ...       |
    +-----------+-------------------------------+------------------+-----------------+

    The initial results look a little noisy. To filter out obvious spurious
    matches, we can set the `tag` method's similarity_threshold parameter.

    >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
              similarity_threshold=.8)
    +-----------+-------------------------------+------------------+----------------+
    | review_id |             review            |      actor       |     score      |
    +-----------+-------------------------------+------------------+----------------+
    |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
    |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
    |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
    |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
    +-----------+-------------------------------+------------------+----------------+

    In this second example, you'll notice that the ``review_id`` column is much
    more sparse. This is because all results whose score was below the specified
    similarity threshold (.8) were excluded from the output.

    """
    # validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    # ensure that tag_name is provided if dataset has > 1 column
    if dataset.num_cols() > 1 and not tag_name:
        raise _ToolkitError("No tag_name parameter specified on dataset " \
                            "with %d columns" % dataset.num_cols())
    tag_name = tag_name or dataset.column_names()[0]

    # ensure that column with name tag_name exists
    if tag_name not in dataset.column_names():
        raise _ToolkitError('No column named "%s" in dataset' % tag_name)

    # ensure that column is of type string
    if dataset[tag_name].dtype() != str:
        raise TypeError("The column used as the tag name must be of type " \
                        "string.")

    # use reasonable default for general case
    distance = _gl.distances.weighted_jaccard

    # if additional features are specified, ensure they are of appropriate types
    if features and not isinstance(features, list) and \
       all([isinstance(x, str) for x in features]):
        raise TypeError("The feature parameter must be a list of strings " \
                        "and those strings must correspond to columns in " \
                        "`dataset`.")

    # at a minimum, this SFrame will contain the tags as features;
    features = features or []
    features = [tag_name] + [x for x in features if x != tag_name]

    # ensure that each specified feature column is either of type list or str
    column_names = set(dataset.column_names())
    for col_name in features:
        if col_name not in column_names:
            raise _ToolkitError("Specified feature column (%s) not found " \
                                "in dataset" % x)

        if dataset.select_column(col_name).dtype() not in (str, list):
            raise TypeError("Only string and list columns are allowed as " \
                            "features.")

    # concatenate the feature columns into a single column
    features_sf = dataset.select_columns(features)
    feature_col, features_sf = _concat_string_features(features_sf, features)

    # compute features
    if verbose:
        _logging.getLogger().info("Extracting features...")

    features = _preprocess(features_sf.select_column(feature_col))

    # group by tag_name to ensure that tags are unique
    feature_cols = features.column_names()
    select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \
                   in feature_cols}
    features.add_column(dataset[tag_name], tag_name)
    features = features.groupby(tag_name, select_cols)

    # create nearest neighbors model
    m = _gl.nearest_neighbors.create(
        features, label=tag_name, distance=distance,
        features=feature_cols, verbose=verbose)

    # add standard toolkit state attributes
    state = {"training_time": m.get("training_time"),
             "tag_name": tag_name,
             "verbose": verbose,
             "num_examples": len(features),
             "features": feature_cols,
             "num_features": len(feature_cols),
             "distance": m.get("distance")}

    model = NearestNeighborAutoTagger(m, state)
    model.summary()

    return model
예제 #17
0
def create(data,
           features=None,
           bm25_k1=1.5,
           bm25_b=0.75,
           tfidf_threshold=0.01,
           verbose=True):
    """
    Create a searchable index of text columns in an SFrame.

    Parameters
    ----------
    data : SFrame
      An SFrame containing at least one str column containing text that should
      be indexed.

    features : list of str
      A list of column names that contain text that should be indexed.
      Default: all str columns in the provided dataset.

    bm25_k1 : float
      Tuning parameter for the relative importance of term frequencies when
      computing the BM25 score between a query token and a document.

    bm25_b : float
      Tuning parameter to downweight scores of long documents when
      computing the BM25 score between a query token and a document.

    tfidf_threshold : float
      Tuning parameter to skip indexing words that have a TF-IDF score below
      this value.

    verbose : bool
      Controls whether or not to print progress during model creation.

    Returns
    -------
    out
       SearchModel

    See Also
    --------
    SearchModel.query

    References
    ----------

    Christopher D. Manning, Hinrich Schutze, and Prabhakar Raghavan.
    Introduction to information retrieval.
    http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf

    Examples
    --------

    >>> import graphlab as gl
    >>> sf = gl.SFrame({'text': ['Hello my friend', 'I love this burrito']})
    >>> m = gl.toolkits._internal.search.create(sf)
    >>> print m.query('burrito')

    """

    # Input validation on data and features
    if features is None:
        features = _get_str_columns(data)

    _raise_error_if_not_of_type(data, [_gl.SFrame])
    _raise_error_if_not_of_type(features, [list])
    for f in features:
        if data[f].dtype() != str:
            raise _ToolkitError("Feature `%s` must be of type str" % f)

    # Store options
    options = {}
    options['bm25_b'] = bm25_b
    options['bm25_k1'] = bm25_k1
    options['tfidf_threshold'] = tfidf_threshold
    options['verbose'] = verbose
    options['features'] = features

    # Construct model
    proxy = _gl.extensions._SearchIndex()
    proxy.init_options(options)
    proxy.index(data)

    return SearchModel(proxy)
예제 #18
0
def distances_to_similarity_scores(distance_fn, distances):
    """
    Convert distances to similarity scores.

    Parameters
    ----------
    distance_fn : str
        The name of the distance function.

    distances : SArray or SFrame
        An `SArray` or `SFrame` of distances to convert to similarity scores. If
        distances is an SFrame, it is expected to contain the following columns:
        "distance", "query_label", and "reference_label", of types float, str,
        and str respectively. If an SFrame is provided that does not contain
        these fields, a ToolkitError is raised.

    label : string
        Name of the label column.

    Returns
    -------
    out : SArray
        The converted similarity scores.

    Notes
    -----
    - To convert Levenshtein distances to similarities, the distances parameter
    must by an `SFrame`, since we require both of the strings being compared in
    order to normalize.
    """
    if not (isinstance(distances, _gl.SFrame) or \
            isinstance(distances, _gl.SArray)):
        raise TypeError("distances parameter is of type %s must be an SFrame " \
                        "or an SArray" % type(distances))

    if isinstance(distances, _gl.SFrame):
        column_names = distances.column_names()
        required_names = ["distance", "query_label", "reference_label"]
        if not all([name in column_names for name in required_names]):
            raise _ToolkitError("distances SFrame is missing required " \
                                "columns; at a minimum, it should have the " \
                                "following columns: \"distance\", " \
                                "\"query_label\", and \"reference_label\"")

    if isinstance(distances, _gl.SArray):
        if distance_fn == "levenshtein":
            raise TypeError("Expected an SFrame but got a an SArray")

        distances = _gl.SFrame({"distance": distances})

    def levenshtein_sim(dist, s1, s2):
        return 1 - dist / max(len(s1), len(s2))

    scores = None

    if distance_fn == "levenshtein" and isinstance(distances, _gl.SFrame):
        scores = distances.apply(
            lambda x: levenshtein_sim(
                x["distance"], x["query_label"], x["reference_label"]))
    elif distance_fn in ("jaccard", "weighted_jaccard", "cosine"):
        scores = distances["distance"].apply(lambda dist: 1 - dist)
    elif distance_fn in ("manhattan", "euclidean", "squared_euclidean"):
        scores = distances["distance"].apply(
            lambda dist: 1 - dist / _MAX_SIMILARITY_RADIUS)
    else:
        raise _ToolkitError("Unsupported distance function: %s" % distance_fn)

    return scores
예제 #19
0
    def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None):
        """
        Evaluate the model's predictive accuracy. This is done by predicting the
        target class for instances in a new dataset and comparing to known
        target values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the target and features used for model training. Additional
            columns are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto': Returns all available metrics.

            - 'accuracy': Classification accuracy.

            - 'confusion_matrix': An SFrame with counts of possible
              prediction/true label combinations.

            - 'roc_curve': An SFrame containing information needed for an roc
              curve (binary classification only).

        max_neighbors : int, optional
            Maximum number of neighbors to consider for each point.

        radius : float, optional
            Maximum distance from each point to a neighbor in the reference
            dataset.

        Returns
        -------
        out : dict
            Evaluation results. The dictionary keys are *accuracy* and
            *confusion_matrix* and *roc_curve* (if applicable).

        See also
        --------
        create, predict, predict_topk, classify

        Notes
        -----
        - Because the model randomly breaks ties between predicted classes, the
          results of repeated calls to `evaluate` method may differ.

        Examples
        --------
        >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
        ...                             'height': [9, 25, 20, 23],
        ...                             'weight': [13, 28, 33, 22]})
        >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species')
        >>> ans = m.evaluate(sf_train, max_neighbors=2,
        ...                  metric='confusion_matrix')
        >>> print ans['confusion_matrix']
        +--------------+-----------------+-------+
        | target_label | predicted_label | count |
        +--------------+-----------------+-------+
        |     cat      |       dog       |   1   |
        |     dog      |       dog       |   2   |
        |    fossa     |       dog       |   1   |
        +--------------+-----------------+-------+
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.nearest_neighbor_classifier.evaluate')

        ## Validate the metric name
        _raise_error_evaluation_metric_is_valid(
            metric, ['auto', 'accuracy', 'confusion_matrix', 'roc_curve'])

        ## Make sure the input dataset has a target column with an appropriate
        #  type.
        target = self.get('target')
        _raise_error_if_column_exists(dataset, target, 'dataset', target)

        if not dataset[target].dtype() == str and not dataset[target].dtype(
        ) == int:
            raise TypeError("The target column of the evaluation dataset must "
                            "contain integers or strings.")

        if self._state["num_classes"] != 2:
            if (metric == 'roc_curve') or (metric == ['roc_curve']):
                err_msg = "Currently, ROC curve is not supported for "
                err_msg += "multi-class classification in this model."
                raise _ToolkitError(err_msg)
            else:
                warn_msg = "WARNING: Ignoring `roc_curve`. "
                warn_msg += "Not supported for multi-class classification."
                print(warn_msg)

        ## Compute predictions with the input dataset.
        ystar = self.predict(dataset,
                             output_type='class',
                             max_neighbors=max_neighbors,
                             radius=radius)
        ystar_prob = self.predict(dataset,
                                  output_type='probability',
                                  max_neighbors=max_neighbors,
                                  radius=radius)

        ## Compile accuracy metrics
        results = {}

        if metric in ['accuracy', 'auto']:
            results['accuracy'] = _gl.evaluation.accuracy(
                targets=dataset[target], predictions=ystar)

        if metric in ['confusion_matrix', 'auto']:
            results['confusion_matrix'] = \
                _gl.evaluation.confusion_matrix(targets=dataset[target],
                                                predictions=ystar)

        if self._state["num_classes"] == 2:
            if metric in ['roc_curve', 'auto']:
                results['roc_curve'] = \
                      _gl.evaluation.roc_curve(targets=dataset[target],
                                               predictions=ystar_prob)
        return results
def create(dataset, feature=None, expected_runlength=250, lag=7):
    """
    Create a `BayesianChangepointsModel`. The changepoint detection
    calculates where there is a shift in mean or variance in a univariate
    timeseries. This model calculates a probability that a given point is
    changepoint, given the data up to the point. The BayesianChangepointsModel
    works with either TimeSeries, SArray, or SFrame inputs.

    The model created by this function contains a table `scores` that contains
    the computed anomaly scores. The type of `scores` matches the type of the
    input `dataset`, and the table contains 4 columns:

        - *row id/time*: ID of the corresponding row in the input `dataset`. If
          `dataset` is an SFrame, this is the row numbers of the input data; if
          `dataset` is a TimeSeries, it is the index of the time series.

        - *changepoint score*: The probability that the given point is a
           changepoint. This value is in a range between 0 and 1.

        - *value*: input data. The name of this column matches the input
          `feature`.

        - *model update time*: time the model was updated. This is particularly
          useful if the `window_size` is larger than the number of rows in the
          input datasets, because the `scores` table has results from several
          updates.

    Note that any `None` values in dataset will have `changepoint_score` of
    `None`, and will be ignored in subsequent changepoint probability
    calculation.

    Parameters
    ----------
    dataset : SFrame, SArray, or TimeSeries
        Input data. The column named by 'feature' will be extracted for
        modeling.

    feature : str, optional
        Name of the column to model. Any data provided to the model in this
        function or with `BayesianChangepointsModel.update` must have a column
        with this name, unless the datasets are in SArray form.

    expected_runlength: int or float, optional
       The a priori expected number of samples between changepoints.
       This helps condition the model. Note that this parameter must be set
       to a value larger than 0.

    lag: int, optional
        The model waits `lag` samples before evaluating the probability of a
        change happening `lag` samples prior. This is useful because it can be
        difficult to evaluate a change after a single sample of a new
        distribution.

        Note that this causes the last `lag` to not have enough data to
        evaluate changepoint scores, so they are filled with 'None' values.
        Also note that this value cannot be larger than 100, due to only
        keeping the previous 100 points in memory.The minimum lag is 0, which
        allows immediate detection of changepoints, but with less certainty.

    Returns
    -------
    out : BayesianChangepointsModel

    See Also
    --------
    MovingZScoreModel, graphlab.TimeSeries, local_outlier_factor

    References
    ----------
    - The model implemented is desribed in `'Bayesian Online Changepoint Prediction'
      by Ryan Adams, <http://arxiv.org/pdf/0710.3742v1.pdf>`_.

    Examples
    --------
    >>> sf = graphlab.SFrame({'series': [100]*25 + [200]*25})
    >>> model = graphlab.anomaly_detection.bayesian_changepoints.create(sf,
    ...                                                         lag=5,
    ...                                                         feature='series')
    >>> model['scores'][24:28].print_rows(max_column_width=20)
    +--------+-------------------+--------+---------------------+
    | row_id | changepoint_score | series |  model_update_time  |
    +--------+-------------------+--------+---------------------+
    |   24   |   0.136735367681  |  100   | 2016-01-27 14:02... |
    |   25   |   0.831430606595  |  200   | 2016-01-27 14:02... |
    |   26   | 0.000347138442071 |  200   | 2016-01-27 14:02... |
    |   27   | 3.40869782692e-05 |  200   | 2016-01-27 14:02... |
    +--------+-------------------+--------+---------------------+
    [4 rows x 4 columns]
    """
    _mt._get_metric_tracker().track('{}.create'.format(__name__))

    start_time = _time.time()
    logger = _logging.getLogger(__name__)

    ## Validate required inputs by themselves.
    if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)):
        raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.")

    if len(dataset) < 1:
        raise _ToolkitError("Input 'dataset' is empty.")

    if feature is not None and not isinstance(feature, str):
        raise TypeError("Input 'feature' must be a string if specified.")

    if not isinstance(lag, int):
        raise TypeError("Input 'lag' must be an integer if specified.")

    if lag > 100 or lag < 0:
        raise ValueError(
            "Input 'lag' cannot be greater than 100 or less than 0")

    if type(expected_runlength) not in (int, float):
        raise TypeError(
            "'expected_runlength' must be either an integer or float")

    if expected_runlength < 1:
        raise ValueError("Input 'expected_runlength' must be greater than 0.")

    ## Determine the feature name if left unspecified.
    column_names = dataset.column_names() if isinstance(dataset, _gl.SFrame) \
        else dataset.value_col_names

    if feature is None:
        if len(column_names) == 1:
            feature = column_names[0]
        else:
            raise _ToolkitError("If the 'input' dataset has multiple " +
                                "columns, a 'feature' column name must be " +
                                "specified.")

    ## Extract the specified feature as an SArray.
    try:
        series = dataset[feature]
    except:
        raise _ToolkitError("The specified feature could not be found " +
                            "in the input 'dataset'.")

    ## Validate the type of the feature.
    if not series.dtype() in [int, float]:
        raise ValueError("The values in the specified feature must be " +
                         "integers or floats.")

    ## Initialize options
    opts = {}

    opts['expected_runlength'] = expected_runlength
    opts['lag'] = lag
    opts['feature'] = feature

    ## Create SDK proxy
    proxy = _gl.extensions._BayesianOnlineChangepoint()
    proxy.init_changepoint_detector(opts, False, series.dropna()[0])

    ## Construct python model from proxy
    model = BayesianChangepointsModel(proxy)

    ## Construct scores SFrame from calculated changepoints
    scores = _gl.SFrame()
    scores[feature] = series
    changepoints = model.__proxy__.calculate_changepoints(series)

    ## Append None's at the end, where there hasn't been enough data to determine
    ## whether there was a changepoint
    changepoints = changepoints.append(
        _gl.SArray([None] * (len(scores) - len(changepoints))))
    scores['changepoint_score'] = changepoints
    scores['model_update_time'] = _dt.datetime.now()

    scores = scores[[
        'changepoint_score',  # reorder the columns
        feature,
        'model_update_time'
    ]]

    #Add row_id to SFrame
    if isinstance(dataset, _gl.SFrame):
        if feature != 'row_id':
            scores = scores.add_row_number('row_id')
        else:
            logger.warning("Feature name is 'row_id', so the " +
                           "index in the model's 'scores' SFrame " +
                           "is called '_row_id'.")
            scores = scores.add_row_number('_row_id')

    ## Add index to timeseries
    if isinstance(dataset, _gl.TimeSeries):
        scores[dataset.index_col_name] = dataset[dataset.index_col_name]

    dataset_type = 'TimeSeries' if isinstance(dataset,
                                              _gl.TimeSeries) else 'SFrame'

    # Set up the model.
    state = {
        'dataset_type': dataset_type,
        'num_examples': len(dataset),
        'training_time': _time.time() - start_time
    }

    if isinstance(dataset, _gl.TimeSeries):
        model.__proxy__.set_index_col_name(dataset.index_col_name)
        model.__proxy__.set_state_sframe(scores, state)
    else:
        model.__proxy__.set_state_sframe(scores, state)

    return model
def create(data, row_label=None, features=None, feature_model='auto',
           method='lsh', verbose=True):
    """
    Create a similarity search model, which can be used to quickly retrieve
    items similar to a query observation. In the case of images, this model
    automatically performs the appropriate feature engineering steps. NOTE:
    If you are using a CPU for the creation step with feature_model='auto',
    creation time may take a while. This is because extracting features for
    images on a CPU is expensive. With a GPU, one can expect large speedups.

    .. warning::

        The similarity search toolkit is currently in beta, and feedback is
        welcome! Please send comments to [email protected].

    Parameters
    ----------
    dataset : SFrame
        The SFrame that represents the training data for the model, including at
        least one column of images.

    row_label : str, optional
        Name of the SFrame column with row id's. If 'row_label' is not
        specified, row numbers are used to identify reference dataset rows when
        the model is queried.

    features : str, optional
        The name of an image column in the input 'dataset' SFrame.

    feature_model : 'auto' | A model of type NeuralNetClassifier, optional
        A trained model for extracting features from raw data objects. By
        default ('auto'), we choose an appropriate model from our set of
        pre-trained models. See
        :class:`~graphlab.toolkits.feature_engineering.DeepFeatureExtractor` for
        more information.

    method : {'lsh', 'brute_force'}, optional
        The method used for nearest neighbor search. The 'lsh' option uses
        locality-sensitive hashing to find approximate results more quickly.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : SimilaritySearchModel

    See Also
    --------
    SimilaritySearchModel
    graphlab.toolkits.nearest_neighbors
    graphlab.toolkits.feature_engineering

    Notes
    -----
    The similarity search toolkit currently uses cosine distance to evaluate the
    similarity between each query and candidate results.

    Examples
    --------
    First, split data into reference and query.

    >>> import graphlab as gl

    >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k')
    >>> reference, query = data.random_split(0.8)

    Build neuralnet feature extractor for images:

    >>> nn_model = gl.neuralnet_classifier.create(reference, target='label')

    Construct SimilaritySearchModel:

    >>> model = gl.similarity_search.create(reference, features= 'image',
    ...                                     feature_model=nn_model)

    Find the most similar items in the reference set for each item in the query
    set:

    >>> model.search(query)
    """

    _mt._get_metric_tracker().track(__name__ + '.create')

    _raise_error_if_not_of_type(data, [_SFrame])
    _raise_error_if_not_of_type(features, [str])
    _raise_error_if_column_exists(data, features)

    if data[features].dtype() != _Image:
        raise _ToolkitError("Feature `%s` must be of type Image" \
                % features)

    return SimilaritySearchModel(data, row_label=row_label, feature=features,
            feature_model=feature_model, method=method, verbose=verbose)
예제 #22
0
    def update(self, dataset, window_size=None, min_observations=None,
               verbose=True):
        """
        Create a new `MovingZScoreModel` with a new dataset. The `window_size`
        and `min_observations` parameters can also be updated with this method.

        The new model contains anomaly scores for each observation in the new
        `dataset`. In addition, the last `window_size` rows of the existing
        model's data and anomaly scores are prepended, for continuity and to
        show how the anomaly score is computed for the first few rows of the
        new `dataset`.

        Parameters
        ----------
        dataset : SFrame or TimeSeries
            New data to use for updating the model. The type of the input
            'dataset' must match the type of the data already in the model (if
            the model has data already).

        window_size : int, optional
            Length of the time window to use for defining the moving z-score
            value, in terms of number of observations. The window size will be
            the same as the current model's window size if a new window is not
            specified.

        min_observations : int, optional
            Minimum number of non-missing observations in the moving window
            required to compute the moving Z-score. If unspecified, the entire
            moving window preceding an observation must not contain any missing
            values in order for the observation to get an anomaly score. This
            parameter will be the same as the current model's value if not
            specified.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : MovingZScoreModel
            A *new* MovingZScoreModel, with an updated dataset and anomaly
            scores for the updated dataset. The `scores` field of the new model
            has the same schema as the `scores` field of the existing model,
            but data prepended from the existing results have a row ID of
            'None'.

        See Also
        --------
        create

        Examples
        --------
        >>> sf = graphlab.SFrame({'year': [2007, 2007, 2008, 2009, 2010, 2010],
        ...                       'value': [12.2, 11.7, 12.5, 21.4, 10.8, 11.2]})
        >>> model = graphlab.anomaly_detection.moving_zscore.create(sf,
        ...                                                         window_size=3,
        ...                                                         feature='value')
        ...
        >>> sf2 = graphlab.SFrame({'year': [2010, 2011, 2012, 2013],
        ...                        'value': [18.4, 12.1, 12.0, 3.6]})
        >>> model2 = model.update(sf2)
        >>> model2['scores'].print_rows(max_column_width=20)
        +--------+----------------+-------+----------------+---------------------+
        | row_id | anomaly_score  | value | moving_average |  model_update_time  |
        +--------+----------------+-------+----------------+---------------------+
        |  None  | 28.0822407386  |  21.4 | 12.1333333333  | 2016-01-04 16:58... |
        |  None  | 1.00086199482  |  10.8 |      15.2      | 2016-01-04 16:58... |
        |  None  | 0.795990414837 |  11.2 |      14.9      | 2016-01-04 16:58... |
        |   0    | 0.801849542822 |  18.4 | 14.4666666667  | 2016-01-04 16:58... |
        |   1    | 0.391346818515 |  12.1 | 13.4666666667  | 2016-01-04 16:58... |
        |   2    | 0.593171014002 |  12.0 |      13.9      | 2016-01-04 16:58... |
        |   3    | 3.52963789428  |  3.6  | 14.1666666667  | 2016-01-04 16:58... |
        +--------+----------------+-------+----------------+---------------------+
        [7 rows x 5 columns]
        """
        start_time = _time.time()
        _mt._get_metric_tracker().track(
                              'toolkit.anomaly_detection.moving_zscore.update')
        logger = _logging.getLogger(__name__)


        ## Validate the new dataset
        if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)):
            raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.")

        if len(dataset) < 1:
            raise TypeError("Input 'dataset' is empty.")

        if ((self.__proxy__['dataset_type'] == 'TimeSeries' and not isinstance(dataset, _gl.TimeSeries)) or
            (self.__proxy__['dataset_type'] == 'SFrame' and not isinstance(dataset, _gl.SFrame))):

            raise TypeError("New input 'dataset' must have the same type " +
                            "as the data already in the model.")

        ## Validate the new window size (if there is one), and figure out what
        #  the new window size will be.
        if window_size is None:
            window_size = self.__proxy__['window_size']

        else:
            if not isinstance(window_size, int):
                raise TypeError("Input 'window_size' must be an integer.")

            if window_size < 1:
                raise ValueError("Input 'window_size' must greater than or " +
                                 "equal to 1.")

        ## Validate and determine the `min_observations` parameter.
        if min_observations is None:
            min_observations = self.__proxy__['min_observations']

        else:
            if not isinstance(min_observations, int):
                raise TypeError("If specified, input 'min_observations' must " +
                                "be a positive integer.")

            if min_observations < 1:
                raise ValueError("If specified, input 'min_observations' must " +
                                 "be a positive integer.")


        ## TimeSeries-specific dataset validation
            ## Make the sure new data occurs *after* the existing data.
        scores = self.__proxy__['scores']

        if isinstance(dataset, _gl.TimeSeries):
            first_new_timestamp = dataset[0][dataset.index_col_name]
            last_old_timestamp = scores[-1][scores.index_col_name]

            if first_new_timestamp < last_old_timestamp:
                raise _ToolkitError("The new dataset has data with " +
                                    "earlier timestamps than the existing " +
                                    "dataset. Please ensure that new data " +
                                    "occurs after existing data.")


        ## Extract the feature from the new dataset and validate it.
        feature = self.__proxy__['feature']

        try:
            series = dataset[feature]
        except:
            raise _ToolkitError("The feature specified by the original " +
                                "model could not be found in the input " +
                                "'dataset'.")

        if not series.dtype() in [int, float]:
            raise ValueError("The values in the specified feature must be " +
                             "integers or floats.")


        ## Create a new model and cut the old score object to the window size.
        new_state = {k: self.__proxy__[k]
            for k in ['verbose', 'feature', 'dataset_type']}

        new_state['window_size'] = window_size
        new_state['min_observations'] = min_observations

        new_model = MovingZScoreModel(new_state)


        ## Save just the old data needed for the moving statistics on the new
        #  data.
        if len(scores) < window_size:
            old_scores = scores[:]
        else:
            old_scores = scores[-window_size:]


        ## Compute Z-scores and anomaly scores.
        series = old_scores[feature].append(series)
        moving_average, moving_zscore, sufficient_data = \
            _moving_z_score(series, window_size, min_observations)

        anomaly_score = abs(moving_zscore)

        if not sufficient_data:
            logger.warning("The number of observations is smaller than " +
                           "the minimum number needed to compute a " +
                           "moving Z-score, so all anomaly scores are 'None'. " +
                           "Consider adding more data with the model's `update` " +
                           "method, or reducing the `window_size` or " +
                           "`min_observations` parameters.")

        ## General post-processing and formatting.
        scores = _gl.SFrame({feature: series,
                             'moving_average': moving_average,
                             'anomaly_score': anomaly_score})
        scores['model_update_time'] = _dt.datetime.now()

        scores = scores[[feature,  # reorder the columns
                         'moving_average',
                         'anomaly_score',
                         'model_update_time']]


        ## Replace the new Z-scores for the *old* data with the original
        #  Z-score for that data.
        num_new_examples = len(dataset)
        new_scores = scores[-num_new_examples:]

        if isinstance(dataset, _gl.TimeSeries):
            new_scores[dataset.index_col_name] = dataset[dataset.index_col_name]
            new_scores = _gl.TimeSeries(new_scores, index=dataset.index_col_name)

            ## The index column should have the same name in the old and new
            #  data. If it doesn't, change the name in the old scores.
            if dataset.index_col_name != old_scores.index_col_name:
                old_scores = old_scores.rename(
                           {old_scores.index_col_name: dataset.index_col_name})

                if verbose:
                    logger.warning("The new dataset's index column name " +
                                   "does not match the existing index " +
                                   "column name. The new name is used in " +
                                   "the new model.")

            final_scores = old_scores.union(new_scores)

        else:
            new_scores = new_scores.add_row_number('row_id')
            old_scores['row_id'] = None
            old_scores['row_id'] = old_scores['row_id'].astype(int)
            final_scores = old_scores.append(new_scores)


        ## Finalize and return the model.
        new_model.__proxy__['num_examples'] = len(scores)
        new_model.__proxy__['scores'] = final_scores
        new_model.__proxy__['training_time'] = _time.time() - start_time

        return new_model
예제 #23
0
    def evaluate(self, 
                 data,
                 methods=['average_similarity', 'average_quality', 'log_det']):
        """
        Objectively evaluate the quality and diversity of a data subset.

        There are several quantitaive measures of the quality and diversity of
        some set. This method provides three:
            - Average quality: The average over the quality features of each of
              the items in data.
            - Average similarity: The average of the pairwise similarities
              between every item in data.
            - Log-determinant: This simultaneously measures both the quality and
              diversity of a set. To measure the log-determinant of a given set,
              we first form the similarity matrix L, where a diagonal entry L_ii
              corresponds to the quality of item i, and an off diagonal entry
              L_ij corresponds to the similarity between items i and j. We then
              take the log of the determinant of this matrix. This type of
              matrix is also referred to as a Gramian matrix.

              The determinant of a Gramian matrix corresponds to the volume
              spanned by the vectors used to construct the matrix. If an item
              has a large quality, it corresponds to a longer vector, which will
              increase the volume (and determinant) of L. If two feature vectors
              are similar, then the volume decreases (because the vectors point
              in a similar direction), which correspondingly decreases the
              determinant. Thus, both quality and similarity are encapsulated by
              the log-determinant.

        Parameters
        ----------
        data: SFrame or SGraph
            The subset of data to evaluate.

        methods: list[string], {'average_similarity', 'average_quality', 'log_det'}
            The set of methods to measure. If methods is None, then all
            possible evaluation methods will be used.

        Returns
        -------
        out: dict
            Dictionary of values with keys corresponding to measurement types and values
            corresponding to the actual evaluation scores.

        Examples
        --------
        >>> cars = graphlab.SFrame.read_csv('https://s3.amazonaws.com/dato-datasets/auto-mpg/auto-mpg.csv')
        >>> sampler = graphlab.diverse_sampler.create(data=cars, 
                                                      item_id='name', 
                                                      quality_feature='accel', 
                                                      similarity_features=['mpg', 
                                                      'displ', 
                                                      'hp', 
                                                      'weight',
                                                      'origin'])
        >>> 

        >>> sf_simple_dd = gl.SFrame({'id': [0, 1, 2],
                                      'q':  [10, 10, 10],
                                      's1': [[1, 1, 1], [1, 1, 1], [1, 1, 1]]})
        >>> sampler = gl.diverse_sampler.create(data=sg_simple_dd,
                                                item_id='id',
                                                quality_feature='q',
                                                similarity_features=['s1'])
        >>> sf = sampler.sample(5, greedy=True, diversity=0.2)
        >>> sampler.evaluate(sf)
        {'log_det': 15.819720050211457, 'average_quality': 23.76, 
            'average_similarity': 0.999730969627407}
        """
        eval_frame = False
        if isinstance(data, _gl.SFrame):
            eval_frame = True
        elif not isinstance(data, _gl.SGraph):
            raise ValueError("Unknown data type " + str(type(data)) + ".")

        div_eval = _gl.extensions.diversity_eval()

        options = dict()
        options["eval_methods"] = methods

        if self._quality_feature is not None:
            options["quality_feature"] = self._quality_feature
        if self._similarity_features is not None:
            options["similarity_features"] = self._similarity_features

        if eval_frame:
            if not self._init_with_frame:
                raise _ToolkitError("Sampler initialized with SGraph, but eval "+ \
                                                        "was called with an SFrame.")
            return div_eval.evaluate_frame(data, options)
        else:
            if self._init_with_frame:
                raise _ToolkitError("Sampler initialized with SFrame, but eval "+ \
                                                        "was called with an SGraph.")
            return div_eval.evaluate_graph(data, options)
def create(dataset,
           label=None,
           features=None,
           distance=None,
           method='auto',
           verbose=True,
           **kwargs):
    """
    Create a nearest neighbor model, which can be searched efficiently and
    quickly for the nearest neighbors of a query observation. If the `method`
    argument is specified as `auto`, the type of model is chosen automatically
    based on the type of data in `dataset`.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change;
        it is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Reference data. If the features for each observation are numeric, they
        may be in separate columns of 'dataset' or a single column with lists
        of values. The features may also be in the form of a column of sparse
        vectors (i.e. dictionaries), with string keys and numeric values.

    label : string, optional
        Name of the SFrame column with row labels. If 'label' is not specified,
        row numbers are used to identify reference dataset rows when the model
        is queried.

    features : list[string], optional
        Name of the columns with features to use in computing distances between
        observations and the query points. 'None' (the default) indicates that
        all columns except the label should be used as features. Each column
        can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *List*: list of integer or string values. Each element is treated as
          a separate variable in the model.

        - *String*: string values.

        Please note: if a composite distance is also specified, this parameter
        is ignored.

    distance : string, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~graphlab.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        If 'distance' is left unspecified or set to 'auto', a composite
        distance is constructed automatically based on feature types.

    method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional
        Method for computing nearest neighbors. The options are:

        - *auto* (default): the method is chosen automatically, based on the
          type of data and the distance. If the distance is 'manhattan' or
          'euclidean' and the features are numeric or vectors of numeric
          values, then the 'ball_tree' method is used. Otherwise, the
          'brute_force' method is used.

        - *ball_tree*: use a tree structure to find the k-closest neighbors to
          each query point. The ball tree model is slower to construct than the
          brute force model, but queries are faster than linear time. This
          method is not applicable for the cosine and dot product distances.
          See `Liu, et al (2004)
          <http://papers.nips.cc/paper/2666-an-investigation-of-p
          ractical-approximat e-nearest-neighbor-algorithms>`_ for
          implementation details.

        - *brute_force*: compute the distance from a query point to all
          reference observations. There is no computation time for model
          creation with the brute force method (although the reference data is
          held in the model, but each query takes linear time.

        - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate
          nearest neighbors efficiently. The LSH model supports 'euclidean',
          'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product'
          (deprecated), and 'transformed_dot_product' distances. Two options
          are provided for LSH -- ``num_tables`` and
          ``num_projections_per_table``. See the notes below for details.

    verbose: bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options for the distance function and query method.

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and
          n/(2^11), which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed. The default value is 20. We recommend choosing values
          from 10 to 30.

        - *num_projections_per_table*: For the LSH method, the number of
          projections/hash functions for each hash table. The default value is
          4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other
          distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8
          ~ 20 for 'cosine' distance and 4 ~ 12 for other distances.

    Returns
    -------
    out : NearestNeighborsModel
        A structure for efficiently computing the nearest neighbors in 'dataset'
        of new query points.

    See Also
    --------
    NearestNeighborsModel.query, graphlab.toolkits.distances

    Notes
    -----
    - Missing data is not allowed in the 'dataset' provided to this function.
      Please use the :func:`graphlab.SFrame.fillna` and
      :func:`graphlab.SFrame.dropna` utilities to handle missing data before
      creating a nearest neighbors model.

    - Missing keys in sparse vectors are assumed to have value 0.

    - The `composite_params` parameter was removed as of GraphLab Create
      version 1.5. The `distance` parameter now accepts either standard or
      composite distances. Please see the :mod:`~graphlab.toolkits.distances`
      module documentation for more information on composite distances.

    - If the features should be weighted equally in the distance calculations
      but are measured on different scales, it is important to standardize the
      features. One way to do this is to subtract the mean of each column and
      divide by the standard deviation.

    **Locality Sensitive Hashing (LSH)**

    There are several efficient nearest neighbors search algorithms that work
    well for data with low dimensions :math:`d` (approximately 50). However,
    most of the solutions suffer from either space or query time that is
    exponential in :math:`d`. For large :math:`d`, they often provide little,
    if any, improvement over the 'brute_force' method. This is a well-known
    consequence of the phenomenon called `The Curse of Dimensionality`.

    `Locality Sensitive Hashing (LSH)
    <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach
    that is designed to efficiently solve the *approximate* nearest neighbor
    search problem for high dimensional data. The key idea of LSH is to hash
    the data points using several hash functions, so that the probability of
    collision is much higher for data points which are close to each other than
    those which are far apart.

    An LSH family is a family of functions :math:`h` which map points from the
    metric space to a bucket, so that

    - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`.
    - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`.

    LSH for efficient approximate nearest neighbor search:

    - We define a new family of hash functions :math:`g`, where each
      function :math:`g` is obtained by concatenating :math:`k` functions
      :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`.
      The algorithm constructs :math:`L` hash tables, each of which
      corresponds to a different randomly chosen hash function :math:`g`.
      There are :math:`k \\cdot L` hash functions used in total.

    - In the preprocessing step, we hash all :math:`n` reference points
      into each of the :math:`L` hash tables.

    - Given a query point :math:`q`, the algorithm iterates over the
      :math:`L` hash functions :math:`g`. For each :math:`g` considered, it
      retrieves the data points that are hashed into the same bucket as q.
      These data points from all the :math:`L` hash tables are considered as
      candidates that are then re-ranked by their real distances with the query
      data.

    **Note** that the number of tables :math:`L` and the number of hash
    functions per table :math:`k` are two main parameters. They can be set
    using the options ``num_tables`` and ``num_projections_per_table``
    respectively.

    Hash functions for different distances:

    - `euclidean` and `squared_euclidean`:
      :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where
      :math:`a` is a vector, of which the elements are independently
      sampled from normal distribution, and :math:`b` is a number
      uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the
      bucket width. We set :math:`r` using the average all-pair `euclidean`
      distances from a small randomly sampled subset of the reference data.

    - `manhattan`: The hash function of `manhattan` is similar with that of
      `euclidean`. The only difference is that the elements of `a` are sampled
      from Cauchy distribution, instead of normal distribution.

    - `cosine`: Random Projection is designed to approximate the cosine
      distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot
      q)`, where :math:`a` is randomly sampled normal unit vector.

    - `jaccard`: We use a recently proposed method one permutation hashing by
      Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014]
      <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for
      details.

    - `dot_product`: The reference data points are first transformed to
      fixed-norm vectors, and then the minimum `dot_product` distance search
      problem can be solved via finding the reference data with smallest
      `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015]
      <http://jmlr.org/proceedings/papers/v37/neyshabur15.html>`_ for details.

    References
    ----------
    - `Wikipedia - nearest neighbor
      search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_

    - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_

    - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of
      Practical Approximate Nearest Neighbor Algorithms
      <http://papers.nips.cc/paper/2666-an-investigation-of-p
      ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural
      Information Processing Systems pp. 825-832.

    - `Wikipedia - Jaccard distance
      <http://en.wikipedia.org/wiki/Jaccard_index>`_

    - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the
      Jaccard Median
      <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_.
      Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete
      Algorithms. Society for Industrial and Applied Mathematics.

    - `Wikipedia - Cosine distance
      <http://en.wikipedia.org/wiki/Cosine_similarity>`_

    - `Wikipedia - Levenshtein distance
      <http://en.wikipedia.org/wiki/Levenshtein_distance>`_

    - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive
      Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_.

    Examples
    --------
    Construct a nearest neighbors model with automatically determined method
    and distance:

    >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11],
    ...                       'X2': [0.69, 0.58, 0.36],
    ...                       'str_feature': ['cat', 'dog', 'fossa']})
    >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'])

    For datasets with a large number of rows and up to about 100 variables, the
    ball tree method often leads to much faster queries.

    >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'],
    ...                                           method='ball_tree')

    Often the final determination of a neighbor is based on several distance
    computations over different sets of features. Each part of this composite
    distance may have a different relative weight.

    >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.],
    ...            [['str_feature'], 'levenshtein', 3.]]
    ...
    >>> model = graphlab.nearest_neighbors.create(sf, distance=my_dist)
    """

    ## Validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Basic validation of the features input
    if features is not None and not isinstance(features, list):
        raise TypeError("If specified, input 'features' must be a list of " +
                        "strings.")

    ## Clean the method options and create the options dictionary
    allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table']
    _method_options = {}

    for k, v in kwargs.items():
        if k in allowed_kwargs:
            _method_options[k] = v
        else:
            raise _ToolkitError(
                "'{}' is not a valid keyword argument".format(k) +
                " for the nearest neighbors model. Please " +
                "check for capitalization and other typos.")

    ## Exclude inappropriate combinations of method an distance
    if method == 'ball_tree' and (
            distance == 'cosine' or distance == _graphlab.distances.cosine
            or distance == 'dot_product'
            or distance == _graphlab.distances.dot_product
            or distance == 'transformed_dot_product'
            or distance == _graphlab.distances.transformed_dot_product):
        raise TypeError(
            "The ball tree method does not work with 'cosine' " +
            "'dot_product', or 'transformed_dot_product' distance." +
            "Please use the 'brute_force' method for these distances.")

    if method == 'lsh' and ('num_projections_per_table'
                            not in _method_options):
        if distance == 'jaccard' or distance == _graphlab.distances.jaccard:
            _method_options['num_projections_per_table'] = 4
        elif distance == 'cosine' or distance == _graphlab.distances.cosine:
            _method_options['num_projections_per_table'] = 16
        else:
            _method_options['num_projections_per_table'] = 8

    ## Initial validation and processing of the label
    if label is None:
        _label = _robust_column_name('__id', dataset.column_names())
        _dataset = dataset.add_row_number(_label)
    else:
        _label = label
        _dataset = _copy.copy(dataset)

    col_type_map = {c: _dataset[c].dtype() for c in _dataset.column_names()}
    _validate_row_label(_label, col_type_map)
    ref_labels = _dataset[_label]

    ## Determine the internal list of available feature names (may still include
    #  the row label name).
    if features is None:
        _features = _dataset.column_names()
    else:
        _features = _copy.deepcopy(features)

    ## Check if there's only one feature and it's the same as the row label.
    #  This would also be trapped by the composite distance validation, but the
    #  error message is not very informative for the user.
    free_features = set(_features).difference([_label])
    if len(free_features) < 1:
        raise _ToolkitError("The only available feature is the same as the " +
                            "row label column. Please specify features " +
                            "that are not also row labels.")

    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.
    # - At this point, the row label 'label' may still be in the list(s) of
    #   features.

    ## Convert any distance function input into a single composite distance.
    # distance is already a composite distance
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    # distance is a single name (except 'auto') or function handle.
    elif (hasattr(distance, '__call__')
          or (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    # distance is unspecified and needs to be constructed.
    elif distance is None or distance == 'auto':
        sample = _dataset.head()
        distance = _construct_auto_distance(_features, _dataset.column_names(),
                                            _dataset.column_types(), sample)

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' "
                        " argument must be a string, function handle, or " +
                        "composite distance.")

    ## Basic composite distance validation, remove the row label from all
    #  feature lists, and convert string distance names into distance functions.
    distance = _scrub_composite_distance_features(distance, [_label])
    distance = _convert_distance_names_to_functions(distance)
    _validate_composite_distance(distance)

    ## Raise an error if any distances are used with non-lists
    list_features_to_check = []
    sparse_distances = [
        'jaccard', 'weighted_jaccard', 'cosine', 'dot_product',
        'transformed_dot_product'
    ]
    sparse_distances = [
        _graphlab.distances.__dict__[k] for k in sparse_distances
    ]
    for d in distance:
        feature_names, dist, _ = d
        list_features = [
            f for f in feature_names if _dataset[f].dtype() == list
        ]
        for f in list_features:
            if dist in sparse_distances:
                list_features_to_check.append(f)
            else:
                raise TypeError(
                    "The chosen distance cannot currently be used " +
                    "on list-typed columns.")
    for f in list_features_to_check:
        only_str_lists = _validate_lists(_dataset[f], [str])
        if not only_str_lists:
            raise TypeError("Distances for sparse data, such as jaccard " +
                            "and weighted_jaccard, can only be used on " +
                            "lists containing only strings. Please modify " +
                            "any list features accordingly before creating " +
                            "the nearest neighbors model.")

    ## Raise an error if any component has string features are in single columns
    for d in distance:
        feature_names, dist, _ = d

        if (len(feature_names) > 1) and (dist
                                         == _graphlab.distances.levenshtein):
            raise ValueError(
                "Levenshtein distance cannot be used with multiple " +
                "columns. Please concatenate strings into a single " +
                "column before creating the nearest neighbors model.")

    ## Get the union of feature names and make a clean dataset.
    clean_features = _get_composite_distance_features(distance)
    sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features)

    ## Decide which method to use
    ## - If more than one distance component (specified either directly or
    #  generated automatically because distance set to 'auto'), then do brute
    #  force.
    if len(distance) > 1:
        _method = 'brute_force'

        if method != 'brute_force' and verbose is True:
            print("Defaulting to brute force instead of ball tree because " +\
                "there are multiple distance components.")

    else:
        if method == 'auto':

            # get the total number of variables. Assume the number of elements in
            # array type columns does not change
            num_variables = sum([
                len(x) if hasattr(x, '__iter__') else 1
                for x in _six.itervalues(sf_clean[0])
            ])

            # flag if all the features in the single composite are of numeric
            # type.
            numeric_type_flag = all([
                x in [int, float, list, array.array]
                for x in sf_clean.column_types()
            ])

            ## Conditions necessary for ball tree to work and be worth it
            if ((distance[0][1] in [
                    'euclidean', 'manhattan', _graphlab.distances.euclidean,
                    _graphlab.distances.manhattan
            ]) and numeric_type_flag is True and num_variables <= 200):

                _method = 'ball_tree'

            else:
                _method = 'brute_force'

        else:
            _method = method

    ## Pick the right model name for the method
    if _method == 'ball_tree':
        model_name = 'nearest_neighbors_ball_tree'
        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors_balltree.create')

    elif _method == 'brute_force':
        model_name = 'nearest_neighbors_brute_force'
        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors_brute.create')

    elif _method == 'lsh':
        model_name = 'nearest_neighbors_lsh'
        _mt._get_metric_tracker().track('toolkit.nearest_neighbors_lsh.create')

    else:
        raise ValueError(
            "Method must be 'auto', 'ball_tree', 'brute_force', " +
            "or 'lsh'.")

    ## Package the model options
    opts = {}
    opts.update(_method_options)
    opts.update({
        'model_name': model_name,
        'ref_labels': ref_labels,
        'label': label,
        'sf_features': sf_clean,
        'composite_params': distance
    })

    ## Construct the nearest neighbors model
    if not verbose:
        _mt.main.get_server().set_log_progress(False)

    result = _graphlab.extensions._nearest_neighbors.train(opts)

    _mt.main.get_server().set_log_progress(True)

    model_proxy = result['model']
    model = NearestNeighborsModel(model_proxy)

    return model
예제 #25
0
    def __init__(self, 
                 data=None,
                 item_id=None,
                 quality_feature=None,
                 similarity_features=None,
                 model_proxy = None,
                 _class = None):
        """ 
        Create a DiverseSampler object. This should never be called directly,
        because it is necessary to set up an SDK proxy prior to calling
        __init__.
        """
        if _class:
            self.__class__ = _class

        self._init_with_frame = False 

        self.__proxy__ = model_proxy
        self.__name__ = 'diverse_sampler'
        self._quality_feature = quality_feature
        self._similarity_features = similarity_features

        if data is None and model_proxy is None:
            raise ValueError("The diverse sampler must be initialized with a " +
                                             "reference SFrame or SGraph.")
        elif data is not None:
            if not (isinstance(data, _gl.SFrame) or isinstance(data, _gl.SGraph)):
                raise ValueError("Unknown data type " + str(type(data)) + ".")

        if item_id is None and model_proxy is None:
            # Note that for SGraphs, the __id vertex field is intrinsic to each
            # gl.Vertex, so we don't actually need to specify item_id
            if isinstance(data, _gl.SFrame):
                raise ValueError("An item_id must be specified.")

        if isinstance(data, _gl.SFrame):
            col_names = data.column_names()
        elif isinstance(data, _gl.SGraph):
            if similarity_features is not None and len(similarity_features) > 1:
                raise _ToolkitError("Only 1 similarity feature is supported for SGraph.")
            col_names = data.get_fields()

        if isinstance(data, _gl.SFrame) and item_id not in col_names:
            raise ValueError("Item ID "+item_id+" does not name " +
                                             "a column in the SFrame.")

        if quality_feature is not None and quality_feature not in col_names:
            raise ValueError("Quality feature "+quality_feature+" does not name " +
                                             "a column in the SFrame.")

        if similarity_features is not None:
            for sname in similarity_features:
                if sname not in col_names:
                    raise ValueError("Similarity feature "+sname+" does not name " +
                                                     "a column in the SFrame.")

        opts = dict()
        if item_id is None and isinstance(data, _gl.SGraph):
            item_id = "__id"
        opts["item_id"] = item_id

        if quality_feature is not None:
            opts["quality_feature"] = quality_feature
        if similarity_features is not None:
            opts["similarity_features"] = similarity_features

        if isinstance(data, _gl.SFrame):
            self._init_with_frame = True
            self.__proxy__.init_with_frame(data, opts)
        elif isinstance(data, _gl.SGraph):
            self._init_with_frame = False
            self.__proxy__.init_with_graph(data, opts)
    def evaluate(self, dataset, query_name=None, k=5, similarity_threshold=None,
                 exclude_zeros=True, verbose=True):
        """
        Match the reference tags to a set of queries labeled with their true
        tags, and then evaluate the model's performance on those queries.

        The true tags should be provided as an additional column in ``dataset``,
        and that column's name should be the same as the ``tag_name`` parameter
        specified when the model was created. The type of the tags column should
        be either string or list (of strings).

        Parameters
        ----------
        dataset : SFrame
            Query data to be tagged.

        query_name : string, optional
            Name of the column in ``dataset`` to be auto-tagged. If ``dataset``
            has more than one column, ``query_name`` must be specified.

        k : int, optional
            Number of results to return from the reference set for each query
            observation. The default is 5, but setting it to ``None`` will
            return all results whose score is greater than or equal to
            ``similarity_threshold``.

        similarity_threshold : float, optional
            Only results whose score is greater than or equal to the specified
            ``similarity_threshold`` are returned. The default is ``None``, in
            which case the ``k`` best results are returned for each query point
            regardless of score.

        exclude_zeros : boolean, optional
            If True, only entries for which there is a tag with a nonzero score
            are preserved in the output. This is the default behavior.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : dict
            A dictionary containing the entire confusion matrix, as well as the
            following evaluation metrics:

            - Precision
            - Recall
            - F1 score

        See Also
        --------
        tag, graphlab.evaluation.confusion_matrix

        Notes
        -----
        - Autotagging is a variation on multiclass classification, where in
          contrast to a multiclass classifier, an autotagger model can output
          zero tags for a particular query (either because there were no tags
          with non-zero scores, or as a result of specifying a value for the
          similarity_threshold parameter). As is standard practice in multiclass
          classification, we report Precision, Recall, and F1 score as our
          evaluation metrics. Specifically, we microaverage Precision and Recall
          by counting type I errors (false positives) and type II errors (false
          negatives) over the entire confusion matrix.

        References
        ----------
        - `Wikipedia - Precision and
          recall <http://en.wikipedia.org/wiki/Precision_and_recall>`_

        - Manning, C., Raghavan P., and Schutze H. (2008). Introduction to
          Information Retrieval.

        Examples
        --------
        Continuing with the actor autotagger model referenced in previous
        example (for the ```tag``` method):

        >>> labeled_reviews_sf = gl.SFrame(
                "s3://dato-datasets/imdb_reviews/reviews.10.tagged.sframe")
        >>> labeled_reviews_sf
        +-------------------------------+---------------------+
        |             review            |        actor        |
        +-------------------------------+---------------------+
        | When I saw this movie I wa... | [Leonardo DiCaprio] |
        | I rented this movie last w... |     [Matt Damon]    |
        | You've gotta hand it to St... |   [Angelina Jolie]  |
        | I caught this film at a te... |   [Julia Roberts]   |
        | I took a flyer in renting ... |  [Jennifer Aniston] |
        | Frankly I'm rather incense... |          []         |
        | This movie looked as if it... |      [Jude Law]     |
        | My wife and I watch a film... |          []         |
        | A story of amazing disinte... |          []         |
        | I don't remember a movie w... |          []         |
        +-------------------------------+---------------------+

        >>> m.evaluate(labeled_reviews_sf, query_name="review", verbose=False,
                k=1)

        .. sourcecode:: :python

            {'confusion_matrix': Columns
                    count	int
                    target_label	str
                    predicted_label	str

             Rows: 10

             Data:
             +-------+-------------------+-------------------+
             | count |    target_label   |  predicted_label  |
             +-------+-------------------+-------------------+
             |   1   | Leonardo DiCaprio | Leonardo DiCaprio |
             |   1   |     Matt Damon    |     Matt Damon    |
             |   1   |   Angelina Jolie  |   Angelina Jolie  |
             |   1   |   Julia Roberts   |   Julia Roberts   |
             |   1   |  Jennifer Aniston |  Jennifer Aniston |
             |   1   |      Jude Law     |      Jude Law     |
             |   1   |        None       |     Will Smith    |
             |   1   |        None       |     Emma Stone    |
             |   1   |        None       |  Jennifer Aniston |
             |   1   |        None       |  Charlize Theron  |
             +-------+-------------------+-------------------+
             [10 rows x 3 columns],
             'f1_score': 0.7499999999999999,
             'precision': 0.6,
             'recall': 1.0}

        >>> m.evaluate(labeled_reviews_sf, query_name="review", verbose=False,
                       k=1, similarity_threshold=.6)

        .. sourcecode:: :python

            {'confusion_matrix': Columns:
                    count	int
                    target_label	str
                    predicted_label	str

             Rows: 7

             Data:
             +-------+-------------------+-------------------+
             | count |    target_label   |  predicted_label  |
             +-------+-------------------+-------------------+
             |   1   | Leonardo DiCaprio | Leonardo DiCaprio |
             |   1   |   Angelina Jolie  |   Angelina Jolie  |
             |   1   |   Julia Roberts   |   Julia Roberts   |
             |   4   |        None       |        None       |
             |   1   |      Jude Law     |      Jude Law     |
             |   1   |     Matt Damon    |        None       |
             |   1   |  Jennifer Aniston |        None       |
             +-------+-------------------+-------------------+
             [7 rows x 3 columns],
             'f1_score': 0.8,
             'precision': 1.0,
             'recall': 0.6666666666666666}

        """
        _mt._get_metric_tracker().track(self.__module__ + '.tag')

        tag_name = self.get("tag_name")
        true_tags = dataset.select_column(tag_name)

        if true_tags.dtype() not in (list, str):
            raise TypeError("The %s column must either be of type str or list" % tag_name)

        if true_tags.dtype() == str:
            true_tags = true_tags.apply(lambda x: [x] if x else [])

        true_tags = true_tags.fillna([])

        dataset = dataset.select_columns([x for x in dataset.column_names() if x != tag_name])

        if similarity_threshold:
            if not isinstance(similarity_threshold, (float, int)):
                raise _ToolkitError("similarity_threshold parameter must be a" \
                                    "float or an int.")

            if similarity_threshold < 0 or similarity_threshold > 1:
                raise _ToolkitError("similarity_threshold parameter must be " \
                                    "between 0 and 1.")

        results = self.tag(dataset, query_name=query_name, k=k,
                           similarity_threshold=similarity_threshold,
                           exclude_zeros=exclude_zeros, verbose=verbose)

        if len(results) == 0:
            raise ValueError("There is no data to evaluate. Try reducing the " \
                             "similarity_threshold or increasing k.")

        group_column = (query_name or dataset.column_names()[0]) + "_id"
        dataset = dataset.add_row_number(group_column)
        results = results.groupby(group_column, {"labels": _gl.aggregate.CONCAT(tag_name)})
        results = dataset.join(results, on={group_column: group_column}, how="left")
        results = results.fillna("labels", [])
        results = results.sort(group_column)

        def precision(tps, fps):
            return tps / float(tps + fps)

        def recall(tps, fns):
            return tps / float(tps + fns)

        def f1_score(p, r):
            return 2 * p * r / (p + r)

        confusion_matrix = _gl.evaluation.confusion_matrix(true_tags, results["labels"])
        confusion_matrix = confusion_matrix.stack("target_label", "target_label")

        # TO DO: this next line will be removed once .stack type-inference is fixed
        # or type_hint parameter is exposed
        confusion_matrix = _gl.SFrame({"predicted_label": [["stub"]],
                                       "count": [1], "target_label": ["stub"]})\
                              .append(confusion_matrix)

        confusion_matrix = confusion_matrix.stack("predicted_label", "predicted_label")

        # TO DO: remove this next line, per note above
        confusion_matrix = confusion_matrix[1:]

        tps = confusion_matrix[confusion_matrix.apply(
            lambda row: row["predicted_label"] != None and \
            row["target_label"] == row["predicted_label"])]["count"].sum() or 0

        fps = confusion_matrix[confusion_matrix.apply(
            lambda row: row["predicted_label"] != None and \
            row["target_label"] != row["predicted_label"])]["count"].sum() or 0

        fns = confusion_matrix[confusion_matrix.apply(
            lambda row: row["predicted_label"] == None and \
            row["target_label"] != None)]["count"].sum() or 0

        p  = precision(tps, fps)
        r  = recall(tps, fns)
        f1 = f1_score(p, r)

        return {"precision": p, "recall": r, "f1_score": f1,
                'confusion_matrix': confusion_matrix}
    def tag(self, dataset, query_name=None, k=5, similarity_threshold=None,
            exclude_zeros=True, verbose=True):
        """
        Match the reference tags passed when a model is created to a new set of
        queries. This is a many-to-many match: each query may have any number of
        occurrences of a reference tag.

        Parameters
        ----------
        dataset : SFrame
            Query data to be tagged.

        query_name : string, optional
            Name of the column in ``dataset`` to be auto-tagged. If ``dataset``
            has more than one column, ``query_name`` must be specified.

        k : int, optional
            Number of results to return from the reference set for each query
            observation. The default is 5, but setting it to ``None`` will
            return all results whose score is greater than or equal to
            ``similarity_threshold``.

        similarity_threshold : float, optional
            Only results whose score is greater than or equal to the specified
            ``similarity_threshold`` are returned. The default is ``None``, in
            which case the ``k`` best results are returned for each query point.

        verbose : bool, optional
            If True, print progress updates and model details.

        exclude_zeros : boolean, optional
            If True, only entries for which there is a tag with a nonzero score
            are preserved in the output. This is the default behavior.

        Returns
        -------
        out : SFrame
            An SFrame with four columns:

            - row ID
            - column name specified as `tag_name` parameter to `create` method
            - column name specified as `query_name` parameter to `tag` method
            - a similarity score between 0 and 1, indicating the strength of the
              match between the query data and the suggested reference tag,
              where a score of zero indicates a poor match and a strength of 1
              corresponds to a perfect match

        Notes
        -----
        - By default, only rows for which there is a tag with a nonzero score
          are included in the output. To guarantee at least one output row for
          every input row in ``dataset``, set the ``exclude_zeros`` parameter
          to False.

        - If both ``k`` and ``similarity_threshold`` are set to ``None``, a
          ToolkitError is raised.

        Examples
        --------
        First construct a toy `SFrame` of actor names, which will serve as the
        reference set for our autotagger model.

        >>> actors_sf = gl.SFrame(
                {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                           "Tom Cruise", "Jude Law", "Robert Pattinson",
                           "Matt Damon", "Brad Pitt", "Johnny Depp",
                           "Leonardo DiCaprio", "Jennifer Aniston",
                           "Jessica Alba", "Emma Stone", "Cameron Diaz",
                           "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                           "Charlize Theron", "Marion Cotillard",
                           "Angelina Jolie"]})
        >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor")

        Then we load some IMDB movie reviews into an `SFrame` and tag them using
        the model we created above. The score field in the output is a
        similarity score, indicating the strength of the match between the query
        data and the suggested reference tag.

        >>> reviews_sf = gl.SFrame(
                "s3://dato-datasets/imdb_reviews/reviews.sframe")
        >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
        +-----------+-------------------------------+------------------+-----------------+
        | review_id |             review            |      actor       |      score      |
        +-----------+-------------------------------+------------------+-----------------+
        |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
        |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
        |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
        |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
        |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
        |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
        |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
        |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
        |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
        |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
        |    ...    |              ...              |       ...        |       ...       |
        +-----------+-------------------------------+------------------+-----------------+

        The initial results look a little noisy. To filter out obvious spurious
        matches, we can set the `tag` method's `similarity_threshold` parameter.

        >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
                  similarity_threshold=.8)
        +-----------+-------------------------------+------------------+----------------+
        | review_id |             review            |      actor       |     score      |
        +-----------+-------------------------------+------------------+----------------+
        |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
        |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
        |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
        |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
        +-----------+-------------------------------+------------------+----------------+

        """
        _mt._get_metric_tracker().track(self.__module__ + '.tag')

        # validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        # ensure that either k or similarity_threshold is set
        if not (k or similarity_threshold):
            raise _ToolkitError("Either k or similarity_threshold parameters " \
                                "must be set")

        # ensure that query_name is provided if dataset has > 1 column
        if dataset.num_cols() > 1 and not query_name:
            raise _ToolkitError("No query_name parameter specified on " \
                                "dataset with %d columns" % dataset.num_cols())

        query_column = query_name or dataset.column_names()[0]

        # ensure that column with name tag_name exists
        if query_column not in dataset.column_names():
            raise _ToolkitError('No column named "%s" in dataset' \
                                % query_column)

        query_sa = dataset.select_column(query_column)
        query_sf = _gl.SFrame({"id": range(len(query_sa)),
                               query_column: query_sa})

        features = _preprocess(query_sa)
        features = features.add_row_number()

        if similarity_threshold:
            if not isinstance(similarity_threshold, (float, int)):
                raise _ToolkitError("similarity_threshold parameter must be a" \
                                    "float or an int.")

            if similarity_threshold < 0 or similarity_threshold > 1:
                raise _ToolkitError("similarity_threshold parameter must be " \
                                    "between 0 and 1.")

        radius = (1 - similarity_threshold) if similarity_threshold else None

        results = self._nn_model.query(features, label="id", k=k,
                                       radius=radius,
                                       verbose=verbose)

        # return empty SFrame immediately if no NN results
        if len(results) == 0:
            return _gl.SFrame({query_column + "_id": [],
                               query_column: [],
                               self.get("tag_name"): [],
                               "score": []})

        results = results.join(query_sf, on={"query_label": "id"})
        results.rename({"query_label": query_column + "_id",
                        query_column: "query_label"})

        # convert distances to similarity scores
        scores = _dists_to_sim_scores("weighted_jaccard", results)

        results.add_column(scores, "score")
        results.remove_column("distance")
        results.remove_column("rank")
        results.rename({"reference_label": self.get("tag_name"),
                        "query_label": query_column})
        results.swap_columns(self.get("tag_name"), query_column)

        if exclude_zeros:
            try:
                results = results.filter_by(0.0, "score", exclude=True)
            except RuntimeError: # nothing to join
                _logging.getLogger(__name__).warn(
                    "Empty results after filtering scores of 0.")
                results = results.head(0)

        return results
예제 #28
0
    def tag(self,
            dataset,
            query_name=None,
            k=5,
            similarity_threshold=None,
            exclude_zeros=True,
            verbose=True):
        """
        Match the reference tags passed when a model is created to a new set of
        queries. This is a many-to-many match: each query may have any number of
        occurrences of a reference tag.

        Parameters
        ----------
        dataset : SFrame
            Query data to be tagged.

        query_name : string, optional
            Name of the column in ``dataset`` to be auto-tagged. If ``dataset``
            has more than one column, ``query_name`` must be specified.

        k : int, optional
            Number of results to return from the reference set for each query
            observation. The default is 5, but setting it to ``None`` will
            return all results whose score is greater than or equal to
            ``similarity_threshold``.

        similarity_threshold : float, optional
            Only results whose score is greater than or equal to the specified
            ``similarity_threshold`` are returned. The default is ``None``, in
            which case the ``k`` best results are returned for each query point.

        verbose : bool, optional
            If True, print progress updates and model details.

        exclude_zeros : boolean, optional
            If True, only entries for which there is a tag with a nonzero score
            are preserved in the output. This is the default behavior.

        Returns
        -------
        out : SFrame
            An SFrame with four columns:

            - row ID
            - column name specified as `tag_name` parameter to `create` method
            - column name specified as `query_name` parameter to `tag` method
            - a similarity score between 0 and 1, indicating the strength of the
              match between the query data and the suggested reference tag,
              where a score of zero indicates a poor match and a strength of 1
              corresponds to a perfect match

        Notes
        -----
        - By default, only rows for which there is a tag with a nonzero score
          are included in the output. To guarantee at least one output row for
          every input row in ``dataset``, set the ``exclude_zeros`` parameter
          to False.

        - If both ``k`` and ``similarity_threshold`` are set to ``None``, a
          ToolkitError is raised.

        Examples
        --------
        First construct a toy `SFrame` of actor names, which will serve as the
        reference set for our autotagger model.

        >>> actors_sf = gl.SFrame(
                {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                           "Tom Cruise", "Jude Law", "Robert Pattinson",
                           "Matt Damon", "Brad Pitt", "Johnny Depp",
                           "Leonardo DiCaprio", "Jennifer Aniston",
                           "Jessica Alba", "Emma Stone", "Cameron Diaz",
                           "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                           "Charlize Theron", "Marion Cotillard",
                           "Angelina Jolie"]})
        >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor")

        Then we load some IMDB movie reviews into an `SFrame` and tag them using
        the model we created above. The score field in the output is a
        similarity score, indicating the strength of the match between the query
        data and the suggested reference tag.

        >>> reviews_sf = gl.SFrame(
                "https://static.turi.com/datasets/imdb_reviews/reviews.sframe")
        >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
        +-----------+-------------------------------+------------------+-----------------+
        | review_id |             review            |      actor       |      score      |
        +-----------+-------------------------------+------------------+-----------------+
        |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
        |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
        |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
        |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
        |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
        |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
        |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
        |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
        |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
        |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
        |    ...    |              ...              |       ...        |       ...       |
        +-----------+-------------------------------+------------------+-----------------+

        The initial results look a little noisy. To filter out obvious spurious
        matches, we can set the `tag` method's `similarity_threshold` parameter.

        >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
                  similarity_threshold=.8)
        +-----------+-------------------------------+------------------+----------------+
        | review_id |             review            |      actor       |     score      |
        +-----------+-------------------------------+------------------+----------------+
        |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
        |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
        |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
        |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
        +-----------+-------------------------------+------------------+----------------+

        """
        _mt._get_metric_tracker().track(self.__module__ + '.tag')

        # validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        # ensure that either k or similarity_threshold is set
        if not (k or similarity_threshold):
            raise _ToolkitError("Either k or similarity_threshold parameters " \
                                "must be set")

        # ensure that query_name is provided if dataset has > 1 column
        if dataset.num_cols() > 1 and not query_name:
            raise _ToolkitError("No query_name parameter specified on " \
                                "dataset with %d columns" % dataset.num_cols())

        query_column = query_name or dataset.column_names()[0]

        # ensure that column with name tag_name exists
        if query_column not in dataset.column_names():
            raise _ToolkitError('No column named "%s" in dataset' \
                                % query_column)

        query_sa = dataset.select_column(query_column)
        query_sf = _gl.SFrame({
            "id": range(len(query_sa)),
            query_column: query_sa
        })

        features = _preprocess(query_sa)
        features = features.add_row_number()

        if similarity_threshold:
            if not isinstance(similarity_threshold, (float, int)):
                raise _ToolkitError("similarity_threshold parameter must be a" \
                                    "float or an int.")

            if similarity_threshold < 0 or similarity_threshold > 1:
                raise _ToolkitError("similarity_threshold parameter must be " \
                                    "between 0 and 1.")

        radius = (1 - similarity_threshold) if similarity_threshold else None

        results = self.__proxy__['nearest_neighbors_model'].query(
            features, label="id", k=k, radius=radius, verbose=verbose)

        # return empty SFrame immediately if no NN results
        if len(results) == 0:
            return _gl.SFrame({
                query_column + "_id": [],
                query_column: [],
                self.get("tag_name"): [],
                "score": []
            })

        results = results.join(query_sf, on={"query_label": "id"})
        results.rename({"query_label": query_column + "_id"})
        results.rename({query_column: "query_label"})

        # convert distances to similarity scores
        scores = _dists_to_sim_scores("weighted_jaccard", results)

        results.add_column(scores, "score")
        results.remove_column("distance")
        results.remove_column("rank")
        results.rename({
            "reference_label": self.get("tag_name"),
            "query_label": query_column
        })
        results.swap_columns(self.get("tag_name"), query_column)

        if exclude_zeros:
            try:
                results = results.filter_by(0.0, "score", exclude=True)
            except RuntimeError:  # nothing to join
                _logging.getLogger(__name__).warn(
                    "Empty results after filtering scores of 0.")
                results = results.head(0)

        return results
예제 #29
0
def create(dataset, window_size, feature=None, min_observations=None,
           verbose=True):
    """
    Create a :class:`MovingZScoreModel` model. This model fits a moving average
    to a univariate time series and identifies points that are far from the
    fitted curve. The MovingZScoreModel works with either TimeSeries or SFrame
    inputs. A uniform sampling rate is assumed and the data window must be
    defined in terms of number of observations.

    This model differs from other GraphLab Create models in that it can be
    created from an existing `MovingZSCoreModel`. To create a new model in this
    fashion, use the existing model's `update` method.

    The model created by this function contains a table `scores` that contains
    the computed anomaly scores. The type of `scores` matches the type of the
    input `dataset`, and the table contains 5 columns:

        - *row id/time*: ID of the corresponding row in the input `dataset`. If
          `dataset` is an SFrame, this is the row numbers of the input data; if
          `dataset` is a TimeSeries, it is the index of the time series.

        - *anomaly score*: absolute value of the moving Z-score. A score of 0
          indicates the value is identical to the moving average. The higher
          the score, the more likely a point is to be an anomaly.

        - *value*: input data. The name of this column matches the input
          `feature`.

        - *moving average*: moving average of each point's preceding
          `window_size` values.

        - *model update time*: time the model was updated. This is particularly
          useful if the `window_size` is larger than the number of rows in the
          input datasets, because the `scores` table has results from several
          updates.

    Parameters
    ----------
    dataset : SFrame or TimeSeries
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    window_size : int
        Length of the time window to use for defining the moving z-score value,
        in terms of number of observations.

    feature : str, optional
        Name of the column to model. Any data provided to the model with either
        the `create` or `update` functions must have a column with this name.
        The feature name is not necessary if `dataset` is an SFrame with a
        single column or a TimeSeries with a single value column; it can be
        determined automatically in this case.

    min_observations : int, optional
        Minimum number of non-missing observations in the moving window
        required to compute the moving Z-score. If unspecified, the entire
        moving window preceding an observation must not contain any missing
        values in order for the observation to get an anomaly score.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : MovingZScoreModel
        A trained :class:`MovingZScoreModel`, which contains a table called
        `scores` that includes the anomaly score for each input data point. The
        type of the `scores` table matches the type of the input `dataset`.

    See Also
    --------
    MovingZScoreModel, MovingZScoreModel.update

    Notes
    -----
    - The moving Z-score for a data point :math:`x_t` is simply the value of
      :math:`x_t` standardized by subtracting the moving mean just prior to
      time :math:`t` and dividing by the moving standard deviation just prior
      to :math:`t`. Suppose :math:`w` stands for the `window_size` in terms of
      the number of observations. Then the moving Z-score is:

      .. math:: z(x_t) = \\frac{x_t - \\bar{x}_t}{s_t}

      where the moving average is:

      .. math:: \\bar{x}_t = (1/w) \sum_{i=t-w}^{t-1} x_i

      and the moving standard deviation is:

      .. math:: s_t = \sqrt{(1/w) \sum_{i=t-w}^{t-1} (x_i - \\bar{x}_t)^2}

    - The moving Z-score at points within `window_size` observations of the
      beginning of a series are not defined, because there are insufficient
      points to compute the moving average and moving standard deviation. This
      is represented by missing values.

    - Missing values in the input dataset are assigned missing values ('None')
      for their anomaly scores as well.

    - If there is no variation in the values preceding a given observation, the
      moving Z-score can be infinite or undefined. If the given observation is
      equal to the moving average, the anomaly score is coded as 'nan'; if the
      observation is *not* equal to the moving average, the anomaly score is
      'inf'.

    Examples
    --------
    >>> sf = graphlab.SFrame({'year': [2007, 2007, 2008, 2009, 2010, 2010],
    ...                       'value': [12.2, 11.7, 12.5, 21.4, 10.8, 11.2]})
    >>> model = graphlab.anomaly_detection.moving_zscore.create(sf,
    ...                                                         window_size=3,
    ...                                                         feature='value')
    >>> model['scores'].print_rows(max_column_width=20)
    +--------+----------------+-------+----------------+---------------------+
    | row_id | anomaly_score  | value | moving_average |  model_update_time  |
    +--------+----------------+-------+----------------+---------------------+
    |   0    |      None      |  12.2 |      None      | 2016-01-04 16:55... |
    |   1    |      None      |  11.7 |      None      | 2016-01-04 16:55... |
    |   2    |      None      |  12.5 |      None      | 2016-01-04 16:55... |
    |   3    | 28.0822407386  |  21.4 | 12.1333333333  | 2016-01-04 16:55... |
    |   4    | 1.00086199482  |  10.8 |      15.2      | 2016-01-04 16:55... |
    |   5    | 0.795990414837 |  11.2 |      14.9      | 2016-01-04 16:55... |
    +--------+----------------+-------+----------------+---------------------+
    [6 rows x 5 columns]
    """

    _mt._get_metric_tracker().track(
                              'toolkit.anomaly_detection.moving_zscore.create')

    start_time = _time.time()
    logger = _logging.getLogger(__name__)


    ## Validate required inputs by themselves.
    if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)):
        raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.")

    if len(dataset) < 1:
        raise _ToolkitError("Input 'dataset' is empty.")

    if not isinstance(window_size, int):
        raise TypeError("Input 'window_size' must be an integer.")

    if window_size < 1:
        raise ValueError("Input 'window_size' must greater than or " +
                         "equal to 1.")

    if feature is not None and not isinstance(feature, str):
        raise TypeError("Input 'feature' must be a string if specified.")

    if min_observations is not None:
        if not isinstance(min_observations, int):
            raise TypeError("If specified, input 'min_observations' must " +
                            "be a positive integer.")

        if min_observations < 1:
            raise ValueError("If specified, input 'min_observations' must " +
                             "be a positive integer.")

    ## Determine the feature name if left unspecified.
    column_names = dataset.column_names() if isinstance(dataset, _gl.SFrame) \
        else dataset.value_col_names

    if feature is None:
        if len(column_names) == 1:
            feature = column_names[0]
        else:
            raise _ToolkitError("If the 'input' dataset has multiple " +
                                "columns, a 'feature' column name must be " +
                                "specified.")


    ## Extract the specified feature as an SArray.
    try:
        series = dataset[feature]
    except:
        raise _ToolkitError("The specified feature could not be found " +
                            "in the input 'dataset'.")


    ## Validate the type of the feature.
    if not series.dtype() in [int, float]:
        raise ValueError("The values in the specified feature must be " +
                         "integers or floats.")


    ## Compute the moving average, Z-score, and a final anomaly score. For all
    #  anomaly detectcion models, the final score should be in the range [0,
    #  \infty], with higher values indicating more outlier-ness.
    moving_average, moving_zscore, sufficient_data = \
        _moving_z_score(series, window_size, min_observations)

    anomaly_score = abs(moving_zscore)

    if not sufficient_data:
        logger.warning("The number of observations is smaller than " +
                       "the minimum number needed to compute a " +
                       "moving Z-score, so all anomaly scores are 'None'. " +
                       "Consider adding more data with the model's `update` " +
                       "method, or reducing the `window_size` or " +
                       "`min_observations` parameters.")

    ## Format the results.
    scores = _gl.SFrame({feature: series,
                         'moving_average': moving_average,
                         'anomaly_score': anomaly_score})
    scores['model_update_time'] = _dt.datetime.now()

    scores = scores[['anomaly_score', # reorder the columns
                     feature,
                     'moving_average',
                     'model_update_time']]

    if isinstance(dataset, _gl.SFrame):
        if feature != 'row_id':
            scores = scores.add_row_number('row_id')
        else:
            logger.warning("Feature name is 'row_id', so the " +
                           "index in the model's 'scores' SFrame " +
                           "is called '_row_id'.")
            scores = scores.add_row_number('_row_id')

    if isinstance(dataset, _gl.TimeSeries):
        scores[dataset.index_col_name] = dataset[dataset.index_col_name]
        scores = _gl.TimeSeries(scores, index=dataset.index_col_name)

    dataset_type = 'TimeSeries' if isinstance(dataset, _gl.TimeSeries) else 'SFrame'

    ## Set up the model.
    state = {
        'dataset_type': dataset_type,
        'verbose': verbose,
        'window_size': window_size,
        'min_observations': min_observations,
        'num_examples': len(dataset),
        'feature': feature,
        'training_time': _time.time() - start_time,
        'scores': scores}

    model = MovingZScoreModel(state)
    return model
예제 #30
0
    def evaluate(self,
                 dataset,
                 query_name=None,
                 k=5,
                 similarity_threshold=None,
                 exclude_zeros=True,
                 verbose=True):
        """
        Match the reference tags to a set of queries labeled with their true
        tags, and then evaluate the model's performance on those queries.

        The true tags should be provided as an additional column in ``dataset``,
        and that column's name should be the same as the ``tag_name`` parameter
        specified when the model was created. The type of the tags column should
        be either string or list (of strings).

        Parameters
        ----------
        dataset : SFrame
            Query data to be tagged.

        query_name : string, optional
            Name of the column in ``dataset`` to be auto-tagged. If ``dataset``
            has more than one column, ``query_name`` must be specified.

        k : int, optional
            Number of results to return from the reference set for each query
            observation. The default is 5, but setting it to ``None`` will
            return all results whose score is greater than or equal to
            ``similarity_threshold``.

        similarity_threshold : float, optional
            Only results whose score is greater than or equal to the specified
            ``similarity_threshold`` are returned. The default is ``None``, in
            which case the ``k`` best results are returned for each query point
            regardless of score.

        exclude_zeros : boolean, optional
            If True, only entries for which there is a tag with a nonzero score
            are preserved in the output. This is the default behavior.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : dict
            A dictionary containing the entire confusion matrix, as well as the
            following evaluation metrics:

            - Precision
            - Recall
            - F1 score

        See Also
        --------
        tag, graphlab.evaluation.confusion_matrix

        Notes
        -----
        - Autotagging is a variation on multiclass classification, where in
          contrast to a multiclass classifier, an autotagger model can output
          zero tags for a particular query (either because there were no tags
          with non-zero scores, or as a result of specifying a value for the
          similarity_threshold parameter). As is standard practice in multiclass
          classification, we report Precision, Recall, and F1 score as our
          evaluation metrics. Specifically, we microaverage Precision and Recall
          by counting type I errors (false positives) and type II errors (false
          negatives) over the entire confusion matrix.

        References
        ----------
        - `Wikipedia - Precision and
          recall <http://en.wikipedia.org/wiki/Precision_and_recall>`_

        - Manning, C., Raghavan P., and Schutze H. (2008). Introduction to
          Information Retrieval.

        Examples
        --------
        Continuing with the actor autotagger model referenced in previous
        example (for the ```tag``` method):

        >>> labeled_reviews_sf = gl.SFrame(
                "https://static.turi.com/datasets/imdb_reviews/reviews.10.tagged.sframe")
        >>> labeled_reviews_sf
        +-------------------------------+---------------------+
        |             review            |        actor        |
        +-------------------------------+---------------------+
        | When I saw this movie I wa... | [Leonardo DiCaprio] |
        | I rented this movie last w... |     [Matt Damon]    |
        | You've gotta hand it to St... |   [Angelina Jolie]  |
        | I caught this film at a te... |   [Julia Roberts]   |
        | I took a flyer in renting ... |  [Jennifer Aniston] |
        | Frankly I'm rather incense... |          []         |
        | This movie looked as if it... |      [Jude Law]     |
        | My wife and I watch a film... |          []         |
        | A story of amazing disinte... |          []         |
        | I don't remember a movie w... |          []         |
        +-------------------------------+---------------------+

        >>> m.evaluate(labeled_reviews_sf, query_name="review", verbose=False,
                k=1)

        .. sourcecode:: python

            {'confusion_matrix': Columns
                    count	int
                    target_label	str
                    predicted_label	str

             Rows: 10

             Data:
             +-------+-------------------+-------------------+
             | count |    target_label   |  predicted_label  |
             +-------+-------------------+-------------------+
             |   1   | Leonardo DiCaprio | Leonardo DiCaprio |
             |   1   |     Matt Damon    |     Matt Damon    |
             |   1   |   Angelina Jolie  |   Angelina Jolie  |
             |   1   |   Julia Roberts   |   Julia Roberts   |
             |   1   |  Jennifer Aniston |  Jennifer Aniston |
             |   1   |      Jude Law     |      Jude Law     |
             |   1   |        None       |     Will Smith    |
             |   1   |        None       |     Emma Stone    |
             |   1   |        None       |  Jennifer Aniston |
             |   1   |        None       |  Charlize Theron  |
             +-------+-------------------+-------------------+
             [10 rows x 3 columns],
             'f1_score': 0.7499999999999999,
             'precision': 0.6,
             'recall': 1.0}

        >>> m.evaluate(labeled_reviews_sf, query_name="review", verbose=False,
                       k=1, similarity_threshold=.6)

        .. sourcecode:: python

            {'confusion_matrix': Columns:
                    count	int
                    target_label	str
                    predicted_label	str

             Rows: 7

             Data:
             +-------+-------------------+-------------------+
             | count |    target_label   |  predicted_label  |
             +-------+-------------------+-------------------+
             |   1   | Leonardo DiCaprio | Leonardo DiCaprio |
             |   1   |   Angelina Jolie  |   Angelina Jolie  |
             |   1   |   Julia Roberts   |   Julia Roberts   |
             |   4   |        None       |        None       |
             |   1   |      Jude Law     |      Jude Law     |
             |   1   |     Matt Damon    |        None       |
             |   1   |  Jennifer Aniston |        None       |
             +-------+-------------------+-------------------+
             [7 rows x 3 columns],
             'f1_score': 0.8,
             'precision': 1.0,
             'recall': 0.6666666666666666}

        """
        _mt._get_metric_tracker().track(self.__module__ + '.tag')

        tag_name = self.get("tag_name")
        true_tags = dataset.select_column(tag_name)

        if true_tags.dtype() not in (list, str):
            raise TypeError(
                "The %s column must either be of type str or list" % tag_name)

        if true_tags.dtype() == str:
            true_tags = true_tags.apply(lambda x: [x] if x else [])

        true_tags = true_tags.fillna([])

        dataset = dataset.select_columns(
            [x for x in dataset.column_names() if x != tag_name])

        if similarity_threshold:
            if not isinstance(similarity_threshold, (float, int)):
                raise _ToolkitError("similarity_threshold parameter must be a" \
                                    "float or an int.")

            if similarity_threshold < 0 or similarity_threshold > 1:
                raise _ToolkitError("similarity_threshold parameter must be " \
                                    "between 0 and 1.")

        results = self.tag(dataset,
                           query_name=query_name,
                           k=k,
                           similarity_threshold=similarity_threshold,
                           exclude_zeros=exclude_zeros,
                           verbose=verbose)

        if len(results) == 0:
            raise ValueError("There is no data to evaluate. Try reducing the " \
                             "similarity_threshold or increasing k.")

        group_column = (query_name or dataset.column_names()[0]) + "_id"
        dataset = dataset.add_row_number(group_column)
        results = results.groupby(group_column,
                                  {"labels": _gl.aggregate.CONCAT(tag_name)})
        results = dataset.join(results,
                               on={group_column: group_column},
                               how="left")
        results = results.fillna("labels", [])
        results = results.sort(group_column)

        def precision(tps, fps):
            return tps / float(tps + fps)

        def recall(tps, fns):
            return tps / float(tps + fns)

        def f1_score(p, r):
            return 2 * p * r / (p + r)

        confusion_matrix = _gl.evaluation.confusion_matrix(
            true_tags, results["labels"])
        confusion_matrix = confusion_matrix.stack("target_label",
                                                  "target_label")

        # TO DO: this next line will be removed once .stack type-inference is fixed
        # or type_hint parameter is exposed
        confusion_matrix = _gl.SFrame({"predicted_label": [["stub"]],
                                       "count": [1], "target_label": ["stub"]})\
                              .append(confusion_matrix)

        confusion_matrix = confusion_matrix.stack("predicted_label",
                                                  "predicted_label")

        # TO DO: remove this next line, per note above
        confusion_matrix = confusion_matrix[1:]

        tps = confusion_matrix[confusion_matrix.apply(
            lambda row: row["predicted_label"] != None and \
            row["target_label"] == row["predicted_label"])]["count"].sum() or 0

        fps = confusion_matrix[confusion_matrix.apply(
            lambda row: row["predicted_label"] != None and \
            row["target_label"] != row["predicted_label"])]["count"].sum() or 0

        fns = confusion_matrix[confusion_matrix.apply(
            lambda row: row["predicted_label"] == None and \
            row["target_label"] != None)]["count"].sum() or 0

        p = precision(tps, fps)
        r = recall(tps, fns)
        f1 = f1_score(p, r)

        return {
            "precision": p,
            "recall": r,
            "f1_score": f1,
            'confusion_matrix': confusion_matrix
        }
예제 #31
0
    def evaluate(self,
                 data,
                 methods=['average_similarity', 'average_quality', 'log_det']):
        """
        Objectively evaluate the quality and diversity of a data subset.

        There are several quantitaive measures of the quality and diversity of
        some set. This method provides three:
            - Average quality: The average over the quality features of each of
              the items in data.
            - Average similarity: The average of the pairwise similarities
              between every item in data.
            - Log-determinant: This simultaneously measures both the quality and
              diversity of a set. To measure the log-determinant of a given set,
              we first form the similarity matrix L, where a diagonal entry L_ii
              corresponds to the quality of item i, and an off diagonal entry
              L_ij corresponds to the similarity between items i and j. We then
              take the log of the determinant of this matrix. This type of
              matrix is also referred to as a Gramian matrix.

              The determinant of a Gramian matrix corresponds to the volume
              spanned by the vectors used to construct the matrix. If an item
              has a large quality, it corresponds to a longer vector, which will
              increase the volume (and determinant) of L. If two feature vectors
              are similar, then the volume decreases (because the vectors point
              in a similar direction), which correspondingly decreases the
              determinant. Thus, both quality and similarity are encapsulated by
              the log-determinant.

        Parameters
        ----------
        data: SFrame or SGraph
            The subset of data to evaluate.

        methods: list[string], {'average_similarity', 'average_quality', 'log_det'}
            The set of methods to measure. If methods is None, then all
            possible evaluation methods will be used.

        Returns
        -------
        out: dict
            Dictionary of values with keys corresponding to measurement types and values
            corresponding to the actual evaluation scores.

        Examples
        --------
        >>> cars = graphlab.SFrame.read_csv('https://static.turi.com/datasets/auto-mpg/auto-mpg.csv')
        >>> sampler = graphlab.diverse_sampler.create(data=cars, 
                                                      item_id='name', 
                                                      quality_feature='accel', 
                                                      similarity_features=['mpg', 
                                                      'displ', 
                                                      'hp', 
                                                      'weight',
                                                      'origin'])
        >>> 

        >>> sf_simple_dd = gl.SFrame({'id': [0, 1, 2],
                                      'q':  [10, 10, 10],
                                      's1': [[1, 1, 1], [1, 1, 1], [1, 1, 1]]})
        >>> sampler = gl.diverse_sampler.create(data=sg_simple_dd,
                                                item_id='id',
                                                quality_feature='q',
                                                similarity_features=['s1'])
        >>> sf = sampler.sample(5, greedy=True, diversity=0.2)
        >>> sampler.evaluate(sf)
        {'log_det': 15.819720050211457, 'average_quality': 23.76, 
            'average_similarity': 0.999730969627407}
        """
        eval_frame = False
        if isinstance(data, _gl.SFrame):
            eval_frame = True
        elif not isinstance(data, _gl.SGraph):
            raise ValueError("Unknown data type " + str(type(data)) + ".")

        div_eval = _gl.extensions.diversity_eval()

        options = dict()
        options["eval_methods"] = methods

        if self._quality_feature is not None:
            options["quality_feature"] = self._quality_feature
        if self._similarity_features is not None:
            options["similarity_features"] = self._similarity_features

        if eval_frame:
            if not self._init_with_frame:
                raise _ToolkitError("Sampler initialized with SGraph, but eval "+ \
                                                        "was called with an SFrame.")
            return div_eval.evaluate_frame(data, options)
        else:
            if self._init_with_frame:
                raise _ToolkitError("Sampler initialized with SFrame, but eval "+ \
                                                        "was called with an SGraph.")
            return div_eval.evaluate_graph(data, options)
def create(data, row_label=None, features=None, feature_model='auto',
           method='lsh', verbose=True):
    """
    Create a similarity search model, which can be used to quickly retrieve
    items similar to a query observation. In the case of images, this model
    automatically performs the appropriate feature engineering steps. NOTE:
    If you are using a CPU for the creation step with feature_model='auto',
    creation time may take a while. This is because extracting features for
    images on a CPU is expensive. With a GPU, one can expect large speedups.

    Parameters
    ----------
    dataset : SFrame
        The SFrame that represents the training data for the model, including at
        least one column of images.

    row_label : str, optional
        Name of the SFrame column with row id's. If 'row_label' is not
        specified, row numbers are used to identify reference dataset rows when
        the model is queried.

    features : str, optional
        The name of an image column in the input 'dataset' SFrame.

    feature_model : 'auto' | A model of type NeuralNetClassifier, optional
        A trained model for extracting features from raw data objects. By
        default ('auto'), we choose an appropriate model from our set of
        pre-trained models. See
        :class:`~graphlab.toolkits.feature_engineering.DeepFeatureExtractor` for
        more information.

    method : {'lsh', 'brute_force'}, optional
        The method used for nearest neighbor search. The 'lsh' option uses
        locality-sensitive hashing to find approximate results more quickly.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : SimilaritySearchModel

    See Also
    --------
    SimilaritySearchModel
    graphlab.toolkits.nearest_neighbors
    graphlab.toolkits.feature_engineering

    Notes
    -----
    The similarity search toolkit currently uses cosine distance to evaluate the
    similarity between each query and candidate results.

    Examples
    --------
    First, split data into reference and query.

    >>> import graphlab as gl

    >>> data = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k')
    >>> reference, query = data.random_split(0.8)

    Build neuralnet feature extractor for images:

    >>> nn_model = gl.neuralnet_classifier.create(reference, target='label')

    Construct SimilaritySearchModel:

    >>> model = gl.similarity_search.create(reference, features= 'image',
    ...                                     feature_model=nn_model)

    Find the most similar items in the reference set for each item in the query
    set:

    >>> model.search(query)
    """

    _mt._get_metric_tracker().track(__name__ + '.create')

    _raise_error_if_not_of_type(data, [_SFrame])
    _raise_error_if_not_of_type(features, [str])
    _raise_error_if_column_exists(data, features)

    if data[features].dtype() != _Image:
        raise _ToolkitError("Feature `%s` must be of type Image" \
                % features)

    return SimilaritySearchModel(data, row_label=row_label, feature=features,
            feature_model=feature_model, method=method, verbose=verbose)
    def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None):
        """
        Evaluate the model's predictive accuracy. This is done by predicting the
        target class for instances in a new dataset and comparing to known
        target values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the target and features used for model training. Additional
            columns are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto': Returns all available metrics.

            - 'accuracy': Classification accuracy.

            - 'confusion_matrix': An SFrame with counts of possible
              prediction/true label combinations.

            - 'roc_curve': An SFrame containing information needed for an roc
              curve (binary classification only).

        max_neighbors : int, optional
            Maximum number of neighbors to consider for each point.

        radius : float, optional
            Maximum distance from each point to a neighbor in the reference
            dataset.

        Returns
        -------
        out : dict
            Evaluation results. The dictionary keys are *accuracy* and
            *confusion_matrix* and *roc_curve* (if applicable).

        See also
        --------
        create, predict, predict_topk, classify

        Notes
        -----
        - Because the model randomly breaks ties between predicted classes, the
          results of repeated calls to `evaluate` method may differ.

        Examples
        --------
        >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
        ...                             'height': [9, 25, 20, 23],
        ...                             'weight': [13, 28, 33, 22]})
        >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species')
        >>> ans = m.evaluate(sf_train, max_neighbors=2,
        ...                  metric='confusion_matrix')
        >>> print ans['confusion_matrix']
        +--------------+-----------------+-------+
        | target_label | predicted_label | count |
        +--------------+-----------------+-------+
        |     cat      |       dog       |   1   |
        |     dog      |       dog       |   2   |
        |    fossa     |       dog       |   1   |
        +--------------+-----------------+-------+
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.nearest_neighbor_classifier.evaluate')

        ## Validate the metric name
        _raise_error_evaluation_metric_is_valid(metric,
                    ['auto', 'accuracy', 'confusion_matrix', 'roc_curve'])

        ## Make sure the input dataset has a target column with an appropriate
        #  type.
        target = self.get('target')
        _raise_error_if_column_exists(dataset, target, 'dataset', target)

        if not dataset[target].dtype() == str and not dataset[target].dtype() == int:
            raise TypeError("The target column of the evaluation dataset must "
                            "contain integers or strings.")

        if self._state["num_classes"] != 2:
            if (metric == 'roc_curve') or (metric == ['roc_curve']):
                err_msg  = "Currently, ROC curve is not supported for "
                err_msg += "multi-class classification in this model."
                raise _ToolkitError(err_msg)
            else:
                warn_msg  = "WARNING: Ignoring `roc_curve`. "
                warn_msg += "Not supported for multi-class classification."
                print warn_msg

        ## Compute predictions with the input dataset.
        ystar = self.predict(dataset, output_type='class',
                             max_neighbors=max_neighbors, radius=radius)
        ystar_prob = self.predict(dataset, output_type='probability',
                             max_neighbors=max_neighbors, radius=radius)


        ## Compile accuracy metrics
        results = {}

        if metric in ['accuracy', 'auto']:
            results['accuracy'] = _gl.evaluation.accuracy(targets=dataset[target],
                                                          predictions=ystar)

        if metric in ['confusion_matrix', 'auto']:
            results['confusion_matrix'] = \
                _gl.evaluation.confusion_matrix(targets=dataset[target],
                                                predictions=ystar)

        if self._state["num_classes"] == 2:
            if metric in ['roc_curve', 'auto']:
                results['roc_curve'] = \
                      _gl.evaluation.roc_curve(targets=dataset[target],
                                               predictions=ystar_prob)
        return results
예제 #34
0
def distances_to_similarity_scores(distance_fn, distances):
    """
    Convert distances to similarity scores.

    Parameters
    ----------
    distance_fn : str
        The name of the distance function.

    distances : SArray or SFrame
        An `SArray` or `SFrame` of distances to convert to similarity scores. If
        distances is an SFrame, it is expected to contain the following columns:
        "distance", "query_label", and "reference_label", of types float, str,
        and str respectively. If an SFrame is provided that does not contain
        these fields, a ToolkitError is raised.

    label : string
        Name of the label column.

    Returns
    -------
    out : SArray
        The converted similarity scores.

    Notes
    -----
    - To convert Levenshtein distances to similarities, the distances parameter
    must by an `SFrame`, since we require both of the strings being compared in
    order to normalize.
    """
    if not (isinstance(distances, _gl.SFrame) or \
            isinstance(distances, _gl.SArray)):
        raise TypeError("distances parameter is of type %s must be an SFrame " \
                        "or an SArray" % type(distances))

    if isinstance(distances, _gl.SFrame):
        column_names = distances.column_names()
        required_names = ["distance", "query_label", "reference_label"]
        if not all([name in column_names for name in required_names]):
            raise _ToolkitError("distances SFrame is missing required " \
                                "columns; at a minimum, it should have the " \
                                "following columns: \"distance\", " \
                                "\"query_label\", and \"reference_label\"")

    if isinstance(distances, _gl.SArray):
        if distance_fn == "levenshtein":
            raise TypeError("Expected an SFrame but got a an SArray")

        distances = _gl.SFrame({"distance": distances})

    def levenshtein_sim(dist, s1, s2):
        return 1 - dist / max(len(s1), len(s2))

    scores = None

    if distance_fn == "levenshtein" and isinstance(distances, _gl.SFrame):
        scores = distances.apply(
            lambda x: levenshtein_sim(
                x["distance"], x["query_label"], x["reference_label"]))
    elif distance_fn in ("jaccard", "weighted_jaccard", "cosine"):
        scores = distances["distance"].apply(lambda dist: 1 - dist)
    elif distance_fn in ("manhattan", "euclidean", "squared_euclidean"):
        scores = distances["distance"].apply(
            lambda dist: 1 - dist / _MAX_SIMILARITY_RADIUS)
    else:
        raise _ToolkitError("Unsupported distance function: %s" % distance_fn)

    return scores
def _validate_features(features, column_type_map, valid_types, label):
    """
    Identify the subset of desired `features` that are valid for the Kmeans
    model. A warning is emitted for each feature that is excluded.

    Parameters
    ----------
    features : list[str]
        Desired feature names.

    column_type_map : dict[str, type]
        Dictionary mapping each column name to the type of values in the
        column.

    valid_types : list[type]
        Exclude features whose type is not in this list.

    label : str
        Name of the row label column.

    Returns
    -------
    valid_features : list[str]
        Names of features to include in the model.
    """
    # logger = _logging.getLogger(__name__)

    if not isinstance(features, list):
        raise TypeError("Input 'features' must be a list, if specified.")

    if len(features) == 0:
        raise ValueError("If specified, input 'features' must contain " +
                         "at least one column name.")

    ## Remove duplicates
    num_original_features = len(features)
    features = set(features)

    if len(features) < num_original_features:
        _logging.warning("Duplicates have been removed from the list of features")

    ## Remove the row label
    if label in features:
        features.remove(label)
        _logging.warning("The row label has been removed from the list of features.")

    ## Check the type of each feature against the list of valid types
    valid_features = []

    for ftr in features:
        if not isinstance(ftr, str):
            _logging.warning("Feature '{}' excluded. ".format(ftr) +
                           "Features must be specified as strings " +
                           "corresponding to column names in the input dataset.")

        elif ftr not in column_type_map.keys():
            _logging.warning("Feature '{}' excluded because ".format(ftr) +
                           "it is not in the input dataset.")

        elif column_type_map[ftr] not in valid_types:
            _logging.warning("Feature '{}' excluded because of its type. ".format(ftr) +
                           "Kmeans features must be int, float, dict, or array.array type.")

        else:
            valid_features.append(ftr)

    if len(valid_features) == 0:
        raise _ToolkitError("All specified features have been excluded. " +
                            "Please specify valid features.")

    return valid_features
    def update(self, dataset):
        """
        Create a new BayesianChangepointsModel using the same parameters, but
        an updated dataset. Knowledge about the data is retained from the
        previous model, and it is assumed the data is a continuation of the
        previous models data.

        Parameters
        ----------
        dataset : SFrame, SArray, or TimeSeries
            New data to use for an updated changepoint detection model. The
            type of the input 'dataset' must match the type of the data already
            in the model (if the model has data already).

        Returns
        -------
        out : BayesianChangepointsModel
            A *new* BayesianChangepointsModel, with an updated dataset and
            changepoint scores for the updated dataset. The `scores` field of
            the new model has the same schema as the `scores` field of the
            existing model. The last `lag` fields are prepended to the data,
            though, because there is now enough data to evaluate their
            changepoint probability.

        See Also
        --------
        create

        Examples
        --------
        >>> sf = graphlab.SFrame({'series': [100]*25})
        >>> model = graphlab.anomaly_detection.bayesian_changepoints.create(sf,
        ...                                                         lag=5,
        ...                                                         feature='series')
        >>> sf2 = graphlab.SFrame({'series': [200]*25})
        >>> model2 = model.update(sf2)
        >>> model2['scores'].print_rows(max_column_width=20)
        +-------------------+--------+---------------------+
        | changepoint_score | series |  model_update_time  |
        +-------------------+--------+---------------------+
        |   0.831430606595  |  200   | 2016-01-27 14:06... |
        | 0.000347138442071 |  200   | 2016-01-27 14:06... |
        | 3.40869782692e-05 |  200   | 2016-01-27 14:06... |
        | 1.40792637711e-05 |  200   | 2016-01-27 14:06... |
        | 7.50780005726e-06 |  200   | 2016-01-27 14:06... |
        | 4.49582032092e-06 |  200   | 2016-01-27 14:06... |
        | 2.90328065455e-06 |  200   | 2016-01-27 14:06... |
        | 1.98060675567e-06 |  200   | 2016-01-27 14:06... |
        | 1.40930691121e-06 |  200   | 2016-01-27 14:06... |
        | 1.03700199168e-06 |  200   | 2016-01-27 14:06... |
        +-------------------+--------+---------------------+
        [25 rows x 3 columns]
        """
        start_time = _time.time()
        _mt._get_metric_tracker().track(
            'toolkit.anomaly_detection.bayesian_changepoints.update')
        logger = _logging.getLogger(__name__)

        ## Validate the new dataset
        if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)):
            raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.")

        if len(dataset) < 1:
            raise TypeError("Input 'dataset' is empty.")

        if ((self.get('dataset_type') == 'TimeSeries'
             and not isinstance(dataset, _gl.TimeSeries))
                or (self.get('dataset_type') == 'SFrame'
                    and not isinstance(dataset, _gl.SFrame))):

            raise TypeError("New input 'dataset' must have the same type " +
                            "as the data already in the model.")

        ## TimeSeries-specific dataset validation
        ## Make the sure new data occurs *after* the existing data.
        scores = self.get('scores')

        if isinstance(dataset, _gl.TimeSeries):
            first_new_timestamp = dataset[0][dataset.index_col_name]
            last_old_timestamp = scores[-1][scores.index_col_name]

            if first_new_timestamp < last_old_timestamp:
                raise _ToolkitError("The new dataset has data with " +
                                    "earlier timestamps than the existing " +
                                    "dataset. Please ensure that new data " +
                                    "occurs after existing data.")

        ## Extract the feature from the new dataset and validate it.
        feature = self.get('feature')

        try:
            series = dataset[feature]
        except:
            raise _ToolkitError("The feature specified by the original " +
                                "model could not be found in the input " +
                                "'dataset'.")

        if not series.dtype() in [int, float]:
            raise ValueError("The values in the specified feature must be " +
                             "integers or floats.")

        ## Create a new model initialize it.
        new_state = {k: self.get(k) for k in ['dataset_type']}

        opts = self.__proxy__.get_most_likely_hyperparams()

        proxy = _gl.extensions._BayesianOnlineChangepoint()

        ## Initialize new model w/state from old model. This allows for
        ## detection changepoints using knowledge learned previously, and
        ## also to detect changepoint probabilites for points which didn't yet
        ## have `lag` points following them.
        proxy.init_changepoint_detector(
            opts, True,
            self.get('scores')[feature].dropna()[0])

        new_model = BayesianChangepointsModel(proxy)

        ## Once again, calculate changepoints with information known
        ## from model creation. Prepend `lag` points from previous dataset,
        ## we now have enough information to check if they were changepoints
        lag = self.get('lag')

        ## If `lag` is greater than 0, we want to prepend the points that we
        ## couldn't find a changepoint score before due to not enough data.
        ## These are the laat `lag` non-None points.
        if lag > 0:
            ## Grab previous scores
            if isinstance(dataset, _gl.SFrame):
                old_scores = self.get('scores')[[feature, 'model_update_time']]
            else:
                old_scores = self.get('scores').to_sframe()[[
                    feature, 'model_update_time'
                ]]
            # Copy SFrame and select only the feature column
            prepend_index_calc_temp_sf = old_scores[[feature]]
            #Rename, incase feature column is 'id'
            prepend_index_calc_temp_sf.rename({feature: 'series'})
            # Identify the last `lag` points that are non-None
            prepend_index_calc_temp_sf = prepend_index_calc_temp_sf.add_row_number(
            )
            prepend_index_calc_temp_sf = prepend_index_calc_temp_sf.dropna()
            # If `lag` is longer than scores, just take all previous points
            if lag >= len(prepend_index_calc_temp_sf):
                prepend_index = 0
            else:
                prepend_index = prepend_index_calc_temp_sf['id'][-(lag +
                                                                   1)] + 1
            old_scores = old_scores[prepend_index:]
        ## Otherwise, we don't prepend anything, so the index can be the input
        ## data length
        else:
            prepend_index = len(series)

        ## Calculate changepoints

        scores = _gl.SFrame()
        scores[feature] = series
        changepoints = new_model.__proxy__.calculate_changepoints(series)
        scores['model_update_time'] = _dt.datetime.now()
        if lag > 0:
            scores = old_scores.append(scores)
        changepoints = changepoints.append(
            _gl.SArray([None] * (len(scores) - len(changepoints))))
        scores['changepoint_score'] = changepoints
        scores = scores[[
            'changepoint_score',  # reorder the columns
            feature,
            'model_update_time'
        ]]

        ## Add row_id to SFrame
        if isinstance(dataset, _gl.SFrame):
            if feature != 'row_id':
                scores = scores.add_row_number('row_id')
            else:
                logger.warning("Feature name is 'row_id', so the " +
                               "index in the model's 'scores' SFrame " +
                               "is called '_row_id'.")
                scores = scores.add_row_number('_row_id')

        ## Finalize and return the model.
        new_state['num_examples'] = len(scores)
        new_state['training_time'] = _time.time() - start_time

        ## If time-series index name has changed, rename old_timeseries
        ## index name
        if isinstance(dataset, _gl.TimeSeries):
            old_index_col_name = self.__proxy__.get_index_col_name()
            old_timeseries = self.get('scores')
            if dataset.index_col_name != old_index_col_name:
                old_timeseries = old_timeseries.rename(
                    {old_index_col_name: dataset.index_col_name})

                logger.warning("The new dataset's index column name " +
                               "does not match the existing index " +
                               "column name. The new name is used in " +
                               "the new model.")

            ## In model creation, the last `lag` points cannot be
            ## evaluated for changepoint probability. Now, there's more data,
            ## so that data is prepended.
            new_index = old_timeseries[
                dataset.index_col_name][prepend_index:].append(
                    dataset[dataset.index_col_name])
            scores[dataset.index_col_name] = new_index
            new_model.__proxy__.set_index_col_name(dataset.index_col_name)
            new_model.__proxy__.set_state_sframe(scores, new_state)
        else:
            new_model.__proxy__.set_state_sframe(scores, new_state)

        return new_model
예제 #37
0
    def __init__(self,
                 data=None,
                 item_id=None,
                 quality_feature=None,
                 similarity_features=None,
                 model_proxy=None,
                 _class=None):
        """ 
        Create a DiverseSampler object. This should never be called directly,
        because it is necessary to set up an SDK proxy prior to calling
        __init__.
        """
        if _class:
            self.__class__ = _class

        self._init_with_frame = False

        self.__proxy__ = model_proxy
        self.__name__ = 'diverse_sampler'
        self._quality_feature = quality_feature
        self._similarity_features = similarity_features

        if data is None and model_proxy is None:
            raise ValueError(
                "The diverse sampler must be initialized with a " +
                "reference SFrame or SGraph.")
        elif data is not None:
            if not (isinstance(data, _gl.SFrame)
                    or isinstance(data, _gl.SGraph)):
                raise ValueError("Unknown data type " + str(type(data)) + ".")

        if item_id is None and model_proxy is None:
            # Note that for SGraphs, the __id vertex field is intrinsic to each
            # gl.Vertex, so we don't actually need to specify item_id
            if isinstance(data, _gl.SFrame):
                raise ValueError("An item_id must be specified.")

        if isinstance(data, _gl.SFrame):
            col_names = data.column_names()
        elif isinstance(data, _gl.SGraph):
            if similarity_features is not None and len(
                    similarity_features) > 1:
                raise _ToolkitError(
                    "Only 1 similarity feature is supported for SGraph.")
            col_names = data.get_fields()

        if isinstance(data, _gl.SFrame) and item_id not in col_names:
            raise ValueError("Item ID " + item_id + " does not name " +
                             "a column in the SFrame.")

        if quality_feature is not None and quality_feature not in col_names:
            raise ValueError("Quality feature " + quality_feature +
                             " does not name " + "a column in the SFrame.")

        if similarity_features is not None:
            for sname in similarity_features:
                if sname not in col_names:
                    raise ValueError("Similarity feature " + sname +
                                     " does not name " +
                                     "a column in the SFrame.")

        opts = dict()
        if item_id is None and isinstance(data, _gl.SGraph):
            item_id = "__id"
        opts["item_id"] = item_id

        if quality_feature is not None:
            opts["quality_feature"] = quality_feature
        if similarity_features is not None:
            opts["similarity_features"] = similarity_features

        if isinstance(data, _gl.SFrame):
            self._init_with_frame = True
            self.__proxy__.init_with_frame(data, opts)
        elif isinstance(data, _gl.SGraph):
            self._init_with_frame = False
            self.__proxy__.init_with_graph(data, opts)
예제 #38
0
def create(dataset, target, features=None, distance=None, verbose=True):
    """
    Create a
    :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`
    model. This model predicts the class of a query instance by finding the most
    common class among the query's nearest neighbors.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    features : list[str], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns except the target variable
        should be used. Please note: if `distance` is specified as a composite
        distance, then that parameter controls which features are used in the
        model. Each column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : str, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~graphlab.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (str)

          2. standard distance name (str)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborClassifier
        A trained model of type
        :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`.

    See Also
    --------
    NearestNeighborClassifier
    graphlab.toolkits.nearest_neighbors
    graphlab.toolkits.distances

    References
    ----------
    - `Wikipedia - nearest neighbors classifier
      <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_

    - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of
      Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`_.
      Vol. 2. New York. Springer. pp. 463-481.

    Examples
    --------
    >>> sf = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
    ...                       'height': [9, 25, 20, 23],
    ...                       'weight': [13, 28, 33, 22]})
    ...
    >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species')

    As with the nearest neighbors toolkit, the nearest neighbor classifier
    accepts composite distance functions.

    >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7],
    ...            [('height', 'weight'), 'manhattan', 1.6]]
    ...
    >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species',
    ...                                                     distance=my_dist)
    """

    ## Set up
    ## ------
    _mt._get_metric_tracker().track(
        'toolkit.classifier.nearest_neighbor_classifier.create')
    start_time = _time.time()

    ## Validation and preprocessing
    ## ----------------------------

    ## 'dataset' must be a non-empty SFrame
    _raise_error_if_not_sframe(dataset, "dataset")
    _raise_error_if_sframe_empty(dataset, "dataset")

    ## 'target' must be a string, in 'dataset', and the type of the target must
    #  be string or integer.
    if not isinstance(target, str) or target not in dataset.column_names():
        raise _ToolkitError("The 'target' parameter must be the name of a "
                            "column in the input dataset.")

    if not dataset[target].dtype() == str and not dataset[target].dtype(
    ) == int:
        raise TypeError("The target column must contain integers or strings.")

    ## Warn that 'None' values in the target may lead to ambiguous predictions.
    if dataset[target].num_missing() > 0:
        _logging.warning(
            "Missing values detected in the target column. This " +
            "may lead to ambiguous 'None' predictions, if the " +
            "'radius' parameter is set too small in the prediction, " +
            "classification, or evaluation methods.")

    ## convert features and distance arguments into a composite distance
    ## NOTE: this is done here instead of in the nearest neighbors toolkit
    #  because the automatic distance construction may be different for the two
    #  toolkits.
    if features is None:
        _features = [x for x in dataset.column_names() if x != target]
    else:
        _features = [x for x in features if x != target]

    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif (hasattr(distance, '__call__')
          or (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    elif distance is None or distance == 'auto':
        col_types = {
            k: v
            for k, v in zip(dataset.column_names(), dataset.column_types())
        }
        distance = _construct_auto_distance(_features, col_types)

    else:
        raise TypeError(
            "Input 'distance' not understood. The 'distance' " +
            "parameter must be a string or a composite distance, " +
            " or left unspecified.")

    ## Construct and query the nearest neighbors model
    ## -----------------------------------------------
    knn_model = _gl.nearest_neighbors.create(dataset,
                                             label=target,
                                             distance=distance,
                                             verbose=verbose)

    ## Postprocessing and formatting
    ## -----------------------------
    model = NearestNeighborClassifier(knn_model)
    model._state['verbose'] = verbose
    model._state['distance'] = knn_model['distance']
    model._state['num_distance_components'] = knn_model[
        'num_distance_components']
    model._state['num_examples'] = dataset.num_rows()
    model._state['features'] = knn_model['features']
    model._state['target'] = target
    model._state['num_classes'] = len(dataset[target].unique())
    model._state['num_features'] = knn_model['num_features']
    model._state['num_unpacked_features'] = knn_model['num_unpacked_features']
    model._state['training_time'] = _time.time() - start_time
    model._target_type = dataset[target].dtype()

    return model
def create(observation_data,
           user_id='user_id', timestamp='timestamp',
           user_data=None,
           time_unit=0,
           features=[],
           time_aggregate=_datetime.timedelta(days=1),
           lookback_feature_periods=[7, 14, 21, 60, 90],
           time_boundaries=[],
           tree_depth = 100,
           verbose=True):
    """
    Create a model of type
    :class:`~graphlab.churn_predictor.ChurnPredictor` that performs
    churn analysis on provided user activity logs.

    The Churn Prediction toolkit allows predicting which users
    will churn (stop using) a product or website given user activity logs.

    Training datasets should contain columns with user id, time stamp, and user
    events. Given the same, or a different data set, the toolkit will compute
    the probability of a user churning.

    For instance, given a dataset of the form:

    .. sourcecode:: python

        +-------------------------------+---------+---------------+------------+
        |            user_id            | action  |   timestamp   | product_id |
        +-------------------------------+---------+---------------+------------+
        | ONE                           | open    | 04/15/1981    | 205075200  |
        | ONE                           | bought  | 04/15/1981    |  88441100  |
        | ONE                           | clicked | 04/17/1981    | 205075200  |
        | TWO                           | clicked | 09/01/2015    | 205075200  |
        | TWO                           | bought  | 09/21/2015    |  88414900  |
        +-------------------------------+---------+---------------+------------+

    If we are looking at this data set, the last time stamp is September 21st
    2015. Given that last date, user TWO is unlikely to churn. Whereas user ONE,
    who was a customer in 1981 and has not come back for 34 years is likely to
    churn (not come back).

    The toolkit does not use current wall time, it uses the last provided time
    stamp for predictions (unless another time stamp is specified).

    This toolkit will look at users, time and activity types, form an internal
    feature set based on user behavior, and train a model to predict user churn.

    Predicted churn is provided as a probability, where 0% means the user will
    definitely churn, and 100% means the user will definitely stay.

    The prediction is set by default to execute on the latest time stamp
    provided in the prediction set, but can also be set manually to some other
    date.

    Since, internally, training requires generating training labels, the last
    10% of the data is not used for training. Therefore, it is safe to reuse
    the training data set as a prediction data set to know who is likely to
    churn. A trained model can also be used to predict on new data set safely.

    Parameters
    ----------
    observation_data : SFrame
        The dataset to use for training the model. It must contain a column of
        user ids, a column of timestamps, along with one or more activity
        columns. Each row represents a user action at a given time.

        A user activity column can contain numeric data (length of visit in
        seconds, number of items in cart) or categorical data (item purchased,
        page visited).

        The user id must be of type 'int' or 'str'. The time column must be of
        type int or datetime.

        User activity columns of type 'int' and 'str' will be considered
        categorical. Columns of type 'float' will be considered numerical.

    user_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        user id.
        Default: user_id

    timestamp : string, optional
        The name of the column in `observation_data` that corresponds to the
        timestamp.

        The column can be of datetime.datetime type, or int type. If the column
        contains ints the time_unit parameter can be used to define the unit of
        time represented by the column.
        Default: timestamp

    user_data : SFrame, optional
        Side information for the users.  This SFrame must have a column with
        the same name as what is specified by the `user_id` input parameter.
        `user_data` can provide any amount of additional user-specific
        information. The join performed is an inner join.

    features : List of string, optional
        If specified, only the features in the list will be used. Columns of
        type Integer and String will be treated as categorical, columns of type
        Float will be treated as numeric.

    time_unit : int, optional
        If the timestamp column is of integral type, this sets the unit interval
        of the column. For instance, if the value of the timestamp column is in
        milliseconds, the value would be 1000. If the timestamp column is in
        seconds, the value would be 1.
        Default: 0 (auto-detect)

    time_aggregate : datetime.timedelta, optional
        Internal time frame to roll up user actions. In order to make the inner
        computation efficient, the user actions are rolled-up by time. The
        default is to aggregate by day. This can be shortened if hourly rates of
        actions are of importance, or stretched if only weekly rates matter.
        The larger the roll-up, the faster the model will run. The timedelta
        must be positive.
        Default: 1 day

    lookback_feature_periods : list of int, optional
        Interval of time to look back for feature generation. Each number is in
        the unit of time_aggregate (the default being days as defined above).
        For instance, if the list contains [7, 14], it will generate features
        for weekly patterns and biweekly patterns. If time_aggregate was set
        to hours, [7, 14] would generate for 7 hours and 14 hours patterns.
        Default: [7, 14, 21, 60, 90]

    time_boundaries : list of datetime.datetime, optional
        List of time boundaries used to compute the training set. At each time
        boundary, users that are present before the boundary will be used to
        compute features, and their presence after the boundary will make up the
        label. By having multiple time boundaries, more training data can be
        generated. If an empty list is specified (default), 10 evenly separated
        boundaries are used based on the first and last timestamp of the
        observation_data.
        Default: 10 evenly separated boundaries are used based on the first and
        last timestamp of the observation_data.

    tree_depth : integer, optional
        The depth of the decision tree built internally to train. By default,
        the model assumes a large complex dataset and will assign up to
        100 levels to the internal decision tree. This can be too many for
        smaller data sets with fewer columns.
        Default: 100

    verbose: boolean, optional
        When set to true, more status output is generated
        Default: True

    Returns
    -------
      out : ChurnPredictor
          A trained model of type
          :class:`~graphlab.churn_predictor.ChurnPredictor`.

    See Also
    --------
    ChurnPredictor


    Examples
    --------

    .. sourcecode:: python

        # Load a data set. The d ata set has 3 columns: user_id, timestamp, event
        # and must contain at least 100 rows of user activity.
        >>> sf = gl.SFrame('~/data/churn/actions_top_k.csv')

        # We set time_unit to 1000 because our data has timestamps in milliseconds.
        # Otherwise, we would ommit it (or set it to 1).
        >>> model = gl.churn_predictor.create(sf,
                                              user_id="user_id",
                                              timestamp="timestamp")

        # For simplicity, we will predict on the input data set
        >>> model.predict(sf)

        # Output is in the form:
        +-------------------------------+----------------------+
        |            user_id            |   stay_probability   |
        +-------------------------------+----------------------+
        | ONE                           | 0.001                |
        | TWO                           | 99.99                |
        +-------------------------------+----------------------+

    """

    _mt._get_metric_tracker().track('{}.create'.format("toolkit.churn_predictor.create"))

    _raise_error_if_not_sframe(observation_data, "observation_data")
    _raise_error_if_not_of_type(user_id, [str])
    _raise_error_if_not_of_type(timestamp, [str])
    if (user_data): _raise_error_if_not_sframe(user_data, "observation_data")

    time_aggregate_int = int(time_aggregate.total_seconds())
    if (time_aggregate_int <= 0):
        raise _ToolkitError("time_aggregate must be a positive time delta")

    if (observation_data.num_rows() < 100):
        raise _ToolkitError("This toolkit requires at least 100 rows of activity")

    # Cheap way to determine time units if not user-specified
    if time_unit == 0:
        # Set to 1 in case of using datetimes or other non-int formats
        time_unit = 1
        first_timestamp = observation_data[timestamp][0]
        if isinstance(first_timestamp, int):
            if (verbose): print("PROGRESS: Determining timestamp unit")
            max_timestamp = observation_data[timestamp].max()
            if len(str(max_timestamp)) >= 11:
                if (verbose): print("PROGRESS: Assuming timestamps are in milliseconds since 01/01/1970")
                time_unit = 1000
            else:
                if (verbose): print("PROGRESS: Assuming timestamps are in seconds since 01/01/1970")
                time_unit = 1

    if (verbose): print("PROGRESS: Initializing churn predictor")
    proxy = _gl.extensions._ChurnPredictor()
    proxy.define_columns(observation_data, timestamp, user_id, time_unit)
    proxy.define_columns3(observation_data, features)
    proxy.define_lookback_feature_periods(lookback_feature_periods)
    proxy.define_default_time_aggregate(time_aggregate_int)
    proxy.define_model_options(tree_depth)

    if (verbose): print("PROGRESS: Sorting input data by time order")
    sorted_data = proxy.sort_by_time(observation_data, "")

    if (verbose): print("PROGRESS: Aggregating input data by groups of " + str(time_aggregate))
    aggregated_by_time = proxy.aggregate_by_time(sorted_data, True, 0, "", "")

    if (time_boundaries):
        unix_timestamps = []
        for dt in time_boundaries: unix_timestamps.append(int(_time.mktime(dt.timetuple())))
        time_boundaries = unix_timestamps

    stpcnt = 10
    if (not time_boundaries and isinstance(sorted_data[timestamp][0], _datetime.datetime)):
        min_time = sorted_data[timestamp][0]
        max_time = sorted_data[timestamp][sorted_data[timestamp].size() - 1]
        step = (max_time - min_time) / stpcnt
        if (verbose): print("PROGRESS: No time boundaries specified, computing 10 boundaries from " + str(min_time) + " to " + str(max_time))
        time_boundaries = [(i+1) * step + min_time for i in range(stpcnt - 1)]

    if (not time_boundaries):
        min_time = sorted_data[timestamp][0] / time_unit
        max_time = sorted_data[timestamp][sorted_data[timestamp].size() - 1] / time_unit
        step = (max_time - min_time) / stpcnt
        if step <= 0:
            raise _ToolkitError("Not enough time in the training data. There should be more than " + str(stpcnt) + " units of time.")
        if (verbose): print("PROGRESS: No time boundaries specified, computing 10 boundaries from " + str(_datetime.datetime.fromtimestamp(min_time)) + " to " + str(_datetime.datetime.fromtimestamp(max_time)))
        time_boundaries = range(min_time + step, max_time, step - 1)

    if (not time_boundaries):
        raise _ToolkitError("Not enough time boundaries defined, at least one must be defined")

    big_user_aggregate = None
    for time_boundary in time_boundaries:

        if isinstance(time_boundary, _datetime.datetime):
            time_boundary = int(_time.mktime(time_boundary.timetuple()))

        if (verbose): print("PROGRESS: Generating user data for aggregate " + str(_datetime.datetime.fromtimestamp(time_boundary)))
        user_aggregate = proxy.per_user_aggregate(aggregated_by_time, time_boundary, "", lookback_feature_periods)

        if (not big_user_aggregate):
            big_user_aggregate = user_aggregate
        else:
            big_user_aggregate = big_user_aggregate.append(user_aggregate)

    if (user_data):
        if (verbose): print("PROGRESS: Joining with user data")
        big_user_aggregate = big_user_aggregate.join(user_data, on=user_id, how="inner")

    if (verbose): print("PROGRESS: Training model")
    proxy.train_model(big_user_aggregate, "")

    if (verbose): print("PROGRESS: All done!")
    return ChurnPredictor(proxy, big_user_aggregate)
def create(dataset, target, features=None, distance=None, verbose=True):
    """
    Create a
    :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`.
    This model predicts the class of a query point by finding the most common
    class among the query's nearest neighbors.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    features : list[string], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns except the target variable
        should be used. Please note: if `distance` is specified as a composite
        distance, then that parameter controls which features are used in the
        model. Each column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : string, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'. Please see the
          :mod:`~graphlab.toolkits.distances` module for more details.

        - *Function*: a function handle from the
          :mod:`~graphlab.toolkits.distances` module. Please see the
          documentation for that module for specific distance functions.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        Note that for sparse vectors, missing keys are assumed to have value
        0.0. If distance is left unspecified or set to 'auto', then a composite
        distance is constructed automatically based on feature types.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborClassifier
        A trained model of type
        :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`.

    See Also
    --------
    NearestNeighborClassifier
    graphlab.toolkits.nearest_neighbors
    graphlab.toolkits.distances

    References
    ----------
    - `Wikipedia - nearest neighbors classifier
      <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_

    - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of
      Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`_.
      Vol. 2. New York. Springer. pp. 463-481.

    Examples
    --------
    >>> sf = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
    ...                       'height': [9, 25, 20, 23],
    ...                       'weight': [13, 28, 33, 22]})
    ...
    >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species')

    As with the nearest neighbors toolkit, the nearest neighbor classifier
    accepts composite distance functions.
    >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7],
    ...            [('height', 'weight'), 'manhattan', 1.6]]
    ...
    >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species',
    ...                                                     distance=my_dist)
    """

    ## Set up
    ## ------
    _mt._get_metric_tracker().track('toolkit.classifier.nearest_neighbor_classifier.create')
    start_time = _time.time()


    ## Validation and preprocessing
    ## ----------------------------

    ## 'dataset' must be a non-empty SFrame
    _raise_error_if_not_sframe(dataset, "dataset")
    _raise_error_if_sframe_empty(dataset, "dataset")


    ## 'target' must be a string, in 'dataset', and the type of the target must
    #  be string or integer.
    if not isinstance(target, str) or target not in dataset.column_names():
        raise _ToolkitError("The 'target' parameter must be the name of a "
                            "column in the input dataset.")

    if not dataset[target].dtype() == str and not dataset[target].dtype() == int:
        raise TypeError("The target column must contain integers or strings.")


    ## Warn that 'None' values in the target may lead to ambiguous predictions.
    if dataset[target].num_missing() > 0:
        _logging.warning("Missing values detected in the target column. This " +
                         "may lead to ambiguous 'None' predictions, if the " +
                         "'radius' parameter is set too small in the prediction, " +
                         "classification, or evaluation methods.")


    ## convert features and distance arguments into a composite distance
    ## NOTE: this is done here instead of in the nearest neighbors toolkit
    #  because the automatic distance construction may be different for the two
    #  toolkits.
    if features is None:
        _features = [x for x in dataset.column_names() if x != target]
    else:
        _features = [x for x in features if x != target]


    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif (hasattr(distance, '__call__') or
        (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    elif distance is None or distance == 'auto':
        col_types = {k: v for k, v in zip(dataset.column_names(),
                                          dataset.column_types())}
        distance = _construct_auto_distance(_features, col_types)

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' " +
                        "parameter must be a string or a composite distance, " +
                        " or left unspecified.")


    ## Construct and query the nearest neighbors model
    ## -----------------------------------------------
    knn_model = _gl.nearest_neighbors.create(dataset, label=target,
                                             distance=distance,
                                             verbose=verbose)


    ## Postprocessing and formatting
    ## -----------------------------
    model = NearestNeighborClassifier(knn_model)
    model._state['verbose'] = verbose
    model._state['distance'] = knn_model['distance']
    model._state['num_distance_components'] = knn_model['num_distance_components']
    model._state['num_examples'] = dataset.num_rows()
    model._state['features'] = knn_model['features']
    model._state['target'] = target
    model._state['num_classes'] = len(dataset[target].unique())
    model._state['num_features'] = knn_model['num_features']
    model._state['num_unpacked_features'] = knn_model['num_unpacked_features']
    model._state['training_time'] = _time.time() - start_time

    return model
예제 #41
0
def create(dataset, tag_name=None, features=None, verbose=True):
    """
    Create a :class:`NearestNeighborAutoTagger`
    model, which can be used to quickly apply tags from a reference set of text
    labels to a new query set using the ``tag`` method.

    Parameters
    ----------
    dataset : SFrame
        Reference data. This SFrame must contain at least one column. By
        default, only the ``tag_name`` column is used as the basis for
        tagging. You may optionally include additional columns with the
        ``features`` parameter.

    tag_name : string, optional
        Name of the column in ``dataset`` with the tags. This column must
        contain string values. If ``dataset`` contains more than one column,
        ``tag_name`` must be specified.

    features : list[string], optional
        Names of the columns with features to use as the basis for tagging.
        'None' (the default) indicates that only the column specified by the
        ``tag_name`` parameter should be used. Only str or list fields are
        allowed. If a column of type list is specified, all values must be
        either of type string or convertible to type string.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : model
        A model for quickly tagging new query observations with entries from
        `dataset`. Currently, the only implementation is the following:

        - NearestNeighborAutoTagger

    See Also
    --------
    graphlab.nearest_neighbors.NearestNeighborsModel

    Examples
    --------
    First construct a toy `SFrame` of actor names, which will serve as the
    reference set for our autotagger model.

    >>> actors_sf = gl.SFrame(
            {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                       "Tom Cruise", "Jude Law", "Robert Pattinson",
                       "Matt Damon", "Brad Pitt", "Johnny Depp",
                       "Leonardo DiCaprio", "Jennifer Aniston",
                       "Jessica Alba", "Emma Stone", "Cameron Diaz",
                       "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                       "Charlize Theron", "Marion Cotillard",
                       "Angelina Jolie"]})
    >>> m = gl.data_matching.nearest_neighbor_autotagger.create(
                actors_sf, tag_name="actor")

    Then we load some IMDB movie reviews into an `SFrame` and tag them using
    the model we created above. The score field in the output is a
    similarity score, indicating the strength of the match between the query
    data and the suggested reference tag.

    >>> reviews_sf = gl.SFrame(
            "https://static.turi.com/datasets/imdb_reviews/reviews.sframe")
    >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
    +-----------+-------------------------------+------------------+-----------------+
    | review_id |             review            |      actor       |      score      |
    +-----------+-------------------------------+------------------+-----------------+
    |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
    |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
    |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
    |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
    |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
    |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
    |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
    |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
    |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
    |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
    |    ...    |              ...              |       ...        |       ...       |
    +-----------+-------------------------------+------------------+-----------------+

    The initial results look a little noisy. To filter out obvious spurious
    matches, we can set the `tag` method's similarity_threshold parameter.

    >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
              similarity_threshold=.8)
    +-----------+-------------------------------+------------------+----------------+
    | review_id |             review            |      actor       |     score      |
    +-----------+-------------------------------+------------------+----------------+
    |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
    |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
    |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
    |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
    +-----------+-------------------------------+------------------+----------------+

    In this second example, you'll notice that the ``review_id`` column is much
    more sparse. This is because all results whose score was below the specified
    similarity threshold (.8) were excluded from the output.

    """
    # validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    # ensure that tag_name is provided if dataset has > 1 column
    if dataset.num_cols() > 1 and not tag_name:
        raise _ToolkitError("No tag_name parameter specified on dataset " \
                            "with %d columns" % dataset.num_cols())
    tag_name = tag_name or dataset.column_names()[0]

    # ensure that column with name tag_name exists
    if tag_name not in dataset.column_names():
        raise _ToolkitError('No column named "%s" in dataset' % tag_name)

    # ensure that column is of type string
    if dataset[tag_name].dtype() != str:
        raise TypeError("The column used as the tag name must be of type " \
                        "string.")

    # use reasonable default for general case
    distance = _gl.distances.weighted_jaccard

    # if additional features are specified, ensure they are of appropriate types
    if features and not isinstance(features, list) and \
       all([isinstance(x, str) for x in features]):
        raise TypeError("The feature parameter must be a list of strings " \
                        "and those strings must correspond to columns in " \
                        "`dataset`.")

    # at a minimum, this SFrame will contain the tags as features;
    features = features or []
    features = [tag_name] + [x for x in features if x != tag_name]

    # ensure that each specified feature column is either of type list or str
    column_names = set(dataset.column_names())
    for col_name in features:
        if col_name not in column_names:
            raise _ToolkitError("Specified feature column (%s) not found " \
                                "in dataset" % col_name)

        if dataset.select_column(col_name).dtype() not in (str, list):
            raise TypeError("Only string and list columns are allowed as " \
                            "features.")

    # concatenate the feature columns into a single column
    features_sf = dataset.select_columns(features)
    feature_col, features_sf = _concat_string_features(features_sf, features)

    # compute features
    if verbose:
        _logging.getLogger().info("Extracting features...")

    features = _preprocess(features_sf.select_column(feature_col))

    # group by tag_name to ensure that tags are unique
    feature_cols = features.column_names()
    select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \
                   in feature_cols}
    features.add_column(dataset[tag_name], tag_name)
    features = features.groupby(tag_name, select_cols)

    # create nearest neighbors model
    m = _gl.nearest_neighbors.create(features,
                                     label=tag_name,
                                     distance=distance,
                                     features=feature_cols,
                                     verbose=verbose)

    # add standard toolkit state attributes
    state = {
        "nearest_neighbors_model": m,
        "training_time": m.get("training_time"),
        "tag_name": tag_name,
        "verbose": verbose,
        "num_examples": len(features),
        "features": feature_cols,
        "num_features": len(feature_cols),
        "distance": m.get("distance")
    }

    model = NearestNeighborAutoTagger(state)
    return model
예제 #42
0
def create(datasets, row_label=None, features=None, grouping_features=None,
           distance=None, k=2, radius=None, verbose=True):
    """
    Create a deduplication model based on nearest neighbors and SGraph connected
    components.

    This method creates a :class:`NearestNeighborDeduplication` model by
    constructing a nearest neighbors similarity graph on all of the rows in the
    input 'datasets', then using the connected components tool in the
    :mod:`~graphlab.toolkits.graph_analytics` module to assign an entity label
    to each record. Records which share the same label are considered to be
    duplicates.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    datasets : SFrame or list[SFrame] or dict(string: SFrame)
        Input datasets. Each SFrame in the list must include all of the features
        specified in the `features` or 'distance' parameters, but may
        have additional columns as well. SFrames can be input as values in a
        dictionary, where the keys are strings used in the output to identify
        the SFrame from which each record originated.

    row_label : string, optional
        Name of the SFrame column with row labels. If not specified, row numbers
        are used to identify rows in the output.

    features : list[string], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates the intersection of columns over all SFrames in
        `datasets` should be used (except the label column, if specified). Each
        column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model. Any
        additional columns named in 'features' will be included in the model
        output but not used for distance computations.

    grouping_features : list[string], optional
        Names of features to use in grouping records before finding approximate
        matches. These columns must have string or integer type data. See the
        Notes section for more details on grouping.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    k : int, optional
        Number of neighbors to consider for each point.

    radius : float, optional
        Maximum distance from each point to a potential duplicate.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborDeduplication model
        The NearestNeighborDeduplication object contains a field 'entities'
        which shows the entity label for each input record. It also shows the
        features for each record that are used to construct the model, as well
        as the original SFrame and row label for each record. If the original
        `datasets` are passed in a list, the SFrame identifier is the index of
        the SFrame in that list.

    See Also
    --------
    NearestNeighborDeduplication, graphlab.toolkits.nearest_neighbors,
    graphlab.SFrame.groupby

    Notes
    -----
    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - For datasets with more than about 10,000 records, *grouping* (also known
      as *blocking*) is a critical step to avoid computing distances between all
      pairs of records. The grouping step simply assigns each record to a group
      that has identical values for all `grouping_features`, and only looks for
      duplicates within each group.

    - Records with missing data in the `grouping_features` are removed from
      consideration as duplicates. These records are given the entity label
      "None".

    - For tasks that require *only* exact matches on certain features, it is
      generally more natural to use the SFrame `groupby` function.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    References
    ----------
    - Christen, Peter. "Data matching: concepts and techniques for record
      linkage, entity resolution, and duplicate detection." Springer Science &
      Business Media, 2012.

    Examples
    --------
    >>> sf1 = graphlab.SFrame({'id': [0, 1, 2],
    ...                        'x0': [0.5, 0.5, 0.3],
    ...                        'x1': [1., 0.8, 0.6],
    ...                        'city': ['seattle', 'olympia', 'boston'],
    ...                        'state': ['WA', 'WA', 'MA']})
    ...
    ... # note: misspellings in the following dataset do not prevent correct
    ... # matches.
    >>> sf2 = graphlab.SFrame({'id': [9, 10],
    ...                        'x0': [0.35, 0.4],
    ...                        'x1': [0.65, 0.8],
    ...                        'city': ['bostan', 'seatle'],
    ...                        'state': ['MA', 'WA']})
    ...
    >>> dist = [[('city',), 'levenshtein', 2],
    ...         [('x0', 'x1'), 'euclidean', 1.5]]
    ...
    >>> m = graphlab.nearest_neighbor_deduplication.create({'a': sf1, 'b': sf2},
    ...                                                    row_label='id',
    ...                                                    grouping_features=['state'],
    ...                                                    distance=dist, k=None,
    ...                                                    radius=3)
    ...
    >>> print m['entities']
    +----------+----+----------+-------+------+---------+------+
    | __sframe | id | __entity | state |  x0  |   city  |  x1  |
    +----------+----+----------+-------+------+---------+------+
    |    a     | 1  |    0     |   WA  | 0.5  | olympia | 0.8  |
    |    a     | 0  |    1     |   WA  | 0.5  | seattle | 1.0  |
    |    b     | 10 |    1     |   WA  | 0.4  |  seatle | 0.8  |
    |    a     | 2  |    2     |   MA  | 0.3  |  boston | 0.6  |
    |    b     | 9  |    2     |   MA  | 0.35 |  bostan | 0.65 |
    +----------+----+----------+-------+------+---------+------+
    [5 rows x 7 columns]
    """

    ## Set up
    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    start_time = _time.time()

    model = NearestNeighborDeduplication()
    model.__proxy__['verbose'] = verbose
    model.__proxy__['k'] = k
    model.__proxy__['radius'] = radius


    ### ----------------------------- ###
    ### Validation and preprocessing ###
    ### ----------------------------- ###

    ### Validate input datasets
    ### -----------------------

    ## If datasets is already a dict, check the keys are all strings
    if isinstance(datasets, dict):
        if not(all([isinstance(x, str) for x in datasets.keys()])):
            raise ValueError("Keys in the 'datasets' dict must be strings.")

    ## Convert singleton SFrame dataset into a list of datasets
    if isinstance(datasets, _gl.SFrame):
        _raise_error_if_sframe_empty(datasets, "dataset")
        datasets = {0: datasets}

    ## Convert a list of SFrames into a dict
    if isinstance(datasets, list):
        datasets = {k: sf for k, sf in enumerate(datasets)}

    ## At this point, 'datasets' must be dict. If it's not, something is wrong.
    if not isinstance(datasets, dict):
        raise TypeError("Input 'datasets' must be an SFrame, a list of SFrames, " +
                        "or a dictionary of (string, SFrame) pairs.")

    model.__proxy__['num_datasets'] = len(datasets)

    ## Ensure that all datasets are SFrames
    for d in datasets.values():
        _raise_error_if_not_sframe(d, "dataset")


    ### Validate row label
    ### ------------------

    ## Validate the label column
    if row_label:
        if not isinstance(row_label, str):
            raise TypeError("The 'row_label' parameter must be the name (string " +
                            "type) of a column in each of the input datasets.")

        for d in datasets.values():
            if row_label not in d.column_names():
                raise _ToolkitError("The specified row_label column does not " +
                                    " exist in all input datasets.")
    else:
        row_label = 'row_number'

        for d in datasets.values():
            if row_label in d.column_names():
                raise _ToolkitError("Input 'row_label' defaulted to " +
                                    "'row_number', which is already a column" +
                                    " in at least one input dataset. Please " +
                                    "specify a row label column manually.")

    model.__proxy__['row_label'] = row_label


    ### Validate 'features' and 'grouping_features' parameters
    ### ------------------------------------------------------
    if features is not None:
        if not hasattr(features, '__iter__'):
            raise TypeError("Input 'features' must be a list.")

        if not all([isinstance(x, str) for x in features]):
            raise TypeError("Input 'features' must contain only strings.")

    if grouping_features is not None:
        if not hasattr(grouping_features, '__iter__'):
            raise TypeError("Input 'grouping_features' must be a list.")

        if not all([isinstance(x, str) for x in grouping_features]):
            raise TypeError("Input 'grouping_features' must contain only strings.")


    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.

    ## Find the intersection of all feature sets and feature types
    col_types = {k: v for k, v in zip(list(datasets.values())[0].column_names(),
                                      list(datasets.values())[0].column_types())}

    all_features = [sf.column_names() for sf in datasets.values()]
    ftr_intersection = list(set(all_features[0]).intersection(*all_features))
    ftr_intersection = [x for x in ftr_intersection if x != row_label]


    ## Convert features and distance arguments into a composite distance.
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif isinstance(distance, str):
        if features is not None:
            distance = [[features, distance, 1]]
        else:
            distance = [[ftr_intersection, distance, 1]]

    elif distance == None:
        if features is not None:
            distance = _construct_auto_distance(features, col_types)
        else:
            distance = _construct_auto_distance(ftr_intersection, col_types)

    else:
        raise TypeError("Input 'distance' not understood. Note that for the " +
                         "data matching toolkit, 'distance' must be a string or " +
                         "a composite distance list."   )


    ## Validate the form of the composite distance and add to the model
    allowed_dists = {
        'euclidean': [int, float, _array.array],
        'squared_euclidean': [int, float, _array.array],
        'manhattan': [int, float, _array.array],
        'levenshtein': [str],
        'jaccard': [str, dict],
        'weighted_jaccard': [str, dict],
        'cosine': [int, float, str, dict, _array.array],
        'dot_product': [int, float, str, dict, _array.array],
        'transformed_dot_product': [int, float, str, dict, _array.array]}

    distance = _dmutl.validate_composite_distance(distance, row_label,
                                                  list(allowed_dists.keys()),
                                                  verbose)
    model.__proxy__['distance'] = _copy.deepcopy(distance)


    ## Figure out which features are 'fuzzy', i.e. used for approximate
    #  matching, and set in the model state.
    fuzzy_features = _dmutl.extract_composite_features(distance)  # already has row_label removed

    model.__proxy__['features'] = fuzzy_features
    model.__proxy__['num_features'] = len(fuzzy_features)


    ## Compile a master list of all features. This includes grouping features,
    #  fuzzy features (the ones used for approximate matching), and "ancillary"
    #  features, which are specified in the 'features' parameter but not in the
    #  composite distance function for whatever reason. by the user in the
    #  'features' parameter, but not included in the 'distance' specification
    #  for some reason.
    if features is None:
        features = []
    else:
        features = [x for x in features if x != row_label]

    if grouping_features is None:
        grouping_features = []
    else:
        grouping_features = [x for x in grouping_features if x != row_label]

    model.__proxy__['grouping_features'] = grouping_features
    model.__proxy__['num_grouping_features'] = len(grouping_features)

    master_features = list(set(features + grouping_features + fuzzy_features))


    ### Consolidate data and engineer features
    ### --------------------------------------

    ## Consolidate multiple input datasets into a single SFrame, with a useful
    #  row label.
    sf_union = _dmutl.concatenate_sframes(datasets, row_label=row_label,
                                   features=master_features,
                                   sf_index_name='__sframe')
    overall_label = '__sframe.' + row_label
    sf_union[overall_label] = (sf_union['__sframe'].astype(str) + "." +
                               sf_union[row_label].astype(str))


    ## Validate the feature types in the consolidated dataset against the
    #  specified distance functions.
    _dmutl.validate_distance_feature_types(sf_union, distance, allowed_dists)


    ## Clean string-type features in the fuzzy feature set.
    for ftr in fuzzy_features:
        if col_types[ftr] == str:
            new_ftr = '__clean.' + ftr
            sf_union[new_ftr] = sf_union[ftr].fillna("")
            sf_union[new_ftr] = sf_union[new_ftr].apply(
                lambda x: _dmutl.cleanse_string(x), dtype=str)

            for dist_comp in distance:
                dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]]


    ## Feature engineering, distance-component-wise. Also update list of
    #  features and a map to their types.
    sf_union, distance = _engineer_distance_features(sf_union, distance)
    transformed_features = _dmutl.extract_composite_features(distance)

    ### -------------------------------------------- ###
    ### Main loop over blocks of neighbor candidates ###
    ### -------------------------------------------- ###

    ## Construct blocks on features that must match exactly
    if verbose:
        _logging.info("Constructing groups of records that match exactly on " +
                      "the 'grouping_features'.")

    sf_union, block_errors, blocks = \
        _dmutl.construct_exact_blocks(sf_union, grouping_features)

    if verbose and len(distance) > 0 and blocks['Count'].max() > 10000:
        _logging.warning("There are more than 10,000 records in the largest match " +
            "group. For many uses, approximate matches within each match group are " +
            "computed with brute force nearest neighbors, which may be slow. " +
            "Consider using smaller groups by requiring different features to " +
            "match exactly.")

    max_entity_number = 0
    sf_entity = _gl.SFrame()
    output_features = (master_features + [row_label, '__sframe', '__entity'])

    ## Main loop over blocks
    for i, block in enumerate(blocks):

        if verbose:
            _logging.info("Processing {} records in match group: {}/{}".format(block['Count'],
                                                                         i+1,
                                                                         len(blocks)))

        ## Retrieve records in the block and impute the mean for missing numeric
        #  values.
        records = sf_union[block['min_idx']:(block['max_idx'] + 1)]
        complete_records = _dmutl.impute_numeric_means(records, transformed_features)

        if len(distance) > 0:
            ## Run all-point nearest neighbors
            if verbose:
                _logging.info("Building the similarity graph....")

            m = _gl.nearest_neighbors.create(complete_records, label=overall_label,
                                             distance=distance, verbose=False)
            knn = m.query(complete_records, label=overall_label, k=k, radius=radius,
                          verbose=verbose)


            ## Construct similarity graph to resolve transitive closure
            sg = _gl.SGraph()
            sg = sg.add_vertices(records[[overall_label]], vid_field=overall_label)
            sg = sg.add_edges(knn, src_field='query_label',
                              dst_field='reference_label')


            ## Cut the similarity graph to establish an entity for each vertex
            if verbose:
                _logging.info("Finding duplicate records in the similarity graph....")

            cc = _gl.connected_components.create(sg, verbose=verbose)

            ## Relabel the component IDs to be consecutive integers starting with
            #  the max index of the previous block's entity labels.
            block_labels = cc['component_size'].add_row_number('__entity')
            block_labels['__entity'] += max_entity_number
            max_entity_number += block_labels.num_rows()
            block_entity_labels = cc['component_id'].join(block_labels,
                                                          on='component_id',
                                                          how='left')

            ## Join the entity labels for the block back to the block's records,
            #  then append to the master output
            records = records.join(block_entity_labels[['__id', '__entity']],
                                   on={overall_label: '__id'}, how='left')
            records = records.sort('__entity')

        else:  # no fuzzy features, so no nearest neighbors, just block ID
            records['__entity'] = _gl.SArray.from_const(i, len(records))


        sf_entity = sf_entity.append(records[output_features])


    ### ------------------------------------- ###
    ### Postprocessing and results formatting ###
    ### ------------------------------------- ###

    ## Add rows missing from the blocking back to the master results
    if len(block_errors) > 0:
        block_errors['__entity'] = _gl.SArray.from_const(None, len(block_errors)).astype(int)
        sf_entity = sf_entity.append(block_errors[output_features])

    ## Rearrange columns
    sf_entity.swap_columns('__sframe', sf_entity.column_names()[0])
    sf_entity.swap_columns(row_label, sf_entity.column_names()[1])
    sf_entity.swap_columns('__entity', sf_entity.column_names()[2])


    ## Finalize the model state
    model.__proxy__['training_time'] = _time.time() - start_time
    model.__proxy__['entities'] = sf_entity
    model.__proxy__['num_entities'] = max_entity_number

    return model
예제 #43
0
def create(dataset, features=None, verbose=True):
    """
    Create an anomaly detection model. Based on the type of the input data,
    this function automatically choose the anomaly detection model and the type
    of anomalies to search for. Generally speaking, if the input data appears
    to be a time series---if the dataset type is TimeSeries, one of the
    features is of type datetime.datetime, or there is only a single
    feature---the toolkit chooses the moving Z-score model.

    Parameters
    ----------
    dataset : SFrame or TimeSeries
        Input dataset. Determines the type of anomaly detection model and types
        of anomalies to search for.

    features : list[str], optional
        Names of columns in the input 'dataset' to use as features.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    model : GraphLab Create model

    See Also
    --------
    local_outlier_factor.create, graphlab.toolkits.dbscan.create

    Examples
    --------
    >>> sf = graphlab.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.],
    ...                       'x1': [2., 1., 0., 1., 2., 1.5, 2.5]})
    ...
    >>> m = graphlab.anomaly_detection.create(sf)
    >>> type(m)
    graphlab.toolkits.anomaly_detection.local_outlier_factor.LocalOutlierFactorModel
    ...
    >>> m['scores']
    +--------+----------------------+
    | row_id | local_outlier_factor |
    +--------+----------------------+
    |   2    |    0.951567102896    |
    |   0    |    0.951567102896    |
    |   5    |    1.00783754045     |
    |   4    |    0.982224576307    |
    |   3    |    1.05829898642     |
    |   1    |    1.05829898642     |
    |   6    |    2.52792223974     |
    +--------+----------------------+
    [7 rows x 2 columns]
    """
    _mt._get_metric_tracker().track('toolkit.anomaly_detection.create')

    ## Basic validation of the input dataset.
    if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)):
        raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.")

    if len(dataset) < 1 or len(dataset.column_names()) < 1:
        raise TypeError("Input 'dataset' is empty.")

    ## Figure out the features and do basic validation.
    if features is None:
        features = dataset.column_names()

    if (not isinstance(features, list)
            or not all([type(c) == str for c in features])):

        raise TypeError("If specified, input 'features' must be a list " +
                        "of strings.")

    if not all([c in dataset.column_names() for c in features]):
        raise _ToolkitError("The specified features could not all be found " +
                            "in the input 'dataset'.")

    ## If any valid features are datetime types LOF is not valid.
    ## If there is more than one feature Z-score is not valid.

    # Figure out if there is a datetime column.
    col_types = {
        k: v
        for k, v in zip(dataset.column_names(), dataset.column_types())
    }

    datetime_features = [c for c in features if col_types[c] == _dt.datetime]
    value_features = [c for c in features if col_types[c] != _dt.datetime]

    ## Decide which model to use.
    try_zscore = False

    if isinstance(dataset, _gl.TimeSeries):
        try_zscore = True

    else:  # dataset is an SFrame
        if len(datetime_features) > 0:
            try_zscore = True

        if len(value_features) == 1 and (col_types[value_features[0]]
                                         in (int, float)):
            try_zscore = True

    ## Create the relevant model.
    bandwidth = max(1, int(0.05 * len(dataset)))

    if try_zscore:
        if len(value_features) != 1 or len(datetime_features) > 1:
            raise _ToolkitError(
                "Cannot select an appropriate anomaly " +
                "detection model. For a " +
                "local outlier factor model, please remove " +
                "any datetime-type features. For a moving" +
                "Z-score model, please identify one data" +
                "feature (integer- or float-type) and at most" +
                "one datetime column as an index (this indexing is done" +
                "automatically for TimeSeries objects)")

        if isinstance(dataset, _gl.SFrame) and len(datetime_features) == 1:
            _dataset = _gl.TimeSeries(dataset, index=datetime_features[0])
        else:
            _dataset = dataset[:]

        if verbose:
            print("Creating a moving Z-score anomaly detection model.")

        model = _gl.moving_zscore.create(dataset=_dataset,
                                         feature=value_features[0],
                                         window_size=bandwidth,
                                         verbose=verbose)

    ## If not doing the moving z-score, do local outlier factor.
    else:
        if verbose:
            print("Creating a local outlier factor model.")

        model = _gl.local_outlier_factor.create(dataset=dataset,
                                                features=features,
                                                num_neighbors=bandwidth,
                                                verbose=verbose)

    return model
예제 #44
0
def create(dataset, num_clusters=None, features=None, initial_centers=None,
           max_iterations=10, batch_size=None, verbose=True):
    r"""
    Run the k-means++ clustering algorithm, returning a KmeansModel object
    that contains the cluster centers and the cluster assignment for
    each data point in the dataset.

    Given a number of clusters, k-means++ iteratively chooses the best cluster
    centers and assigns nearby points to the best cluster. If no points change
    cluster membership between iterations, the algorithm terminates.

    Parameters
    ----------
    dataset : SFrame
        Each row in the SFrame is an observation.

    num_clusters : int
        Number of clusters. This is the 'k' in k-means.

    features : list[string], optional
        Names of feature columns to use in computing distances between
        observations and cluster centers. 'None' (the default) indicates that
        all columns should be used as features. Columns may be of the following
        types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a distinct feature in the model.

        - *Dict*: dictionary of keys mapped to numeric values. Each unique key
          is treated as a distinct feature in the model.

        Note that columns of type *list* are not supported. Convert them to
        array columns if all entries in the list are of numeric types.

    initial_centers : SFrame, optional
        If None (default), k-means++ intelligently chooses initial cluster
        centers. Otherwise, the algorithm starts with the centers provided in
        this SFrame. If this SFrame is provided, the ``num_clusters`` parameter
        does not need to be specified. ``initial_centers`` must have the columns
        specified in the ``features`` argument.

    max_iterations : int, optional
        The maximum number of iterations to run. Prints a warning if the
        algorithm does not converge after max_iterations iterations. If set to
        0, the model returns clusters defined by the initial centers and
        assignments to those centers.

    batch_size : int, optional    
        Number of randomly-chosen data points to use in each iteration. If
        `None` (the default) or greater than the number of rows in `dataset`,
        then this parameter is ignored: all rows of `dataset` are used in each
        iteration and model training terminates once point assignments stop
        changing or `max_iterations` is reached.

    verbose : bool, optional
        If True, print model training progress to the screen.

    Returns
    -------
    out : KmeansModel
        A Model object containing a cluster id for each vertex, and the centers
        of the clusters.

    See Also
    --------
    KmeansModel

    References
    ----------
    - `Wikipedia - k-means clustering
      <http://en.wikipedia.org/wiki/K-means_clustering>`_

    - Artuhur, D. and Vassilvitskii, S. (2007) `k-means++: The Advantages of
      Careful Seeding <http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf>`_. In
      Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete
      Algorithms. pp. 1027-1035.

    - Elkan, C. (2003) `Using the triangle inequality to accelerate k-means
      <http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf>`_. In Proceedings of
      the Twentieth International Conference on Machine Learning, Volume 3, pp.
      147-153.

    - Sculley, D. (2010) `Web Scale K-Means Clustering
      <http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf>`_. In
      Proceedings of the 19th International Conference on World Wide Web. pp.
      1177-1178

    Examples
    --------
    >>> sf = graphlab.SFrame({
        "d1": [ 0.46973508, 0.0063261, 0.14143399, 0.35025834,
                0.83728709, 0.81438336, 0.74205833, 0.36273747,
                0.00793858, 0.02298716],
        "d2": [ 0.51050977, 0.82167952, 0.61451765, 0.51179513,
                0.35223035, 0.59366481, 0.48848649, 0.90026032,
                0.78798728, 0.40125452],
        "d3": [ 0.71716265, 0.54163387, 0.55577274, 0.12619953,
                0.80172228, 0.21519973, 0.21014113, 0.54207596,
                0.65432528, 0.00754797],
        "d4": [ 0.69486673, 0.92585721, 0.95461882, 0.72658554,
                0.86590678, 0.18017175, 0.60361348, 0.89223113,
                0.37992791, 0.44700959]
        })

    It's important to standardize our columns to get the best results possible
    from the k-means algorithm.

    >>> for col in ['d1', 'd2', 'd3', 'd4']:
            sf[col] = (sf[col] - sf[col].mean()) / sf[col].std()
    >>> model = graphlab.kmeans.create(sf, num_clusters=3)
    """

    _mt._get_metric_tracker().track('toolkit.kmeans.create')

    opts = {'model_name': 'kmeans',
            'max_iterations': max_iterations,
            'verbose': verbose}

    ## Validate input dataset
    if not (isinstance(dataset, _SFrame)):
        raise TypeError("Input 'dataset' must be an SFrame.")

    if dataset.num_rows() == 0 or dataset.num_cols() == 0:
        raise ValueError("Input 'dataset' has no data.")

    ## Validate input initial centers
    if initial_centers is not None:
        if not (isinstance(initial_centers, _SFrame)):
            raise TypeError("Input 'initial_centers' must be an SFrame.")

        if initial_centers.num_rows() == 0 or initial_centers.num_cols() == 0:
            raise ValueError("An 'initial_centers' argument is provided " +\
                             "but has no data.")

    ## Validate number of clusters
    if initial_centers is None:
        if num_clusters is None:
            raise ValueError("Number of clusters cannot be determined from " +\
                             "'num_clusters' or 'initial_centers'. You must " +\
                             "specify one of these arguments.")
        else:
            _num_clusters = num_clusters

    else:
        num_centers = initial_centers.num_rows()

        if num_clusters is None:
            _num_clusters = num_centers
        else:
            if num_clusters != num_centers:
                raise ValueError("The value of 'num_clusters' does not match " +\
                                 "the number of provided initial centers. " +\
                                 "Please provide only one of these arguments " +\
                                 "or ensure the values match.")
            else:
                _num_clusters = num_clusters

    if not isinstance(_num_clusters, int):
        raise _ToolkitError("Parameter 'num_clusters' must be an integer.")

    if _num_clusters > dataset.num_rows():
        raise ValueError("The desired number of clusters exceeds the number " +
                         "of data points. Please set 'num_clusters' to be " +
                         "smaller than the number of data points.")

    opts['num_clusters'] = _num_clusters

    ## Validate the features in the dataset
    features = _select_valid_features(dataset, features, [_array, dict, int, float])
    sf_features = dataset.select_columns(features)
    opts['features'] = sf_features

    ## Validate the features in the initial centers (if provided)
    if initial_centers is not None:
        try:
            initial_centers = initial_centers.select_columns(features)
        except:
            raise ValueError("Specified features cannot be extracted from the " +\
                             "provided initial centers.")

        if initial_centers.column_types() != sf_features.column_types():
            raise TypeError("Feature types are different in the dataset and " +\
                             "initial centers.")

    else:
        initial_centers = _graphlab.SFrame()

    opts['initial_centers'] = initial_centers


    ## Validate the batch size and determine the training method.
    if batch_size is None:
        opts['method'] = 'elkan'
        opts['batch_size'] = dataset.num_rows();

    else:
        opts['method'] = 'minibatch'
        opts['batch_size'] = batch_size


    ## Create and return the model
    params = _graphlab.toolkits._main.run('kmeans_train', opts, verbose)
    return KmeansModel(params['model'])
def create(dataset, label=None, features=None, distance=None, method='auto',
           verbose=True, **kwargs):
    """
    Create a nearest neighbor model, which can be searched efficiently and
    quickly for the nearest neighbors of a query observation. If the `method`
    argument is specified as `auto`, the type of model is chosen automatically
    based on the type of data in `dataset`.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Reference data. If the features for each observation are numeric, they
        may be in separate columns of 'dataset' or a single column with lists
        of values. The features may also be in the form of a column of sparse
        vectors (i.e. dictionaries), with string keys and numeric values.

    label : string, optional
        Name of the SFrame column with row labels. If 'label' is not specified,
        row numbers are used to identify reference dataset rows when the model
        is queried.

    features : list[string], optional
        Name of the columns with features to use in computing distances between
        observations and the query points. 'None' (the default) indicates that
        all columns except the label should be used as features. Each column can
        be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Columns of type *list* are not supported. Convert them to array columns
        if all entries in the list are of numeric types. Please note: if a
        composite distance is also specified, this parameter is ignored.

    distance : string, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~graphlab.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional
        Method for computing nearest neighbors. The options are:

        - *auto* (default): the method is chosen automatically, based on the
          type of data and the distance. If the distance is 'manhattan' or
          'euclidean' and the features are numeric or vectors of numeric values,
          then the 'ball_tree' method is used. Otherwise, the 'brute_force'
          method is used.

        - *ball_tree*: use a tree structure to find the k-closest neighbors to
          each query point. The ball tree model is slower to construct than the
          brute force model, but queries are faster than linear time. This
          method is not applicable for the cosine and dot product distances. See
          `Liu, et al (2004)
          <http://papers.nips.cc/paper/2666-an-investigation-of-p
          ractical-approximat e-nearest-neighbor-algorithms>`_ for
          implementation details.

        - *brute_force*: compute the distance from a query point to all
          reference observations. There is no computation time for model
          creation with the brute force method (although the reference data is
          held in the model, but each query takes linear time.

        - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate
          nearest neighbors efficiently. The LSH model supports 'euclidean',
          'squared_euclidean', 'manhattan', 'cosine', 'jaccard',
          'dot_product' (deprecated), and 'transformed_dot_product' distances.
          Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``.
          See the notes below for details.

    verbose: bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options for the distance function and query method.

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and n/(2^11),
          which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables constructed.
          The default value is 20. We recommend choosing values from 10 to 30.

        - *num_projections_per_table*: For the LSH method, the number of
          projections/hash functions for each hash table. The default value
          is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances.
          We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine'
          distance and 4 ~ 12 for other distances.

    Returns
    -------
    out : NearestNeighborsModel
        A structure for efficiently computing the nearest neighbors in 'dataset'
        of new query points.

    See Also
    --------
    NearestNeighborsModel.query, graphlab.toolkits.distances

    Notes
    -----
    - Missing data is not allowed in the 'dataset' provided to this function.
      Please use the :func:`graphlab.SFrame.fillna` and
      :func:`graphlab.SFrame.dropna` utilities to handle missing data before
      creating a nearest neighbors model.

    - Missing keys in sparse vectors are assumed to have value 0.

    - The `composite_params` parameter was removed as of GraphLab Create
      version 1.5. The `distance` parameter now accepts either standard or
      composite distances. Please see the :mod:`~graphlab.toolkits.distances`
      module documentation for more information on composite distances.

    - If the features should be weighted equally in the distance calculations
      but are measured on different scales, it is important to standardize the
      features. One way to do this is to subtract the mean of each column and
      divide by the standard deviation.

    **Locality Sensitive Hashing (LSH)**

    There are several efficient nearest neighbors search algorithms that work well
    for data with low dimensions :math:`d` (approximately 50). However, most of
    the solutions suffer from either space or query time that is exponential
    in :math:`d`. For large :math:`d`, they often provide little, if any,
    improvement over the 'brute_force' method. This is a well-known consequence
    of the phenomenon called `The Curse of Dimensionality`.

    `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_
    is an approach that is designed to efficiently
    solve the *approximate* nearest neighbor search problem for high dimensional
    data. The key idea of LSH is to hash the data points using several hash
    functions, so that the probability of collision is much higher for data
    points which are close to each other than those which are far apart.

    An LSH family is a family of functions :math:`h` which map points from the
    metric space to a bucket, so that

    - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`.
    - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`.

    LSH for efficient approximate nearest neighbor search:

    - We define a new family of hash functions :math:`g`, where each
      function :math:`g` is obtained by concatenating :math:`k` functions
      :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`.
      The algorithm constructs :math:`L` hash tables, each of which
      corresponds to a different randomly chosen hash function :math:`g`.
      There are :math:`k \\cdot L` hash functions used in total.
    - In the preprocessing step, we hash all :math:`n` reference points
      into each of the :math:`L` hash tables.
    - Given a query point :math:`q`, the algorithm iterates over the
      :math:`L` hash functions :math:`g`.
      For each :math:`g` considered, it retrieves the data points that
      are hashed into the same bucket as q.
      These data points from all the :math:`L` hash tables are considered
      as candidates that are then re-ranked by their real distances with the
      query data.

    **Note** that the number of tables :math:`L` and the number of hash
    functions per table :math:`k` are two main parameters.
    They can be set using the options ``num_tables`` and
    ``num_projections_per_table`` respectively.

    Hash functions for different distances:

    - ``euclidean`` and ``squared_euclidean``:
      :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where
      :math:`a` is a vector, of which the elements are independently
      sampled from normal distribution, and :math:`b` is a number
      uniformly sampled from :math:`[0, r]`.
      :math:`r` is a parameter for the bucket width. We set :math:`r` using
      the average all-pair `euclidean` distances from a small randomly sampled
      subset of the reference data.
    - ``manhattan``:
      The hash function of ``manhattan`` is similar with that
      of ``euclidean``. The only difference is that the elements of `a`
      are sampled from Cauchy distribution, instead of normal distribution.
    - ``cosine``: Random Projection is designed to approximate the cosine distance
      between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`,
      where :math:`a` is randomly sampled normal unit vector.
    - ``jaccard``:
      We use a recently proposed method one permutation hashing by Shrivastava and Li.
      See the paper `[Shrivastava and Li, UAI 2014]
      <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details.
    - ``dot_product``:
      The reference data points are first transformed to
      fixed-norm vectors, and then the minimum ``dot_product`` distance search
      problem can be solved via finding the reference data with smallest
      ``cosine`` distances. See the paper `[Neyshabur and Srebro, ICML 2015]
      <http://jmlr.org/proceedings/papers/v37/neyshabur15.html>`_ for details.

    References
    ----------
    - `Wikipedia - nearest neighbor
      search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_

    - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_

    - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of
      Practical Approximate Nearest Neighbor Algorithms
      <http://papers.nips.cc/paper/2666-an-investigation-of-p
      ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural
      Information Processing Systems pp. 825-832.

    - `Wikipedia - Jaccard distance
      <http://en.wikipedia.org/wiki/Jaccard_index>`_

    - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the
      Jaccard Median
      <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_.
      Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete
      Algorithms. Society for Industrial and Applied Mathematics.

    - `Wikipedia - Cosine distance
      <http://en.wikipedia.org/wiki/Cosine_similarity>`_

    - `Wikipedia - Levenshtein distance
      <http://en.wikipedia.org/wiki/Levenshtein_distance>`_

    - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets
      <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_.

    Examples
    --------
    Construct a nearest neighbors model with automatically determined method and
    distance:

    >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11],
    ...                       'X2': [0.69, 0.58, 0.36],
    ...                       'str_feature': ['cat', 'dog', 'fossa']})
    >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'])

    For datasets with a large number of rows and up to about 100 variables, the
    ball tree method often leads to much faster queries.

    >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'],
    ...                                           method='ball_tree')

    Often the final determination of a neighbor is based on several distance
    computations over different sets of features. Each part of this composite
    distance may have a different relative weight.

    >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.],
    ...            [['str_feature'], 'levenshtein', 3.]]
    ...
    >>> model = graphlab.nearest_neighbors.create(sf, distance=my_dist)
    """

    ## Validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")


    ## Clean the method options and create the options dictionary
    allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table']
    _method_options = {}

    for k, v in kwargs.items():
        if k in allowed_kwargs:
            _method_options[k] = v
        else:
            raise _ToolkitError("'{}' is not a valid keyword argument".format(k) +
                                " for the nearest neighbors model. Please " +
                                "check for capitalization and other typos.")


    ## Exclude inappropriate combinations of method an distance
    if method == 'ball_tree' and (distance == 'cosine'
                                  or distance == _graphlab.distances.cosine
                                  or distance == 'dot_product'
                                  or distance == _graphlab.distances.dot_product
                                  or distance == 'transformed_dot_product'
                                  or distance == _graphlab.distances.transformed_dot_product):
        raise TypeError("The ball tree method does not work with 'cosine' " +
                        "'dot_product', or 'transformed_dot_product' distance." +
                        "Please use the 'brute_force' method for these distances.")


    if method == 'lsh' and (not _method_options.has_key('num_projections_per_table')):
        if distance == 'jaccard' or distance == _graphlab.distances.jaccard:
            _method_options['num_projections_per_table'] = 4
        elif distance == 'cosine' or distance == _graphlab.distances.cosine:
            _method_options['num_projections_per_table'] = 16
        else:
            _method_options['num_projections_per_table'] = 8

    ## Initial validation and processing of the label
    _dataset, _label = _tkutl._validate_row_label(dataset, label=label)
    ref_labels = _dataset[_label]


    ## Determine the internal list of available feature names (may still include
    #  the row label name).
    if features is None:
        _features = _dataset.column_names()
    else:
        _features = _copy.deepcopy(features)


    ## Check if there's only one feature and it's the same as the row label.
    #  This would also be trapped by the composite distance validation, but the
    #  error message is not very informative for the user.
    free_features = set(_features).difference([_label])
    if len(free_features) < 1:
        raise _ToolkitError("The only available feature is the same as the " +
                            "row label column. Please specify features " +
                            "that are not also row labels.")


    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.
    # - At this point, the row label 'label' may still be in the list(s) of
    #   features.

    ## Convert any distance function input into a single composite distance.
    # distance is already a composite distance
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    # distance is a single name (except 'auto') or function handle.
    elif (hasattr(distance, '__call__') or
        (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    # distance is unspecified and needs to be constructed.
    elif distance is None or distance == 'auto':
        distance = _construct_auto_distance(_features,
                                            _dataset.column_names(),
                                            _dataset.column_types())

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' "
                        " argument must be a string, function handle, or " +
                        "composite distance.")

    ## Basic composite distance validation, remove the row label from all
    #  feature lists, and convert string distance names into distance functions.
    distance = _scrub_composite_distance_features(distance, [_label])
    distance = _convert_distance_names_to_functions(distance)
    _validate_composite_distance(distance)

    ## Raise an error if any component has string features are in single columns
    for d in distance:
        feature_names, dist, _ = d

        if (len(feature_names) > 1) and (dist == _graphlab.distances.levenshtein):
            raise ValueError("Levenshtein distance cannot be used with multiple " +
                             "columns. Please concatenate strings into a single " +
                             "column before creating the nearest neighbors model.")

    ## Get the union of feature names and make a clean dataset.
    clean_features = _get_composite_distance_features(distance)
    sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features)


    ## Decide which method to use
    ## - If more than one distance component (specified either directly or
    #  generated automatically because distance set to 'auto'), then do brute
    #  force.
    if len(distance) > 1:
        _method = 'brute_force'

        if method != 'brute_force' and verbose is True:
            print "Defaulting to brute force instead of ball tree because " +\
                "there are multiple distance components."

    else:
        if method == 'auto':

            # get the total number of variables. Assume the number of elements in
            # array type columns does not change
            num_variables = sum([len(x) if hasattr(x, '__iter__') else 1
                for x in sf_clean[0].itervalues()])

            # flag if all the features in the single composite are of numeric
            # type.
            numeric_type_flag = all([x in [int, float, list, array.array]
                for x in sf_clean.column_types()])

            ## Conditions necessary for ball tree to work and be worth it
            if ((distance[0][1] in ['euclidean',
                                    'manhattan',
                                    _graphlab.distances.euclidean,
                                    _graphlab.distances.manhattan])
                    and numeric_type_flag is True
                    and num_variables <= 200):

                    _method = 'ball_tree'

            else:
                _method = 'brute_force'

        else:
            _method = method


    ## Pick the right model name for the method
    if _method == 'ball_tree':
        model_name = 'nearest_neighbors_ball_tree'
        _mt._get_metric_tracker().track('toolkit.nearest_neighbors_balltree.create')

    elif _method == 'brute_force':
        model_name = 'nearest_neighbors_brute_force'
        _mt._get_metric_tracker().track('toolkit.nearest_neighbors_brute.create')

    elif _method == 'lsh':
        model_name = 'nearest_neighbors_lsh'
        _mt._get_metric_tracker().track('toolkit.nearest_neighbors_lsh.create')

    else:
        raise ValueError("Method must be 'auto', 'ball_tree', 'brute_force', " +
                         "or 'lsh'.")


    ## Package the model options
    opts = {}
    opts.update(_method_options)
    opts.update(
        {'model_name': model_name,
        'ref_labels': ref_labels,
        'label': label,
        'sf_features': sf_clean,
        'composite_params': distance})

    ## Construct the nearest neighbors model
    if not verbose:
        _mt.main.get_client().set_log_progress(False)

    result = _graphlab.extensions._nearest_neighbors.train(opts)

    _mt.main.get_client().set_log_progress(True)

    model_proxy = result['model']
    model = NearestNeighborsModel(model_proxy)

    return model