def create(dataset, transformers):
    """
    Create a Transformer object to transform data for feature engineering.

    Parameters
    ----------
    dataset : SFrame
        The dataset to use for training the model.

    transformers: Transformer  | list[Transformer]
        An Transformer or a list of Transformers.

    See Also
    --------
    graphlab.toolkits.feature_engineering._feature_engineering._TransformerBase

    Examples
    --------

    .. sourcecode:: python

        # Create data.
        >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4]})

        >>> from graphlab.toolkits.feature_engineering import FeatureHasher, \
                                               QuadraticFeatures, OneHotEncoder

        # Create a single transformer.
        >>> encoder = graphlab.feature_engineering.create(sf,
                                 OneHotEncoder(max_categories = 10))

        # Create a chain of transformers.
        >>> chain = graphlab.feature_engineering.create(sf, [
                                    QuadraticFeatures(),
                                    FeatureHasher()
                                  ])

        # Create a chain of transformers with names for each of the steps.
        >>> chain = graphlab.feature_engineering.create(sf, [
                                    ('quadratic', QuadraticFeatures()),
                                    ('hasher', FeatureHasher())
                                  ])


    """
    err_msg = "The parameters 'transformers' must be a valid Transformer object."
    cls = transformers.__class__

    _raise_error_if_not_sframe(dataset, "dataset")

    # List of transformers.
    if (cls == list):
        transformers = TransformerChain(transformers)
    # Transformer.
    else:
        if not issubclass(cls, TransformerBase):
            raise TypeError(err_msg)
    # Fit and return
    transformers.fit(dataset)
    return transformers
Пример #2
0
    def fit(self, data):
        """
        Fit a transformer using the SFrame `data`.

        Parameters
        ----------
        data : SFrame
            The data used to fit the transformer.

        Returns
        -------
        self (A fitted version of the object)

        See Also
        --------
        transform, fit_transform

        Examples
        --------
        .. sourcecode:: python

        {examples}
        """

        _raise_error_if_not_sframe(data, "data")
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit')
        self.__proxy__.fit(data)
        return self
Пример #3
0
def create(dataset, transformers):
    """
    Create a Transformer object to transform data for feature engineering.

    Parameters
    ----------
    dataset : SFrame
        The dataset to use for training the model.

    transformers: Transformer  | list[Transformer]
        An Transformer or a list of Transformers.

    See Also
    --------
    graphlab.toolkits.feature_engineering._feature_engineering._TransformerBase

    Examples
    --------

    .. sourcecode:: python

        # Create data.
        >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4]})

        >>> from graphlab.toolkits.feature_engineering import FeatureHasher, \
                                               QuadraticFeatures, OneHotEncoder

        # Create a single transformer.
        >>> encoder = graphlab.feature_engineering.create(sf,
                                 OneHotEncoder(max_categories = 10))

        # Create a chain of transformers.
        >>> chain = graphlab.feature_engineering.create(sf, [
                                    QuadraticFeatures(),
                                    FeatureHasher()
                                  ])

        # Create a chain of transformers with names for each of the steps.
        >>> chain = graphlab.feature_engineering.create(sf, [
                                    ('quadratic', QuadraticFeatures()),
                                    ('hasher', FeatureHasher())
                                  ])


    """
    err_msg = "The parameters 'transformers' must be a valid Transformer object."
    cls = transformers.__class__

    _raise_error_if_not_sframe(dataset, "dataset")

    # List of transformers.
    if (cls == list):
        transformers = TransformerChain(transformers)
    # Transformer.
    else:
        if not issubclass(cls, TransformerBase):
            raise TypeError(err_msg)
    # Fit and return
    transformers.fit(dataset)
    return transformers
Пример #4
0
    def transform(self, data):
        """
        Transform the SFrame `data` using a fitted model.

        Parameters
        ----------
        data : SFrame
            The data  to be transformed.

        Returns
        -------
        A transformed SFrame.

        See Also
        --------
        fit, fit_transform

        Examples
        --------
        .. sourcecode:: python

        {examples}

        """
        _raise_error_if_not_sframe(data, "data")
        _mt._get_metric_tracker().track(self.__class__.__module__ +
                                        '.transform')
        return self.__proxy__.transform(data)
Пример #5
0
    def fit(self, data):
        """
        Fit a transformer using the SFrame `data`.

        Parameters
        ----------
        data : SFrame
            The data used to fit the transformer.

        Returns
        -------
        self (A fitted version of the object)

        See Also
        --------
        transform fit_transform

        Examples
        --------
        .. sourcecode:: python
        {examples}
        """

        _raise_error_if_not_sframe(data, "data")
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit')
        self.__proxy__.fit(data)
        return self
Пример #6
0
    def fit_transform(self, data):
        """
        First fit a transformer using the SFrame `data` and then return a
        transformed version of `data`.

        Parameters
        ----------
        data : SFrame
            The data used to fit the transformer. The same data is then also
            transformed.

        Returns
        -------
        Transformed SFrame.

        See Also
        --------
        fit, transform

        Notes
        ------
        - Fit transform modifies self.

        Examples
        --------
        .. sourcecode:: python

        {examples}
        """
        _raise_error_if_not_sframe(data, "data")
        _mt._get_metric_tracker().track(self.__class__.__module__ +
                                        '.fit_transform')
        return self.__proxy__.fit_transform(data)
Пример #7
0
    def transform(self, data):
        """
        Transform the SFrame `data` using a fitted model.

        Parameters
        ----------
        data : SFrame
            The data  to be transformed.

        Returns
        -------
        A transformed SFrame.

        See Also
        --------
        transform fit_transform

        Examples
        --------
        .. sourcecode:: python
        {examples}

        """
        _raise_error_if_not_sframe(data, "data")
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.transform')
        return self.__proxy__.transform(data)
Пример #8
0
    def fit_transform(self, data):
        """
        First fit a transformer using the SFrame `data` and then return a transformed
        version of `data`.

        Parameters
        ----------
        data : SFrame
            The data used to fit the transformer. The same data is then also
            transformed.

        Returns
        -------
        Transformed SFrame.

        See Also
        --------
        fit fit_transform

        Notes
        ------
        - Fit transform modifies self.

        Examples
        --------
        .. sourcecode:: python
        {examples}
        """
        _raise_error_if_not_sframe(data, "data")
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit_transform')
        return self.__proxy__.fit_transform(data)
def create_regression_with_model_selector(dataset, target, model_selector,
    features = None, validation_set='auto', verbose = True):
    """
    Create a :class:`~graphlab.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    model_selector: function
        Provide a model selector.

    features : list[string], optional
        List of feature names used by feature column

    verbose : boolean
        whether print out messages during training

    """

    # Error checking
    _raise_error_if_not_sframe(dataset, "training dataset")
    if features is None:
        features = dataset.column_names()
        if target in features:
            features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)

    # Sample the data
    features_sframe = _toolkits_select_columns(dataset, features)
    if features_sframe.num_rows() > 1e5:
        fraction = 1.0 * 1e5 / features_sframe.num_rows()
        features_sframe = features_sframe.sample(fraction, seed = 0)

    # Run the model selector.
    selected_model_name = model_selector(features_sframe)
    model = create_selected(selected_model_name, dataset, target, features, validation_set, verbose)

    return model
Пример #10
0
def create_regression_with_model_selector(dataset, target, model_selector,
    features = None, validation_set=None, verbose = True):
    """
    Create a :class:`~graphlab.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    model_selector: function
        Provide a model selector.

    features : list[string], optional
        List of feature names used by feature column

    verbose : boolean
        whether print out messages during training

    """

    # Error checking
    _raise_error_if_not_sframe(dataset, "training dataset")
    if features is None:
        features = dataset.column_names()
        if target in features:
            features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)

    # Sample the data
    features_sframe = _toolkits_select_columns(dataset, features)
    if features_sframe.num_rows() > 1e5:
        fraction = 1.0 * 1e5 / features_sframe.num_rows()
        features_sframe = features_sframe.sample(fraction, seed = 0)

    # Run the model selector.
    selected_model_name = model_selector(features_sframe)
    model = create_selected(selected_model_name, dataset, target, features, validation_set, verbose)

    return model
    def extract_features(self, dataset):
        """
        Use the mined patterns to convert itemsets to binary vectors.

        For each itemset in ``dataset``, extract_features returns a vector of
        binary indicator variables, marking which mined patterns contain the
        itemset.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the item column exists in ``dataset`` it will be ignored
            while making predictions.

        Returns
        -------
        out : SFrame
            An SFrame of extracted features.
            The SFrame contains a row for each unique transaction in ``dataset``
            Each row of the SFrame consists of the 'features' and
            * extracted_features - an array.array of binary indicator variables

        See Also
        --------
        predict

        Examples
        --------
        .. sourcecode:: python

            >>> features = model.extract_features(bakery_sf)
            >>> features
            Data:
            +---------+-------------------------------+
            | Receipt |       extracted_features      |
            +---------+-------------------------------+
            |  21855  | [0.0, 1.0, 0.0, 0.0, 0.0, ... |
            |  63664  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |   7899  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |  25263  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |  30621  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |  43116  | [0.0, 0.0, 0.0, 1.0, 0.0, ... |
            |  27112  | [0.0, 0.0, 0.0, 0.0, 1.0, ... |
            |  26319  | [0.0, 1.0, 0.0, 0.0, 0.0, ... |
            |  26439  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |  62361  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            +---------+-------------------------------+
            [75000 rows x 2 columns]
        """
        _mt._get_metric_tracker().track(
            'toolkits.frequent_pattern_mining.extract_features')
        _raise_error_if_not_sframe(dataset, "dataset")
        return self.__proxy__.extract_features(dataset)
Пример #12
0
    def extract_features(self, dataset):
        """
        Use the mined patterns to convert itemsets to binary vectors.

        For each itemset in ``dataset``, extract_features returns a vector of
        binary indicator variables, marking which mined patterns contain the
        itemset.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the item column exists in ``dataset`` it will be ignored
            while making predictions.

        Returns
        -------
        out : SFrame
            An SFrame of extracted features.
            The SFrame contains a row for each unique transaction in ``dataset``
            Each row of the SFrame consists of the 'features' and
            * extracted_features - an array.array of binary indicator variables

        See Also
        --------
        predict

        Examples
        --------
        .. sourcecode:: python

            >>> features = model.extract_features(bakery_sf)
            >>> features
            Data:
            +---------+-------------------------------+
            | Receipt |       extracted_features      |
            +---------+-------------------------------+
            |  21855  | [0.0, 1.0, 0.0, 0.0, 0.0, ... |
            |  63664  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |   7899  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |  25263  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |  30621  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |  43116  | [0.0, 0.0, 0.0, 1.0, 0.0, ... |
            |  27112  | [0.0, 0.0, 0.0, 0.0, 1.0, ... |
            |  26319  | [0.0, 1.0, 0.0, 0.0, 0.0, ... |
            |  26439  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            |  62361  | [0.0, 0.0, 0.0, 0.0, 0.0, ... |
            +---------+-------------------------------+
            [75000 rows x 2 columns]
        """
        _mt._get_metric_tracker().track(
            'toolkits.frequent_pattern_mining.extract_features')
        _raise_error_if_not_sframe(dataset, "dataset")
        return self.__proxy__.extract_features(dataset)
    def evaluate(self, dataset, metric="auto",
                 missing_value_action='auto', options={}, **kwargs):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset in the same format used for training. The columns names and
            types of the dataset must be the same as that used in training.

        metric : str, list[str]
            Evaluation metric(s) to be computed.

        missing_value_action: str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'auto': Choose a model dependent missing value policy.
            - 'impute': Proceed with evaluation by filling in the missing
                        values with the mean of the training data. Missing
                        values are also imputed if an entire column of data is
                        missing during evaluation.
            - 'none': Treat missing value as is. Model must be able to handle missing value.
            - 'error' : Do not proceed with prediction and terminate with
                        an error message.

        options : dict
            additional options to be passed in to prediction

        kwargs : dict
            additional options to be passed into prediction
        """
        if missing_value_action == 'auto':
            missing_value_action = select_default_missing_value_policy(
                                                             self, 'evaluate')

        _raise_error_if_not_sframe(dataset, "dataset")
        options = options.copy()
        options.update(kwargs)

        options.update({'model': self.__proxy__,
                        'dataset': dataset,
                        'model_name': self.__name__,
                        'missing_value_action': missing_value_action,
                        'metric': metric
                        })
        results = _graphlab.toolkits._main.run(
                'supervised_learning_evaluate', options)
        return _map_unity_proxy_to_object(results)
    def evaluate(self, dataset, metric="auto",
                 missing_value_action='auto', options={}, **kwargs):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset in the same format used for training. The columns names and
            types of the dataset must be the same as that used in training.

        metric : str, list[str]
            Evaluation metric(s) to be computed.

        missing_value_action: str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'auto': Choose a model dependent missing value policy.
            - 'impute': Proceed with evaluation by filling in the missing
                        values with the mean of the training data. Missing
                        values are also imputed if an entire column of data is
                        missing during evaluation.
            - 'none': Treat missing value as is. Model must be able to handle missing value.
            - 'error' : Do not proceed with prediction and terminate with
                        an error message.

        options : dict
            additional options to be passed in to prediction

        kwargs : dict
            additional options to be passed into prediction
        """
        if missing_value_action == 'auto':
            missing_value_action = select_default_missing_value_policy(
                                                             self, 'evaluate')

        _raise_error_if_not_sframe(dataset, "dataset")
        options = options.copy()
        options.update(kwargs)

        options.update({'model': self.__proxy__,
                        'dataset': dataset,
                        'model_name': self.__name__,
                        'missing_value_action': missing_value_action,
                        'metric': metric
                        })
        results = _graphlab.toolkits._main.run(
                'supervised_learning_evaluate', options)
        return _map_unity_proxy_to_object(results)
    def classify(self, dataset, missing_value_action='auto'):
        """
        Return predictions for ``dataset``, using the trained supervised_learning
        model. Predictions are generated as class labels (0 or
        1).

        Parameters
        ----------
        dataset: SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        missing_value_action: str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'auto': Choose model dependent missing value action
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with prediction and terminate with
              an error message.
        Returns
        -------
        out : SFrame
            An SFrame with model predictions.
        """
        if (missing_value_action == 'auto'):
            missing_value_action = select_default_missing_value_policy(self, 'classify')

        # Low latency path
        if isinstance(dataset, list):
            return _graphlab.extensions._fast_classify(self.__proxy__, dataset,
                    missing_value_action)
        if isinstance(dataset, dict):
            return _graphlab.extensions._fast_classify(self.__proxy__, [dataset],
                    missing_value_action)

        _raise_error_if_not_sframe(dataset, "dataset")
        options = {}
        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset,
                        'missing_value_action': missing_value_action,
                        })
        target = _graphlab.toolkits._main.run('supervised_learning_classify', options)
        return _map_unity_proxy_to_object(target['classify'])
    def predict(self, dataset, missing_value_action = 'error',
                                      output_type='', options = {},
                                      **kwargs):
        """
        Return predictions for ``dataset``, using the trained supervised_learning
        model. Predictions are generated as class labels (0 or
        1).

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.
        missing_value_action: str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'impute': Proceed with evaluation by filling in the missing
                        values with the mean of the training data. Missing
                        values are also imputed if an entire column of data is
                        missing during evaluation.
            - 'error' : Do not proceed with prediction and terminate with
                        an error message.
        output_type : str, optional
            output type that maybe needed by some of the toolkits
        options : dict
            additional options to be passed in to prediction
        kwargs : dict
            additional options to be passed into prediction
        Returns
        -------
        out : SArray
            An SArray with model predictions.
        """

        _raise_error_if_not_sframe(dataset, "dataset")

        options = options.copy()
        options.update(kwargs)
        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset,
                        'missing_value_action' : missing_value_action,
                        'output_type' : output_type
                        })

        target = _graphlab.toolkits._main.run('supervised_learning_predict', options)
        return _map_unity_proxy_to_object(target['predicted'])
Пример #17
0
    def classify(self, dataset, missing_value_action='auto'):
        """
        Return predictions for ``dataset``, using the trained supervised_learning
        model. Predictions are generated as class labels (0 or
        1).

        Parameters
        ----------
        dataset: SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        missing_value_action: str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'auto': Choose model dependent missing value action
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with prediction and terminate with
              an error message.
        Returns
        -------
        out : SFrame
            An SFrame with model predictions.
        """
        if (missing_value_action == 'auto'):
            missing_value_action = select_default_missing_value_policy(self, 'classify')

        # Low latency path
        if isinstance(dataset, list):
            return _graphlab.extensions._fast_classify(self.__proxy__, dataset,
                    missing_value_action)
        if isinstance(dataset, dict):
            return _graphlab.extensions._fast_classify(self.__proxy__, [dataset],
                    missing_value_action)

        _raise_error_if_not_sframe(dataset, "dataset")
        options = {}
        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset,
                        'missing_value_action': missing_value_action,
                        })
        target = _graphlab.toolkits._main.run('supervised_learning_classify', options)
        return _map_unity_proxy_to_object(target['classify'])
Пример #18
0
    def fit(self, data):
        """
        Fits the transformer using the given data.
        """

        _raise_error_if_not_sframe(data, "data")

        fitted_state = {}
        feature_columns = _internal_utils.get_column_names(
            data, self._exclude, self._features)

        if not feature_columns:
            raise RuntimeError(
                "No valid feature columns specified in transformation.")

        fitted_state['features'] = feature_columns
        fitted_state['fitted'] = True

        self.__proxy__.update(fitted_state)

        return self
def create(dataset, target, features=None, distance=None, verbose=True):
    """
    Create a
    :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`.
    This model predicts the class of a query point by finding the most common
    class among the query's nearest neighbors.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    features : list[string], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns except the target variable
        should be used. Please note: if `distance` is specified as a composite
        distance, then that parameter controls which features are used in the
        model. Each column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : string, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'. Please see the
          :mod:`~graphlab.toolkits.distances` module for more details.

        - *Function*: a function handle from the
          :mod:`~graphlab.toolkits.distances` module. Please see the
          documentation for that module for specific distance functions.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        Note that for sparse vectors, missing keys are assumed to have value
        0.0. If distance is left unspecified or set to 'auto', then a composite
        distance is constructed automatically based on feature types.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborClassifier
        A trained model of type
        :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`.

    See Also
    --------
    NearestNeighborClassifier
    graphlab.toolkits.nearest_neighbors
    graphlab.toolkits.distances

    References
    ----------
    - `Wikipedia - nearest neighbors classifier
      <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_

    - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of
      Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`_.
      Vol. 2. New York. Springer. pp. 463-481.

    Examples
    --------
    >>> sf = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
    ...                       'height': [9, 25, 20, 23],
    ...                       'weight': [13, 28, 33, 22]})
    ...
    >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species')

    As with the nearest neighbors toolkit, the nearest neighbor classifier
    accepts composite distance functions.
    >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7],
    ...            [('height', 'weight'), 'manhattan', 1.6]]
    ...
    >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species',
    ...                                                     distance=my_dist)
    """

    ## Set up
    ## ------
    _mt._get_metric_tracker().track('toolkit.classifier.nearest_neighbor_classifier.create')
    start_time = _time.time()


    ## Validation and preprocessing
    ## ----------------------------

    ## 'dataset' must be a non-empty SFrame
    _raise_error_if_not_sframe(dataset, "dataset")
    _raise_error_if_sframe_empty(dataset, "dataset")


    ## 'target' must be a string, in 'dataset', and the type of the target must
    #  be string or integer.
    if not isinstance(target, str) or target not in dataset.column_names():
        raise _ToolkitError("The 'target' parameter must be the name of a "
                            "column in the input dataset.")

    if not dataset[target].dtype() == str and not dataset[target].dtype() == int:
        raise TypeError("The target column must contain integers or strings.")


    ## Warn that 'None' values in the target may lead to ambiguous predictions.
    if dataset[target].num_missing() > 0:
        _logging.warning("Missing values detected in the target column. This " +
                         "may lead to ambiguous 'None' predictions, if the " +
                         "'radius' parameter is set too small in the prediction, " +
                         "classification, or evaluation methods.")


    ## convert features and distance arguments into a composite distance
    ## NOTE: this is done here instead of in the nearest neighbors toolkit
    #  because the automatic distance construction may be different for the two
    #  toolkits.
    if features is None:
        _features = [x for x in dataset.column_names() if x != target]
    else:
        _features = [x for x in features if x != target]


    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif (hasattr(distance, '__call__') or
        (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    elif distance is None or distance == 'auto':
        col_types = {k: v for k, v in zip(dataset.column_names(),
                                          dataset.column_types())}
        distance = _construct_auto_distance(_features, col_types)

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' " +
                        "parameter must be a string or a composite distance, " +
                        " or left unspecified.")


    ## Construct and query the nearest neighbors model
    ## -----------------------------------------------
    knn_model = _gl.nearest_neighbors.create(dataset, label=target,
                                             distance=distance,
                                             verbose=verbose)


    ## Postprocessing and formatting
    ## -----------------------------
    model = NearestNeighborClassifier(knn_model)
    model._state['verbose'] = verbose
    model._state['distance'] = knn_model['distance']
    model._state['num_distance_components'] = knn_model['num_distance_components']
    model._state['num_examples'] = dataset.num_rows()
    model._state['features'] = knn_model['features']
    model._state['target'] = target
    model._state['num_classes'] = len(dataset[target].unique())
    model._state['num_features'] = knn_model['num_features']
    model._state['num_unpacked_features'] = knn_model['num_unpacked_features']
    model._state['training_time'] = _time.time() - start_time

    return model
Пример #20
0
    def predict_topk(self,
                     dataset,
                     max_neighbors=10,
                     radius=None,
                     k=3,
                     verbose=False):
        """
        Return top-k most likely predictions for each observation in
        ``dataset``. Predictions are returned as an SFrame with three columns:
        `row_id`, `class`, and `probability`.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include the features used for
            model training, but does not require a target column. Additional
            columns are ignored.

        max_neighbors : int, optional
            Maximum number of neighbors to consider for each point.

        radius : float, optional
            Maximum distance from each point to a neighbor in the reference
            dataset.

        k : int, optional
            Number of classes to return for each input example.

        Returns
        -------
        out : SFrame

        See Also
        ----------
        create, classify, predict

        Notes
        -----
        - If the 'radius' parameter is small, it is possible that a query point
          has no neighbors in the training dataset. In this case, the query is
          dropped from the SFrame output by this method. If all queries have no
          neighbors, then the result is an empty SFrame. If the target column in
          the training dataset has missing values, these predictions will be
          ambiguous.

        - Ties between predicted classes are broken randomly.

        Examples
        --------
        >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
        ...                             'height': [9, 25, 20, 23],
        ...                             'weight': [13, 28, 33, 22]})
        ...
        >>> sf_new = graphlab.SFrame({'height': [26, 19],
        ...                           'weight': [25, 35]})
        ...
        >>> m = graphlab.nearest_neighbor_classifier.create(sf_train, target='species')
        >>> ystar = m.predict_topk(sf_new, max_neighbors=2)
        >>> print ystar
        +--------+-------+-------------+
        | row_id | class | probability |
        +--------+-------+-------------+
        |   0    |  dog  |     1.0     |
        |   1    | fossa |     0.5     |
        |   1    |  dog  |     0.5     |
        +--------+-------+-------------+
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.nearest_neighbor_classifier.predict_topk')

        ## Validate the number of results to return. Note that the
        #  'max_neighbors' and 'radius' parameters are validated by the nearest
        #  neighbor model's query method.
        if not isinstance(k, int) or k < 1:
            raise TypeError(
                "The number of results to return for each point, " +
                "'k', must be an integer greater than 0.")

        ## Validate the query dataset.
        _raise_error_if_not_sframe(dataset, "dataset")
        _raise_error_if_sframe_empty(dataset, "dataset")

        ## Validate neighborhood parameters 'max_neighbors'.
        # - NOTE: when the parameter name is changed in nearest neighbors, the
        #   query call will do this itself, and this block can be removed.
        if max_neighbors is not None:
            if not isinstance(max_neighbors, int):
                raise ValueError("Input 'max_neighbors' must be an integer.")

            if max_neighbors <= 0:
                raise ValueError(
                    "Input 'max_neighbors' must be larger than 0.")

        ## Find the nearest neighbors for each query and count the number of
        #  votes for each class.
        knn = self._knn_model.query(dataset,
                                    k=max_neighbors,
                                    radius=radius,
                                    verbose=verbose)

        ## If there are *no* results for *any* query make an empty SFrame.
        if knn.num_rows() == 0:
            ystar = _gl.SFrame({'row_id': [], 'class': [], 'probability': []})
            ystar['row_id'] = ystar['row_id'].astype(int)
            ystar['class'] = ystar['class'].astype(str)

        else:
            ## Find the classes with the top-k vote totals
            grp = knn.groupby(['query_label', 'reference_label'],
                              _gl.aggregate.COUNT)

            ystar = grp.unstack(column=['reference_label', 'Count'],
                                new_column_name='votes')

            ystar['topk'] = ystar['votes'].apply(
                lambda x: _sort_topk_votes(x, k))
            ystar['total_votes'] = ystar['votes'].apply(
                lambda x: sum(x.values()))

            ## Re-stack, unpack, and rename the results
            ystar = ystar.stack('topk', new_column_name='topk')
            ystar = ystar.unpack('topk')
            ystar.rename({'topk.class': 'class', 'query_label': 'row_id'})
            ystar['probability'] = ystar['topk.votes'] / ystar['total_votes']
            ystar = ystar[['row_id', 'class', 'probability']]

        return ystar
    def predict_topk(self, dataset, k=5):
        """
        Use the trained model to obtain top-k predictions for the most
        confident rules given a partial set of observations described in the
        ``dataset``.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the item column exists in ``dataset`` it will be ignored
            while making predictions.

        k : int, optional
            Number of predictions to return for each input example.

        Returns
        -------
        out : SFrame
            An SFrame with the top scoring association rules for each itemset
            in the dataset.
            The SFrame contains a row for each unique transaction in ``dataset``
            Each row of the SFrame consists of the 'features' and

            * prefix - the 'antecedent' or 'left-hand side' of an assocation
              rule. It must be a frequent itemset and a subset of the
              associated itemset.
            * prediction - the 'consequent' or 'right-hand side' of the
              assocation rule. It must be disjoint of the prefix.
            * confidence - the confidence of the assocation rule defined as:
              ``confidence(prefix => prediction) = Support(prefix U prediction) / Support(prefix)``
            * prefix support - the frequency of the 'prefix' itemset in the
              training data
            * prediction support - the frequency of the 'prediction' itemset in
              the training data
            * joint support - the frequency of the cooccurance
              ('prefix' + 'prediction') in the training data

            If there does not exist ``k`` valid association rules for an
            itemset, then ``predict_topk`` will return as many valid rules
            as possible.


        See Also
        --------
        get_frequent_patterns, extract_features, predict

        Note
        --------
        Prediction can be slow when max_patterns is set to a large value because
        there are more rules to consider for predictions.

        References
        ----------

        - Wikipedia - Association Rule Learning
          <https://en.wikipedia.org/wiki/Association_rule_learning>
        - Han, Jiawei, Micheline Kamber, and Jian Pei. Data mining: concepts and
          techniques: concepts and techniques. Elsevier, 2011.

        Examples
        --------
        .. sourcecode:: python

            # For an SFrame
            >>> predictions = model.predict(bakery_sf, k = 5)

            Columns:
                Receipt int
                prefix  list
                prediction  list
                confidence  float
                prefix support  int
                joint support   int

            Rows: 13283

            Data:
            +---------+-----------------+-------------------------------+-----------------+
            | Receipt |      prefix     |           prediction          |    confidence   |
            +---------+-----------------+-------------------------------+-----------------+
            |    13   |   [CherrySoda]  |         [AppleDanish]         |  0.352077687444 |
            |    13   |   [CherrySoda]  |          [AppleTart]          |  0.349593495935 |
            |    13   |   [CherrySoda]  |        [AppleCroissant]       |  0.349141824752 |
            |    13   |   [CherrySoda]  | [AppleCroissant, AppleDanish] |  0.302619692864 |
            |    13   |   [CherrySoda]  |  [AppleCroissant, AppleTart]  |  0.301942186089 |
            |    42   | [ChocolateTart] |      [VanillaFrappuccino]     |  0.461889374644 |
            |    42   | [ChocolateTart] |         [WalnutCookie]        |  0.367990876259 |
            |    42   | [ChocolateTart] | [WalnutCookie, VanillaFrap... |  0.323322562251 |
            |    42   |        []       |         [CoffeeEclair]        |  0.104013695516 |
            |    42   |        []       |          [HotCoffee]          | 0.0976340461956 |
            +---------+-----------------+-------------------------------+-----------------+
            +----------------+---------------+
            | prefix support | joint support |
            +----------------+---------------+
            |      4428      |      1559     |
            |      4428      |      1548     |
            |      4428      |      1546     |
            |      4428      |      1340     |
            |      4428      |      1337     |
            |      5261      |      2430     |
            |      5261      |      1936     |
            |      5261      |      1701     |
            |     74769      |      7777     |
            |     74769      |      7300     |
            +----------------+---------------+
            [13283 rows x 7 columns]

            # For a single itemset, e.g. ['HotCoffee', 'VanillaEclair']
            >>> new_itemset = gl.SFrame({'Receipt': [-1, -1],
                                         'Item': ['HotCoffee', 'VanillaEclair']})
            >>> model.predict(new_itemset, k = 3)

            Data:
            +---------+-------------+-------------------------------+----------------+----------------+
            | Receipt |    prefix   |           prediction          |     score      | prefix support |
            +---------+-------------+-------------------------------+----------------+----------------+
            |    -1   | [HotCoffee] |       [ApricotCroissant]      | 0.344545454545 |      7700      |
            |    -1   | [HotCoffee] |        [BlueberryTart]        | 0.341298701299 |      7700      |
            |    -1   | [HotCoffee] | [BlueberryTart, ApricotCro... | 0.31974025974  |      7700      |
            +---------+-------------+-------------------------------+----------------+----------------+
            +---------------+
            | joint support |
            +---------------+
            |      2653     |
            |      2628     |
            |      2462     |
            +---------------+
            [3 rows x 7 columns]
        """

        _mt._get_metric_tracker().track(
                      'toolkits.frequent_pattern_mining.predict_topk')
        _raise_error_if_not_sframe(dataset, "dataset")
        score_function = "confidence" # For now, we only support confidence
        return self.__proxy__.predict_topk(dataset, score_function, k)
Пример #22
0
    def extract_features(self, dataset, layer_id=None):
        """
        Takes an input dataset, propagates each example through the network,
        and returns an SArray of dense feature vectors, each of which is the concatenation
        of all the hidden unit values at layer[layer_id]. These feature vectors
        can be used as input to train another classifier such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`,
        an :py:class:`~graphlab.svm_classifier.SVMClassifier`, another
        :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`, or a :py:class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier`. Input dataset size must be the same as for the training of the model,
        except for images which are automatically resized.


        We also are releasing a pre-trained model for ImageNet, as described by
        Alex Krizhevsky et. al. It is located at
        http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45 .
        Using it requires 256 x 256 x 3 images.
        Please see Examples and References for more.


        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        layer_id : int , optional
            The index of the layer in neuralnet at which the activations are
            taken to be a dense feature vector. Must be a fully-connected layer.
            Default is None, in which case the layer before the connection
            layer to the output is used.


        Returns
        -------
        out : SArray
            An SArray of dtype array.array containing extracted features.

        See Also
        ------------
        graphlab.deeplearning.layers

        References
        ----------
        - Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet
        classification with deep convolutional neural networks." Advances in
        neural information processing systems. 2012.

        Examples
        --------
        >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k')
        >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist')
        >>> m = graphlab.neuralnet_classifier.create(data,
        ...                                          target='label',
        ...                                          network=net,
        ...                                          max_iterations=3)
        >>> # Now, let's extract features from the last layer
        >>> data['features'] = m.extract_features(data)
        >>> # Now, let's build a new classifier on top of extracted features
        >>> m = graphlab.classifier.create(data,
        ...                                          features = ['features'],
        ...                                          target='label')

        Now, let's see how to load the ImageNet model, and use it for extracting
        features after resizing the data:

        >>> imagenet_model = graphlab.load_model('http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45')
        >>> data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3)
        >>> data['imagenet_features'] = imagenet_model.extract_features(data)

        """
        _mt._get_metric_tracker().track('toolkit.classifier.neuralnet_classifier.extract_features')
        _raise_error_if_not_sframe(dataset, "dataset")
        options = dict()

        net = self.get('network').layers
        network_size = len(net) - 1
        if layer_id is None:
            if net[network_size]._type == "CONNECTION":
                layer_id = network_size - 1
            else:
                layer_id = network_size - 2
        _numeric_param_check_range("layer_id", layer_id, 0, network_size)

        conv2flat = False
        for i in range(0, layer_id + 1):
            if net[i]._type == "CONNECTION" or net[i]._type == "TRANSITION":
                conv2flat = True

        if conv2flat is not True:
            raise ValueError("Features must be extracted from either a network "
                    "with non-image input or a layer after a FlattenLayer. "
                    "Try extracting features from layer following a FlattenLayer.")

        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset,
                        'layer_id': layer_id})
        target = _toolkits_main.run('supervised_learning_feature_extraction', options)
        return _map_unity_proxy_to_object(target['extracted'])
def make_sgraph(vertex_sframe,
                edge_sframe,
                output_path,
                vid_field,
                src_field,
                dst_field,
                num_partitions=8,
                _distributed='auto'):
    """
    Make an SGraph with input vertex and edge sframes,
    Save the graph to output_path, and return the graph.

    Parameters
    ----------
    vertex_sframe : SFrame
        SFrame of vertex data

    edge_sframe : SFrame
        SFrame of edge data

    output_path : str
        Path where the final graph is saved to.

    vid_field : str
        Column name of vertex id in the vertex sframe.

    src_field : str
        Column name of source vertex id in the edge sframe.

    dst_field : str
        Column name of target vertex id in the edge sframe.

    num_partitions : int
        Number of partitions for the final sgraph.

    Returns
    -------
    out : g
        SGraph
    """
    if type(vid_field) is not str:
        raise TypeError('vid_field must be str')
    if type(src_field) is not str:
        raise TypeError('src_field must be str')
    if type(dst_field) is not str:
        raise TypeError('dst_field must be str')

    # Infer the vid type
    vid_type = None
    if (vertex_sframe is not None and len(vertex_sframe) > 0):
        vid_type = vertex_sframe[vid_field].dtype()
    elif (edge_sframe is not None and len(edge_sframe) > 0):
        vid_type = edge_sframe[src_field].dtype()
    else:
        vid_type = int

    # Create empty edge sframe if input is dummy
    if (edge_sframe is None or len(edge_sframe) == 0):
        edge_sframe = gl.SFrame()
        edge_sframe['__src_id'] = gl.SArray([], vid_type)
        edge_sframe['__dst_id'] = gl.SArray([], vid_type)
        src_field = '__src_id'
        dst_field = '__dst_id'

    # Create empty vertex sframe if input is dummy
    if (vertex_sframe is None or len(vertex_sframe) == 0):
        vertex_sframe = gl.SFrame()
        vertex_sframe['__id'] = gl.SArray([], vid_type)
        vid_field = '__id'

    _raise_error_if_not_sframe(vertex_sframe, "vertex_data")
    _raise_error_if_not_sframe(edge_sframe, "edge_data")

    if vid_field not in vertex_sframe.column_names():
        raise ValueError('Column %s not found in vertex_data' % vid_field)
    if src_field not in edge_sframe.column_names():
        raise ValueError('Column %s not found in edge_data' % src_field)
    if dst_field not in edge_sframe.column_names():
        raise ValueError('Column %s not found in edge_data' % dst_field)

    output_path = _make_internal_url(output_path)

    opts = {'vertex_data': vertex_sframe,
            'edge_data': edge_sframe,
            'output_path': output_path,
            'vid_field': vid_field,
            'src_field': src_field,
            'dst_field': dst_field,
            'num_partitions': num_partitions}

    run('distributed_graph_ingress', opts, env=_distributed)
    from graphlab.data_structures.sgraph import load_sgraph
    return load_sgraph(output_path)
Пример #24
0
    def link(self, dataset, k=5, radius=None, verbose=True):
        """
        Find matching records from the reference dataset (entered when the model
        was created) for each record in the 'dataset' passed to this function.
        The query dataset must include columns with the same names as the label
        and feature columns used to create the RecordLinker
        model.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        k : int, optional
            Maximum number of nearest neighbors to return from the reference set
            for each query observation. The default is 5, but setting it to
            ``None`` will return all neighbors within ``radius`` of the query
            point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the ``k``
            nearest neighbors are returned for each query point, regardless of
            distance.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the row label of the
            query observation, the second is the row label of the nearby
            reference observation, the third is the distance between the query
            and reference observations, and the fourth is the rank of the
            reference observation among the query's k-nearest neighbors.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output is
          an SFrame with :math:`nm` rows.

        Examples
        --------
        Assume we've created the model from the example in the RecordLinker
        'create' function.

        >>> queries = graphlab.SFrame({'sqft': [986, 1320],
        ...                            'street': ['fremont', 'phiney'],
        ...                            'city': ['sea', 'seattle'],
        ...                            'state': ['WA', 'WA']})
        ...
        >>> model.link(queries, k=2, radius=5.)
        +-------------+-----------------+----------+------+
        | query_label | reference_label | distance | rank |
        +-------------+-----------------+----------+------+
        |      0      |        0        |   4.0    |  1   |
        |      0      |        2        |   5.0    |  2   |
        |      1      |        0        |   0.0    |  1   |
        +-------------+-----------------+----------+------+
        """
        _mt._get_metric_tracker().track(self.__module__ + '.link_records')

        ## Validate the 'dataset' input.
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Make sure all of the necessary features are present at 'link' time.
        sf_features = _tkutl._toolkits_select_columns(dataset, 
                                                      self.get('features'))

        ## Clean and impute string data. *** Think about consolidating this and
        #  the next step into a feature transformer.***
        col_types = {k: v for k, v in zip(dataset.column_names(),
                                          dataset.column_types())}
        _dataset = _copy.copy(dataset)
        _distance = _copy.deepcopy(self._state['distance'])

        for ftr in self.get('features'):
            if col_types[ftr] == str:
                new_ftr = '__clean.' + ftr
                _dataset[new_ftr] = _dataset[ftr].fillna("")
                _dataset[new_ftr] = _dataset[new_ftr].apply(
                    lambda x: _dmutl.cleanse_string(x), dtype=str)

                for dist_comp in _distance:
                    dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]]


        ## Convert strings to dicts and concatenate string features.
        _dataset, _ = _engineer_distance_features(_dataset, _distance)


        ## Query the nearest neighbor model
        result = self._knn_model.query(_dataset, k=k, radius=radius,
                                      verbose=verbose)
        return result
Пример #25
0
def create(dataset,
           features=None,
           distance=None,
           radius=1.,
           min_core_neighbors=10,
           verbose=True):
    """
    Create a DBSCAN clustering model. The DBSCAN method partitions the input
    dataset into three types of points, based on the estimated probability
    density at each point.

    - **Core** points have a large number of points within a given neighborhood.
      Specifically, `min_core_neighbors` must be within distance `radius` of a
      point for it to be considered a core point.

    - **Boundary** points are within distance `radius` of a core point, but
      don't have sufficient neighbors of their own to be considered core.

    - **Noise** points comprise the remainder of the data. These points have too
      few neighbors to be considered core points, and are further than distance
      `radius` from all core points.

    Clusters are formed by connecting core points that are neighbors of each
    other, then assigning boundary points to their nearest core neighbor's
    cluster.

    Parameters
    ----------
    dataset : SFrame
        Training data, with each row corresponding to an observation. Must
        include all features specified in the `features` parameter, but may have
        additional columns as well.

    features : list[str], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns of the input `dataset` should
        be used to train the model. All features must be numeric, i.e. integer
        or float types.

    distance : str or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (str)

          2. standard distance name (str)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified, a composite distance is constructed
        automatically based on feature types.

    radius : int or float, optional
        Size of each point's neighborhood, with respect to the specified
        distance function.

    min_core_neighbors : int, optional
        Number of neighbors that must be within distance `radius` of a point in
        order for that point to be considered a "core point" of a cluster.

    verbose : bool, optional
        If True, print progress updates and model details during model creation.

    Returns
    -------
    out : DBSCANModel
        A model containing a cluster label for each row in the input `dataset`.
        Also contains the indices of the core points, cluster boundary points,
        and noise points.

    See Also
    --------
    DBSCANModel, graphlab.toolkits.distances

    Notes
    -----
    - Our implementation of DBSCAN first computes the similarity graph on the
      input dataset, which can be a computationally intensive process. In the
      current implementation, some distances are substantially faster than
      others; in particular "euclidean", "squared_euclidean", "cosine", and
      "transformed_dot_product" are quite fast, while composite distances can be
      slow.

    - Any distance function in the GL Create library may be used with DBSCAN but
      the results may be poor for distances that violate the standard metric
      properties, i.e. symmetry, non-negativity, triangle inequality, and
      identity of indiscernibles. In particular, the DBSCAN algorithm is based
      on the concept of connecting high-density points that are *close* to each
      other into a single cluster, but the notion of *close* may be very
      counterintuitive if the chosen distance function is not a valid metric.
      The distances "euclidean", "manhattan", "jaccard", and "levenshtein" will
      likely yield the best results.

    References
    ----------
    - Ester, M., et al. (1996) `A Density-Based Algorithm for Discovering
      Clusters in Large Spatial Databases with Noise
      <http://www.aaai.org/Papers/KDD/1996/KDD96-037>`_. In Proceedings of the
      Second International Conference on Knowledge Discovery and Data Mining.
      pp. 226-231.

    - `Wikipedia - DBSCAN <https://en.wikipedia.org/wiki/DBSCAN>`_

    - `Visualizing DBSCAN Clustering
      <http://www.naftaliharris.com/blog/visualizing-dbscan-clustering/>`_

    Examples
    --------
    >>> sf = graphlab.SFrame({
    ...     'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162,
    ...            8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020],
    ...     'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305,
    ...            5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]})
    ...
    >>> model = graphlab.dbscan.create(sf, radius=4.25, min_core_neighbors=3)
    >>> model['cluster_id'].print_rows(15)
    +--------+------------+----------+
    | row_id | cluster_id |   type   |
    +--------+------------+----------+
    |   8    |     0      |   core   |
    |   7    |     2      |   core   |
    |   0    |     1      |   core   |
    |   2    |     2      |   core   |
    |   3    |     1      |   core   |
    |   11   |     2      |   core   |
    |   4    |     2      |   core   |
    |   1    |     0      | boundary |
    |   6    |     0      | boundary |
    |   5    |     0      | boundary |
    |   9    |     0      | boundary |
    |   12   |     2      | boundary |
    |   10   |     1      | boundary |
    |   13   |     1      | boundary |
    +--------+------------+----------+
    [14 rows x 3 columns]
    """

    ## Start the training time clock and instantiate an empty model
    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    logger = _logging.getLogger(__name__)
    start_time = _time.time()

    ## Validate the input dataset
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Validate neighborhood parameters
    if not isinstance(min_core_neighbors, int) or min_core_neighbors < 0:
        raise ValueError("Input 'min_core_neighbors' must be a non-negative " +
                         "integer.")

    if not isinstance(radius, (int, float)) or radius < 0:
        raise ValueError("Input 'radius' must be a non-negative integer " +
                         "or float.")

    ## Compute all-point nearest neighbors within `radius` and count
    #  neighborhood sizes
    knn_model = _gl.nearest_neighbors.create(dataset,
                                             features=features,
                                             distance=distance,
                                             method='brute_force',
                                             verbose=verbose)

    knn = knn_model.similarity_graph(k=None,
                                     radius=radius,
                                     include_self_edges=False,
                                     output_type='SFrame',
                                     verbose=verbose)

    neighbor_counts = knn.groupby('query_label', _agg.COUNT)

    ### NOTE: points with NO neighbors are already dropped here!

    ## Identify core points and boundary candidate points. Not all of the
    #  boundary candidates will be boundary points - some are in small isolated
    #  clusters.
    if verbose:
        logger.info("Identifying noise points and core points.")

    boundary_mask = neighbor_counts['Count'] < min_core_neighbors
    core_mask = 1 - boundary_mask

    # this includes too small clusters
    boundary_idx = neighbor_counts[boundary_mask]['query_label']
    core_idx = neighbor_counts[core_mask]['query_label']

    ## Build a similarity graph on the core points
    ## NOTE: careful with singleton core points - the second filter removes them
    #  from the edge set so they have to be added separately as vertices.
    if verbose:
        logger.info("Constructing the core point similarity graph.")

    core_vertices = knn.filter_by(core_idx, 'query_label')
    core_edges = core_vertices.filter_by(core_idx, 'reference_label')

    core_graph = _gl.SGraph()
    core_graph = core_graph.add_vertices(core_vertices[['query_label']],
                                         vid_field='query_label')
    core_graph = core_graph.add_edges(core_edges,
                                      src_field='query_label',
                                      dst_field='reference_label')

    ## Compute core point connected components and relabel to be consecutive
    #  integers
    cc = _gl.connected_components.create(core_graph, verbose=verbose)
    cc_labels = cc['component_size'].add_row_number('__label')
    core_assignments = cc['component_id'].join(cc_labels,
                                               on='component_id',
                                               how='left')[['__id', '__label']]
    core_assignments['type'] = 'core'

    ## Join potential boundary points to core cluster labels (points that aren't
    #  really on a boundary are implicitly dropped)
    if verbose:
        logger.info("Processing boundary points.")

    boundary_edges = knn.filter_by(boundary_idx, 'query_label')

    # separate real boundary points from points in small isolated clusters
    boundary_core_edges = boundary_edges.filter_by(core_idx, 'reference_label')

    # join a boundary point to its single closest core point.
    boundary_assignments = boundary_core_edges.groupby(
        'query_label',
        {'reference_label': _agg.ARGMIN('rank', 'reference_label')})

    boundary_assignments = boundary_assignments.join(
        core_assignments, on={'reference_label': '__id'})

    boundary_assignments = boundary_assignments.rename({'query_label': '__id'})
    boundary_assignments = boundary_assignments.remove_column(
        'reference_label')
    boundary_assignments['type'] = 'boundary'

    ## Identify boundary candidates that turned out to be in small clusters but
    #  not on real cluster boundaries
    small_cluster_idx = set(boundary_idx).difference(
        boundary_assignments['__id'])

    ## Identify individual noise points by the fact that they have no neighbors.
    noise_idx = set(range(dataset.num_rows())).difference(
        neighbor_counts['query_label'])

    noise_idx = noise_idx.union(small_cluster_idx)

    noise_assignments = _gl.SFrame(
        {'row_id': _gl.SArray(list(noise_idx), int)})
    noise_assignments['cluster_id'] = None
    noise_assignments['cluster_id'] = noise_assignments['cluster_id'].astype(
        int)
    noise_assignments['type'] = 'noise'

    ## Append core, boundary, and noise results to each other.
    master_assignments = _gl.SFrame()
    num_clusters = 0

    if core_assignments.num_rows() > 0:
        core_assignments = core_assignments.rename({
            '__id': 'row_id',
            '__label': 'cluster_id'
        })
        master_assignments = master_assignments.append(core_assignments)
        num_clusters = len(core_assignments['cluster_id'].unique())

    if boundary_assignments.num_rows() > 0:
        boundary_assignments = boundary_assignments.rename({
            '__id':
            'row_id',
            '__label':
            'cluster_id'
        })
        master_assignments = master_assignments.append(boundary_assignments)

    if noise_assignments.num_rows() > 0:
        master_assignments = master_assignments.append(noise_assignments)

    ## Post-processing and formatting
    state = {
        'verbose': verbose,
        'radius': radius,
        'min_core_neighbors': min_core_neighbors,
        'distance': knn_model['distance'],
        'num_distance_components': knn_model['num_distance_components'],
        'num_examples': dataset.num_rows(),
        'features': knn_model['features'],
        'num_features': knn_model['num_features'],
        'unpacked_features': knn_model['unpacked_features'],
        'num_unpacked_features': knn_model['num_unpacked_features'],
        'cluster_id': master_assignments,
        'num_clusters': num_clusters,
        'training_time': _time.time() - start_time
    }

    return DBSCANModel(state)
    def predict(self, dataset, output_type='cluster_id', verbose=True):
        """
        Return predicted cluster label for instances in the new 'dataset'.
        K-means predictions are made by assigning each new instance to the
        closest cluster center.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include the features used for
            model training; additional columns are ignored.

        output_type : {'cluster_id', 'distance'}, optional
            Form of the prediction. 'cluster_id' (the default) returns the
            cluster label assigned to each input instance, while 'distance'
            returns the Euclidean distance between the instance and its
            assigned cluster's center.

        verbose : bool, optional
            If True, print progress updates to the screen.

        Returns
        -------
        out : SArray
            Model predictions. Depending on the specified `output_type`, either
            the assigned cluster label or the distance of each point to its
            closest cluster center. The order of the predictions is the same as
            order of the input data rows.

        See Also
        --------
        create

        Examples
        --------
        >>> sf = graphlab.SFrame({
        ...     'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162,
        ...            8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020],
        ...     'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305,
        ...            5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]})
        ...
        >>> model = graphlab.kmeans.create(sf, num_clusters=3)
        ...
        >>> sf_new = graphlab.SFrame({'x1': [-5.6584, -1.0167, -9.6181],
        ...                           'x2': [-6.3803, -3.7937, -1.1022]})
        >>> clusters = model.predict(sf_new, output_type='cluster_id')
        >>> print clusters
        [1, 0, 1]
        """
        _mt._get_metric_tracker().track('toolkit.kmeans.predict')

        ## Validate the input dataset.
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Validate the output type.
        if not isinstance(output_type, str):
            raise TypeError("The 'output_type' parameter must be a string.")

        if not output_type in ('cluster_id', 'distance'):
            raise ValueError("The 'output_type' parameter must be either " +
                             "'cluster_label' or 'distance'.")

        ## Get model features.
        ref_features = self.get('features')
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Compute predictions.
        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'dataset': sf_features}

        result = _gl.toolkits._main.run('kmeans_predict', opts, verbose)
        sf_result = _gl.SFrame(None, _proxy=result['predictions'])

        if output_type == 'distance':
            return sf_result['distance']
        else:
            return sf_result['cluster_id']
Пример #27
0
def create(dataset, tag_name=None, features=None, verbose=True):
    """
    Create a :class:`NearestNeighborAutoTagger`
    model, which can be used to quickly apply tags from a reference set of text
    labels to a new query set using the ``tag`` method.

    Parameters
    ----------
    dataset : SFrame
        Reference data. This SFrame must contain at least one column. By
        default, only the ``tag_name`` column is used as the basis for
        tagging. You may optionally include additional columns with the
        ``features`` parameter.

    tag_name : string, optional
        Name of the column in ``dataset`` with the tags. This column must
        contain string values. If ``dataset`` contains more than one column,
        ``tag_name`` must be specified.

    features : list[string], optional
        Names of the columns with features to use as the basis for tagging.
        'None' (the default) indicates that only the column specified by the
        ``tag_name`` parameter should be used. Only str or list fields are
        allowed. If a column of type list is specified, all values must be
        either of type string or convertible to type string.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : model
        A model for quickly tagging new query observations with entries from
        `dataset`. Currently, the only implementation is the following:

        - NearestNeighborAutoTagger

    See Also
    --------
    graphlab.nearest_neighbors.NearestNeighborsModel

    Examples
    --------
    First construct a toy `SFrame` of actor names, which will serve as the
    reference set for our autotagger model.

    >>> actors_sf = gl.SFrame(
            {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                       "Tom Cruise", "Jude Law", "Robert Pattinson",
                       "Matt Damon", "Brad Pitt", "Johnny Depp",
                       "Leonardo DiCaprio", "Jennifer Aniston",
                       "Jessica Alba", "Emma Stone", "Cameron Diaz",
                       "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                       "Charlize Theron", "Marion Cotillard",
                       "Angelina Jolie"]})
    >>> m = gl.data_matching.nearest_neighbor_autotagger.create(
                actors_sf, tag_name="actor")

    Then we load some IMDB movie reviews into an `SFrame` and tag them using
    the model we created above. The score field in the output is a
    similarity score, indicating the strength of the match between the query
    data and the suggested reference tag.

    >>> reviews_sf = gl.SFrame(
            "https://static.turi.com/datasets/imdb_reviews/reviews.sframe")
    >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
    +-----------+-------------------------------+------------------+-----------------+
    | review_id |             review            |      actor       |      score      |
    +-----------+-------------------------------+------------------+-----------------+
    |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
    |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
    |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
    |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
    |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
    |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
    |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
    |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
    |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
    |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
    |    ...    |              ...              |       ...        |       ...       |
    +-----------+-------------------------------+------------------+-----------------+

    The initial results look a little noisy. To filter out obvious spurious
    matches, we can set the `tag` method's similarity_threshold parameter.

    >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
              similarity_threshold=.8)
    +-----------+-------------------------------+------------------+----------------+
    | review_id |             review            |      actor       |     score      |
    +-----------+-------------------------------+------------------+----------------+
    |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
    |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
    |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
    |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
    +-----------+-------------------------------+------------------+----------------+

    In this second example, you'll notice that the ``review_id`` column is much
    more sparse. This is because all results whose score was below the specified
    similarity threshold (.8) were excluded from the output.

    """
    # validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    # ensure that tag_name is provided if dataset has > 1 column
    if dataset.num_cols() > 1 and not tag_name:
        raise _ToolkitError("No tag_name parameter specified on dataset " \
                            "with %d columns" % dataset.num_cols())
    tag_name = tag_name or dataset.column_names()[0]

    # ensure that column with name tag_name exists
    if tag_name not in dataset.column_names():
        raise _ToolkitError('No column named "%s" in dataset' % tag_name)

    # ensure that column is of type string
    if dataset[tag_name].dtype() != str:
        raise TypeError("The column used as the tag name must be of type " \
                        "string.")

    # use reasonable default for general case
    distance = _gl.distances.weighted_jaccard

    # if additional features are specified, ensure they are of appropriate types
    if features and not isinstance(features, list) and \
       all([isinstance(x, str) for x in features]):
        raise TypeError("The feature parameter must be a list of strings " \
                        "and those strings must correspond to columns in " \
                        "`dataset`.")

    # at a minimum, this SFrame will contain the tags as features;
    features = features or []
    features = [tag_name] + [x for x in features if x != tag_name]

    # ensure that each specified feature column is either of type list or str
    column_names = set(dataset.column_names())
    for col_name in features:
        if col_name not in column_names:
            raise _ToolkitError("Specified feature column (%s) not found " \
                                "in dataset" % col_name)

        if dataset.select_column(col_name).dtype() not in (str, list):
            raise TypeError("Only string and list columns are allowed as " \
                            "features.")

    # concatenate the feature columns into a single column
    features_sf = dataset.select_columns(features)
    feature_col, features_sf = _concat_string_features(features_sf, features)

    # compute features
    if verbose:
        _logging.getLogger().info("Extracting features...")

    features = _preprocess(features_sf.select_column(feature_col))

    # group by tag_name to ensure that tags are unique
    feature_cols = features.column_names()
    select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \
                   in feature_cols}
    features.add_column(dataset[tag_name], tag_name)
    features = features.groupby(tag_name, select_cols)

    # create nearest neighbors model
    m = _gl.nearest_neighbors.create(features,
                                     label=tag_name,
                                     distance=distance,
                                     features=feature_cols,
                                     verbose=verbose)

    # add standard toolkit state attributes
    state = {
        "nearest_neighbors_model": m,
        "training_time": m.get("training_time"),
        "tag_name": tag_name,
        "verbose": verbose,
        "num_examples": len(features),
        "features": feature_cols,
        "num_features": len(feature_cols),
        "distance": m.get("distance")
    }

    model = NearestNeighborAutoTagger(state)
    return model
Пример #28
0
def create(datasets, row_label=None, features=None, grouping_features=None,
           distance=None, k=2, radius=None, verbose=True):
    """
    Create a deduplication model based on nearest neighbors and SGraph connected
    components.

    This method creates a :class:`NearestNeighborDeduplication` model by
    constructing a nearest neighbors similarity graph on all of the rows in the
    input 'datasets', then using the connected components tool in the
    :mod:`~graphlab.toolkits.graph_analytics` module to assign an entity label
    to each record. Records which share the same label are considered to be
    duplicates.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    datasets : SFrame or list[SFrame] or dict(string: SFrame)
        Input datasets. Each SFrame in the list must include all of the features
        specified in the `features` or 'distance' parameters, but may
        have additional columns as well. SFrames can be input as values in a
        dictionary, where the keys are strings used in the output to identify
        the SFrame from which each record originated.

    row_label : string, optional
        Name of the SFrame column with row labels. If not specified, row numbers
        are used to identify rows in the output.

    features : list[string], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates the intersection of columns over all SFrames in
        `datasets` should be used (except the label column, if specified). Each
        column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model. Any
        additional columns named in 'features' will be included in the model
        output but not used for distance computations.

    grouping_features : list[string], optional
        Names of features to use in grouping records before finding approximate
        matches. These columns must have string or integer type data. See the
        Notes section for more details on grouping.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    k : int, optional
        Number of neighbors to consider for each point.

    radius : float, optional
        Maximum distance from each point to a potential duplicate.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborDeduplication model
        The NearestNeighborDeduplication object contains a field 'entities'
        which shows the entity label for each input record. It also shows the
        features for each record that are used to construct the model, as well
        as the original SFrame and row label for each record. If the original
        `datasets` are passed in a list, the SFrame identifier is the index of
        the SFrame in that list.

    See Also
    --------
    NearestNeighborDeduplication, graphlab.toolkits.nearest_neighbors,
    graphlab.SFrame.groupby

    Notes
    -----
    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - For datasets with more than about 10,000 records, *grouping* (also known
      as *blocking*) is a critical step to avoid computing distances between all
      pairs of records. The grouping step simply assigns each record to a group
      that has identical values for all `grouping_features`, and only looks for
      duplicates within each group.

    - Records with missing data in the `grouping_features` are removed from
      consideration as duplicates. These records are given the entity label
      "None".

    - For tasks that require *only* exact matches on certain features, it is
      generally more natural to use the SFrame `groupby` function.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    References
    ----------
    - Christen, Peter. "Data matching: concepts and techniques for record
      linkage, entity resolution, and duplicate detection." Springer Science &
      Business Media, 2012.

    Examples
    --------
    >>> sf1 = graphlab.SFrame({'id': [0, 1, 2],
    ...                        'x0': [0.5, 0.5, 0.3],
    ...                        'x1': [1., 0.8, 0.6],
    ...                        'city': ['seattle', 'olympia', 'boston'],
    ...                        'state': ['WA', 'WA', 'MA']})
    ...
    ... # note: misspellings in the following dataset do not prevent correct
    ... # matches.
    >>> sf2 = graphlab.SFrame({'id': [9, 10],
    ...                        'x0': [0.35, 0.4],
    ...                        'x1': [0.65, 0.8],
    ...                        'city': ['bostan', 'seatle'],
    ...                        'state': ['MA', 'WA']})
    ...
    >>> dist = [[('city',), 'levenshtein', 2],
    ...         [('x0', 'x1'), 'euclidean', 1.5]]
    ...
    >>> m = graphlab.nearest_neighbor_deduplication.create({'a': sf1, 'b': sf2},
    ...                                                    row_label='id',
    ...                                                    grouping_features=['state'],
    ...                                                    distance=dist, k=None,
    ...                                                    radius=3)
    ...
    >>> print m['entities']
    +----------+----+----------+-------+------+---------+------+
    | __sframe | id | __entity | state |  x0  |   city  |  x1  |
    +----------+----+----------+-------+------+---------+------+
    |    a     | 1  |    0     |   WA  | 0.5  | olympia | 0.8  |
    |    a     | 0  |    1     |   WA  | 0.5  | seattle | 1.0  |
    |    b     | 10 |    1     |   WA  | 0.4  |  seatle | 0.8  |
    |    a     | 2  |    2     |   MA  | 0.3  |  boston | 0.6  |
    |    b     | 9  |    2     |   MA  | 0.35 |  bostan | 0.65 |
    +----------+----+----------+-------+------+---------+------+
    [5 rows x 7 columns]
    """

    ## Set up
    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    start_time = _time.time()

    model = NearestNeighborDeduplication()
    model.__proxy__['verbose'] = verbose
    model.__proxy__['k'] = k
    model.__proxy__['radius'] = radius


    ### ----------------------------- ###
    ### Validation and preprocessing ###
    ### ----------------------------- ###

    ### Validate input datasets
    ### -----------------------

    ## If datasets is already a dict, check the keys are all strings
    if isinstance(datasets, dict):
        if not(all([isinstance(x, str) for x in datasets.keys()])):
            raise ValueError("Keys in the 'datasets' dict must be strings.")

    ## Convert singleton SFrame dataset into a list of datasets
    if isinstance(datasets, _gl.SFrame):
        _raise_error_if_sframe_empty(datasets, "dataset")
        datasets = {0: datasets}

    ## Convert a list of SFrames into a dict
    if isinstance(datasets, list):
        datasets = {k: sf for k, sf in enumerate(datasets)}

    ## At this point, 'datasets' must be dict. If it's not, something is wrong.
    if not isinstance(datasets, dict):
        raise TypeError("Input 'datasets' must be an SFrame, a list of SFrames, " +
                        "or a dictionary of (string, SFrame) pairs.")

    model.__proxy__['num_datasets'] = len(datasets)

    ## Ensure that all datasets are SFrames
    for d in datasets.values():
        _raise_error_if_not_sframe(d, "dataset")


    ### Validate row label
    ### ------------------

    ## Validate the label column
    if row_label:
        if not isinstance(row_label, str):
            raise TypeError("The 'row_label' parameter must be the name (string " +
                            "type) of a column in each of the input datasets.")

        for d in datasets.values():
            if row_label not in d.column_names():
                raise _ToolkitError("The specified row_label column does not " +
                                    " exist in all input datasets.")
    else:
        row_label = 'row_number'

        for d in datasets.values():
            if row_label in d.column_names():
                raise _ToolkitError("Input 'row_label' defaulted to " +
                                    "'row_number', which is already a column" +
                                    " in at least one input dataset. Please " +
                                    "specify a row label column manually.")

    model.__proxy__['row_label'] = row_label


    ### Validate 'features' and 'grouping_features' parameters
    ### ------------------------------------------------------
    if features is not None:
        if not hasattr(features, '__iter__'):
            raise TypeError("Input 'features' must be a list.")

        if not all([isinstance(x, str) for x in features]):
            raise TypeError("Input 'features' must contain only strings.")

    if grouping_features is not None:
        if not hasattr(grouping_features, '__iter__'):
            raise TypeError("Input 'grouping_features' must be a list.")

        if not all([isinstance(x, str) for x in grouping_features]):
            raise TypeError("Input 'grouping_features' must contain only strings.")


    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.

    ## Find the intersection of all feature sets and feature types
    col_types = {k: v for k, v in zip(list(datasets.values())[0].column_names(),
                                      list(datasets.values())[0].column_types())}

    all_features = [sf.column_names() for sf in datasets.values()]
    ftr_intersection = list(set(all_features[0]).intersection(*all_features))
    ftr_intersection = [x for x in ftr_intersection if x != row_label]


    ## Convert features and distance arguments into a composite distance.
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif isinstance(distance, str):
        if features is not None:
            distance = [[features, distance, 1]]
        else:
            distance = [[ftr_intersection, distance, 1]]

    elif distance == None:
        if features is not None:
            distance = _construct_auto_distance(features, col_types)
        else:
            distance = _construct_auto_distance(ftr_intersection, col_types)

    else:
        raise TypeError("Input 'distance' not understood. Note that for the " +
                         "data matching toolkit, 'distance' must be a string or " +
                         "a composite distance list."   )


    ## Validate the form of the composite distance and add to the model
    allowed_dists = {
        'euclidean': [int, float, _array.array],
        'squared_euclidean': [int, float, _array.array],
        'manhattan': [int, float, _array.array],
        'levenshtein': [str],
        'jaccard': [str, dict],
        'weighted_jaccard': [str, dict],
        'cosine': [int, float, str, dict, _array.array],
        'dot_product': [int, float, str, dict, _array.array],
        'transformed_dot_product': [int, float, str, dict, _array.array]}

    distance = _dmutl.validate_composite_distance(distance, row_label,
                                                  list(allowed_dists.keys()),
                                                  verbose)
    model.__proxy__['distance'] = _copy.deepcopy(distance)


    ## Figure out which features are 'fuzzy', i.e. used for approximate
    #  matching, and set in the model state.
    fuzzy_features = _dmutl.extract_composite_features(distance)  # already has row_label removed

    model.__proxy__['features'] = fuzzy_features
    model.__proxy__['num_features'] = len(fuzzy_features)


    ## Compile a master list of all features. This includes grouping features,
    #  fuzzy features (the ones used for approximate matching), and "ancillary"
    #  features, which are specified in the 'features' parameter but not in the
    #  composite distance function for whatever reason. by the user in the
    #  'features' parameter, but not included in the 'distance' specification
    #  for some reason.
    if features is None:
        features = []
    else:
        features = [x for x in features if x != row_label]

    if grouping_features is None:
        grouping_features = []
    else:
        grouping_features = [x for x in grouping_features if x != row_label]

    model.__proxy__['grouping_features'] = grouping_features
    model.__proxy__['num_grouping_features'] = len(grouping_features)

    master_features = list(set(features + grouping_features + fuzzy_features))


    ### Consolidate data and engineer features
    ### --------------------------------------

    ## Consolidate multiple input datasets into a single SFrame, with a useful
    #  row label.
    sf_union = _dmutl.concatenate_sframes(datasets, row_label=row_label,
                                   features=master_features,
                                   sf_index_name='__sframe')
    overall_label = '__sframe.' + row_label
    sf_union[overall_label] = (sf_union['__sframe'].astype(str) + "." +
                               sf_union[row_label].astype(str))


    ## Validate the feature types in the consolidated dataset against the
    #  specified distance functions.
    _dmutl.validate_distance_feature_types(sf_union, distance, allowed_dists)


    ## Clean string-type features in the fuzzy feature set.
    for ftr in fuzzy_features:
        if col_types[ftr] == str:
            new_ftr = '__clean.' + ftr
            sf_union[new_ftr] = sf_union[ftr].fillna("")
            sf_union[new_ftr] = sf_union[new_ftr].apply(
                lambda x: _dmutl.cleanse_string(x), dtype=str)

            for dist_comp in distance:
                dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]]


    ## Feature engineering, distance-component-wise. Also update list of
    #  features and a map to their types.
    sf_union, distance = _engineer_distance_features(sf_union, distance)
    transformed_features = _dmutl.extract_composite_features(distance)

    ### -------------------------------------------- ###
    ### Main loop over blocks of neighbor candidates ###
    ### -------------------------------------------- ###

    ## Construct blocks on features that must match exactly
    if verbose:
        _logging.info("Constructing groups of records that match exactly on " +
                      "the 'grouping_features'.")

    sf_union, block_errors, blocks = \
        _dmutl.construct_exact_blocks(sf_union, grouping_features)

    if verbose and len(distance) > 0 and blocks['Count'].max() > 10000:
        _logging.warning("There are more than 10,000 records in the largest match " +
            "group. For many uses, approximate matches within each match group are " +
            "computed with brute force nearest neighbors, which may be slow. " +
            "Consider using smaller groups by requiring different features to " +
            "match exactly.")

    max_entity_number = 0
    sf_entity = _gl.SFrame()
    output_features = (master_features + [row_label, '__sframe', '__entity'])

    ## Main loop over blocks
    for i, block in enumerate(blocks):

        if verbose:
            _logging.info("Processing {} records in match group: {}/{}".format(block['Count'],
                                                                         i+1,
                                                                         len(blocks)))

        ## Retrieve records in the block and impute the mean for missing numeric
        #  values.
        records = sf_union[block['min_idx']:(block['max_idx'] + 1)]
        complete_records = _dmutl.impute_numeric_means(records, transformed_features)

        if len(distance) > 0:
            ## Run all-point nearest neighbors
            if verbose:
                _logging.info("Building the similarity graph....")

            m = _gl.nearest_neighbors.create(complete_records, label=overall_label,
                                             distance=distance, verbose=False)
            knn = m.query(complete_records, label=overall_label, k=k, radius=radius,
                          verbose=verbose)


            ## Construct similarity graph to resolve transitive closure
            sg = _gl.SGraph()
            sg = sg.add_vertices(records[[overall_label]], vid_field=overall_label)
            sg = sg.add_edges(knn, src_field='query_label',
                              dst_field='reference_label')


            ## Cut the similarity graph to establish an entity for each vertex
            if verbose:
                _logging.info("Finding duplicate records in the similarity graph....")

            cc = _gl.connected_components.create(sg, verbose=verbose)

            ## Relabel the component IDs to be consecutive integers starting with
            #  the max index of the previous block's entity labels.
            block_labels = cc['component_size'].add_row_number('__entity')
            block_labels['__entity'] += max_entity_number
            max_entity_number += block_labels.num_rows()
            block_entity_labels = cc['component_id'].join(block_labels,
                                                          on='component_id',
                                                          how='left')

            ## Join the entity labels for the block back to the block's records,
            #  then append to the master output
            records = records.join(block_entity_labels[['__id', '__entity']],
                                   on={overall_label: '__id'}, how='left')
            records = records.sort('__entity')

        else:  # no fuzzy features, so no nearest neighbors, just block ID
            records['__entity'] = _gl.SArray.from_const(i, len(records))


        sf_entity = sf_entity.append(records[output_features])


    ### ------------------------------------- ###
    ### Postprocessing and results formatting ###
    ### ------------------------------------- ###

    ## Add rows missing from the blocking back to the master results
    if len(block_errors) > 0:
        block_errors['__entity'] = _gl.SArray.from_const(None, len(block_errors)).astype(int)
        sf_entity = sf_entity.append(block_errors[output_features])

    ## Rearrange columns
    sf_entity.swap_columns('__sframe', sf_entity.column_names()[0])
    sf_entity.swap_columns(row_label, sf_entity.column_names()[1])
    sf_entity.swap_columns('__entity', sf_entity.column_names()[2])


    ## Finalize the model state
    model.__proxy__['training_time'] = _time.time() - start_time
    model.__proxy__['entities'] = sf_entity
    model.__proxy__['num_entities'] = max_entity_number

    return model
    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
        """
        Retrieve the nearest neighbors from the reference set for each element
        of the query set. The query SFrame must include columns with the same
        names as the label and feature columns used to create the
        NearestNeighborsModel.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        label : string, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the query
            point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the ``k``
            nearest neighbors are returned for each query point, regardless of
            distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output is
          an SFrame with :math:`nm` rows.

        Examples
        --------
        First construct a toy SFrame and create a nearest neighbors model:

        >>> sf = graphlab.SFrame({'label': range(3),
        ...                       'feature1': [0.98, 0.62, 0.11],
        ...                       'feature2': [0.69, 0.58, 0.36]})
        >>> model = graphlab.nearest_neighbors.create(sf, 'label')

        A new SFrame contains query observations with same schema as the
        reference SFrame. This SFrame is passed to the ``query`` method.

        >>> queries = graphlab.SFrame({'label': range(3),
        ...                            'feature1': [0.05, 0.61, 0.99],
        ...                            'feature2': [0.06, 0.97, 0.86]})
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """

        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors.query')

        ## Validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Get model features
        ref_features = self.get('features')
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Validate and preprocess the 'label' input
        ref_label = self.get('label')

        if label is None:
            sf_features = sf_features.add_row_number(column_name=ref_label)
            sf_label = sf_features[[ref_label]]
            sf_features.remove_column(ref_label)

        else:
            if not label in dataset.column_names():
                raise ValueError(
                    "Input 'label' must be a string matching the name of a " +\
                    "column in the reference SFrame 'dataset'.")

            if not dataset[label].dtype() == str and not dataset[label].dtype() == int:
                raise TypeError("The label column must contain integers or strings.")

            if label in ref_features:
                raise ValueError("The label column cannot be one of the features.")

            sf_label = _tkutl._toolkits_select_columns(dataset, [label])

            if label != ref_label:
                sf_label.rename({label: ref_label})


        ## Validate neighborhood parameters 'k' and 'radius'
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")


        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'features': sf_features,
                'label': sf_label,
                'k': k,
                'radius': radius}

        if verbose is True:
            print "Starting model querying..."

        result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts,
                                             verbose)
        return _SFrame(None, _proxy=result['neighbors'])
    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
        """
        For each row of the input 'dataset', retrieve the nearest neighbors
        from the model's stored data. In general, the query dataset does not
        need to be the same as the reference data stored in the model, but if
        it is, the 'include_self_edges' parameter can be set to False to
        exclude results that match query points to themselves.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        label : str, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the
            query point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the
            ``k`` nearest neighbors are returned for each query point,
            regardless of distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        See Also
        --------
        similarity_graph

        Notes
        -----
        - The `dataset` input to this method *can* have missing values (in
          contrast to the reference dataset used to create the nearest
          neighbors model). Missing numeric values are imputed to be the mean
          of the corresponding feature in the reference dataset, and missing
          strings are imputed to be empty strings.

        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output
          is an SFrame with :math:`nm` rows.

        - For models created with the 'lsh' method, the query results may have
          fewer query labels than input query points. Because LSH is an
          approximate method, a query point may have fewer than 'k' neighbors.
          If LSH returns no neighbors at all for a query, the query point is
          omitted from the results.

        Examples
        --------
        First construct a toy SFrame and create a nearest neighbors model:

        >>> sf = graphlab.SFrame({'label': range(3),
        ...                       'feature1': [0.98, 0.62, 0.11],
        ...                       'feature2': [0.69, 0.58, 0.36]})
        >>> model = graphlab.nearest_neighbors.create(sf, 'label')

        A new SFrame contains query observations with same schema as the
        reference SFrame. This SFrame is passed to the ``query`` method.

        >>> queries = graphlab.SFrame({'label': range(3),
        ...                            'feature1': [0.05, 0.61, 0.99],
        ...                            'feature2': [0.06, 0.97, 0.86]})
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """

        _mt._get_metric_tracker().track('toolkit.nearest_neighbors.query')

        ## Validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Get model features
        ref_features = self.get('features')
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Validate and preprocess the 'label' input
        if label is None:
            query_labels = _graphlab.SArray.from_sequence(len(dataset))

        else:
            if not label in dataset.column_names():
                raise ValueError(
                    "Input 'label' must be a string matching the name of a " +\
                    "column in the reference SFrame 'dataset'.")

            if not dataset[label].dtype() == str and not dataset[label].dtype(
            ) == int:
                raise TypeError(
                    "The label column must contain integers or strings.")

            if label in ref_features:
                raise ValueError(
                    "The label column cannot be one of the features.")

            query_labels = dataset[label]

        ## Validate neighborhood parameters 'k' and 'radius'
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")

        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'features': sf_features,
            'query_labels': query_labels,
            'k': k,
            'radius': radius
        }

        result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts,
                                              verbose)
        return _SFrame(None, _proxy=result['neighbors'])
def create(dataset,
           label=None,
           features=None,
           distance=None,
           method='auto',
           verbose=True,
           **kwargs):
    """
    Create a nearest neighbor model, which can be searched efficiently and
    quickly for the nearest neighbors of a query observation. If the `method`
    argument is specified as `auto`, the type of model is chosen automatically
    based on the type of data in `dataset`.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change;
        it is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Reference data. If the features for each observation are numeric, they
        may be in separate columns of 'dataset' or a single column with lists
        of values. The features may also be in the form of a column of sparse
        vectors (i.e. dictionaries), with string keys and numeric values.

    label : string, optional
        Name of the SFrame column with row labels. If 'label' is not specified,
        row numbers are used to identify reference dataset rows when the model
        is queried.

    features : list[string], optional
        Name of the columns with features to use in computing distances between
        observations and the query points. 'None' (the default) indicates that
        all columns except the label should be used as features. Each column
        can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *List*: list of integer or string values. Each element is treated as
          a separate variable in the model.

        - *String*: string values.

        Please note: if a composite distance is also specified, this parameter
        is ignored.

    distance : string, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~graphlab.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        If 'distance' is left unspecified or set to 'auto', a composite
        distance is constructed automatically based on feature types.

    method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional
        Method for computing nearest neighbors. The options are:

        - *auto* (default): the method is chosen automatically, based on the
          type of data and the distance. If the distance is 'manhattan' or
          'euclidean' and the features are numeric or vectors of numeric
          values, then the 'ball_tree' method is used. Otherwise, the
          'brute_force' method is used.

        - *ball_tree*: use a tree structure to find the k-closest neighbors to
          each query point. The ball tree model is slower to construct than the
          brute force model, but queries are faster than linear time. This
          method is not applicable for the cosine and dot product distances.
          See `Liu, et al (2004)
          <http://papers.nips.cc/paper/2666-an-investigation-of-p
          ractical-approximat e-nearest-neighbor-algorithms>`_ for
          implementation details.

        - *brute_force*: compute the distance from a query point to all
          reference observations. There is no computation time for model
          creation with the brute force method (although the reference data is
          held in the model, but each query takes linear time.

        - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate
          nearest neighbors efficiently. The LSH model supports 'euclidean',
          'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product'
          (deprecated), and 'transformed_dot_product' distances. Two options
          are provided for LSH -- ``num_tables`` and
          ``num_projections_per_table``. See the notes below for details.

    verbose: bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options for the distance function and query method.

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and
          n/(2^11), which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed. The default value is 20. We recommend choosing values
          from 10 to 30.

        - *num_projections_per_table*: For the LSH method, the number of
          projections/hash functions for each hash table. The default value is
          4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other
          distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8
          ~ 20 for 'cosine' distance and 4 ~ 12 for other distances.

    Returns
    -------
    out : NearestNeighborsModel
        A structure for efficiently computing the nearest neighbors in 'dataset'
        of new query points.

    See Also
    --------
    NearestNeighborsModel.query, graphlab.toolkits.distances

    Notes
    -----
    - Missing data is not allowed in the 'dataset' provided to this function.
      Please use the :func:`graphlab.SFrame.fillna` and
      :func:`graphlab.SFrame.dropna` utilities to handle missing data before
      creating a nearest neighbors model.

    - Missing keys in sparse vectors are assumed to have value 0.

    - The `composite_params` parameter was removed as of GraphLab Create
      version 1.5. The `distance` parameter now accepts either standard or
      composite distances. Please see the :mod:`~graphlab.toolkits.distances`
      module documentation for more information on composite distances.

    - If the features should be weighted equally in the distance calculations
      but are measured on different scales, it is important to standardize the
      features. One way to do this is to subtract the mean of each column and
      divide by the standard deviation.

    **Locality Sensitive Hashing (LSH)**

    There are several efficient nearest neighbors search algorithms that work
    well for data with low dimensions :math:`d` (approximately 50). However,
    most of the solutions suffer from either space or query time that is
    exponential in :math:`d`. For large :math:`d`, they often provide little,
    if any, improvement over the 'brute_force' method. This is a well-known
    consequence of the phenomenon called `The Curse of Dimensionality`.

    `Locality Sensitive Hashing (LSH)
    <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach
    that is designed to efficiently solve the *approximate* nearest neighbor
    search problem for high dimensional data. The key idea of LSH is to hash
    the data points using several hash functions, so that the probability of
    collision is much higher for data points which are close to each other than
    those which are far apart.

    An LSH family is a family of functions :math:`h` which map points from the
    metric space to a bucket, so that

    - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`.
    - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`.

    LSH for efficient approximate nearest neighbor search:

    - We define a new family of hash functions :math:`g`, where each
      function :math:`g` is obtained by concatenating :math:`k` functions
      :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`.
      The algorithm constructs :math:`L` hash tables, each of which
      corresponds to a different randomly chosen hash function :math:`g`.
      There are :math:`k \\cdot L` hash functions used in total.

    - In the preprocessing step, we hash all :math:`n` reference points
      into each of the :math:`L` hash tables.

    - Given a query point :math:`q`, the algorithm iterates over the
      :math:`L` hash functions :math:`g`. For each :math:`g` considered, it
      retrieves the data points that are hashed into the same bucket as q.
      These data points from all the :math:`L` hash tables are considered as
      candidates that are then re-ranked by their real distances with the query
      data.

    **Note** that the number of tables :math:`L` and the number of hash
    functions per table :math:`k` are two main parameters. They can be set
    using the options ``num_tables`` and ``num_projections_per_table``
    respectively.

    Hash functions for different distances:

    - `euclidean` and `squared_euclidean`:
      :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where
      :math:`a` is a vector, of which the elements are independently
      sampled from normal distribution, and :math:`b` is a number
      uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the
      bucket width. We set :math:`r` using the average all-pair `euclidean`
      distances from a small randomly sampled subset of the reference data.

    - `manhattan`: The hash function of `manhattan` is similar with that of
      `euclidean`. The only difference is that the elements of `a` are sampled
      from Cauchy distribution, instead of normal distribution.

    - `cosine`: Random Projection is designed to approximate the cosine
      distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot
      q)`, where :math:`a` is randomly sampled normal unit vector.

    - `jaccard`: We use a recently proposed method one permutation hashing by
      Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014]
      <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for
      details.

    - `dot_product`: The reference data points are first transformed to
      fixed-norm vectors, and then the minimum `dot_product` distance search
      problem can be solved via finding the reference data with smallest
      `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015]
      <http://jmlr.org/proceedings/papers/v37/neyshabur15.html>`_ for details.

    References
    ----------
    - `Wikipedia - nearest neighbor
      search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_

    - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_

    - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of
      Practical Approximate Nearest Neighbor Algorithms
      <http://papers.nips.cc/paper/2666-an-investigation-of-p
      ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural
      Information Processing Systems pp. 825-832.

    - `Wikipedia - Jaccard distance
      <http://en.wikipedia.org/wiki/Jaccard_index>`_

    - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the
      Jaccard Median
      <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_.
      Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete
      Algorithms. Society for Industrial and Applied Mathematics.

    - `Wikipedia - Cosine distance
      <http://en.wikipedia.org/wiki/Cosine_similarity>`_

    - `Wikipedia - Levenshtein distance
      <http://en.wikipedia.org/wiki/Levenshtein_distance>`_

    - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive
      Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_.

    Examples
    --------
    Construct a nearest neighbors model with automatically determined method
    and distance:

    >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11],
    ...                       'X2': [0.69, 0.58, 0.36],
    ...                       'str_feature': ['cat', 'dog', 'fossa']})
    >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'])

    For datasets with a large number of rows and up to about 100 variables, the
    ball tree method often leads to much faster queries.

    >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'],
    ...                                           method='ball_tree')

    Often the final determination of a neighbor is based on several distance
    computations over different sets of features. Each part of this composite
    distance may have a different relative weight.

    >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.],
    ...            [['str_feature'], 'levenshtein', 3.]]
    ...
    >>> model = graphlab.nearest_neighbors.create(sf, distance=my_dist)
    """

    ## Validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Basic validation of the features input
    if features is not None and not isinstance(features, list):
        raise TypeError("If specified, input 'features' must be a list of " +
                        "strings.")

    ## Clean the method options and create the options dictionary
    allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table']
    _method_options = {}

    for k, v in kwargs.items():
        if k in allowed_kwargs:
            _method_options[k] = v
        else:
            raise _ToolkitError(
                "'{}' is not a valid keyword argument".format(k) +
                " for the nearest neighbors model. Please " +
                "check for capitalization and other typos.")

    ## Exclude inappropriate combinations of method an distance
    if method == 'ball_tree' and (
            distance == 'cosine' or distance == _graphlab.distances.cosine
            or distance == 'dot_product'
            or distance == _graphlab.distances.dot_product
            or distance == 'transformed_dot_product'
            or distance == _graphlab.distances.transformed_dot_product):
        raise TypeError(
            "The ball tree method does not work with 'cosine' " +
            "'dot_product', or 'transformed_dot_product' distance." +
            "Please use the 'brute_force' method for these distances.")

    if method == 'lsh' and ('num_projections_per_table'
                            not in _method_options):
        if distance == 'jaccard' or distance == _graphlab.distances.jaccard:
            _method_options['num_projections_per_table'] = 4
        elif distance == 'cosine' or distance == _graphlab.distances.cosine:
            _method_options['num_projections_per_table'] = 16
        else:
            _method_options['num_projections_per_table'] = 8

    ## Initial validation and processing of the label
    if label is None:
        _label = _robust_column_name('__id', dataset.column_names())
        _dataset = dataset.add_row_number(_label)
    else:
        _label = label
        _dataset = _copy.copy(dataset)

    col_type_map = {c: _dataset[c].dtype() for c in _dataset.column_names()}
    _validate_row_label(_label, col_type_map)
    ref_labels = _dataset[_label]

    ## Determine the internal list of available feature names (may still include
    #  the row label name).
    if features is None:
        _features = _dataset.column_names()
    else:
        _features = _copy.deepcopy(features)

    ## Check if there's only one feature and it's the same as the row label.
    #  This would also be trapped by the composite distance validation, but the
    #  error message is not very informative for the user.
    free_features = set(_features).difference([_label])
    if len(free_features) < 1:
        raise _ToolkitError("The only available feature is the same as the " +
                            "row label column. Please specify features " +
                            "that are not also row labels.")

    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.
    # - At this point, the row label 'label' may still be in the list(s) of
    #   features.

    ## Convert any distance function input into a single composite distance.
    # distance is already a composite distance
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    # distance is a single name (except 'auto') or function handle.
    elif (hasattr(distance, '__call__')
          or (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    # distance is unspecified and needs to be constructed.
    elif distance is None or distance == 'auto':
        sample = _dataset.head()
        distance = _construct_auto_distance(_features, _dataset.column_names(),
                                            _dataset.column_types(), sample)

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' "
                        " argument must be a string, function handle, or " +
                        "composite distance.")

    ## Basic composite distance validation, remove the row label from all
    #  feature lists, and convert string distance names into distance functions.
    distance = _scrub_composite_distance_features(distance, [_label])
    distance = _convert_distance_names_to_functions(distance)
    _validate_composite_distance(distance)

    ## Raise an error if any distances are used with non-lists
    list_features_to_check = []
    sparse_distances = [
        'jaccard', 'weighted_jaccard', 'cosine', 'dot_product',
        'transformed_dot_product'
    ]
    sparse_distances = [
        _graphlab.distances.__dict__[k] for k in sparse_distances
    ]
    for d in distance:
        feature_names, dist, _ = d
        list_features = [
            f for f in feature_names if _dataset[f].dtype() == list
        ]
        for f in list_features:
            if dist in sparse_distances:
                list_features_to_check.append(f)
            else:
                raise TypeError(
                    "The chosen distance cannot currently be used " +
                    "on list-typed columns.")
    for f in list_features_to_check:
        only_str_lists = _validate_lists(_dataset[f], [str])
        if not only_str_lists:
            raise TypeError("Distances for sparse data, such as jaccard " +
                            "and weighted_jaccard, can only be used on " +
                            "lists containing only strings. Please modify " +
                            "any list features accordingly before creating " +
                            "the nearest neighbors model.")

    ## Raise an error if any component has string features are in single columns
    for d in distance:
        feature_names, dist, _ = d

        if (len(feature_names) > 1) and (dist
                                         == _graphlab.distances.levenshtein):
            raise ValueError(
                "Levenshtein distance cannot be used with multiple " +
                "columns. Please concatenate strings into a single " +
                "column before creating the nearest neighbors model.")

    ## Get the union of feature names and make a clean dataset.
    clean_features = _get_composite_distance_features(distance)
    sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features)

    ## Decide which method to use
    ## - If more than one distance component (specified either directly or
    #  generated automatically because distance set to 'auto'), then do brute
    #  force.
    if len(distance) > 1:
        _method = 'brute_force'

        if method != 'brute_force' and verbose is True:
            print("Defaulting to brute force instead of ball tree because " +\
                "there are multiple distance components.")

    else:
        if method == 'auto':

            # get the total number of variables. Assume the number of elements in
            # array type columns does not change
            num_variables = sum([
                len(x) if hasattr(x, '__iter__') else 1
                for x in _six.itervalues(sf_clean[0])
            ])

            # flag if all the features in the single composite are of numeric
            # type.
            numeric_type_flag = all([
                x in [int, float, list, array.array]
                for x in sf_clean.column_types()
            ])

            ## Conditions necessary for ball tree to work and be worth it
            if ((distance[0][1] in [
                    'euclidean', 'manhattan', _graphlab.distances.euclidean,
                    _graphlab.distances.manhattan
            ]) and numeric_type_flag is True and num_variables <= 200):

                _method = 'ball_tree'

            else:
                _method = 'brute_force'

        else:
            _method = method

    ## Pick the right model name for the method
    if _method == 'ball_tree':
        model_name = 'nearest_neighbors_ball_tree'
        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors_balltree.create')

    elif _method == 'brute_force':
        model_name = 'nearest_neighbors_brute_force'
        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors_brute.create')

    elif _method == 'lsh':
        model_name = 'nearest_neighbors_lsh'
        _mt._get_metric_tracker().track('toolkit.nearest_neighbors_lsh.create')

    else:
        raise ValueError(
            "Method must be 'auto', 'ball_tree', 'brute_force', " +
            "or 'lsh'.")

    ## Package the model options
    opts = {}
    opts.update(_method_options)
    opts.update({
        'model_name': model_name,
        'ref_labels': ref_labels,
        'label': label,
        'sf_features': sf_clean,
        'composite_params': distance
    })

    ## Construct the nearest neighbors model
    if not verbose:
        _mt.main.get_server().set_log_progress(False)

    result = _graphlab.extensions._nearest_neighbors.train(opts)

    _mt.main.get_server().set_log_progress(True)

    model_proxy = result['model']
    model = NearestNeighborsModel(model_proxy)

    return model
def create(dataset, tag_name=None, features=None, verbose=True):
    """
    Create a NearestNeighborAutotagger model, which can be used to quickly apply
    tags from a reference set of text labels to a new query set using the
    ``tag`` method.

    Parameters
    ----------
    dataset : SFrame
        Reference data. This SFrame must contain at least one column. By
        default, only the ``tag_name`` column is used as the basis for
        tagging. You may optionally include additional columns with the
        ``features`` parameter.

    tag_name : string, optional
        Name of the column in ``dataset`` with the tags. This column must
        contain string values. If ``dataset`` contains more than one column,
        ``tag_name`` must be specified.

    features : list[string], optional
        Names of the columns with features to use as the basis for tagging.
        'None' (the default) indicates that only the column specified by the
        ``tag_name`` parameter should be used. Only str or list fields are
        allowed. If a column of type list is specified, all values must be
        either of type string or convertible to type string.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : model
        A model for quickly tagging new query observations with entries from
        `dataset`. Currently, the only implementation is the following:

        - NearestNeighborAutoTagger

    See Also
    --------
    NearestNeighborsModel

    Examples
    --------
    First construct a toy `SFrame` of actor names, which will serve as the
    reference set for our autotagger model.

    >>> actors_sf = gl.SFrame(
            {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                       "Tom Cruise", "Jude Law", "Robert Pattinson",
                       "Matt Damon", "Brad Pitt", "Johnny Depp",
                       "Leonardo DiCaprio", "Jennifer Aniston",
                       "Jessica Alba", "Emma Stone", "Cameron Diaz",
                       "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                       "Charlize Theron", "Marion Cotillard",
                       "Angelina Jolie"]})
    >>> m = gl.data_matching.nearest_neighbor_autotagger.create(
                actors_sf, tag_name="actor")

    Then we load some IMDB movie reviews into an `SFrame` and tag them using
    the model we created above. The score field in the output is a
    similarity score, indicating the strength of the match between the query
    data and the suggested reference tag.

    >>> reviews_sf = gl.SFrame(
            "s3://dato-datasets/imdb_reviews/reviews.sframe")
    >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
    +-----------+-------------------------------+------------------+-----------------+
    | review_id |             review            |      actor       |      score      |
    +-----------+-------------------------------+------------------+-----------------+
    |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
    |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
    |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
    |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
    |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
    |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
    |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
    |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
    |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
    |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
    |    ...    |              ...              |       ...        |       ...       |
    +-----------+-------------------------------+------------------+-----------------+

    The initial results look a little noisy. To filter out obvious spurious
    matches, we can set the `tag` method's similarity_threshold parameter.

    >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
              similarity_threshold=.8)
    +-----------+-------------------------------+------------------+----------------+
    | review_id |             review            |      actor       |     score      |
    +-----------+-------------------------------+------------------+----------------+
    |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
    |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
    |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
    |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
    +-----------+-------------------------------+------------------+----------------+

    In this second example, you'll notice that the ``review_id`` column is much
    more sparse. This is because all results whose score was below the specified
    similarity threshold (.8) were excluded from the output.

    """
    # validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    # ensure that tag_name is provided if dataset has > 1 column
    if dataset.num_cols() > 1 and not tag_name:
        raise _ToolkitError("No tag_name parameter specified on dataset " \
                            "with %d columns" % dataset.num_cols())
    tag_name = tag_name or dataset.column_names()[0]

    # ensure that column with name tag_name exists
    if tag_name not in dataset.column_names():
        raise _ToolkitError('No column named "%s" in dataset' % tag_name)

    # ensure that column is of type string
    if dataset[tag_name].dtype() != str:
        raise TypeError("The column used as the tag name must be of type " \
                        "string.")

    # use reasonable default for general case
    distance = _gl.distances.weighted_jaccard

    # if additional features are specified, ensure they are of appropriate types
    if features and not isinstance(features, list) and \
       all([isinstance(x, str) for x in features]):
        raise TypeError("The feature parameter must be a list of strings " \
                        "and those strings must correspond to columns in " \
                        "`dataset`.")

    # at a minimum, this SFrame will contain the tags as features;
    features = features or []
    features = [tag_name] + [x for x in features if x != tag_name]

    # ensure that each specified feature column is either of type list or str
    column_names = set(dataset.column_names())
    for col_name in features:
        if col_name not in column_names:
            raise _ToolkitError("Specified feature column (%s) not found " \
                                "in dataset" % x)

        if dataset.select_column(col_name).dtype() not in (str, list):
            raise TypeError("Only string and list columns are allowed as " \
                            "features.")

    # concatenate the feature columns into a single column
    features_sf = dataset.select_columns(features)
    feature_col, features_sf = _concat_string_features(features_sf, features)

    # compute features
    if verbose:
        _logging.getLogger().info("Extracting features...")

    features = _preprocess(features_sf.select_column(feature_col))

    # group by tag_name to ensure that tags are unique
    feature_cols = features.column_names()
    select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \
                   in feature_cols}
    features.add_column(dataset[tag_name], tag_name)
    features = features.groupby(tag_name, select_cols)

    # create nearest neighbors model
    m = _gl.nearest_neighbors.create(
        features, label=tag_name, distance=distance,
        features=feature_cols, verbose=verbose)

    # add standard toolkit state attributes
    state = {"training_time": m.get("training_time"),
             "tag_name": tag_name,
             "verbose": verbose,
             "num_examples": len(features),
             "features": feature_cols,
             "num_features": len(feature_cols),
             "distance": m.get("distance")}

    model = NearestNeighborAutoTagger(m, state)
    model.summary()

    return model
    def predict_topk(self, dataset,  max_neighbors=10, radius=None, k=3,
                     verbose=False):
        """
        Return top-k most likely predictions for each observation in
        ``dataset``. Predictions are returned as an SFrame with three columns:
        `row_id`, `class`, and `probability`.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include the features used for
            model training, but does not require a target column. Additional
            columns are ignored.

        max_neighbors : int, optional
            Maximum number of neighbors to consider for each point.

        radius : float, optional
            Maximum distance from each point to a neighbor in the reference
            dataset.

        k : int, optional
            Number of classes to return for each input example.

        Returns
        -------
        out : SFrame

        See Also
        ----------
        create, classify, predict

        Notes
        -----
        - If the 'radius' parameter is small, it is possible that a query point
          has no neighbors in the training dataset. In this case, the query is
          dropped from the SFrame output by this method. If all queries have no
          neighbors, then the result is an empty SFrame. If the target column in
          the training dataset has missing values, these predictions will be
          ambiguous.

        - Ties between predicted classes are broken randomly.

        Examples
        --------
        >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
        ...                             'height': [9, 25, 20, 23],
        ...                             'weight': [13, 28, 33, 22]})
        ...
        >>> sf_new = graphlab.SFrame({'height': [26, 19],
        ...                           'weight': [25, 35]})
        ...
        >>> m = graphlab.nearest_neighbor_classifier.create(sf_train, target='species')
        >>> ystar = m.predict_topk(sf_new, max_neighbors=2)
        >>> print ystar
        +--------+-------+-------------+
        | row_id | class | probability |
        +--------+-------+-------------+
        |   0    |  dog  |     1.0     |
        |   1    | fossa |     0.5     |
        |   1    |  dog  |     0.5     |
        +--------+-------+-------------+
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.nearest_neighbor_classifier.predict_topk')

        ## Validate the number of results to return. Note that the
        #  'max_neighbors' and 'radius' parameters are validated by the nearest
        #  neighbor model's query method.
        if not isinstance(k, int) or k < 1:
            raise TypeError("The number of results to return for each point, 'k', "
                            "must be an integer greater than 0.")


        ## Validate the query dataset.
        _raise_error_if_not_sframe(dataset, "dataset")
        _raise_error_if_sframe_empty(dataset, "dataset")

        ## Validate neighborhood parameters 'max_neighbors'.
        # - NOTE: when the parameter name is changed in nearest neighbors, the
        #   query call will do this itself, and this block can be removed.
        if max_neighbors is not None:
            if not isinstance(max_neighbors, int):
                raise ValueError("Input 'max_neighbors' must be an integer.")

            if max_neighbors <= 0:
                raise ValueError("Input 'max_neighbors' must be larger than 0.")


        ## Find the nearest neighbors for each query and count the number of
        #  votes for each class.
        knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius,
                                    verbose=verbose)

        ## If there are *no* results for *any* query make an empty SFrame.
        if knn.num_rows() == 0:
            ystar = _gl.SFrame({'row_id': [], 'class': [], 'probability': []})
            ystar['row_id'] = ystar['row_id'].astype(int)
            ystar['class'] = ystar['class'].astype(str)


        else:
            ## Find the classes with the top-k vote totals
            grp = knn.groupby(['query_label', 'reference_label'], _gl.aggregate.COUNT)

            ystar = grp.unstack(column=['reference_label', 'Count'],
                                new_column_name='votes')

            ystar['topk'] = ystar['votes'].apply(lambda x: _sort_topk_votes(x, k))
            ystar['total_votes'] = ystar['votes'].apply(lambda x: sum(x.values()))

            ## Re-stack, unpack, and rename the results
            ystar = ystar.stack('topk', new_column_name='topk')
            ystar = ystar.unpack('topk')
            ystar.rename({'topk.class': 'class', 'query_label': 'row_id'})
            ystar['probability'] = ystar['topk.votes'] / ystar['total_votes']
            ystar = ystar[['row_id', 'class', 'probability']]

        return ystar
Пример #34
0
    def tag(self,
            dataset,
            query_name=None,
            k=5,
            similarity_threshold=None,
            exclude_zeros=True,
            verbose=True):
        """
        Match the reference tags passed when a model is created to a new set of
        queries. This is a many-to-many match: each query may have any number of
        occurrences of a reference tag.

        Parameters
        ----------
        dataset : SFrame
            Query data to be tagged.

        query_name : string, optional
            Name of the column in ``dataset`` to be auto-tagged. If ``dataset``
            has more than one column, ``query_name`` must be specified.

        k : int, optional
            Number of results to return from the reference set for each query
            observation. The default is 5, but setting it to ``None`` will
            return all results whose score is greater than or equal to
            ``similarity_threshold``.

        similarity_threshold : float, optional
            Only results whose score is greater than or equal to the specified
            ``similarity_threshold`` are returned. The default is ``None``, in
            which case the ``k`` best results are returned for each query point.

        verbose : bool, optional
            If True, print progress updates and model details.

        exclude_zeros : boolean, optional
            If True, only entries for which there is a tag with a nonzero score
            are preserved in the output. This is the default behavior.

        Returns
        -------
        out : SFrame
            An SFrame with four columns:

            - row ID
            - column name specified as `tag_name` parameter to `create` method
            - column name specified as `query_name` parameter to `tag` method
            - a similarity score between 0 and 1, indicating the strength of the
              match between the query data and the suggested reference tag,
              where a score of zero indicates a poor match and a strength of 1
              corresponds to a perfect match

        Notes
        -----
        - By default, only rows for which there is a tag with a nonzero score
          are included in the output. To guarantee at least one output row for
          every input row in ``dataset``, set the ``exclude_zeros`` parameter
          to False.

        - If both ``k`` and ``similarity_threshold`` are set to ``None``, a
          ToolkitError is raised.

        Examples
        --------
        First construct a toy `SFrame` of actor names, which will serve as the
        reference set for our autotagger model.

        >>> actors_sf = gl.SFrame(
                {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                           "Tom Cruise", "Jude Law", "Robert Pattinson",
                           "Matt Damon", "Brad Pitt", "Johnny Depp",
                           "Leonardo DiCaprio", "Jennifer Aniston",
                           "Jessica Alba", "Emma Stone", "Cameron Diaz",
                           "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                           "Charlize Theron", "Marion Cotillard",
                           "Angelina Jolie"]})
        >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor")

        Then we load some IMDB movie reviews into an `SFrame` and tag them using
        the model we created above. The score field in the output is a
        similarity score, indicating the strength of the match between the query
        data and the suggested reference tag.

        >>> reviews_sf = gl.SFrame(
                "https://static.turi.com/datasets/imdb_reviews/reviews.sframe")
        >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
        +-----------+-------------------------------+------------------+-----------------+
        | review_id |             review            |      actor       |      score      |
        +-----------+-------------------------------+------------------+-----------------+
        |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
        |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
        |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
        |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
        |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
        |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
        |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
        |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
        |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
        |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
        |    ...    |              ...              |       ...        |       ...       |
        +-----------+-------------------------------+------------------+-----------------+

        The initial results look a little noisy. To filter out obvious spurious
        matches, we can set the `tag` method's `similarity_threshold` parameter.

        >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
                  similarity_threshold=.8)
        +-----------+-------------------------------+------------------+----------------+
        | review_id |             review            |      actor       |     score      |
        +-----------+-------------------------------+------------------+----------------+
        |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
        |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
        |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
        |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
        +-----------+-------------------------------+------------------+----------------+

        """
        _mt._get_metric_tracker().track(self.__module__ + '.tag')

        # validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        # ensure that either k or similarity_threshold is set
        if not (k or similarity_threshold):
            raise _ToolkitError("Either k or similarity_threshold parameters " \
                                "must be set")

        # ensure that query_name is provided if dataset has > 1 column
        if dataset.num_cols() > 1 and not query_name:
            raise _ToolkitError("No query_name parameter specified on " \
                                "dataset with %d columns" % dataset.num_cols())

        query_column = query_name or dataset.column_names()[0]

        # ensure that column with name tag_name exists
        if query_column not in dataset.column_names():
            raise _ToolkitError('No column named "%s" in dataset' \
                                % query_column)

        query_sa = dataset.select_column(query_column)
        query_sf = _gl.SFrame({
            "id": range(len(query_sa)),
            query_column: query_sa
        })

        features = _preprocess(query_sa)
        features = features.add_row_number()

        if similarity_threshold:
            if not isinstance(similarity_threshold, (float, int)):
                raise _ToolkitError("similarity_threshold parameter must be a" \
                                    "float or an int.")

            if similarity_threshold < 0 or similarity_threshold > 1:
                raise _ToolkitError("similarity_threshold parameter must be " \
                                    "between 0 and 1.")

        radius = (1 - similarity_threshold) if similarity_threshold else None

        results = self.__proxy__['nearest_neighbors_model'].query(
            features, label="id", k=k, radius=radius, verbose=verbose)

        # return empty SFrame immediately if no NN results
        if len(results) == 0:
            return _gl.SFrame({
                query_column + "_id": [],
                query_column: [],
                self.get("tag_name"): [],
                "score": []
            })

        results = results.join(query_sf, on={"query_label": "id"})
        results.rename({"query_label": query_column + "_id"})
        results.rename({query_column: "query_label"})

        # convert distances to similarity scores
        scores = _dists_to_sim_scores("weighted_jaccard", results)

        results.add_column(scores, "score")
        results.remove_column("distance")
        results.remove_column("rank")
        results.rename({
            "reference_label": self.get("tag_name"),
            "query_label": query_column
        })
        results.swap_columns(self.get("tag_name"), query_column)

        if exclude_zeros:
            try:
                results = results.filter_by(0.0, "score", exclude=True)
            except RuntimeError:  # nothing to join
                _logging.getLogger(__name__).warn(
                    "Empty results after filtering scores of 0.")
                results = results.head(0)

        return results
    def link(self, dataset, k=5, radius=None, verbose=True):
        """
        Find matching records from the reference dataset (entered when the model
        was created) for each record in the 'dataset' passed to this function.
        The query dataset must include columns with the same names as the label
        and feature columns used to create the RecordLinker
        model.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        k : int, optional
            Maximum number of nearest neighbors to return from the reference set
            for each query observation. The default is 5, but setting it to
            ``None`` will return all neighbors within ``radius`` of the query
            point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the ``k``
            nearest neighbors are returned for each query point, regardless of
            distance.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the row label of the
            query observation, the second is the row label of the nearby
            reference observation, the third is the distance between the query
            and reference observations, and the fourth is the rank of the
            reference observation among the query's k-nearest neighbors.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output is
          an SFrame with :math:`nm` rows.

        Examples
        --------
        Assume we've created the model from the example in the RecordLinker
        'create' function.

        >>> queries = graphlab.SFrame({'sqft': [986, 1320],
        ...                            'street': ['fremont', 'phiney'],
        ...                            'city': ['sea', 'seattle'],
        ...                            'state': ['WA', 'WA']})
        ...
        >>> model.link(queries, k=2, radius=5.)
        +-------------+-----------------+----------+------+
        | query_label | reference_label | distance | rank |
        +-------------+-----------------+----------+------+
        |      0      |        0        |   4.0    |  1   |
        |      0      |        2        |   5.0    |  2   |
        |      1      |        0        |   0.0    |  1   |
        +-------------+-----------------+----------+------+
        """
        _mt._get_metric_tracker().track(self.__module__ + '.link_records')

        ## Validate the 'dataset' input.
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Make sure all of the necessary features are present at 'link' time.
        sf_features = _tkutl._toolkits_select_columns(dataset,
                                                      self.get('features'))

        ## Clean and impute string data. *** Think about consolidating this and
        #  the next step into a feature transformer.***
        col_types = {
            k: v
            for k, v in zip(dataset.column_names(), dataset.column_types())
        }
        _dataset = _copy.copy(dataset)
        _distance = _copy.deepcopy(self.__proxy__['distance'])

        for ftr in self.get('features'):
            if col_types[ftr] == str:
                new_ftr = '__clean.' + ftr
                _dataset[new_ftr] = _dataset[ftr].fillna("")
                _dataset[new_ftr] = _dataset[new_ftr].apply(
                    lambda x: _dmutl.cleanse_string(x), dtype=str)

                for dist_comp in _distance:
                    dist_comp[0] = [
                        new_ftr if x == ftr else x for x in dist_comp[0]
                    ]

        ## Convert strings to dicts and concatenate string features.
        _dataset, _ = _engineer_distance_features(_dataset, _distance)

        ## Query the nearest neighbor model
        result = self.__proxy__['nearest_neighbors_model'].query(
            _dataset, k=k, radius=radius, verbose=verbose)
        return result
def create(dataset, features=None, label=None, distance=None, num_neighbors=5,
           threshold_distances=True, verbose=True):
    """
    Create a :class:`LocalOutlierFactorModel`. This mode contains local outlier
    factor (LOF) scores for the training data passed to this model, and can
    predict the LOF score for new observations.

    The LOF method scores each data instance by computing the ratio of the
    average densities of the instance's neighbors to the density of the
    instance itself. The higher the score, the more likely the instance is to
    be an outlier *relative to its neighbors*. A score of 1 or less means that
    an instance has a density similar (or higher) to its neighbors and is
    unlikely to be an outlier.

    The model created by this function contains an SFrame called 'scores' that
    contains the computed local outlier factors. The `scores` SFrame has four
    columns:

        - *row_id*: the row index of the instance in the input dataset. If a
          label column is passed, the labels (and the label name) are passed
          through to this column in the output.

        - *density*: the density of instance as estimated by the LOF
          procedure.

        - *neighborhood_radius*: the distance from the instance to its
          furthest neighbor (defined by 'num_neighbors', and used for
          predicting the LOF for new points).

        - *anomaly_score*: the local outlier factor.

    For more information on the LOF method and the computation used for each of
    these columns, please see the Notes and References sections below.

    Parameters
    ----------
    dataset : SFrame
        Input dataset. The 'dataset' SFrame must include the features specified
        in the 'features' or 'distance' parameter (additional columns are
        ignored).

    features : list[string], optional
        Names of feature columns. 'None' (the default) indicates that all
        columns should be used. Each column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if 'distance' is specified as a composite distance, then
        that parameter controls which features are used in the model. Also note
        that the column of row labels is automatically removed from the
        features, if there is a conflict.

    label : str, optional
        Name of the input column containing row labels. The values in this
        column must be integers or strings. If not specified, row numbers are
        used by default.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. If
        left unspecified, a distance function is automatically constructed
        based on the feature types. The distance may be specified by either a
        string or composite distance:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', or 'dot_product'. Please see
          the :mod:`distances` module for more details.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

    num_neighbors : int, optional
        Number of neighbors to consider for each point.

    threshold_distances : bool, optional
        If True (the default), the distance between two points is thresholded.
        This reduces noise and can improve the quality of results, but at the
        cost of slower computation. See the notes below for more detail.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    model : LocalOutlierFactorModel
        A trained :class:`LocalOutlierFactorModel`, which contains an SFrame
        called 'scores' that includes the 'anomaly score' for each input
        instance.

    See Also
    --------
    LocalOutlierFactorModel, graphlab.toolkits.nearest_neighbors

    Notes
    -----
    - The LOF method scores each data instance by computing the ratio of the
      average densities of the instance's neighbors to the density of the
      instance itself. According to the LOF method, the estimated density of a
      point :math:`p` is the number of :math:`p`'s neighbors divided by the sum
      of distances to the instance's neighbors. In the following, suppose
      :math:`N(p)` is the set of neighbors of point
      :math:`p`, :math:`k` is the number of points in this set (i.e.
      the 'num_neighbors' parameter), and :math:`d(p, x)` is the distance
      between points :math:`p` and :math:`x` (also based on a user-specified
      distance function).

      .. math:: \hat{f}(p) = \\frac{k}{\sum_{x \in N(p)} d(p, x)}

    - The LOF score for point :math:`p` is then the ratio of :math:`p`'s
      density to the average densities of :math:`p`'s neighbors:

      .. math:: LOF(p) = \\frac{\\frac{1}{k} \sum_{x \in N(p)} \hat{f}(x)}{\hat{f}(p)}

    - If the 'threshold_distances' flag is set to True, exact distances are
      replaced by "thresholded" distances. Let  Suppose :math:`r_k(x)` is the
      distance from :math:`x` to its :math:`k`'th nearest neighbor. Then the
      thresholded distance from point :math:`p` to point :math:`x_i` is

      .. math:: d^*(p, x) = \max\{r_k(x), d(p, x)\}

      This adaptive thresholding is used in the original LOF paper (see the
      References section) to reduce noise in the computed distances and improve
      the quality of the final LOF scores.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - If there are several observations located at an identical position, the
      LOF values can be undefined. An LOF score of "nan" means that a point is
      either in or near a set of co-located points.

    - This implementation of LOF forces the neighborhood of each data instance
      to contain exactly 'num_neighbors' points, breaking ties arbitrarily.
      This differs from the original LOF paper (see References below), which
      allows neighborhoods to expand if there are multiple neighbors at exactly
      the same distance from an instance.

    References
    ----------
    - Breunig, M. M., Kriegel, H., Ng, R. T., & Sander, J. (2000). `LOF:
      Identifying Density-Based Local Outliers
      <http://people.cs.vt.edu/badityap/classes/cs6604-Fall13/readings/breunig-2000.pdf>`_,
      pp 1-12.

    Examples
    --------
    >>> sf = graphlab.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.],
    ...                       'x1': [2., 1., 0., 1., 2., 1.5, 2.5]})
    >>> lof = graphlab.local_outlier_factor.create(sf, num_neighbors=3)
    >>> lof['scores']
    +--------+----------------+----------------+---------------------+
    | row_id |    density     | anomaly_score  | neighborhood_radius |
    +--------+----------------+----------------+---------------------+
    |   0    | 0.927050983125 | 1.03785526045  |         1.0         |
    |   3    | 0.962144739546 | 0.919592692017 |         1.0         |
    |   1    | 0.765148090776 | 1.14822979837  |         1.0         |
    |   6    | 0.230412599692 | 3.52802012342  |    4.71699056603    |
    |   2    | 0.71140803489  | 1.26014768739  |    1.80277563773    |
    |   5    | 0.962144739546 | 0.919592692017 |    1.11803398875    |
    |   4    | 0.962144739546 | 0.919592692017 |    1.11803398875    |
    +--------+----------------+----------------+---------------------+
    [7 rows x 4 columns]
    """

    ## Start the training time clock and instantiate an empty model
    _mt._get_metric_tracker().track(
        'toolkit.anomaly_detection.local_outlier_factor.create')

    logger = _logging.getLogger(__name__)
    start_time = _time.time()

    ## Validate the input dataset
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")


    ## Validate the number of neighbors, mostly to make the error message use
    #  the right parameter name.
    if not isinstance(num_neighbors, int):
        raise TypeError("Input 'num_neighbors' must be an integer.")

    if num_neighbors <= 0:
        raise ValueError("Input 'num_neighbors' must be larger than 0.")

    if num_neighbors > dataset.num_rows():
        num_neighbors = dataset.num_rows()

        if verbose:
            logger.info("Input 'num_neighbors' is larger than the number " +
                        "of rows in the input 'dataset'. Resetting " +
                        "'num_neighbors' to the dataset length.")

    ## Validate the row label against the features *using the nearest neighbors
    #  tool with only one row of data. This is a hack - we should encapsulate
    #  the validation steps in nearest neighbors and do them here first.
    validation_model = _gl.nearest_neighbors.create(dataset[:1], label=label,
                                                    features=features,
                                                    distance=distance,
                                                    method='brute_force',
                                                    verbose=False)

    ## Compute the similarity graph based on k and radius, without self-edges,
    #  but keep it in the form of an SFrame. Do this *without* the row label,
    #  because I need to sort on the row number, and row labels that aren't
    #  already in order will be screwed up.
    knn_model = _gl.nearest_neighbors.create(dataset,
                                             distance=validation_model.distance,
                                             method='brute_force',
                                             verbose=verbose)

    knn = knn_model.similarity_graph(k=num_neighbors, radius=None,
                                     include_self_edges=False,
                                     output_type='SFrame',
                                     verbose=verbose)

    ## Bias the distances by making them at least equal to the *reference*
    #  point's k'th neighbor radius. This is "reach-distance" in the original
    #  paper.
    if threshold_distances is True:
        radii = knn.groupby('query_label',
                        {'neighborhood_radius': _gl.aggregate.MAX('distance')})

        knn = knn.join(radii, on={'reference_label': 'query_label'},
                       how='left')

        knn['distance'] = knn.apply(
            lambda x: x['distance'] if x['distance'] > x['neighborhood_radius'] \
                 else x['neighborhood_radius'])


    ## Find the sum of distances from each point to its neighborhood, then
    #  compute the "local reachability density (LRD)". This is not remotely a
    #  valid density estimate, but it does have the form of mass / volume,
    #  where the mass is estimated by the number of neighbors in point x's
    #  neighborhood, and the volume is estimated by the sum of the distances
    #  between x and its neighbors.
    #
    ## NOTE: if a vertex is co-located with all of its neighbors, the sum of
    #  distances will be 0, in which case the inverse distance sum value is
    #  'inf'.
    scores = knn.groupby('query_label',
                         {'dist_sum': _gl.aggregate.SUM('distance')})

    scores['density'] = float(num_neighbors) / scores['dist_sum']


    ## Join the density of each point back to the nearest neighbors results,
    #  then get the average density of each point's neighbors' densities.
    knn = knn.join(scores, on={'reference_label': 'query_label'},
                   how='left')

    scores2 = knn.groupby('query_label',
                    {'average_neighbor_density': _gl.aggregate.AVG('density')})

    ## Combine each point's density and average neighbor density into one
    #  SFrame, then compute the local outlier factor (LOF).
    scores = scores.sort('query_label')
    scores2 = scores2.sort('query_label')
    scores['anomaly_score'] = scores2['average_neighbor_density'] / scores['density']


    ## Add each point's neighborhood radius to the output SFrame.
    if threshold_distances is True:
        radii = radii.sort('query_label')
        scores['neighborhood_radius'] = radii['neighborhood_radius']


    ## Remove the extraneous columns from the output SFrame and format.
    scores = scores.remove_column('dist_sum')


    ## Substitute in the row labels.
    if label is None:
        row_label_name = 'row_id'
        scores = scores.rename({'query_label': row_label_name})

    else:
        row_label_name = label
        scores = scores.remove_column('query_label')
        col_names = scores.column_names()
        scores[row_label_name] = dataset[label]
        scores = scores[[row_label_name] + col_names]


    ## Post-processing and formatting
    state = {
        'nearest_neighbors_model': knn_model,
        'verbose': verbose,
        'threshold_distances': threshold_distances,
        'num_neighbors': num_neighbors,
        'num_examples': dataset.num_rows(),
        'distance': knn_model['distance'],
        'num_distance_components': knn_model['num_distance_components'],
        'features': knn_model['features'],
        'row_label_name': row_label_name,
        'num_features': knn_model['num_features'],
        'unpacked_features': knn_model['unpacked_features'],
        'num_unpacked_features': knn_model['num_unpacked_features'],
        'scores': scores,
        'training_time': _time.time() - start_time}

    model = LocalOutlierFactorModel(state)
    return model
Пример #37
0
def create(dataset,
           target,
           model_name,
           env,
           features=None,
           validation_set='auto',
           verbose=True,
           **kwargs):

    _raise_error_if_not_sframe(dataset, "training dataset")

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print(
                        "PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.\n"
                        "          You can set ``validation_set=None`` to disable validation tracking.\n"
                    )
                dataset, validation_set = dataset.random_split(.95)
            else:
                validation_set = None
        else:
            raise TypeError('Unrecognized value for validation_set.')

    # Target
    target_sframe = _toolkits_select_columns(dataset, [target])

    # Features
    if features is None:
        features = dataset.column_names()
        features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError(
            "Invalid feature %s: Feature names must be of type str" % x)
    features_sframe = _toolkits_select_columns(dataset, features)

    options = {}
    _kwargs = {}
    for k in kwargs:
        _kwargs[k.lower()] = kwargs[k]
    options.update(_kwargs)
    options.update({
        'target': target_sframe,
        'features': features_sframe,
        'model_name': model_name
    })

    if validation_set is not None:

        if not isinstance(validation_set, _graphlab.SFrame):
            raise TypeError(
                "validation_set must be either 'auto' or an SFrame matching the training data."
            )

        # Attempt to append the two datasets together to check schema
        validation_set.head().append(dataset.head())

        options.update({
            'features_validation':
            _toolkits_select_columns(validation_set, features),
            'target_validation':
            _toolkits_select_columns(validation_set, [target])
        })

    from . import _dml
    dml_obj = _dml.run("distributed_supervised_train", model_name, options,
                       env)

    return dml_obj
    def predict(self, dataset, verbose=True):
        """
        Compute local outlier factors for new data. The LOF scores for new data
        instances are based on the neighborhood statistics for the data used
        when the model was created. Each new point is scored independently.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new points to score with LOF against the training data
            already stored in the model.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SArray
            LOF score for each new point. The output SArray is sorted to match
            the order of the 'dataset' input to this method.

        Examples
        --------
        >>> sf = graphlab.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.],
        ...                       'x1': [2., 1., 0., 1., 2., 1.5, 2.5]})
        >>> m = graphlab.local_outlier_factor.create(sf, num_neighbors=3)
        ...
        >>> sf_new = graphlab.SFrame({'x0': [0.5, 4.5],
        ...                           'x1': [1., 4.0]})
        >>> m.predict(sf_new)
        dtype: float
        Rows: 2
        [0.9317508614964032, 2.905646339288692]
        """
        _mt._get_metric_tracker().track(
            'toolkit.anomaly_detection.local_outlier_factor.predict')

        ## Validate the input dataset
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        num_neighbors = self.__proxy__['num_neighbors']

        ## Query the knn model with the new points.
        knn = self.__proxy__['nearest_neighbors_model'].query(dataset, k=num_neighbors, verbose=verbose)

        ## Join the reference data's neighborhood statistics to the nearest
        #  neighbors results.
        knn = knn.join(self.__proxy__['scores'], on={'reference_label': 'row_id'},
                       how='left')

        # Compute reachability distance for each new point and its
        # neighborhood.
        if self.__proxy__['threshold_distances'] is True:
            knn['distance'] = knn.apply(
                lambda x: x['distance'] \
                    if x['distance'] > x['neighborhood_radius'] \
                    else x['neighborhood_radius'])

        ## Find the sum of distances from each point to its neighborhood, then
        #  compute the "local reachability density" for each query point.
        scores = knn.groupby('query_label',
                             {'dist_sum': _gl.aggregate.SUM('distance')})

        scores['density'] = float(num_neighbors) / scores['dist_sum']


        ## Find the average density for each query point's neighbors.
        scores2 = knn.groupby('query_label',
                    {'average_neighbor_density': _gl.aggregate.AVG('density')})

        ## Join the point densities and average neighbor densities into a
        #  single SFrame and compute the local outlier factor.
        scores = scores.join(scores2, on='query_label')
        scores['anomaly_score'] = \
            scores['average_neighbor_density'] / scores['density']

        ## Remove extraneous columns and format.
        scores = scores.sort('query_label', ascending=True)
        return scores['anomaly_score']
    def extract_features(self, dataset):
        """
        For each example in the dataset, extract the leaf indices of
        each tree as features.

        For multiclass classification, each leaf index contains #num_class
        numbers.

        The returned feature vectors can be used as input to train another
        supervised learning model such as a
        :py:class:`~graphlab.logistic_classifier.LogisticClassifier`,
        an :py:class:`~graphlab.svm_classifier.SVMClassifier`, or a
        :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        Returns
        -------
        out : SArray
            An SArray of dtype array.array containing extracted features.

        Examples
        --------
        >>> data =  graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/regression/houses.csv')

        >>> # Regression Tree Models
        >>> model = graphlab.boosted_trees_regression.create(data,
        ...                           target='price',
        ...                           features=['bath', 'bedroom', 'size'])
        >>> data['boosted_tree_features'] = model.extract_features(data)
        >>> model = graphlab.random_forest_regression.create(data,
        ...                           target='price',
        ...                           features=['bath', 'bedroom', 'size'])
        >>> data['random_forest_features'] = model.extract_features(data)

        >>> # Classification Tree Models
        >>> data['is_expensive'] = data['price'] > 30000
        >>> model = graphlab.boosted_trees_classifier.create(data,
        ...                           target='is_expensive',
        ...                           features=['bath', 'bedroom', 'size'])
        >>> data['boosted_tree_features'] = model.extract_features(data)

        >>> model = graphlab.random_forest_classifier.create(data,
        ...                           target='is_expensive',
        ...                           features=['bath', 'bedroom', 'size'])
        >>> data['random_forest_features'] = model.extract_features(data)
        """
        metric_name = '.'.join([self.__module__, 'extract_features'])
        _mt._get_metric_tracker().track(metric_name)
        _raise_error_if_not_sframe(dataset, "dataset")
        options = dict()
        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset})
        target = _toolkits_main.run('supervised_learning_feature_extraction', options)
        return _map_unity_proxy_to_object(target['extracted'])
    def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `row_id`,
        `class`, and `probability`, `margin`,  or `rank`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the model.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the target column exists in ``dataset`` it will be ignored
            while making predictions.

        output_type : {'probability', 'rank', 'margin'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.
            - `margin`     : Margin associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': Default to 'impute'
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | row_id | class |   probability     |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _mt._get_metric_tracker().track('toolkit.classifier.logistic_classifier.predict_topk')
        _check_categorical_option_type('output_type', output_type,
                                       ['rank', 'margin', 'probability'])
        _check_categorical_option_type('missing_value_action', missing_value_action,
                                       ['auto', 'impute', 'error'])
        if missing_value_action == 'auto':
            missing_value_action = 'impute'

        # Low latency path
        if isinstance(dataset, list):
            return _graphlab.extensions._fast_predict_topk(self.__proxy__, dataset,
                    output_type, missing_value_action, k)
        if isinstance(dataset, dict):
            return _graphlab.extensions._fast_predict_topk(self.__proxy__, [dataset],
                    output_type, missing_value_action, k)
        # Fast path
        _raise_error_if_not_sframe(dataset, "dataset")
        options = dict()
        if (missing_value_action == 'auto'):
            missing_value_action = _sl.select_default_missing_value_policy(
                                                              self, 'predict')
        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset,
                        'output_type': output_type,
                        'topk': k,
                        'missing_value_action': missing_value_action})
        target = _graphlab.toolkits._main.run(
                  'supervised_learning_predict_topk', options)
        return _map_unity_proxy_to_object(target['predicted'])
Пример #41
0
    def predict_topk(self, dataset, output_type="probability", k=3):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `row_id`,
        `class`, and `probability`,`rank`, or `score`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the
        model, except for images which are automatically resized.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'probability', 'rank', 'score'}, optional
            Choose the return type of the prediction:

            - `rank`: outputs rank along with class label.
            - `probability`: outputs learned probability along with class label.
            - `score`: Same as probability

        k : int, optional
            Number of classes to return for each input example.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train')
        >>> training_data, validation_data = data.random_split(0.8)
        >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist')
        >>> m = graphlab.neuralnet_classifier.create(training_data,
        ...                                          target='label',
        ...                                          network=net,
        ...                                          max_iterations=3)
        ...
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | row_id | class |    probability    |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _mt._get_metric_tracker().track('toolkit.classifier.neuralnet_classifier.predict_topk')
        _raise_error_if_not_sframe(dataset, "dataset")
        options = dict()
        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset,
                        'output_type': output_type,
                        'topk': k,
                        'missing_value_action': 'error'})
        target = _toolkits_main.run('supervised_learning_predict_topk', options)
        return _map_unity_proxy_to_object(target['predicted'])
def create(dataset, label=None, features=None, distance='auto', method='auto',
    composite_params=None, verbose=True, **kwargs):
    """
    Create a nearest neighbor model, which can be searched efficiently and
    quickly for the nearest neighbors of a query observation. If the `method`
    argument is specified as `auto`, the type of model is chosen automatically
    based on the type of data in `dataset`.

    Parameters
    ----------
    dataset : SFrame
        Reference data. If the features for each observation are numeric, they
        may be in separate columns of 'dataset' or a single column with lists of
        values. The features may also be in the form of a column of sparse
        vectors (i.e. dictionaries), with string keys and numeric values.

    label : string, optional
        Name of the SFrame column with row labels. If 'label' is not specified,
        row numbers are used to identify reference dataset rows when the model
        is queried.

    features : list[string], optional
        Name of the columns with features to use in computing distances between
        observations and the query points. 'None' (the default) indicates that
        all columns except the label should be used as features. Each column can
        be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Columns of type *list* are not supported. Convert them to array columns
        if all entries in the list are of numeric types. Please note: if
        `composite_params` is also specified, this parameter is ignored.

    distance : string or function, optional
        Name of the function that measures the distances between two
        observations. Please see the notes and references for detailed
        descriptions of the distances. Note that for sparse vectors, missing
        keys are assumed to have value 0.0. Please note: if `composite_params`
        is also specified, this parameter is ignored.

        - *auto* (default): the model chooses a reasonable distance based on the
          data types in 'dataset'. Columns of type str will be compared using
          levenshtein distance, columns of type dict use jaccard distance, and
          columns of type float, int, or list will be combined and use
          euclidean distance. The set of column-specific distances are
          aggregated into a single composite distance.

        - *squared_euclidean*: works only with the `brute_force` method because
          it is not a metric.

        - *euclidean*

        - *manhattan*

        - *jaccard*: works only with variables in a dictionary feature, where
          the keys are treated as a set and the values are ignored.

        - *weighted_jaccard*: like jaccard distance, works only with variables
          in a dictionary feature. For the weighted version of jaccard, however,
          the values of the dictionary are used to weight the contribution of
          each key. This is done by taking the minimum of the two values for
          each key in the numerator and the maximum of the two values in the
          denominator.

        - *cosine*: works only with the 'brute_force' method because it is not a
          true metric. Please see `Wikipedia
          <http://en.wikipedia.org/wiki/Cosine_similarity>`_ for more detail.

        - *dot_product*: works only with the 'brute_force' method because it is
          not a true metric.

        - *levenshtein*: for a single column of string inputs.

    method : {'auto', 'ball_tree', 'brute_force'}, optional
        Method for computing nearest neighbors. The options are:

        - *auto* (default): the method is chosen automatically, based on the
          type of data and the distance. If the distance is 'manhattan' or
          'euclidean' and the features are numeric or vectors of numeric values,
          then the 'ball_tree' method is used. Otherwise, the 'brute_force'
          method is used.

        - *ball_tree*: use a tree structure to find the k-closest neighbors to
          each query point. The ball tree model is slower to construct than the
          brute force model, but queries are faster than linear time. This
          method is not applicable for the cosine and dot product distances. See
          `Liu, et al (2004)
          <http://papers.nips.cc/paper/2666-an-investigation-of-p
          ractical-approximat e-nearest-neighbor-algorithms>`_ for
          implementation details.

        - *brute_force*: compute the distance from a query point to all
          reference observations. There is no computation time for model
          creation with the brute force method (although the reference data is
          held in the model, but each query takes linear time.

    composite_params : list [list [list [string], string or function, float]]
        Multiple sets of features and corresponding distance functions can be
        used as inputs to a composite distance function. Each element of this
        composite is specified by a list in this argument. Each inner list must
        include a list of feature names, the name of a distance function, and a
        relative weight. See the examples and notes sections below. If
        `composite_params` is specified, any standalone `features`, `distance`,
        and `method` arguments are ignored. Keyword arguments are applied to
        each member of the composite distance computation.

    verbose: bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options for the distance function and query method.

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and n/(2^11),
          which ensures a maximum tree depth of 12. The default leaf size is
          indicated by a "0" in the
          :func:`~graphlab.nearest_neighbors.NearestNeighborsModel.get_default_options`
          method.

    Returns
    -------
    out : NearestNeighborsModel
        A structure for efficiently computing the nearest neighbors in 'dataset'
        of new query points.

    See Also
    --------
    NearestNeighborsModel.query

    Notes
    -----
    - If the features should be weighted equally in the distance calculations
      but are measured on different scales, it is important to standardize the
      features. One way to do this is to subtract the mean of each column and
      divide by the standard deviation.

    - Distance definitions. Suppose :math:`u` and :math:`v` are observations
      with :math:`d` variables each.

        - `squared_euclidean`
            .. math:: D(u, v) = \sum_i^d (u_i - v_i)^2

        - `euclidean`
            .. math:: D(u, v) = \\sqrt{\sum_i^d (u_i - v_i)^2}

        - `manhattan`
            .. math:: D(u, v) = \\sum_i^d |u_i - v_i|

        - `cosine`
            .. math::

                D(u, v) = 1 - \\frac{\sum_i^d u_i v_i}
                {\sqrt{\sum_i^d u_i^2}\sqrt{\sum_i^d v_i^2}}

        - `dot_product`
            .. math::

                D(u, v) = \\frac{1}{\sum_i^d u_i v_i}

    - For the jaccard distances, suppose :math:`S` and :math:`T` are the sets of
      keys from two observations' dictionaries. For the weighted version of
      jaccard distance, suppose :math:`S_k` and :math:`T_k` are the values
      associated with key :math:`k` in the respective dictionaries. Typically
      these values are counts, i.e. of words or n-grams.

        - `jaccard`
            .. math:: D(S, T) = 1 - \\frac{|S \cap T|}{|S \cup T|}

        - `weighted_jaccard`
            .. math::

                D(S, T) = 1 - \\frac{\sum_{k \in S \cup T} \min\{S_k, T_k\}}
                {\sum_{k \in S \cup T} \max\{S_k, T_k\}}

    - Levenshtein distance is a type of edit distance for string types. The
      distance is the number of insertion, deletion, and substituion edits
      needed to transform string :math:`A` into string :math:`B`.

        .. math::

            D(A, B) = d(|A|, |B|)

        .. math ::

            d(i, j) = \max(i, j), \quad \mathrm{if } \min(i, j) = 0

        .. math ::

            d(i, j) = \min \Big \{d(i-1, j) + 1, \ d(i, j-1) + 1, \ d(i-1, j-1) + I(A_i \\neq B_i) \Big \}, \quad \mathrm{else}

    - Composite distances are simply weighted sums of the above distances. The
      set of features input to each component distance may vary, and the weight
      on each component acts as a multiplier before each of the component
      distances is summed. For example, if ``composite_params`` is set to

      ``[[['X1', 'X2'], 'euclidean', 2], [['X2', 'X3'], 'manhattan', 3]]``,

      then the overall distance computation for rows :math:`a` and :math:`b` is:

        .. math::

            D(a, b) = 2 * D_{euclidean}(a[[X1, X2]], b[[X1, X2]]) +
            3 * D_{manhattan}(a[[X2, X3]], b[[X2, X3]])

    References
    ----------
    - `Wikipedia - nearest neighbor
      search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_

    - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_

    - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of
      Practical Approximate Nearest Neighbor Algorithms
      <http://papers.nips.cc/paper/2666-an-investigation-of-p
      ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural
      Information Processing Systems pp. 825-832.

    - `Wikipedia - Jaccard distance
      <http://en.wikipedia.org/wiki/Jaccard_index>`_

    - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the
      Jaccard Median
      <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_.
      Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete
      Algorithms. Society for Industrial and Applied Mathematics.

    - `Wikipedia - Cosine distance
      <http://en.wikipedia.org/wiki/Cosine_similarity>`_

    - `Wikipedia - Levenshtein distance
      <http://en.wikipedia.org/wiki/Levenshtein_distance>`_

    Examples
    --------
    Construct a nearest neighbors model with automatically determined method and
    distance:

    >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11],
    ...                       'X2': [0.69, 0.58, 0.36],
                              'str_feature': ['cat', 'dog', 'fossa']})
    >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'])

    For datasets with a large number of rows and up to about 100 variables, the
    ball tree method often leads to much faster queries.

    >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'],
    ...                                           method='ball_tree')

    Often the final determination of a neighbor is based on several distance
    computations over different sets of features. Each part of this composite
    distance may have a different relative weight.

    >>> model = graphlab.nearest_neighbors.create(sf, composite_params=[
    ...                                          [['X1', 'X2'], 'euclidean', 2.],
    ...                                          [['str_feature'], 'levenshtein', 3.]])
    """

    ## Validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")


    ## Exclude inappropriate combinations of method an distance
    if method == 'ball_tree' and (distance == 'cosine'
                                  or distance == _graphlab.distances.cosine
                                  or distance == 'dot_product'
                                  or distance == _graphlab.distances.dot_product):
        raise TypeError("The ball tree method does not work with 'cosine' " +\
                        "or 'dot_product' distance. Please use the 'brute_force' " +\
                        "method for these distances.")


    ## Initial validation and processing of the label
    if label is None:
        _label = '__id'

        try:
            _dataset = dataset.add_row_number(column_name=_label)
        except:
            print "Tried to add a default label column '{}' ".format(_label) +\
                  "but a column by this name already exists. Using the " + \
                  "existing column as the label column."
            _dataset = dataset

    else:
        if not label in dataset.column_names():
            raise ValueError(
                "Input 'label' must be a string matching the name of a " +\
                "column in the reference SFrame 'dataset'.")

        if not dataset[label].dtype() == str and not dataset[label].dtype() == int:
            raise TypeError("The label column must contain integers or strings.")

        _label = label
        _dataset = dataset

    sf_label = _tkutl._toolkits_select_columns(_dataset, [_label])


    ## Clean the method options and create the options dictionary
    if len(kwargs) > 0:
        _method_options = {k.lower(): v for k, v in kwargs.items()}
    else:
        _method_options = {}



    ## If composite inputs aren't specifed, formulate the standalone inputs as a
    #  composite input for code simplicity. If the standalone input doesn't
    #  specify features, choose all the features and make a set of distance
    #  components based on feature type.
    if composite_params is None:

        ## If not features specified, use them all
        if features is None:
            _features = [x for x in _dataset.column_names() if x != _label]
        else:
            _features = features[:]

        ## If the distance argument is 'auto', turn it into a list of distance
        #  components by choosing an automatic distance for each feature based
        #  on its type.
        if distance == 'auto':
            _composites = choose_auto_distance(_features,
                                               dataset.column_names(),
                                               dataset.column_types())

        else:
            _composites = [[_features, distance, 1]]


    # Ignore automatically generated components if components have been provided
    else:
        if distance != 'auto':
            raise ValueError(
                "Either the 'distance' parameter or the 'composite_params' " +\
                "parameter may be specified, but not both.")

        if features is not None:
            raise ValueError(
                "Either the 'features' parameter or the 'composite_params' " +\
                "parameter may be specified, but not both.")

        if len(composite_params) == 0:
            raise ValueError(
                "'composite_params' was specified as an empty list. If " +\
                "specified, this parameter must contain at least one distance " +\
                "component, which is a list containing three elements: a list " +\
                "of feature names, a distance name or function, and a relative " +\
                "weight.")

        _composites = copy.deepcopy(composite_params)



    ## Clean the list of features in each component of the composite inputs, and
    #  compile the union of the lists of features.
    all_features = []

    for i in range(len(_composites)):

        if len(_composites[i]) != 3:
            raise ValueError("Each element of 'composite_params' must be a " +\
                             "list with three members.")

        feature_names = _composites[i][0]
        if len(feature_names) == 0:
            raise ValueError("An empty list of features cannot be passed " +\
                             "as part of a composite distance function.")

        # set of features must be iterable
        _tkutl._raise_error_if_not_iterable(feature_names, "features")

        # feature names must be strings
        if not all([isinstance(x, str) for x in feature_names]):
            raise TypeError("Input 'features' must contain only strings.")

        # remove the label name from all of the features lists
        feature_names = [x for x in feature_names if x != _label]

        # ensure that string features are in single columns
        if len(feature_names) > 1 and any([_dataset[x].dtype() is str for x in feature_names]):
            raise ValueError(                             
                "Multiple features have been entered, one of which is of string " +\
                "type. If the input features for any distance component contain a " +\
                "string column, that must be the only column for that component.")

        # ensure that relative weights are integers or floats
        if not isinstance(_composites[i][2], (int, float)):
            raise ValueError(
                "The weight of each distance component must be a single " +\
                "integer or a float value.")

        # combine all features into a big list
        _composites[i][0] = feature_names
        all_features += feature_names

        # convert distance strings to distance functions
        temp_dist = _composites[i][1]
        if isinstance(temp_dist, str):
            _composites[i][1] = _graphlab.util._get_distance(temp_dist)


    # Pull out the relevant features from the input dataset (the union of
    # features over all distance components)
    all_features = list(set(all_features))
    sf_features = _tkutl._toolkits_select_columns(_dataset, all_features)


    ## Decide which method to use
    ## - If more than one distance component (specified either directly or
    #  generated automatically because distance set to 'auto'), then do brute
    #  force.
    if len(_composites) > 1:
        _method = 'brute_force'

        if method == 'ball_tree':
            print "Defaulting to brute force instead of ball tree because " +\
                "there are multiple distance components."

    else:

        if method == 'auto':

            # get the total number of variables. Assume the number of elements in
            # array type columns does not change
            num_variables = sum([len(x) if hasattr(x, '__iter__') else 1
                for x in sf_features[0].itervalues()])

            # flag if all the features in the single composite are of numeric
            # type.
            numeric_type_flag = all([x in [int, float, list, array.array]
                for x in sf_features.column_types()])

            ## Conditions necessary for ball tree to work and be worth it
            if ((_composites[0][1] in ['euclidean',
                                       'manhattan',
                                       _graphlab.distances.euclidean,
                                       _graphlab.distances.manhattan])
                    and numeric_type_flag is True
                    and num_variables <= 100):

                    _method = 'ball_tree'

            else:
                _method = 'brute_force'

        else:
            _method = method


    ## Pick the right model name for the method
    if _method == 'ball_tree':
        model_name = 'nearest_neighbors_ball_tree'
        _mt._get_metric_tracker().track('toolkit.nearest_neighbors_balltree.create')

    elif _method == 'brute_force':
        model_name = 'nearest_neighbors_brute_force'
        _mt._get_metric_tracker().track('toolkit.nearest_neighbors_brute.create')

    else:
        raise ValueError("Method must be 'brute_force', 'ball_tree', or 'auto'")


    ## Package the model options
    opts = {}
    opts.update(_method_options)
    opts.update(
        {'model_name': model_name,
        'sf_label': sf_label,
        'sf_features': sf_features,
        'composite_params': _composites})

    ## Construct the nearest neighbors model
    if verbose:
        print "Starting model construction..."

    result = _graphlab.extensions._nearest_neighbors.train(opts)
    model_proxy = result['model']
    model = NearestNeighborsModel(model_proxy)

    if verbose:
        model.summary()
        print

    return model
Пример #43
0
    def predict_topk(self,
                     dataset,
                     output_type="probability",
                     k=3,
                     missing_value_action='auto'):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `row_id`,
        `class`, and `probability`, `margin`,  or `rank`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the model.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the target column exists in ``dataset`` it will be ignored
            while making predictions.

        output_type : {'probability', 'rank', 'margin'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.
            - `margin`     : Margin associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': By default the model will treat missing value as is.
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | row_id | class |   probability     |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.decision_tree_classifier.predict_topk')
        _check_categorical_option_type('output_type', output_type,
                                       ['rank', 'margin', 'probability'])
        if missing_value_action == 'auto':
            missing_value_action = _sl.select_default_missing_value_policy(
                self, 'predict')

        # Low latency path
        if isinstance(dataset, list):
            return _graphlab.extensions._fast_predict_topk(
                self.__proxy__, dataset, output_type, missing_value_action, k)
        if isinstance(dataset, dict):
            return _graphlab.extensions._fast_predict_topk(
                self.__proxy__, [dataset], output_type, missing_value_action,
                k)
        # Fast path
        _raise_error_if_not_sframe(dataset, "dataset")
        options = dict()
        options.update({
            'model': self.__proxy__,
            'model_name': self.__name__,
            'dataset': dataset,
            'output_type': output_type,
            'topk': k,
            'missing_value_action': missing_value_action
        })
        target = _graphlab.toolkits._main.run(
            'supervised_learning_predict_topk', options)
        return _map_unity_proxy_to_object(target['predicted'])
    def extract_features(self, dataset, layer_id=None):
        """
        Takes an input dataset, propagates each example through the network,
        and returns an SArray of dense feature vectors, each of which is the concatenation
        of all the hidden unit values at layer[layer_id]. These feature vectors
        can be used as input to train another classifier such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`,
        an :py:class:`~graphlab.svm_classifier.SVMClassifier`, another
        :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`, or a :py:class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier`. Input dataset size must be the same as for the training of the model,
        except for images which are automatically resized.


        We also are releasing a pre-trained model for ImageNet, as described by
        Alex Krizhevsky et. al. It is located at
        https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45 .
        Using it requires 256 x 256 x 3 images.
        Please see Examples and References for more.


        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        layer_id : int , optional
            The index of the layer in neuralnet at which the activations are
            taken to be a dense feature vector. Must be a fully-connected layer.
            Default is None, in which case the layer before the connection
            layer to the output is used.


        Returns
        -------
        out : SArray
            An SArray of dtype array.array containing extracted features.

        See Also
        ------------
        graphlab.deeplearning.layers

        References
        ----------
        - Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet
          classification with deep convolutional neural networks." Advances in
          neural information processing systems. 2012.

        Examples
        --------
        >>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k')
        >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist')
        >>> m = graphlab.neuralnet_classifier.create(data,
        ...                                          target='label',
        ...                                          network=net,
        ...                                          max_iterations=3)
        >>> # Now, let's extract features from the last layer
        >>> data['features'] = m.extract_features(data)
        >>> # Now, let's build a new classifier on top of extracted features
        >>> m = graphlab.classifier.create(data,
        ...                                          features = ['features'],
        ...                                          target='label')

        Now, let's see how to load the ImageNet model, and use it for extracting
        features after resizing the data:

        >>> imagenet_model = graphlab.load_model('https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45')
        >>> data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3, decode=True)
        >>> data['imagenet_features'] = imagenet_model.extract_features(data)

        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.neuralnet_classifier.extract_features')
        _raise_error_if_not_sframe(dataset, "dataset")
        options = dict()

        net = self.get('network').layers
        network_size = len(net) - 1
        if layer_id is None:
            if net[network_size]._type == "CONNECTION":
                layer_id = network_size - 1
            else:
                layer_id = network_size - 2
        _numeric_param_check_range("layer_id", layer_id, 0, network_size)

        conv2flat = False
        for i in range(0, layer_id + 1):
            if net[i]._type == "CONNECTION" or net[i]._type == "TRANSITION":
                conv2flat = True

        if conv2flat is not True:
            raise ValueError(
                "Features must be extracted from either a network "
                "with non-image input or a layer after a FlattenLayer. "
                "Try extracting features from layer following a FlattenLayer.")

        options.update({
            'model': self.__proxy__,
            'model_name': self.__name__,
            'dataset': dataset,
            'missing_value_action': "error",
            'layer_id': layer_id
        })
        target = _toolkits_main.run('supervised_learning_feature_extraction',
                                    options)
        return _map_unity_proxy_to_object(target['extracted'])
Пример #45
0
    def classify(self, dataset, max_neighbors=10, radius=None, verbose=True):
        """
        Return the predicted class for each observation in *dataset*. This
        prediction is made based on the closest neighbors stored in the nearest
        neighbors classifier model.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        verbose : bool, optional
            If True, print progress updates.

        max_neighbors : int, optional
            Maximum number of neighbors to consider for each point.

        radius : float, optional
            Maximum distance from each point to a neighbor in the reference
            dataset.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions. The first column is the most
            likely class according to the model, and the second column is the
            predicted probability for that class.

        See Also
        --------
        create, predict, predict_topk

        Notes
        -----
        - If the 'radius' parameter is small, it is possible that a query point
          has no qualified neighbors in the training dataset. In this case, the
          resulting class and probability for that query are 'None' in the
          SFrame output by this method. If the target column in the training
          dataset has missing values, these predictions will be ambiguous.

        - Ties between predicted classes are broken randomly.

        Examples
        --------
        >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
        ...                             'height': [9, 25, 20, 23],
        ...                             'weight': [13, 28, 33, 22]})
        ...
        >>> sf_new = graphlab.SFrame({'height': [26, 19],
        ...                           'weight': [25, 35]})
        ...
        >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species')
        >>> ystar = m.classify(sf_new, max_neighbors=2)
        >>> print ystar
        +-------+-------------+
        | class | probability |
        +-------+-------------+
        |  dog  |     1.0     |
        | fossa |     0.5     |
        +-------+-------------+
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.nearest_neighbor_classifier.classify')

        ## Validate the query 'dataset'. Note that the 'max_neighbors' and
        #  'radius' parameters are validated by the nearest neighbor model's
        #  query method.
        _raise_error_if_not_sframe(dataset, "dataset")
        _raise_error_if_sframe_empty(dataset, "dataset")
        n_query = dataset.num_rows()

        ## Validate neighborhood parameters 'max_neighbors'.
        # - NOTE: when the parameter name is changed in nearest neighbors, the
        #   query call will do this itself, and this block can be removed.
        if max_neighbors is not None:
            if not isinstance(max_neighbors, int):
                raise ValueError("Input 'max_neighbors' must be an integer.")

            if max_neighbors <= 0:
                raise ValueError(
                    "Input 'max_neighbors' must be larger than 0.")

        ## Find the nearest neighbors for each query and count the number of
        #  votes for each class.
        knn = self._knn_model.query(dataset,
                                    k=max_neighbors,
                                    radius=radius,
                                    verbose=verbose)

        ## If there are *no* results for *any* query make an SFrame of nothing.
        if knn.num_rows() == 0:
            ystar = _gl.SFrame({
                'class':
                _gl.SArray([None] * n_query, self._target_type),
                'probability':
                _gl.SArray([None] * n_query, int)
            })

        else:
            ## Find the class with the most votes for each query and postprocess.
            grp = knn.groupby(['query_label', 'reference_label'],
                              _gl.aggregate.COUNT)

            ystar = grp.groupby(
                'query_label', {
                    'class': _gl.aggregate.ARGMAX('Count', 'reference_label'),
                    'max_votes': _gl.aggregate.MAX('Count'),
                    'total_votes': _gl.aggregate.SUM('Count')
                })

            ystar['probability'] = ystar['max_votes'] / ystar['total_votes']

            ## Fill in 'None' for query points that don't have any near neighbors.
            row_ids = _gl.SFrame({'query_label': range(n_query)})
            ystar = ystar.join(row_ids, how='right')

            ## Sort by row number (because row number is not returned) and return
            ystar = ystar.sort('query_label', ascending=True)
            ystar = ystar[['class', 'probability']]

        return ystar
def create(dataset, item, features=None, min_support=1, max_patterns=100,
           min_length=1):
    """
    Create a :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner` to
    extract the set of frequently occurring items in an event-series.

    Parameters
    ----------

    dataset : SFrame
        Dataset for training the model.

    item: string
        Name of the column containing the item. The values in this column must
        be of string or integer type.

    features : list[string], optional
        Names of the columns containing features. 'None' (the default) indicates
        that all columns except the target variable should be used as features.

        The feature columns are the ones that together identify a unique
        transaction ID for the item.

    min_support : int, optional
        The minimum number of times that a pattern must occur in order for it
        to be considered `frequent`.

    max_patterns : int, optional
        The maximum number of frequent patterns to be mined.

    min_length: int, optional
        The minimum size (number of elements in the set) of each pattern being
        mined.

    Returns
    -------
    out : FrequentPatternMiner
        A trained model of type
        :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner`.

    Notes
    -----
    Frequent closed itemests are mined using the `top-k FP growth` algorithm.
    Mining occurs until the top max_patterns closed itemsets of size min_length
    and support greater than min_support are found.

    See Also
    --------
    FrequentPatternMiner

    References
    ----------

    - Wikipedia - Association Rule Learning
      <https://en.wikipedia.org/wiki/Association_rule_learning>
    - Han, Jiawei, et al. "Mining top-k frequent closed patterns without minimum
      support." Data Mining, 2002. ICDM 2003.
    - Wang, Jianyong, et al. "TFP: An efficient algorithm for mining top-k
      frequent closed itemsets." Knowledge and Data Engineering, IEEE Transactions
      on 17.5 (2005): 652-663.

    Examples
    --------

    .. sourcecode:: python

        >>> import graphlab as gl
        >>> bakery_sf = gl.SFrame("http://s3.amazonaws.com/dato-datasets/bakery.sf")
        >>> bakery_sf
        Data:
        +---------+-------------+-------+----------+----------+-----------------+
        | Receipt |   SaleDate  | EmpId | StoreNum | Quantity |       Item      |
        +---------+-------------+-------+----------+----------+-----------------+
        |    1    | 12-JAN-2000 |   20  |    20    |    1     |  GanacheCookie  |
        |    1    | 12-JAN-2000 |   20  |    20    |    5     |     ApplePie    |
        |    2    | 15-JAN-2000 |   35  |    10    |    1     |   CoffeeEclair  |
        |    2    | 15-JAN-2000 |   35  |    10    |    3     |     ApplePie    |
        |    2    | 15-JAN-2000 |   35  |    10    |    4     |   AlmondTwist   |
        |    2    | 15-JAN-2000 |   35  |    10    |    3     |    HotCoffee    |
        |    3    |  8-JAN-2000 |   13  |    13    |    5     |    OperaCake    |
        |    3    |  8-JAN-2000 |   13  |    13    |    3     |   OrangeJuice   |
        |    3    |  8-JAN-2000 |   13  |    13    |    3     | CheeseCroissant |
        |    4    | 24-JAN-2000 |   16  |    16    |    1     |   TruffleCake   |
        +---------+-------------+-------+----------+----------+-----------------+
        [266209 rows x 6 columns]

        >>> model = gl.frequent_pattern_mining.create(train, 'Item',
                         features=['Receipt'], min_length=4, max_patterns=500)
        Model fields
        ------------
        Min support                   : 1
        Max patterns                  : 500
        Min pattern length            : 4

        Most frequent patterns
        ----------------------
        ['CoffeeEclair', 'HotCoffee', 'AlmondTwist', 'ApplePie']: 1704
        ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'RaspberryCookie']: 1565
        ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'GreenTea']: 1290
        ['LemonLemonade', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1289
        ['LemonLemonade', 'LemonCookie', 'RaspberryCookie', 'GreenTea']: 1279
        ['LemonCookie', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1279
        ['AppleTart', 'AppleDanish', 'AppleCroissant', 'CherrySoda']: 1253
        ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1221
        ['CherryTart', 'ApricotDanish', 'OperaCake', 'ApricotTart']: 61
        ['CherryTart', 'ApricotDanish', 'OperaCake', 'RaspberryLemonade']: 55
    """
    _mt._get_metric_tracker().track('toolkit.frequent_pattern_mining.create')

    # Type checking.
    _raise_error_if_not_sframe(dataset, "dataset")
    _raise_error_if_not_of_type(item, str, "item")
    _raise_error_if_not_of_type(features, [list, _types.NoneType], "features")
    _raise_error_if_not_of_type(min_support, [int, float], "min_support")
    _raise_error_if_not_of_type(max_patterns, [int, float], "max_patterns")
    _raise_error_if_not_of_type(min_length, [int, float], "min_length")

    # Value checking.
    column_names = dataset.column_names()

    # If features is None, then use all other column names than item
    if features is None:
        features = column_names
        features.remove(item)

    # Call the C++ create function.
    proxy = _gl.extensions._pattern_mining_create(
            dataset, item, features, min_support, max_patterns, min_length)
    return FrequentPatternMiner(proxy)
Пример #47
0
def create(dataset, target, features=None, distance=None, verbose=True):
    """
    Create a
    :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`
    model. This model predicts the class of a query instance by finding the most
    common class among the query's nearest neighbors.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    features : list[str], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns except the target variable
        should be used. Please note: if `distance` is specified as a composite
        distance, then that parameter controls which features are used in the
        model. Each column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : str, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~graphlab.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (str)

          2. standard distance name (str)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborClassifier
        A trained model of type
        :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`.

    See Also
    --------
    NearestNeighborClassifier
    graphlab.toolkits.nearest_neighbors
    graphlab.toolkits.distances

    References
    ----------
    - `Wikipedia - nearest neighbors classifier
      <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_

    - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of
      Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`_.
      Vol. 2. New York. Springer. pp. 463-481.

    Examples
    --------
    >>> sf = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
    ...                       'height': [9, 25, 20, 23],
    ...                       'weight': [13, 28, 33, 22]})
    ...
    >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species')

    As with the nearest neighbors toolkit, the nearest neighbor classifier
    accepts composite distance functions.

    >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7],
    ...            [('height', 'weight'), 'manhattan', 1.6]]
    ...
    >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species',
    ...                                                     distance=my_dist)
    """

    ## Set up
    ## ------
    _mt._get_metric_tracker().track(
        'toolkit.classifier.nearest_neighbor_classifier.create')
    start_time = _time.time()

    ## Validation and preprocessing
    ## ----------------------------

    ## 'dataset' must be a non-empty SFrame
    _raise_error_if_not_sframe(dataset, "dataset")
    _raise_error_if_sframe_empty(dataset, "dataset")

    ## 'target' must be a string, in 'dataset', and the type of the target must
    #  be string or integer.
    if not isinstance(target, str) or target not in dataset.column_names():
        raise _ToolkitError("The 'target' parameter must be the name of a "
                            "column in the input dataset.")

    if not dataset[target].dtype() == str and not dataset[target].dtype(
    ) == int:
        raise TypeError("The target column must contain integers or strings.")

    ## Warn that 'None' values in the target may lead to ambiguous predictions.
    if dataset[target].num_missing() > 0:
        _logging.warning(
            "Missing values detected in the target column. This " +
            "may lead to ambiguous 'None' predictions, if the " +
            "'radius' parameter is set too small in the prediction, " +
            "classification, or evaluation methods.")

    ## convert features and distance arguments into a composite distance
    ## NOTE: this is done here instead of in the nearest neighbors toolkit
    #  because the automatic distance construction may be different for the two
    #  toolkits.
    if features is None:
        _features = [x for x in dataset.column_names() if x != target]
    else:
        _features = [x for x in features if x != target]

    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif (hasattr(distance, '__call__')
          or (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    elif distance is None or distance == 'auto':
        col_types = {
            k: v
            for k, v in zip(dataset.column_names(), dataset.column_types())
        }
        distance = _construct_auto_distance(_features, col_types)

    else:
        raise TypeError(
            "Input 'distance' not understood. The 'distance' " +
            "parameter must be a string or a composite distance, " +
            " or left unspecified.")

    ## Construct and query the nearest neighbors model
    ## -----------------------------------------------
    knn_model = _gl.nearest_neighbors.create(dataset,
                                             label=target,
                                             distance=distance,
                                             verbose=verbose)

    ## Postprocessing and formatting
    ## -----------------------------
    model = NearestNeighborClassifier(knn_model)
    model._state['verbose'] = verbose
    model._state['distance'] = knn_model['distance']
    model._state['num_distance_components'] = knn_model[
        'num_distance_components']
    model._state['num_examples'] = dataset.num_rows()
    model._state['features'] = knn_model['features']
    model._state['target'] = target
    model._state['num_classes'] = len(dataset[target].unique())
    model._state['num_features'] = knn_model['num_features']
    model._state['num_unpacked_features'] = knn_model['num_unpacked_features']
    model._state['training_time'] = _time.time() - start_time
    model._target_type = dataset[target].dtype()

    return model
Пример #48
0
def create(dataset, features=None, distance=None, method='auto', verbose=True,
           **kwargs):
    """
    Create a RecordLinker model to match query records to a reference dataset of
    records, assuming both sets have the same general form.

    Parameters
    ----------
    dataset : SFrame
        Reference data, against which to link new queries with the 'link'
        method. The 'dataset' SFrame must include at least the features
        specified in the 'features' or 'distance' parameter.

    features : list[string], optional    
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns should be used. Each column can
        be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if 'distance' is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', or 'dot_product'. Please see
          the :mod:`distances` module for more details.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

    method : {'auto', brute_force', 'lsh', 'ball_tree'}, optional
        Strategy for the nearest neighbors search. If not specified or 'auto',
        the search strategy is chosen automatically based on the data type and
        dimension.

    verbose : bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options passed through to the nearest_neighbors toolkit for particular
        nearest neighbors search strategies:

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and n/(2^11),
          which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed.

        - *num_projections_per_table*: For the LSH method, the number of
          projections for each hash table.

    Returns
    -------
    out : RecordLinker model.

    See Also
    --------
    RecordLinker, graphlab.toolkits.nearest_neighbors

    Notes
    -----
    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    References
    ----------
    - Christen, Peter. "Data matching: concepts and techniques for record
      linkage, entity resolution, and duplicate detection." Springer Science &
      Business Media, 2012.

    Examples
    --------
    >>> homes = graphlab.SFrame({'sqft': [1230, 875, 1745],
    ...                          'street': ['phinney', 'fairview', 'cottage'],
    ...                          'city': ['seattle', 'olympia', 'boston'],
    ...                          'state': ['WA', 'WA', 'MA']})
    ...
    >>> model = graphlab.record_linker.create(homes, features=['city'],
    ...                                       distance='levenshtein') 
    """

    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    start_time = _time.time()


    ## Validate the 'dataset' input.
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")


    ## Validate the features input.
    if features is not None:
        if not hasattr(features, '__iter__'):
            raise TypeError("Input 'features' must be a list.")

        if not all([isinstance(x, str) for x in features]):
            raise TypeError("Input 'features' must contain only strings.")

    else:
        features = dataset.column_names()


    ## Validate and preprocess the distance input.
    col_types = {k: v for k, v in zip(dataset.column_names(),
                                      dataset.column_types())}

    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif isinstance(distance, str):
        # this will likely produce errors downstream if 'features' was not
        # specified by the user.
        distance = [[features, distance, 1]]

    elif distance == None:
        distance = _construct_auto_distance(features, col_types)

    else:
        raise TypeError("Input 'distance' not understood. For the " +
                         "data matching toolkit, 'distance' must be a string or " +
                         "a composite distance list."   )


    ## Validate the composite distance and set it in the model.
    allowed_dists = {
        'euclidean': [int, float, _array.array],
        'squared_euclidean': [int, float, _array.array],
        'manhattan': [int, float, _array.array],
        'levenshtein': [str],
        'jaccard': [str, dict],
        'weighted_jaccard': [str, dict],
        'cosine': [int, float, str, dict, _array.array],
        'dot_product': [int, float, str, dict, _array.array]}

    distance = _dmutl.validate_composite_distance(distance, row_label=None,
                                             allowed_dists=allowed_dists.keys(),
                                             verbose=verbose)


    ## Validate feauture types against distance functions.
    _dmutl.validate_distance_feature_types(dataset, distance, allowed_dists)


    ## Clean and impute string data.

    #  *** NOTE: after this, the composite distance and feature set will be
    #      modified and useless to the user, so set the state here. ***
    state = {'distance': distance,
             'num_distance_components': len(distance)}

    union_features = _dmutl.extract_composite_features(distance)

    _dataset = _copy.copy(dataset)
    _distance = _copy.deepcopy(distance)

    for ftr in union_features:
        if col_types[ftr] == str:
            new_ftr = '__clean.' + ftr
            _dataset[new_ftr] = _dataset[ftr].fillna("")
            _dataset[new_ftr] = _dataset[new_ftr].apply(
                lambda x: _dmutl.cleanse_string(x), dtype=str)

            for dist_comp in _distance:
                dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]]


    ## Convert strings to dicts if the distance isn't levenshtein, and
    #  concatenate string columns within a distance component into a single
    #  feature.
    _dataset, _distance = _engineer_distance_features(_dataset, _distance)


    ## Create the nearest neighbors model and set in the model
    knn_model = _gl.nearest_neighbors.create(_dataset, distance=_distance,
                                             method=method, verbose=verbose,
                                             **kwargs)


    ## Postprocessing and formatting
    state.update({'verbose': verbose,
                  'num_examples': dataset.num_rows(),
                  'features': union_features,
                  'num_features': len(union_features),
                  'method': knn_model['method'],
                  'training_time': _time.time() - start_time})

    model = RecordLinker(knn_model, state)
    return model
Пример #49
0
def create(dataset, target, model_name, features=None,
           validation_set='auto', verbose=True, distributed='auto', **kwargs):
    """
    Create a :class:`~graphlab.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    features : list[string], optional
        List of feature names used by feature column

    validation_set : SFrame, optional

        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    distributed: env
        The distributed environment

    verbose : boolean
        whether print out messages during training

    kwargs : dict
        Additional parameter options that can be passed
    """

    _raise_error_if_not_sframe(dataset, "training dataset")

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95)
            else:
                validation_set = None
        else:
            raise TypeError, 'Unrecognized value for validation_set.'

    # Target
    target_sframe = _toolkits_select_columns(dataset, [target])

    # Features
    if features is None:
        features = dataset.column_names()
        features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)
    features_sframe = _toolkits_select_columns(dataset, features)


    options = {}
    _kwargs = {}
    for k in kwargs:
      _kwargs[k.lower()] = kwargs[k]
    options.update(_kwargs)
    options.update({'target': target_sframe,
                    'features': features_sframe,
                    'model_name': model_name})

    if validation_set is not None:

        if not isinstance(validation_set, _graphlab.SFrame):
            raise TypeError, "validation_set must be either 'auto' or an SFrame matching the training data."

        # Attempt to append the two datasets together to check schema
        validation_set.head().append(dataset.head())

        options.update({
            'features_validation' : _toolkits_select_columns(validation_set, features),
            'target_validation' : _toolkits_select_columns(validation_set, [target])})

    execution_env = get_distributed_execution_environment()
    if distributed == 'auto' and execution_env is None:
        ret = _graphlab.toolkits._main.run("supervised_learning_train",
                                           options, verbose)
        model = SupervisedLearningModel(ret['model'], model_name)
    else:
        ret = _distributed_run("distributed_supervised_train",
                               options, env=distributed, verbose=verbose)
        model = SupervisedLearningModel(ret, model_name)

    return model
def create(dataset, target, model_name, features=None,
           validation_set='auto', verbose=True, distributed='auto', **kwargs):
    """
    Create a :class:`~graphlab.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    features : list[string], optional
        List of feature names used by feature column

    validation_set : SFrame, optional

        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    distributed: env
        The distributed environment

    verbose : boolean
        whether print out messages during training

    kwargs : dict
        Additional parameter options that can be passed
    """

    _raise_error_if_not_sframe(dataset, "training dataset")

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95)
            else:
                validation_set = None
        else:
            raise TypeError('Unrecognized value for validation_set.')

    # Target
    target_sframe = _toolkits_select_columns(dataset, [target])

    # Features
    if features is None:
        features = dataset.column_names()
        features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)
    features_sframe = _toolkits_select_columns(dataset, features)


    options = {}
    _kwargs = {}
    for k in kwargs:
      _kwargs[k.lower()] = kwargs[k]
    options.update(_kwargs)
    options.update({'target': target_sframe,
                    'features': features_sframe,
                    'model_name': model_name})

    if validation_set is not None:

        if not isinstance(validation_set, _graphlab.SFrame):
            raise TypeError("validation_set must be either 'auto' or an SFrame matching the training data.")

        # Attempt to append the two datasets together to check schema
        validation_set.head().append(dataset.head())

        options.update({
            'features_validation' : _toolkits_select_columns(validation_set, features),
            'target_validation' : _toolkits_select_columns(validation_set, [target])})


    ret = _graphlab.toolkits._main.run("supervised_learning_train",
                                       options, verbose)
    model = SupervisedLearningModel(ret['model'], model_name)

    return model
Пример #51
0
def create_classification_with_model_selector(dataset, target, model_selector,
    features=None, validation_set='auto', verbose=True):
    """
    Create a :class:`~graphlab.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel. This function is normally not called, call
    specific model's create function instead.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    model_selector: function
        Provide a model selector.

    features : list[string], optional
        List of feature names used by feature column

    verbose : boolean
        whether print out messages during training

    """

    # Error checking
    _raise_error_if_not_sframe(dataset, "training dataset")
    if features is None:
        features = dataset.column_names()
        if target in features:
            features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)

    # Sample the data
    features_sframe = _toolkits_select_columns(dataset, features)
    if features_sframe.num_rows() > 1e5:
        fraction = 1.0 * 1e5 / features_sframe.num_rows()
        features_sframe = features_sframe.sample(fraction, seed = 0)

    # Get available models for this dataset
    num_classes = dataset[target].unique().size()
    selected_model_names = model_selector(num_classes, features_sframe)

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95)
            else:
                validation_set = None
        else:
            raise TypeError, 'Unrecognized value for validation_set.'

    # Match C++ model names with user model names
    python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier',
                    'random_forest_classifier': 'RandomForestClassifier',
                    'classifier_logistic_regression': 'LogisticClassifier',
                    'classifier_svm': 'SVMClassifier',
                    'neuralnet_classifier': 'NeuralNetClassifier',
                    'neuralnet_classifier_v2': 'NeuralNetClassifier'}

    # Print useful user-facing progress messages
    print 'PROGRESS: The following methods are available for this type of problem.'
    print 'PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names])
    if len(selected_model_names) > 1:
        print 'PROGRESS: The returned model will be chosen according to validation accuracy.'

    models = {}
    metrics = {}
    for model_name in selected_model_names:

        # Fit each of the available models
        m = create_selected(model_name, dataset, target, features, validation_set, verbose)
        models[model_name] = m

        # Get the last progress value or validation_accuracy, whichever is there
        if 'progress' in m.list_fields():
            prog = m['progress']
            validation_column = 'Validation-accuracy'
            accuracy_column = 'Training-accuracy'
            if validation_column in prog.column_names():
                metrics[model_name] = float(prog[validation_column].tail(1)[0])
            else:
                metrics[model_name] = float(prog[accuracy_column].tail(1)[0])
        # Validation accuracy (for boosted trees.)
        elif 'validation_accuracy' in m.list_fields():
            metrics[model_name] = m['validation_accuracy']
        else:
            raise ValueError, \
            "Model does not have metrics that can be used for model selection."

    # Choose model based on either validation, if available.
    best_model = None
    best_acc = None
    for model_name in selected_model_names:
        if best_acc is None:
            best_model = model_name
            best_acc = metrics[model_name]
        if best_acc < metrics[model_name]:
            best_model = model_name
            best_acc = metrics[model_name]

    ret = []
    width = 32
    if len(selected_model_names) > 1:
        ret.append('PROGRESS: Model selection based on validation accuracy:')
        ret.append('---------------------------------------------')
        key_str = '{:<{}}: {}'
        for model_name in selected_model_names:
            name = python_names[model_name]
            row = key_str.format(name, width, str(metrics[model_name]))
            ret.append(row)
        ret.append('---------------------------------------------')
        ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.')
    print '\nPROGRESS: '.join(ret)
    return models[best_model]
def create_with_model_selector(dataset, target, model_selector,
    features = None, verbose = True):
    """
    Create a :class:`~graphlab.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    model_selector: function
        Provide a model selector.

    features : list[string], optional
        List of feature names used by feature column

    verbose : boolean
        whether print out messages during training

    """

    # Error checking
    _raise_error_if_not_sframe(dataset, "training dataset")
    if features is None:
        features = dataset.column_names()
        if target in features:
            features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)

    # Sample the data
    features_sframe = _toolkits_select_columns(dataset, features)
    if features_sframe.num_rows() > 1e5:
        fraction = 1.0 * 1e5 / features_sframe.num_rows()
        features_sframe = features_sframe.sample(fraction, seed = 0)

    # Run the model selector.
    selected_model_name = model_selector(features_sframe)

    if (selected_model_name == 'neuralnet_classifier'):

      model = _graphlab.classifier.neuralnet_classifier.create(dataset,
                                target, features = features, verbose = verbose)
      return model

    else:

      # Multi-class through boosted trees
      if ('classifier' in selected_model_name) and \
                              (dataset[target].unique().size() > 2):
        selected_model_name = 'boosted_trees_classifier'

      # Create the model
      model = create(dataset,
                     target,
                     selected_model_name,
                     features = features,
                     verbose = verbose)

      # Return the model
      if selected_model_name == 'boosted_trees_regression':
          return _graphlab.boosted_trees_regression.BoostedTreesRegression(\
                                                                model.__proxy__)
      elif selected_model_name == 'regression_linear_regression':
          return _graphlab.linear_regression.LinearRegression(\

                                                                model.__proxy__)
      elif selected_model_name == 'boosted_trees_classifier':
          return _graphlab.boosted_trees_classifier.BoostedTreesClassifier(\
                                                                model.__proxy__)
      elif selected_model_name == 'classifier_logistic_regression':
          return _graphlab.logistic_classifier.LogisticClassifier(\
                                                                model.__proxy__)
      elif selected_model_name == 'classifier_svm':
          return _graphlab.svm_classifier.SVMClassifier(model.__proxy__)
      else:
          raise ToolkitError, "Internal error: Incorrect model returned."
    def predict_topk(self, dataset, output_type="probability", k=3):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `row_id`,
        `class`, and `probability`,`rank`, or `score`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the
        model, except for images which are automatically resized.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'probability', 'rank', 'score'}, optional
            Choose the return type of the prediction:

            - `rank`: outputs rank along with class label.
            - `probability`: outputs learned probability along with class label.
            - `score`: Same as probability

        k : int, optional
            Number of classes to return for each input example.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train')
        >>> training_data, validation_data = data.random_split(0.8)
        >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist')
        >>> m = graphlab.neuralnet_classifier.create(training_data,
        ...                                          target='label',
        ...                                          network=net,
        ...                                          max_iterations=3)
        ...
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | row_id | class |    probability    |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.neuralnet_classifier.predict_topk')
        _raise_error_if_not_sframe(dataset, "dataset")
        options = dict()
        options.update({
            'model': self.__proxy__,
            'model_name': self.__name__,
            'dataset': dataset,
            'output_type': output_type,
            'topk': k,
            'missing_value_action': 'error'
        })
        target = _toolkits_main.run('supervised_learning_predict_topk',
                                    options)
        return _map_unity_proxy_to_object(target['predicted'])
def create_classification_with_model_selector(dataset, target, model_selector,
    features=None, validation_set='auto', verbose=True):
    """
    Create a :class:`~graphlab.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel. This function is normally not called, call
    specific model's create function instead.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    model_selector: function
        Provide a model selector.

    features : list[string], optional
        List of feature names used by feature column

    verbose : boolean
        whether print out messages during training

    """

    # Error checking
    _raise_error_if_not_sframe(dataset, "training dataset")
    if features is None:
        features = dataset.column_names()
        if target in features:
            features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)

    # Sample the data
    features_sframe = _toolkits_select_columns(dataset, features)
    if features_sframe.num_rows() > 1e5:
        fraction = 1.0 * 1e5 / features_sframe.num_rows()
        features_sframe = features_sframe.sample(fraction, seed = 0)

    # Get available models for this dataset
    num_classes = dataset[target].unique().size()
    selected_model_names = model_selector(num_classes, features_sframe)

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95)
            else:
                validation_set = None
        else:
            raise TypeError('Unrecognized value for validation_set.')

    # Match C++ model names with user model names
    python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier',
                    'random_forest_classifier': 'RandomForestClassifier',
                    'decision_tree_classifier': 'DecisionTreeClassifier',
                    'classifier_logistic_regression': 'LogisticClassifier',
                    'classifier_svm': 'SVMClassifier',
                    'neuralnet_classifier': 'NeuralNetClassifier',
                    'neuralnet_classifier_v2': 'NeuralNetClassifier'}

    # Print useful user-facing progress messages
    if verbose:
        print('PROGRESS: The following methods are available for this type of problem.')
        print('PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names]))
        if len(selected_model_names) > 1:
            print('PROGRESS: The returned model will be chosen according to validation accuracy.')

    models = {}
    metrics = {}
    for model_name in selected_model_names:

        # Fit each of the available models
        m = create_selected(model_name, dataset, target, features, validation_set, verbose)
        models[model_name] = m

        if 'validation_accuracy' in m.list_fields():
            metrics[model_name] = m['validation_accuracy']

        # Most models have this.
        elif 'progress' in m.list_fields():
            prog = m['progress']
            validation_column = 'Validation-accuracy'
            accuracy_column = 'Training-accuracy'
            if validation_column in prog.column_names():
                metrics[model_name] = float(prog[validation_column].tail(1)[0])
            else:
                metrics[model_name] = float(prog[accuracy_column].tail(1)[0])
        else:
            raise ValueError("Model does not have metrics that can be used for model selection.")

    # Choose model based on either validation, if available.
    best_model = None
    best_acc = None
    for model_name in selected_model_names:
        if best_acc is None:
            best_model = model_name
            best_acc = metrics[model_name]
        if best_acc is not None and best_acc < metrics[model_name]:
            best_model = model_name
            best_acc = metrics[model_name]

    ret = []
    width = 32
    if len(selected_model_names) > 1:
        ret.append('PROGRESS: Model selection based on validation accuracy:')
        ret.append('---------------------------------------------')
        key_str = '{:<{}}: {}'
        for model_name in selected_model_names:
            name = python_names[model_name]
            row = key_str.format(name, width, str(metrics[model_name]))
            ret.append(row)
        ret.append('---------------------------------------------')
        ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.')

    if verbose:
        print('\nPROGRESS: '.join(ret))
    return models[best_model]
    def classify(self, dataset, max_neighbors=10, radius=None, verbose=True):
        """
        Return the predicted class for each observation in ``dataset``, based on
        the closest neighbors stored in the nearest neighbors classifier model.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        verbose : bool, optional
            If True, print progress updates.

        max_neighbors : int, optional
            Maximum number of neighbors to consider for each point.

        radius : float, optional
            Maximum distance from each point to a neighbor in the reference
            dataset.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions. The first column is the most
            likely class according to the model, and the second column is the
            predicted probability for that class.

        See Also
        --------
        create, predict, predict_topk

        Notes
        -----
        - If the 'radius' parameter is small, it is possible that a query point
          has no qualified neighbors in the training dataset. In this case, the
          resulting class and probability for that query are 'None' in the
          SFrame output by this method. If the target column in the training
          dataset has missing values, these predictions will be ambiguous.

        - Ties between predicted classes are broken randomly.

        Examples
        --------
        >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
        ...                             'height': [9, 25, 20, 23],
        ...                             'weight': [13, 28, 33, 22]})
        ...
        >>> sf_new = graphlab.SFrame({'height': [26, 19],
        ...                           'weight': [25, 35]})
        ...
        >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species')
        >>> ystar = m.classify(sf_new, max_neighbors=2)
        >>> print ystar
        +-------+-------------+
        | class | probability |
        +-------+-------------+
        |  dog  |     1.0     |
        | fossa |     0.5     |
        +-------+-------------+
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.nearest_neighbor_classifier.classify')

        ## Validate the query 'dataset'. Note that the 'max_neighbors' and
        #  'radius' parameters are validated by the nearest neighbor model's
        #  query method.
        _raise_error_if_not_sframe(dataset, "dataset")
        _raise_error_if_sframe_empty(dataset, "dataset")
        n_query = dataset.num_rows()

        ## Validate neighborhood parameters 'max_neighbors'.
        # - NOTE: when the parameter name is changed in nearest neighbors, the
        #   query call will do this itself, and this block can be removed.
        if max_neighbors is not None:
            if not isinstance(max_neighbors, int):
                raise ValueError("Input 'max_neighbors' must be an integer.")

            if max_neighbors <= 0:
                raise ValueError("Input 'max_neighbors' must be larger than 0.")


        ## Find the nearest neighbors for each query and count the number of
        #  votes for each class.
        knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius,
                                    verbose=verbose)

        ## If there are *no* results for *any* query make an SFrame of nothing.
        if knn.num_rows() == 0:
            ystar = _gl.SFrame({'class': [None] * n_query,
                                'probability': [None] * n_query})

        else:
            ## Find the class with the most votes for each query and postprocess.
            grp = knn.groupby(['query_label', 'reference_label'], _gl.aggregate.COUNT)

            ystar = grp.groupby('query_label',
                                {'class': _gl.aggregate.ARGMAX('Count', 'reference_label'),
                                 'max_votes': _gl.aggregate.MAX('Count'),
                                 'total_votes': _gl.aggregate.SUM('Count')})

            ystar['probability'] = ystar['max_votes'] / ystar['total_votes']

            ## Fill in 'None' for query points that don't have any near neighbors.
            row_ids = _gl.SFrame({'query_label': range(n_query)})
            ystar = ystar.join(row_ids, how='right')

            ## Sort by row number (because row number is not returned) and return
            ystar = ystar.sort('query_label', ascending=True)
            ystar = ystar[['class', 'probability']]

        return ystar
    def predict(self, dataset, missing_value_action='auto',
                output_type='', options={}, **kwargs):
        """
        Return predictions for ``dataset``, using the trained supervised_learning
        model. Predictions are generated as class labels (0 or
        1).

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        missing_value_action: str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'auto': Choose a model dependent missing value policy.
            - 'impute': Proceed with evaluation by filling in the missing
                        values with the mean of the training data. Missing
                        values are also imputed if an entire column of data is
                        missing during evaluation.
            - 'none': Treat missing value as is. Model must be able to handle missing value.
            - 'error' : Do not proceed with prediction and terminate with
                        an error message.

        output_type : str, optional
            output type that maybe needed by some of the toolkits

        options : dict
            additional options to be passed in to prediction

        kwargs : dict
            additional options to be passed into prediction

        Returns
        -------
        out : SArray
            An SArray with model predictions.
        """
        if missing_value_action == 'auto':
            missing_value_action = select_default_missing_value_policy(self, 'predict')

        # Low latency path
        if isinstance(dataset, list):
            return _graphlab.extensions._fast_predict(self.__proxy__, dataset,
                    output_type, missing_value_action)
        if isinstance(dataset, dict):
            return _graphlab.extensions._fast_predict(self.__proxy__, [dataset],
                    output_type, missing_value_action)

        # Batch predictions path
        else:
            _raise_error_if_not_sframe(dataset, "dataset")

            options = options.copy()
            options.update(kwargs)

            options.update({'model': self.__proxy__,
                            'model_name': self.__name__,
                            'dataset': dataset,
                            'missing_value_action' : missing_value_action,
                            'output_type' : output_type
                            })

            target = _graphlab.toolkits._main.run(
                      'supervised_learning_predict', options)
            return _map_unity_proxy_to_object(target['predicted'])
def create(datasets, row_label=None, features=None, grouping_features=None,
           distance=None, k=2, radius=None, verbose=True):
    """
    Create a deduplication model based on nearest neighbors and SGraph connected
    components.

    This method creates a :class:`NearestNeighborDeduplication` model by
    constructing a nearest neighbors similarity graph on all of the rows in the
    input 'datasets', then using the connected components tool in the
    :mod:`~graphlab.toolkits.graph_analytics` module to assign an entity label
    to each record. Records which share the same label are considered to be
    duplicates.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of GraphLab Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    datasets : SFrame or list[SFrame] or dict(string: SFrame)
        Input datasets. Each SFrame in the list must include all of the features
        specified in the `features` or 'distance' parameters, but may
        have additional columns as well. SFrames can be input as values in a
        dictionary, where the keys are strings used in the output to identify
        the SFrame from which each record originated.

    row_label : string, optional
        Name of the SFrame column with row labels. If not specified, row numbers
        are used to identify rows in the output.

    features : list[string], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates the intersection of columns over all SFrames in
        `datasets` should be used (except the label column, if specified). Each
        column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model. Any
        additional columns named in 'features' will be included in the model
        output but not used for distance computations.

    grouping_features : list[string], optional
        Names of features to use in grouping records before finding approximate
        matches. These columns must have string or integer type data. See the
        Notes section for more details on grouping.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    k : int, optional
        Number of neighbors to consider for each point.

    radius : float, optional
        Maximum distance from each point to a potential duplicate.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborDeduplication model
        The NearestNeighborDeduplication object contains a field 'entities'
        which shows the entity label for each input record. It also shows the
        features for each record that are used to construct the model, as well
        as the original SFrame and row label for each record. If the original
        `datasets` are passed in a list, the SFrame identifier is the index of
        the SFrame in that list.

    See Also
    --------
    NearestNeighborDeduplication, graphlab.toolkits.nearest_neighbors,
    graphlab.SFrame.groupby

    Notes
    -----
    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - For datasets with more than about 10,000 records, *grouping* (also known
      as *blocking*) is a critical step to avoid computing distances between all
      pairs of records. The grouping step simply assigns each record to a group
      that has identical values for all `grouping_features`, and only looks for
      duplicates within each group.

    - Records with missing data in the `grouping_features` are removed from
      consideration as duplicates. These records are given the entity label
      "None".

    - For tasks that require *only* exact matches on certain features, it is
      generally more natural to use the SFrame `groupby` function.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    References
    ----------
    - Christen, Peter. "Data matching: concepts and techniques for record
      linkage, entity resolution, and duplicate detection." Springer Science &
      Business Media, 2012.

    Examples
    --------
    >>> sf1 = graphlab.SFrame({'id': [0, 1, 2],
    ...                        'x0': [0.5, 0.5, 0.3],
    ...                        'x1': [1., 0.8, 0.6],
    ...                        'city': ['seattle', 'olympia', 'boston'],
    ...                        'state': ['WA', 'WA', 'MA']})
    ...
    ... # note: misspellings in the following dataset do not prevent correct
    ... # matches.
    >>> sf2 = graphlab.SFrame({'id': [9, 10],
    ...                        'x0': [0.35, 0.4],
    ...                        'x1': [0.65, 0.8],
    ...                        'city': ['bostan', 'seatle'],
    ...                        'state': ['MA', 'WA']})
    ...
    >>> dist = [[('city',), 'levenshtein', 2],
    ...         [('x0', 'x1'), 'euclidean', 1.5]]
    ...
    >>> m = graphlab.nearest_neighbor_deduplication.create({'a': sf1, 'b': sf2},
    ...                                                    row_label='id',
    ...                                                    grouping_features=['state'],
    ...                                                    distance=dist, k=None,
    ...                                                    radius=3)
    ...
    >>> print m['entities']
    +----------+----+----------+-------+------+---------+------+
    | __sframe | id | __entity | state |  x0  |   city  |  x1  |
    +----------+----+----------+-------+------+---------+------+
    |    a     | 1  |    0     |   WA  | 0.5  | olympia | 0.8  |
    |    a     | 0  |    1     |   WA  | 0.5  | seattle | 1.0  |
    |    b     | 10 |    1     |   WA  | 0.4  |  seatle | 0.8  |
    |    a     | 2  |    2     |   MA  | 0.3  |  boston | 0.6  |
    |    b     | 9  |    2     |   MA  | 0.35 |  bostan | 0.65 |
    +----------+----+----------+-------+------+---------+------+
    [5 rows x 7 columns]
    """

    ## Set up
    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    start_time = _time.time()

    model = NearestNeighborDeduplication()
    model._state['verbose'] = verbose
    model._state['k'] = k
    model._state['radius'] = radius


    ### ----------------------------- ###
    ### Validation and preprocessing ###
    ### ----------------------------- ###

    ### Validate input datasets
    ### -----------------------

    ## If datasets is already a dict, check the keys are all strings
    if isinstance(datasets, dict):
        if not(all([isinstance(x, str) for x in datasets.keys()])):
            raise ValueError("Keys in the 'datasets' dict must be strings.")

    ## Convert singleton SFrame dataset into a list of datasets
    if isinstance(datasets, _gl.SFrame):
        _raise_error_if_sframe_empty(datasets, "dataset")
        datasets = {0: datasets}

    ## Convert a list of SFrames into a dict
    if isinstance(datasets, list):
        datasets = {k: sf for k, sf in enumerate(datasets)}

    ## At this point, 'datasets' must be dict. If it's not, something is wrong.
    if not isinstance(datasets, dict):
        raise TypeError("Input 'datasets' must be an SFrame, a list of SFrames, " +
                        "or a dictionary of (string, SFrame) pairs.")

    model._state['num_datasets'] = len(datasets)

    ## Ensure that all datasets are SFrames
    for d in datasets.values():
        _raise_error_if_not_sframe(d, "dataset")


    ### Validate row label
    ### ------------------

    ## Validate the label column
    if row_label:
        if not isinstance(row_label, str):
            raise TypeError("The 'row_label' parameter must be the name (string " +
                            "type) of a column in each of the input datasets.")

        for d in datasets.values():
            if row_label not in d.column_names():
                raise _ToolkitError("The specified row_label column does not " +
                                    " exist in all input datasets.")
    else:
        row_label = 'row_number'

        for d in datasets.values():
            if row_label in d.column_names():
                raise _ToolkitError("Input 'row_label' defaulted to " +
                                    "'row_number', which is already a column" +
                                    " in at least one input dataset. Please " +
                                    "specify a row label column manually.")

    model._state['row_label'] = row_label


    ### Validate 'features' and 'grouping_features' parameters
    ### ------------------------------------------------------
    if features is not None:
        if not hasattr(features, '__iter__'):
            raise TypeError("Input 'features' must be a list.")

        if not all([isinstance(x, str) for x in features]):
            raise TypeError("Input 'features' must contain only strings.")

    if grouping_features is not None:
        if not hasattr(grouping_features, '__iter__'):
            raise TypeError("Input 'grouping_features' must be a list.")

        if not all([isinstance(x, str) for x in grouping_features]):
            raise TypeError("Input 'grouping_features' must contain only strings.")


    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.

    ## Find the intersection of all feature sets and feature types
    col_types = {k: v for k, v in zip(datasets.values()[0].column_names(),
                                      datasets.values()[0].column_types())}

    all_features = [sf.column_names() for sf in datasets.values()]
    ftr_intersection = list(set(all_features[0]).intersection(*all_features))
    ftr_intersection = [x for x in ftr_intersection if x != row_label]


    ## Convert features and distance arguments into a composite distance.
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif isinstance(distance, str):
        if features is not None:
            distance = [[features, distance, 1]]
        else:
            distance = [[ftr_intersection, distance, 1]]

    elif distance == None:
        if features is not None:
            distance = _construct_auto_distance(features, col_types)
        else:
            distance = _construct_auto_distance(ftr_intersection, col_types)

    else:
        raise TypeError("Input 'distance' not understood. Note that for the " +
                         "data matching toolkit, 'distance' must be a string or " +
                         "a composite distance list."   )


    ## Validate the form of the composite distance and add to the model
    allowed_dists = {
        'euclidean': [int, float, _array.array],
        'squared_euclidean': [int, float, _array.array],
        'manhattan': [int, float, _array.array],
        'levenshtein': [str],
        'jaccard': [str, dict],
        'weighted_jaccard': [str, dict],
        'cosine': [int, float, str, dict, _array.array],
        'dot_product': [int, float, str, dict, _array.array],
        'transformed_dot_product': [int, float, str, dict, _array.array]}

    distance = _dmutl.validate_composite_distance(distance, row_label,
                                                  allowed_dists.keys(),
                                                  verbose)
    model._state['distance'] = _copy.deepcopy(distance)


    ## Figure out which features are 'fuzzy', i.e. used for approximate
    #  matching, and set in the model state.
    fuzzy_features = _dmutl.extract_composite_features(distance)  # already has row_label removed

    model._state['features'] = fuzzy_features
    model._state['num_features'] = len(fuzzy_features)


    ## Compile a master list of all features. This includes grouping features,
    #  fuzzy features (the ones used for approximate matching), and "ancillary"
    #  features, which are specified in the 'features' parameter but not in the
    #  composite distance function for whatever reason. by the user in the
    #  'features' parameter, but not included in the 'distance' specification
    #  for some reason.
    if features is None:
        features = []
    else:
        features = [x for x in features if x != row_label]

    if grouping_features is None:
        grouping_features = []
    else:
        grouping_features = [x for x in grouping_features if x != row_label]

    model._state['grouping_features'] = grouping_features
    model._state['num_grouping_features'] = len(grouping_features)

    master_features = list(set(features + grouping_features + fuzzy_features))


    ### Consolidate data and engineer features
    ### --------------------------------------

    ## Consolidate multiple input datasets into a single SFrame, with a useful
    #  row label.
    sf_union = _dmutl.concatenate_sframes(datasets, row_label=row_label,
                                   features=master_features,
                                   sf_index_name='__sframe')
    overall_label = '__sframe.' + row_label
    sf_union[overall_label] = (sf_union['__sframe'].astype(str) + "." +
                               sf_union[row_label].astype(str))


    ## Validate the feature types in the consolidated dataset against the
    #  specified distance functions.
    _dmutl.validate_distance_feature_types(sf_union, distance, allowed_dists)


    ## Clean string-type features in the fuzzy feature set.
    for ftr in fuzzy_features:
        if col_types[ftr] == str:
            new_ftr = '__clean.' + ftr
            sf_union[new_ftr] = sf_union[ftr].fillna("")
            sf_union[new_ftr] = sf_union[new_ftr].apply(
                lambda x: _dmutl.cleanse_string(x), dtype=str)

            for dist_comp in distance:
                dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]]


    ## Feature engineering, distance-component-wise. Also update list of
    #  features and a map to their types.
    sf_union, distance = _engineer_distance_features(sf_union, distance)
    transformed_features = _dmutl.extract_composite_features(distance)

    ### -------------------------------------------- ###
    ### Main loop over blocks of neighbor candidates ###
    ### -------------------------------------------- ###

    ## Construct blocks on features that must match exactly
    if verbose:
        _logging.info("Constructing groups of records that match exactly on " +
                      "the 'grouping_features'.")

    sf_union, block_errors, blocks = \
        _dmutl.construct_exact_blocks(sf_union, grouping_features)

    if verbose and len(distance) > 0 and blocks['Count'].max() > 10000:
        _logging.warning("There are more than 10,000 records in the largest match " +
            "group. For many uses, approximate matches within each match group are " +
            "computed with brute force nearest neighbors, which may be slow. " +
            "Consider using smaller groups by requiring different features to " +
            "match exactly.")

    max_entity_number = 0
    sf_entity = _gl.SFrame()
    output_features = (master_features + [row_label, '__sframe', '__entity'])

    ## Main loop over blocks
    for i, block in enumerate(blocks):

        if verbose:
            _logging.info("Processing {} records in match group: {}/{}".format(block['Count'],
                                                                         i+1,
                                                                         len(blocks)))

        ## Retrieve records in the block and impute the mean for missing numeric
        #  values.
        records = sf_union[block['min_idx']:(block['max_idx'] + 1)]
        complete_records = _dmutl.impute_numeric_means(records, transformed_features)

        if len(distance) > 0:
            ## Run all-point nearest neighbors
            if verbose:
                _logging.info("Building the similarity graph....")

            m = _gl.nearest_neighbors.create(complete_records, label=overall_label,
                                             distance=distance, verbose=False)
            knn = m.query(complete_records, label=overall_label, k=k, radius=radius,
                          verbose=verbose)


            ## Construct similarity graph to resolve transitive closure
            sg = _gl.SGraph()
            sg = sg.add_vertices(records[[overall_label]], vid_field=overall_label)
            sg = sg.add_edges(knn, src_field='query_label',
                              dst_field='reference_label')


            ## Cut the similarity graph to establish an entity for each vertex
            if verbose:
                _logging.info("Finding duplicate records in the similarity graph....")

            cc = _gl.connected_components.create(sg, verbose=verbose)

            ## Relabel the component IDs to be consecutive integers starting with
            #  the max index of the previous block's entity labels.
            block_labels = cc['component_size'].add_row_number('__entity')
            block_labels['__entity'] += max_entity_number
            max_entity_number += block_labels.num_rows()
            block_entity_labels = cc['component_id'].join(block_labels,
                                                          on='component_id',
                                                          how='left')

            ## Join the entity labels for the block back to the block's records,
            #  then append to the master output
            records = records.join(block_entity_labels[['__id', '__entity']],
                                   on={overall_label: '__id'}, how='left')
            records = records.sort('__entity')

        else:  # no fuzzy features, so no nearest neighbors, just block ID
            records['__entity'] = _gl.SArray.from_const(i, len(records))


        sf_entity = sf_entity.append(records[output_features])


    ### ------------------------------------- ###
    ### Postprocessing and results formatting ###
    ### ------------------------------------- ###

    ## Add rows missing from the blocking back to the master results
    if len(block_errors) > 0:
        block_errors['__entity'] = _gl.SArray.from_const(None, len(block_errors)).astype(int)
        sf_entity = sf_entity.append(block_errors[output_features])

    ## Rearrange columns
    sf_entity.swap_columns('__sframe', sf_entity.column_names()[0])
    sf_entity.swap_columns(row_label, sf_entity.column_names()[1])
    sf_entity.swap_columns('__entity', sf_entity.column_names()[2])


    ## Finalize the model state
    model._state['training_time'] = _time.time() - start_time
    model._state['entities'] = sf_entity
    model._state['num_entities'] = max_entity_number

    return model
def create(dataset, target, model_name, features=None,
           validation_set = None, verbose = True, **kwargs):
    """
    Create a :class:`~graphlab.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    features : list[string], optional
        List of feature names used by feature column

    validation_set : SFrame, optional
        The validation set that is used to watch the validation result as
        boosting progress.

    verbose : boolean
        whether print out messages during training

    kwargs : dict
        Additional parameter options that can be passed
    """

    _raise_error_if_not_sframe(dataset, "training dataset")

    # Target
    target_sframe = _toolkits_select_columns(dataset, [target])

    # Features
    if features is None:
        features = dataset.column_names()
        features.remove(target)
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)
    features_sframe = _toolkits_select_columns(dataset, features)


    options = {}
    _kwargs = {}
    for k in kwargs:
      _kwargs[k.lower()] = kwargs[k]
    options.update(_kwargs)
    options.update({'target': target_sframe,
                    'features': features_sframe,
                    'model_name': model_name})

    if validation_set is not None:
        options.update({
            'features_validation' : _toolkits_select_columns(validation_set, features),
            'target_validation' : _toolkits_select_columns(validation_set, [target])})

    ret = _graphlab.toolkits._main.run("supervised_learning_train", options, verbose=verbose)
    model = SupervisedLearningModel(ret['model'], model_name)
    return model
def create(dataset,
           features=None,
           distance=None,
           method='auto',
           verbose=True,
           **kwargs):
    """
    Create a RecordLinker model to match query records to a reference dataset of
    records, assuming both sets have the same general form.

    Parameters
    ----------
    dataset : SFrame
        Reference data, against which to link new queries with the 'link'
        method. The 'dataset' SFrame must include at least the features
        specified in the 'features' or 'distance' parameter.

    features : list[string], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns should be used. Each column can
        be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if 'distance' is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    method : {'auto', brute_force', 'lsh', 'ball_tree'}, optional
        Strategy for the nearest neighbors search. If not specified or 'auto',
        the search strategy is chosen automatically based on the data type and
        dimension.

    verbose : bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options passed through to the nearest_neighbors toolkit for particular
        nearest neighbors search strategies:

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and n/(2^11),
          which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed.

        - *num_projections_per_table*: For the LSH method, the number of
          projections for each hash table.

    Returns
    -------
    out : RecordLinker model.

    See Also
    --------
    RecordLinker, graphlab.toolkits.nearest_neighbors

    Notes
    -----
    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    References
    ----------
    - Christen, Peter. "Data matching: concepts and techniques for record
      linkage, entity resolution, and duplicate detection." Springer Science &
      Business Media, 2012.

    Examples
    --------
    >>> homes = graphlab.SFrame({'sqft': [1230, 875, 1745],
    ...                          'street': ['phinney', 'fairview', 'cottage'],
    ...                          'city': ['seattle', 'olympia', 'boston'],
    ...                          'state': ['WA', 'WA', 'MA']})
    ...
    >>> model = graphlab.record_linker.create(homes, features=['city'],
    ...                                       distance='levenshtein')
    """

    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    start_time = _time.time()

    ## Validate the 'dataset' input.
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Clean the method options and create the options dictionary
    allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table']
    _method_options = {}

    for k, v in kwargs.items():
        if k in allowed_kwargs:
            _method_options[k] = v
        else:
            raise _ToolkitError(
                "'{}' is not a valid keyword argument".format(k) +
                " for the nearest neighbors model. Please " +
                "check for capitalization and other typos.")

    ## Validate the features input.
    if features is not None:
        if not hasattr(features, '__iter__'):
            raise TypeError("Input 'features' must be a list.")

        if not all([isinstance(x, str) for x in features]):
            raise TypeError("Input 'features' must contain only strings.")

    else:
        features = dataset.column_names()

    ## Validate and preprocess the distance input.
    col_types = {
        k: v
        for k, v in zip(dataset.column_names(), dataset.column_types())
    }

    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif isinstance(distance, str):
        # this will likely produce errors downstream if 'features' was not
        # specified by the user.
        distance = [[features, distance, 1]]

    elif distance == None:
        distance = _construct_auto_distance(features, col_types)

    else:
        raise TypeError(
            "Input 'distance' not understood. For the " +
            "data matching toolkit, 'distance' must be a string or " +
            "a composite distance list.")

    ## Validate the composite distance and set it in the model.
    allowed_dists = {
        'euclidean': [int, float, _array.array],
        'squared_euclidean': [int, float, _array.array],
        'manhattan': [int, float, _array.array],
        'levenshtein': [str],
        'jaccard': [str, dict],
        'weighted_jaccard': [str, dict],
        'cosine': [int, float, str, dict, _array.array],
        'dot_product': [int, float, str, dict, _array.array],
        'transformed_dot_product': [int, float, str, dict, _array.array]
    }

    distance = _dmutl.validate_composite_distance(distance,
                                                  row_label=None,
                                                  allowed_dists=list(
                                                      allowed_dists.keys()),
                                                  verbose=verbose)

    ## Validate feauture types against distance functions.
    _dmutl.validate_distance_feature_types(dataset, distance, allowed_dists)

    ## Clean and impute string data.

    #  *** NOTE: after this, the composite distance and feature set will be
    #      modified and useless to the user, so set the state here. ***
    state = {'distance': distance, 'num_distance_components': len(distance)}

    union_features = _dmutl.extract_composite_features(distance)

    _dataset = _copy.copy(dataset)
    _distance = _copy.deepcopy(distance)

    for ftr in union_features:
        if col_types[ftr] == str:
            new_ftr = '__clean.' + ftr
            _dataset[new_ftr] = _dataset[ftr].fillna("")
            _dataset[new_ftr] = _dataset[new_ftr].apply(
                lambda x: _dmutl.cleanse_string(x), dtype=str)

            for dist_comp in _distance:
                dist_comp[0] = [
                    new_ftr if x == ftr else x for x in dist_comp[0]
                ]

    ## Convert strings to dicts if the distance isn't levenshtein, and
    #  concatenate string columns within a distance component into a single
    #  feature.
    _dataset, _distance = _engineer_distance_features(_dataset, _distance)

    ## Create the nearest neighbors model and set in the model
    nn_model = _gl.nearest_neighbors.create(_dataset,
                                            distance=_distance,
                                            method=method,
                                            verbose=verbose,
                                            **kwargs)

    ## Postprocessing and formatting
    state.update({
        'verbose': verbose,
        'num_examples': dataset.num_rows(),
        'features': union_features,
        'nearest_neighbors_model': nn_model,
        'num_features': len(union_features),
        'method': nn_model['method'],
        'training_time': _time.time() - start_time
    })

    model = RecordLinker(state)
    return model
    def tag(self, dataset, query_name=None, k=5, similarity_threshold=None,
            exclude_zeros=True, verbose=True):
        """
        Match the reference tags passed when a model is created to a new set of
        queries. This is a many-to-many match: each query may have any number of
        occurrences of a reference tag.

        Parameters
        ----------
        dataset : SFrame
            Query data to be tagged.

        query_name : string, optional
            Name of the column in ``dataset`` to be auto-tagged. If ``dataset``
            has more than one column, ``query_name`` must be specified.

        k : int, optional
            Number of results to return from the reference set for each query
            observation. The default is 5, but setting it to ``None`` will
            return all results whose score is greater than or equal to
            ``similarity_threshold``.

        similarity_threshold : float, optional
            Only results whose score is greater than or equal to the specified
            ``similarity_threshold`` are returned. The default is ``None``, in
            which case the ``k`` best results are returned for each query point.

        verbose : bool, optional
            If True, print progress updates and model details.

        exclude_zeros : boolean, optional
            If True, only entries for which there is a tag with a nonzero score
            are preserved in the output. This is the default behavior.

        Returns
        -------
        out : SFrame
            An SFrame with four columns:

            - row ID
            - column name specified as `tag_name` parameter to `create` method
            - column name specified as `query_name` parameter to `tag` method
            - a similarity score between 0 and 1, indicating the strength of the
              match between the query data and the suggested reference tag,
              where a score of zero indicates a poor match and a strength of 1
              corresponds to a perfect match

        Notes
        -----
        - By default, only rows for which there is a tag with a nonzero score
          are included in the output. To guarantee at least one output row for
          every input row in ``dataset``, set the ``exclude_zeros`` parameter
          to False.

        - If both ``k`` and ``similarity_threshold`` are set to ``None``, a
          ToolkitError is raised.

        Examples
        --------
        First construct a toy `SFrame` of actor names, which will serve as the
        reference set for our autotagger model.

        >>> actors_sf = gl.SFrame(
                {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                           "Tom Cruise", "Jude Law", "Robert Pattinson",
                           "Matt Damon", "Brad Pitt", "Johnny Depp",
                           "Leonardo DiCaprio", "Jennifer Aniston",
                           "Jessica Alba", "Emma Stone", "Cameron Diaz",
                           "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                           "Charlize Theron", "Marion Cotillard",
                           "Angelina Jolie"]})
        >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor")

        Then we load some IMDB movie reviews into an `SFrame` and tag them using
        the model we created above. The score field in the output is a
        similarity score, indicating the strength of the match between the query
        data and the suggested reference tag.

        >>> reviews_sf = gl.SFrame(
                "s3://dato-datasets/imdb_reviews/reviews.sframe")
        >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
        +-----------+-------------------------------+------------------+-----------------+
        | review_id |             review            |      actor       |      score      |
        +-----------+-------------------------------+------------------+-----------------+
        |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
        |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
        |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
        |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
        |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
        |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
        |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
        |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
        |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
        |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
        |    ...    |              ...              |       ...        |       ...       |
        +-----------+-------------------------------+------------------+-----------------+

        The initial results look a little noisy. To filter out obvious spurious
        matches, we can set the `tag` method's `similarity_threshold` parameter.

        >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
                  similarity_threshold=.8)
        +-----------+-------------------------------+------------------+----------------+
        | review_id |             review            |      actor       |     score      |
        +-----------+-------------------------------+------------------+----------------+
        |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
        |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
        |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
        |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
        +-----------+-------------------------------+------------------+----------------+

        """
        _mt._get_metric_tracker().track(self.__module__ + '.tag')

        # validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        # ensure that either k or similarity_threshold is set
        if not (k or similarity_threshold):
            raise _ToolkitError("Either k or similarity_threshold parameters " \
                                "must be set")

        # ensure that query_name is provided if dataset has > 1 column
        if dataset.num_cols() > 1 and not query_name:
            raise _ToolkitError("No query_name parameter specified on " \
                                "dataset with %d columns" % dataset.num_cols())

        query_column = query_name or dataset.column_names()[0]

        # ensure that column with name tag_name exists
        if query_column not in dataset.column_names():
            raise _ToolkitError('No column named "%s" in dataset' \
                                % query_column)

        query_sa = dataset.select_column(query_column)
        query_sf = _gl.SFrame({"id": range(len(query_sa)),
                               query_column: query_sa})

        features = _preprocess(query_sa)
        features = features.add_row_number()

        if similarity_threshold:
            if not isinstance(similarity_threshold, (float, int)):
                raise _ToolkitError("similarity_threshold parameter must be a" \
                                    "float or an int.")

            if similarity_threshold < 0 or similarity_threshold > 1:
                raise _ToolkitError("similarity_threshold parameter must be " \
                                    "between 0 and 1.")

        radius = (1 - similarity_threshold) if similarity_threshold else None

        results = self._nn_model.query(features, label="id", k=k,
                                       radius=radius,
                                       verbose=verbose)

        # return empty SFrame immediately if no NN results
        if len(results) == 0:
            return _gl.SFrame({query_column + "_id": [],
                               query_column: [],
                               self.get("tag_name"): [],
                               "score": []})

        results = results.join(query_sf, on={"query_label": "id"})
        results.rename({"query_label": query_column + "_id",
                        query_column: "query_label"})

        # convert distances to similarity scores
        scores = _dists_to_sim_scores("weighted_jaccard", results)

        results.add_column(scores, "score")
        results.remove_column("distance")
        results.remove_column("rank")
        results.rename({"reference_label": self.get("tag_name"),
                        "query_label": query_column})
        results.swap_columns(self.get("tag_name"), query_column)

        if exclude_zeros:
            try:
                results = results.filter_by(0.0, "score", exclude=True)
            except RuntimeError: # nothing to join
                _logging.getLogger(__name__).warn(
                    "Empty results after filtering scores of 0.")
                results = results.head(0)

        return results