def create(dataset,
           features=None,
           distance=None,
           method='auto',
           verbose=True,
           **kwargs):
    """
    Create a RecordLinker model to match query records to a reference dataset of
    records, assuming both sets have the same general form.

    Parameters
    ----------
    dataset : SFrame
        Reference data, against which to link new queries with the 'link'
        method. The 'dataset' SFrame must include at least the features
        specified in the 'features' or 'distance' parameter.

    features : list[string], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns should be used. Each column can
        be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if 'distance' is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about GraphLab Create distance functions, please
        see the :py:mod:`~graphlab.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    method : {'auto', brute_force', 'lsh', 'ball_tree'}, optional
        Strategy for the nearest neighbors search. If not specified or 'auto',
        the search strategy is chosen automatically based on the data type and
        dimension.

    verbose : bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options passed through to the nearest_neighbors toolkit for particular
        nearest neighbors search strategies:

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and n/(2^11),
          which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed.

        - *num_projections_per_table*: For the LSH method, the number of
          projections for each hash table.

    Returns
    -------
    out : RecordLinker model.

    See Also
    --------
    RecordLinker, graphlab.toolkits.nearest_neighbors

    Notes
    -----
    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    References
    ----------
    - Christen, Peter. "Data matching: concepts and techniques for record
      linkage, entity resolution, and duplicate detection." Springer Science &
      Business Media, 2012.

    Examples
    --------
    >>> homes = graphlab.SFrame({'sqft': [1230, 875, 1745],
    ...                          'street': ['phinney', 'fairview', 'cottage'],
    ...                          'city': ['seattle', 'olympia', 'boston'],
    ...                          'state': ['WA', 'WA', 'MA']})
    ...
    >>> model = graphlab.record_linker.create(homes, features=['city'],
    ...                                       distance='levenshtein')
    """

    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    start_time = _time.time()

    ## Validate the 'dataset' input.
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Clean the method options and create the options dictionary
    allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table']
    _method_options = {}

    for k, v in kwargs.items():
        if k in allowed_kwargs:
            _method_options[k] = v
        else:
            raise _ToolkitError(
                "'{}' is not a valid keyword argument".format(k) +
                " for the nearest neighbors model. Please " +
                "check for capitalization and other typos.")

    ## Validate the features input.
    if features is not None:
        if not hasattr(features, '__iter__'):
            raise TypeError("Input 'features' must be a list.")

        if not all([isinstance(x, str) for x in features]):
            raise TypeError("Input 'features' must contain only strings.")

    else:
        features = dataset.column_names()

    ## Validate and preprocess the distance input.
    col_types = {
        k: v
        for k, v in zip(dataset.column_names(), dataset.column_types())
    }

    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif isinstance(distance, str):
        # this will likely produce errors downstream if 'features' was not
        # specified by the user.
        distance = [[features, distance, 1]]

    elif distance == None:
        distance = _construct_auto_distance(features, col_types)

    else:
        raise TypeError(
            "Input 'distance' not understood. For the " +
            "data matching toolkit, 'distance' must be a string or " +
            "a composite distance list.")

    ## Validate the composite distance and set it in the model.
    allowed_dists = {
        'euclidean': [int, float, _array.array],
        'squared_euclidean': [int, float, _array.array],
        'manhattan': [int, float, _array.array],
        'levenshtein': [str],
        'jaccard': [str, dict],
        'weighted_jaccard': [str, dict],
        'cosine': [int, float, str, dict, _array.array],
        'dot_product': [int, float, str, dict, _array.array],
        'transformed_dot_product': [int, float, str, dict, _array.array]
    }

    distance = _dmutl.validate_composite_distance(distance,
                                                  row_label=None,
                                                  allowed_dists=list(
                                                      allowed_dists.keys()),
                                                  verbose=verbose)

    ## Validate feauture types against distance functions.
    _dmutl.validate_distance_feature_types(dataset, distance, allowed_dists)

    ## Clean and impute string data.

    #  *** NOTE: after this, the composite distance and feature set will be
    #      modified and useless to the user, so set the state here. ***
    state = {'distance': distance, 'num_distance_components': len(distance)}

    union_features = _dmutl.extract_composite_features(distance)

    _dataset = _copy.copy(dataset)
    _distance = _copy.deepcopy(distance)

    for ftr in union_features:
        if col_types[ftr] == str:
            new_ftr = '__clean.' + ftr
            _dataset[new_ftr] = _dataset[ftr].fillna("")
            _dataset[new_ftr] = _dataset[new_ftr].apply(
                lambda x: _dmutl.cleanse_string(x), dtype=str)

            for dist_comp in _distance:
                dist_comp[0] = [
                    new_ftr if x == ftr else x for x in dist_comp[0]
                ]

    ## Convert strings to dicts if the distance isn't levenshtein, and
    #  concatenate string columns within a distance component into a single
    #  feature.
    _dataset, _distance = _engineer_distance_features(_dataset, _distance)

    ## Create the nearest neighbors model and set in the model
    nn_model = _gl.nearest_neighbors.create(_dataset,
                                            distance=_distance,
                                            method=method,
                                            verbose=verbose,
                                            **kwargs)

    ## Postprocessing and formatting
    state.update({
        'verbose': verbose,
        'num_examples': dataset.num_rows(),
        'features': union_features,
        'nearest_neighbors_model': nn_model,
        'num_features': len(union_features),
        'method': nn_model['method'],
        'training_time': _time.time() - start_time
    })

    model = RecordLinker(state)
    return model
예제 #2
0
def create(dataset, features=None, distance=None, method='auto', verbose=True,
           **kwargs):
    """
    Create a RecordLinker model to match query records to a reference dataset of
    records, assuming both sets have the same general form.

    Parameters
    ----------
    dataset : SFrame
        Reference data, against which to link new queries with the 'link'
        method. The 'dataset' SFrame must include at least the features
        specified in the 'features' or 'distance' parameter.

    features : list[string], optional    
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns should be used. Each column can
        be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if 'distance' is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : string or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', or 'dot_product'. Please see
          the :mod:`distances` module for more details.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

    method : {'auto', brute_force', 'lsh', 'ball_tree'}, optional
        Strategy for the nearest neighbors search. If not specified or 'auto',
        the search strategy is chosen automatically based on the data type and
        dimension.

    verbose : bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options passed through to the nearest_neighbors toolkit for particular
        nearest neighbors search strategies:

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and n/(2^11),
          which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed.

        - *num_projections_per_table*: For the LSH method, the number of
          projections for each hash table.

    Returns
    -------
    out : RecordLinker model.

    See Also
    --------
    RecordLinker, graphlab.toolkits.nearest_neighbors

    Notes
    -----
    - Standardizing features is often a good idea with distance-based methods,
      but this model does *not* standardize features.

    - For features that all have the same type, the distance parameter may be a
      single standard distance function name (e.g. "euclidean"). In the model,
      however, all distances are first converted to composite distance
      functions; as a result, the 'distance' field in the model is always a
      composite distance.

    References
    ----------
    - Christen, Peter. "Data matching: concepts and techniques for record
      linkage, entity resolution, and duplicate detection." Springer Science &
      Business Media, 2012.

    Examples
    --------
    >>> homes = graphlab.SFrame({'sqft': [1230, 875, 1745],
    ...                          'street': ['phinney', 'fairview', 'cottage'],
    ...                          'city': ['seattle', 'olympia', 'boston'],
    ...                          'state': ['WA', 'WA', 'MA']})
    ...
    >>> model = graphlab.record_linker.create(homes, features=['city'],
    ...                                       distance='levenshtein') 
    """

    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    start_time = _time.time()


    ## Validate the 'dataset' input.
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")


    ## Validate the features input.
    if features is not None:
        if not hasattr(features, '__iter__'):
            raise TypeError("Input 'features' must be a list.")

        if not all([isinstance(x, str) for x in features]):
            raise TypeError("Input 'features' must contain only strings.")

    else:
        features = dataset.column_names()


    ## Validate and preprocess the distance input.
    col_types = {k: v for k, v in zip(dataset.column_names(),
                                      dataset.column_types())}

    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif isinstance(distance, str):
        # this will likely produce errors downstream if 'features' was not
        # specified by the user.
        distance = [[features, distance, 1]]

    elif distance == None:
        distance = _construct_auto_distance(features, col_types)

    else:
        raise TypeError("Input 'distance' not understood. For the " +
                         "data matching toolkit, 'distance' must be a string or " +
                         "a composite distance list."   )


    ## Validate the composite distance and set it in the model.
    allowed_dists = {
        'euclidean': [int, float, _array.array],
        'squared_euclidean': [int, float, _array.array],
        'manhattan': [int, float, _array.array],
        'levenshtein': [str],
        'jaccard': [str, dict],
        'weighted_jaccard': [str, dict],
        'cosine': [int, float, str, dict, _array.array],
        'dot_product': [int, float, str, dict, _array.array]}

    distance = _dmutl.validate_composite_distance(distance, row_label=None,
                                             allowed_dists=allowed_dists.keys(),
                                             verbose=verbose)


    ## Validate feauture types against distance functions.
    _dmutl.validate_distance_feature_types(dataset, distance, allowed_dists)


    ## Clean and impute string data.

    #  *** NOTE: after this, the composite distance and feature set will be
    #      modified and useless to the user, so set the state here. ***
    state = {'distance': distance,
             'num_distance_components': len(distance)}

    union_features = _dmutl.extract_composite_features(distance)

    _dataset = _copy.copy(dataset)
    _distance = _copy.deepcopy(distance)

    for ftr in union_features:
        if col_types[ftr] == str:
            new_ftr = '__clean.' + ftr
            _dataset[new_ftr] = _dataset[ftr].fillna("")
            _dataset[new_ftr] = _dataset[new_ftr].apply(
                lambda x: _dmutl.cleanse_string(x), dtype=str)

            for dist_comp in _distance:
                dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]]


    ## Convert strings to dicts if the distance isn't levenshtein, and
    #  concatenate string columns within a distance component into a single
    #  feature.
    _dataset, _distance = _engineer_distance_features(_dataset, _distance)


    ## Create the nearest neighbors model and set in the model
    knn_model = _gl.nearest_neighbors.create(_dataset, distance=_distance,
                                             method=method, verbose=verbose,
                                             **kwargs)


    ## Postprocessing and formatting
    state.update({'verbose': verbose,
                  'num_examples': dataset.num_rows(),
                  'features': union_features,
                  'num_features': len(union_features),
                  'method': knn_model['method'],
                  'training_time': _time.time() - start_time})

    model = RecordLinker(knn_model, state)
    return model
    def link(self, dataset, k=5, radius=None, verbose=True):
        """
        Find matching records from the reference dataset (entered when the model
        was created) for each record in the 'dataset' passed to this function.
        The query dataset must include columns with the same names as the label
        and feature columns used to create the RecordLinker
        model.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        k : int, optional
            Maximum number of nearest neighbors to return from the reference set
            for each query observation. The default is 5, but setting it to
            ``None`` will return all neighbors within ``radius`` of the query
            point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the ``k``
            nearest neighbors are returned for each query point, regardless of
            distance.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the row label of the
            query observation, the second is the row label of the nearby
            reference observation, the third is the distance between the query
            and reference observations, and the fourth is the rank of the
            reference observation among the query's k-nearest neighbors.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output is
          an SFrame with :math:`nm` rows.

        Examples
        --------
        Assume we've created the model from the example in the RecordLinker
        'create' function.

        >>> queries = graphlab.SFrame({'sqft': [986, 1320],
        ...                            'street': ['fremont', 'phiney'],
        ...                            'city': ['sea', 'seattle'],
        ...                            'state': ['WA', 'WA']})
        ...
        >>> model.link(queries, k=2, radius=5.)
        +-------------+-----------------+----------+------+
        | query_label | reference_label | distance | rank |
        +-------------+-----------------+----------+------+
        |      0      |        0        |   4.0    |  1   |
        |      0      |        2        |   5.0    |  2   |
        |      1      |        0        |   0.0    |  1   |
        +-------------+-----------------+----------+------+
        """
        _mt._get_metric_tracker().track(self.__module__ + '.link_records')

        ## Validate the 'dataset' input.
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Make sure all of the necessary features are present at 'link' time.
        sf_features = _tkutl._toolkits_select_columns(dataset,
                                                      self.get('features'))

        ## Clean and impute string data. *** Think about consolidating this and
        #  the next step into a feature transformer.***
        col_types = {
            k: v
            for k, v in zip(dataset.column_names(), dataset.column_types())
        }
        _dataset = _copy.copy(dataset)
        _distance = _copy.deepcopy(self.__proxy__['distance'])

        for ftr in self.get('features'):
            if col_types[ftr] == str:
                new_ftr = '__clean.' + ftr
                _dataset[new_ftr] = _dataset[ftr].fillna("")
                _dataset[new_ftr] = _dataset[new_ftr].apply(
                    lambda x: _dmutl.cleanse_string(x), dtype=str)

                for dist_comp in _distance:
                    dist_comp[0] = [
                        new_ftr if x == ftr else x for x in dist_comp[0]
                    ]

        ## Convert strings to dicts and concatenate string features.
        _dataset, _ = _engineer_distance_features(_dataset, _distance)

        ## Query the nearest neighbor model
        result = self.__proxy__['nearest_neighbors_model'].query(
            _dataset, k=k, radius=radius, verbose=verbose)
        return result
예제 #4
0
    def link(self, dataset, k=5, radius=None, verbose=True):
        """
        Find matching records from the reference dataset (entered when the model
        was created) for each record in the 'dataset' passed to this function.
        The query dataset must include columns with the same names as the label
        and feature columns used to create the RecordLinker
        model.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        k : int, optional
            Maximum number of nearest neighbors to return from the reference set
            for each query observation. The default is 5, but setting it to
            ``None`` will return all neighbors within ``radius`` of the query
            point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the ``k``
            nearest neighbors are returned for each query point, regardless of
            distance.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the row label of the
            query observation, the second is the row label of the nearby
            reference observation, the third is the distance between the query
            and reference observations, and the fourth is the rank of the
            reference observation among the query's k-nearest neighbors.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output is
          an SFrame with :math:`nm` rows.

        Examples
        --------
        Assume we've created the model from the example in the RecordLinker
        'create' function.

        >>> queries = graphlab.SFrame({'sqft': [986, 1320],
        ...                            'street': ['fremont', 'phiney'],
        ...                            'city': ['sea', 'seattle'],
        ...                            'state': ['WA', 'WA']})
        ...
        >>> model.link(queries, k=2, radius=5.)
        +-------------+-----------------+----------+------+
        | query_label | reference_label | distance | rank |
        +-------------+-----------------+----------+------+
        |      0      |        0        |   4.0    |  1   |
        |      0      |        2        |   5.0    |  2   |
        |      1      |        0        |   0.0    |  1   |
        +-------------+-----------------+----------+------+
        """
        _mt._get_metric_tracker().track(self.__module__ + '.link_records')

        ## Validate the 'dataset' input.
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Make sure all of the necessary features are present at 'link' time.
        sf_features = _tkutl._toolkits_select_columns(dataset, 
                                                      self.get('features'))

        ## Clean and impute string data. *** Think about consolidating this and
        #  the next step into a feature transformer.***
        col_types = {k: v for k, v in zip(dataset.column_names(),
                                          dataset.column_types())}
        _dataset = _copy.copy(dataset)
        _distance = _copy.deepcopy(self._state['distance'])

        for ftr in self.get('features'):
            if col_types[ftr] == str:
                new_ftr = '__clean.' + ftr
                _dataset[new_ftr] = _dataset[ftr].fillna("")
                _dataset[new_ftr] = _dataset[new_ftr].apply(
                    lambda x: _dmutl.cleanse_string(x), dtype=str)

                for dist_comp in _distance:
                    dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]]


        ## Convert strings to dicts and concatenate string features.
        _dataset, _ = _engineer_distance_features(_dataset, _distance)


        ## Query the nearest neighbor model
        result = self._knn_model.query(_dataset, k=k, radius=radius,
                                      verbose=verbose)
        return result