示例#1
0
    def prob(self, comparison_vectors):
        """Compute the probabilities for each record pair.

        For each pair of records, estimate the probability of being a match.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The dataframe with comparison vectors.
        return_type : 'series' or 'array'
            Return a pandas series or numpy array. Default 'series'.

        Returns
        -------
        pandas.Series or numpy.ndarray
            The probability of being a match for each record pair.

        """

        logging.info("Classifying - compute probabilities")

        enc_vectors = self.algorithm._transform_vectors(
            comparison_vectors.as_matrix())

        return pandas.Series(self.algorithm._expectation(enc_vectors),
                             index=comparison_vectors.index)
示例#2
0
    def __init__(self, features=[], n_jobs=1, indexing_type='label', **kwargs):

        logging.info("comparing - initialize {} class".format(
            self.__class__.__name__))

        self.features = []
        self.add(features)

        # public
        self.n_jobs = n_jobs
        self.indexing_type = indexing_type  # label of position
        self.features = []

        # logging
        self._i = 1
        self._i_max = None
        self._n = []
        self._eta = []
        self._output_log_total = True

        # private
        self._compare_functions = []

        if isinstance(features, (pandas.MultiIndex, pandas.Index)):
            warnings.warn(
                "It seems you are using the older version of the Compare API, "
                "see the documentation about how to update to the new API. "
                "http://recordlinkage.readthedocs.io/"
                "en/latest/ref-compare.html", VisibleDeprecationWarning)
示例#3
0
    def predict(self, comparison_vectors, return_type='index'):
        """Predict the class of the record pairs.

        Classify a set of record pairs based on their comparison vectors into
        matches, non-matches and possible matches. The classifier has to be
        trained to call this method.


        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            Dataframe with comparison vectors.
        return_type : 'index' (default), 'series', 'array'
            The format to return the classification result. The argument value
            'index' will return the pandas.MultiIndex of the matches. The
            argument value 'series' will return a pandas.Series with zeros
            (distinct) and ones (matches). The argument value 'array' will
            return a numpy.ndarray with zeros and ones.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """

        logging.info("Classifying - predict matches and non-matches")

        return self._predict(comparison_vectors, return_type)
示例#4
0
    def __init__(self, algorithms=[]):

        logging.info("Index - initialize {} class".format(
            self.__class__.__name__))

        self.algorithms = []
        self.add(algorithms)
示例#5
0
    def fit(self, comparison_vectors, match_index=None):
        """Train the classifier.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The comparison vectors (or features) to train the model with.
        match_index : pandas.MultiIndex
            A pandas.MultiIndex object with the true matches.
            The MultiIndex contains only the true matches. Default None.

        Note
        ----

        A note in case of finding links within a single dataset (for example
        deduplication). Unsure that the training record pairs are from the
        lower triangular part of the dataset/matrix. See detailed information
        here: link.

        """

        logging.info("Classification - start training {}".format(
            self.__class__.__name__)
        )

        self._initialise_classifier(comparison_vectors)

        # start timing
        start_time = time.time()

        if isinstance(match_index, (pandas.MultiIndex, pandas.Index)):

            try:
                y = pandas.Series(0, index=comparison_vectors.index)
                y.loc[match_index & comparison_vectors.index] = 1
            except pandas.IndexError as err:

                # The are no matches. So training is not possible.
                if len(match_index & comparison_vectors.index) == 0:
                    raise LearningError(
                        "both matches and non-matches needed in the" +
                        "trainingsdata, only non-matches found"
                    )
                else:
                    raise err

            self._fit(comparison_vectors.values, y.values)

        elif match_index is None:
            self._fit(comparison_vectors.values)
        else:
            raise ValueError(
                "'match_index' has incorrect type '{}'".format(
                    type(match_index)
                )
            )

        # log timing
        logf_time = "Classification - training computation time: ~{:.2f}s"
        logging.info(logf_time.format(time.time() - start_time))
示例#6
0
    def prob(self, comparison_vectors, return_type='series'):
        """Compute the probabilities for each record pair.

        For each pair of records, estimate the probability of being a match.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The dataframe with comparison vectors.
        return_type : 'series' or 'array'
            Return a pandas series or numpy array. Default 'series'.

        Returns
        -------
        pandas.Series or numpy.ndarray
            The probability of being a match for each record pair.

        """

        logging.info("Classifying - compute probabilities")

        probs = self.classifier.predict_proba(comparison_vectors.as_matrix())

        if return_type == 'series':
            return pandas.Series(probs[:, 0], index=comparison_vectors.index)
        elif return_type == 'array':
            return probs[:, 0]
        else:
            raise ValueError(
                "return_type {} unknown. Choose 'index', 'series' or "
                "'array'".format(return_type))
示例#7
0
    def predict(self, comparison_vectors):
        """Predict the class of the record pairs.

        Classify a set of record pairs based on their comparison vectors into
        matches, non-matches and possible matches. The classifier has to be
        trained to call this method.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            Dataframe with comparison vectors.
        return_type : str
            Deprecated. Use recordlinkage.options instead. Use the option
            `recordlinkage.set_option('classification.return_type', 'index')`
            instead.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """

        logging.info("Classification - predict matches and non-matches")

        # make the predicition
        prediction = self._predict(comparison_vectors.as_matrix())
        self._post_predict(prediction)

        # format and return the result
        return self._return_result(prediction, comparison_vectors)
示例#8
0
    def prob(self, comparison_vectors, return_type=None):
        """Compute the probabilities for each record pair.

        For each pair of records, estimate the probability of being a match.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The dataframe with comparison vectors.
        return_type : str
            Deprecated. (default 'series')

        Returns
        -------
        pandas.Series or numpy.ndarray
            The probability of being a match for each record pair.

        """

        # deprecation
        if return_type is not None:
            warnings.warn(
                "The argument 'return_type' is removed. "
                "Default value is now 'series'.",
                VisibleDeprecationWarning,
                stacklevel=2)

        logging.info("Classification - compute probabilities")

        prob_match = self._prob_match(comparison_vectors.as_matrix())
        return pandas.Series(prob_match, index=comparison_vectors.index)
示例#9
0
    def exact(self, s1, s2, *args, **kwargs):
        """
        exact(s1, s2, agree_value=1, disagree_value=0, missing_value=0, label=None)

        Compare the record pairs exactly.

        This method initialises the exact similarity measurement between
        values. The similarity is 1 in case of agreement and 0 otherwise.

        Parameters
        ----------

        s1 : str or int
            Field name to compare in left DataFrame.
        s2 : str or int
            Field name to compare in right DataFrame.
        agree_value : float, str, numpy.dtype
            The value when two records are identical. Default 1. If 'values'
            is passed, then the value of the record pair is passed.
        disagree_value : float, str, numpy.dtype
            The value when two records are not identical.
        missing_value : float, str, numpy.dtype
            The value for a comparison with a missing value. Default 0.
        label : label
            The label of the column in the resulting dataframe.

        """

        # logging
        logging.info(
            "Comparing - initialize exact algorithm - compare {l_left} with "
            "{l_right}".format(l_left=s1, l_right=s2))

        return self._compare_vectorized(_compare_exact, s1, s2, *args,
                                        **kwargs)
示例#10
0
    def __init__(self):

        logging.info("Classification - initialize {} class".format(
            self.__class__.__name__))

        # The actual classifier. Maybe this is slightly strange because of
        # inheritance.
        self.classifier = None
示例#11
0
    def learn(self, comparison_vectors, match_index, return_type='index'):
        """Train the classifier.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The comparison vectors.
        match_index : pandas.MultiIndex
            The true matches.
        return_type : 'index' (default), 'series', 'array'
            The format to return the classification result. The argument value
            'index' will return the pandas.MultiIndex of the matches. The
            argument value 'series' will return a pandas.Series with zeros
            (distinct) and ones (matches). The argument value 'array' will
            return a numpy.ndarray with zeros and ones.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """

        logging.info("Classifying - start learning {}".format(
            self.__class__.__name__))

        # start timing
        start_time = time.time()

        if isinstance(match_index, (pandas.MultiIndex, pandas.Index)):

            # The match_index variable is of type MultiIndex
            train_series = pandas.Series(False, index=comparison_vectors.index)

            try:
                train_series.loc[match_index & comparison_vectors.index] = True

            except pandas.IndexError as err:

                # The are no matches. So training is not possible.
                if len(match_index & comparison_vectors.index) == 0:
                    raise LearningError(
                        "both matches and non-matches needed in the" +
                        "trainingsdata, only non-matches found")
                else:
                    raise err

        self.classifier.fit(comparison_vectors.as_matrix(),
                            numpy.array(train_series))

        result = self._predict(comparison_vectors, return_type)

        # log timing
        logf_time = "Classifying - learning computation time: ~{:.2f}s"
        logging.info(logf_time.format(time.time() - start_time))

        return result
示例#12
0
    def _compute(self, pairs, x, x_link=None):

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        # index_time = time.time() - start_time

        features = []

        for feat in self.features:

            lbl1 = feat.labels_left
            lbl2 = feat.labels_right

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            result = feat._compute(data1, data2)
            features.append((result, feat.label))

        features = self._union(features, pairs)

        # log timing
        n = pairs.shape[0]
        i_max = '?' if self._i_max is None else self._i_max
        eta = time.time() - start_time
        self._eta.append(eta)
        self._n.append(n)

        # log
        logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format(
            self._i, i_max, eta, n))

        # log total
        if self._output_log_total:

            n_total = np.sum(self._n)
            eta_total = np.sum(self._eta)

            logging.info(
                "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format(
                    self._i, i_max, eta_total, n_total))

        self._i += 1

        return features
示例#13
0
    def __init__(self, verify_integrity=True):
        super(BaseIndexator, self).__init__()

        self._n = []
        self._n_max = []

        self.verify_integrity = verify_integrity

        logging.info("Indexing - initialize {} class".format(
            self.__class__.__name__))
示例#14
0
    def __init__(self, verify_integrity=True, suffixes=('_1', '_2')):
        super(BaseIndexAlgorithm, self).__init__()

        self.suffixes = suffixes
        self.verify_integrity = verify_integrity

        self._n = []
        self._n_max = []

        logging.info("Indexing - initialize {} class".format(
            self.__class__.__name__))
    def learn(self, comparison_vectors, init='jaro', return_type='index'):
        """ Train the algorithm.

        Train the Expectation-Maximisation classifier. This method is well-
        known as the ECM-algorithm implementation in the context of record
        linkage.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The dataframe with comparison vectors.
        params_init : dict
            A dictionary with initial parameters of the ECM algorithm
            (optional).
        return_type : 'index' (default), 'series', 'array'
            The format to return the classification result. The argument value
            'index' will return the pandas.MultiIndex of the matches. The
            argument value 'series' will return a pandas.Series with zeros
            (distinct) and ones (matches). The argument value 'array' will
            return a numpy.ndarray with zeros and ones.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """

        logging.info("Classifying - start learning {}".format(
            self.__class__.__name__)
        )

        # start timing
        start_time = time.time()

        probs = self.algorithm.train(comparison_vectors.as_matrix())

        n_matches = int(self.algorithm.p * len(probs))
        self.p_threshold = numpy.sort(probs)[len(probs) - n_matches]

        prediction = self._decision_rule(probs, self.p_threshold)

        result = self._return_result(
            prediction, return_type, comparison_vectors
        )

        # log timing
        logf_time = "Classifying - learning computation time: ~{:.2f}s"
        logging.info(logf_time.format(time.time() - start_time))

        return result
示例#16
0
    def __init__(self, labels_left, labels_right, args=(), kwargs={}):

        self.labels_left = labels_left
        self.labels_right = labels_right
        self.args = args
        self.kwargs = kwargs
        self._f_compare_vectorized = None

        # logging
        logging.info("{} - initialize exact algorithm "
                     "- compare {l_left} with {l_right}".format(
                         self.__class__.__name__,
                         l_left=labels_left,
                         l_right=labels_right))
示例#17
0
    def __init__(self, algorithms=[]):

        logging.info("indexing - initialize {} class".format(
            self.__class__.__name__))

        self.algorithms = []
        self.add(algorithms)

        # logging
        self._i = 1
        self._i_max = None
        self._n = []
        self._n_max = []
        self._eta = []
        self._output_log_total = True
示例#18
0
    def predict(self,
                comparison_vectors,
                return_type='index',
                *args,
                **kwargs):
        """Predict the class of reord pairs.

        Classify a set of record pairs based on their comparison vectors into
        matches, non-matches and possible matches. The classifier has to be
        trained to call this method.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The dataframe with comparison vectors.
        return_type : 'index' (default), 'series', 'array'
            The format to return the classification result. The argument value
            'index' will return the pandas.MultiIndex of the matches. The
            argument value 'series' will return a pandas.Series with zeros
            (distinct) and ones (matches). The argument value 'array' will
            return a numpy.ndarray with zeros and ones.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        Note
        ----
        Prediction is risky for this unsupervised learning method. Be aware
        that the sample from the population is valid.


        """

        logging.info("Classifying - predict matches and non-matches")

        enc_vectors = self.algorithm._transform_vectors(
            comparison_vectors.as_matrix())

        probs = self.algorithm._expectation(enc_vectors)

        prediction = self._decision_rule(probs, self.p_threshold)

        return self._return_result(prediction, return_type, comparison_vectors)
示例#19
0
    def __init__(self,
                 pairs=None,
                 df_a=None,
                 df_b=None,
                 low_memory=False,
                 block_size=1000000,
                 njobs=1,
                 indexing_type='label',
                 **kwargs):

        logging.info("Comparing - initialize {} class".format(
            self.__class__.__name__))

        # public
        self.indexing_type = indexing_type  # label of position

        # private
        self._compare_functions = []

        if isinstance(pairs, (pandas.MultiIndex, pandas.Index)):
            self.deprecated = True

            warnings.warn(
                "It seems you are using the older version of the Compare API, "
                "see the documentation about how to update to the new API. "
                "http://recordlinkage.readthedocs.io/"
                "en/latest/ref-compare.html", VisibleDeprecationWarning)
        else:
            self.deprecated = False

        # deprecated
        self.df_a = df_a
        self.df_b = df_b if df_b is not None else df_a

        self.pairs = pairs

        self.low_memory = low_memory
        self.block_size = block_size
        self.njobs = njobs

        self._df_a_indexed = None
        self._df_b_indexed = None

        self.vectors = pandas.DataFrame(index=pairs)
示例#20
0
    def date(self,
             s1,
             s2,
             swap_month_day=0.5,
             swap_months='default',
             *args,
             **kwargs):
        """
        date(self, s1, s2, swap_month_day=0.5, swap_months='default', missing_value=0, label=None)

        Compute the (partial) similarity between date values.

        Parameters
        ----------
        s1 : str or int
            The name or position of the column in the left DataFrame.
        s2 : str or int
            The name or position of the column in the right DataFrame.
        swap_month_day : float
            The value if the month and day are swapped.
        swap_months : list of tuples
            A list of tuples with common errors caused by the translating of
            months into numbers, i.e. October is month 10. The format of the
            tuples is (month_good, month_bad, value). Default : swap_months =
            [(6, 7, 0.5), (7, 6, 0.5), (9, 10, 0.5), (10, 9, 0.5)]
        missing_value : numpy.dtype
            The value for a comparison with a missing value. Default 0.
        label : label
            The label of the column in the resulting dataframe.

        """

        # logging
        logging.info(
            "Comparing - initialize date algorithm - compare {l_left} with "
            "{l_right}".format(l_left=s1, l_right=s2))

        return self._compare_vectorized(_dates_internal,
                                        s1,
                                        s2,
                                        swap_month_day=swap_month_day,
                                        swap_months=swap_months,
                                        *args,
                                        **kwargs)
示例#21
0
    def compare_vectorized(self, comp_func, labels_left, labels_right, *args,
                           **kwargs):
        """Compute the similarity between values with a callable.

        This method initialises the comparing of values with a custom
        function/callable. The function/callable should accept
        numpy.ndarray's.

        Example
        -------

        >>> comp = recordlinkage.Compare()
        >>> comp.compare_vectorized(custom_callable, 'first_name', 'name')
        >>> comp.compare(PAIRS, DATAFRAME1, DATAFRAME2)

        Parameters
        ----------
        comp_func : function
            A comparison function. This function can be a built-in function
            or a user defined comparison function. The function should accept
            numpy.ndarray's as first two arguments.
        labels_left : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        labels_right : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        *args :
            Additional arguments to pass to callable comp_func.
        **kwargs :
            Additional keyword arguments to pass to callable comp_func.
            (keyword 'label' is reserved.)
        label : (list of) label(s)
            The name of the feature and the name of the column. IMPORTANT:
            This argument is a keyword argument.
        """

        log_str = "Comparing - initialize user defined function - " \
            "compare {l_left} with {l_right}"
        logging.info(log_str.format(l_left=labels_left, l_right=labels_right))

        return self._compare_vectorized(comp_func, labels_left, labels_right,
                                        *args, **kwargs)
示例#22
0
    def __init__(self,
                 comp_func,
                 labels_left,
                 labels_right,
                 args=(),
                 kwargs={},
                 label=None,
                 name="",
                 description=""):

        self.comp_func = comp_func
        self.labels_left = labels_left
        self.labels_right = labels_right
        self.args = args
        self.kwargs = kwargs
        self.label = label
        self.description = description

        # logging
        logging.info("CompareFeature - initialize exact algorithm - compare "
                     "{l_left} with {l_right}".format(l_left=labels_left,
                                                      l_right=labels_right))
示例#23
0
    def _compute(self, *args):

        logging.info("Comparing - start comparing data")

        # start the timer for the comparing step
        start_time = time.time()

        c = self._compute_vectorized(*args)

        # log timing
        total_time = time.time() - start_time

        # log timing
        logging.info(
            "Comparing - computation time: ~{:.2f}s".format(total_time))

        # log results
        logf_result = "Comparing - summary shape={}"
        logging.info(logf_result.format(c.shape))

        return c
示例#24
0
    def _compute(self, pairs, x, x_link=None):

        logging.info("Comparing - start comparing data")

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        index_time = time.time() - start_time

        results = pandas.DataFrame(index=pairs)
        label_num = 0  # make a label is label is None

        for feat, label in self.features:

            lbl1 = feat.labels_left
            lbl2 = feat.labels_right

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            c = feat._compute(*tuple(data1 + data2))

            if is_pandas_like(c):
                c = c.values  # convert pandas into numpy

            if label is not None:
                label = listify(label)

            n_cols = 1 if len(c.shape) == 1 else c.shape[1]

            labels = []
            for i in range(0, n_cols):

                label_val = label[i] if label is not None else label_num
                label_num += 1

                labels.append(label_val)

            results[label_val] = c

        # log timing
        total_time = time.time() - start_time

        # log timing
        logging.info("Comparing - computation time: ~{:.2f}s (from which "
                     "indexing: ~{:.2f}s)".format(total_time, index_time))

        # log results
        logf_result = "Comparing - summary shape={}"
        logging.info(logf_result.format(results.shape))

        return results
示例#25
0
    def numeric(self, s1, s2, method='linear', *args, **kwargs):
        """
        numeric(s1, s2, method='linear', offset, scale, origin=0, missing_value=0, label=None)

        Compute the (partial) similarity between numeric values.

        This method initialises the similarity measurement between numeric
        values. The implemented algorithms are: 'step', 'linear', 'exp',
        'gauss' or 'squared'. In case of agreement, the similarity is 1 and in
        case of complete disagreement it is 0. The implementation is similar
        with numeric comparing in ElasticSearch, a full-text search tool. The
        parameters are explained in the image below (source ElasticSearch, The
        Definitive Guide)

        .. image:: /images/elas_1705.png
            :width: 100%
            :target: https://www.elastic.co/guide/en/elasticsearch/guide/current/decay-functions.html
            :alt: Decay functions, like in ElasticSearch

        Parameters
        ----------
        s1 : str or int
            The name or position of the column in the left DataFrame.
        s2 : str or int
            The name or position of the column in the right DataFrame.
        method : float
            The metric used. Options 'step', 'linear', 'exp', 'gauss' or
            'squared'. Default 'linear'.
        offset : float
            The offset. See image above.
        scale : float
            The scale of the numeric comparison method. See the image above.
            This argument is not available for the 'step' algorithm.
        origin : str
            The shift of bias between the values. See image above.
        missing_value : numpy.dtype
            The value if one or both records have a missing value on the
            compared field. Default 0.
        label : label
            The label of the column in the resulting dataframe.

        Note
        ----
        Numeric comparing can be an efficient way to compare date/time
        variables. This can be done by comparing the timestamps.

        """

        if method == 'step':
            num_sim_alg = _step_sim

        elif method in ['linear', 'lin']:
            num_sim_alg = _linear_sim

        elif method == 'squared':
            num_sim_alg = _squared_sim

        elif method in ['exp', 'exponential']:
            num_sim_alg = _exp_sim

        elif method in ['gauss', 'gaussian']:
            num_sim_alg = _gauss_sim

        else:
            raise ValueError("The algorithm '{}' is not known.".format(method))

        # logging
        logging.info(
            "Comparing - initialize numeric '{method}' algorithm - compare "
            "{l_left} with {l_right}".format(l_left=s1,
                                             l_right=s2,
                                             method=method))

        return self._compare_vectorized(_num_internal, s1, s2, num_sim_alg,
                                        *args, **kwargs)
示例#26
0
    def geo(self, lat1, lng1, lat2, lng2, method='linear', *args, **kwargs):
        """
        geo(lat1, lng1, lat2, lng2, method='linear', offset, scale, origin=0, missing_value=0, label=None)

        Compute the (partial) similarity between WGS84 coordinate values.

        Compare the geometric (haversine) distance between two WGS-
        coordinates. The similarity algorithms are 'step', 'linear', 'exp',
        'gauss' or 'squared'. The similarity functions are the same as in
        :meth:`recordlinkage.comparing.Compare.numeric`

        Parameters
        ----------
        lat1 : str or int
            The name or position of the column in the left DataFrame.
        lng1 : str or int
            The name or position of the column in the left DataFrame.
        lat2 : str or int
            The name or position of the column in the right DataFrame.
        lng2 : str or int
            The name or position of the column in the right DataFrame.
        method : str
            The metric used. Options 'step', 'linear', 'exp', 'gauss' or
            'squared'. Default 'linear'.
        offset : float
            The offset. See Compare.numeric.
        scale : float
            The scale of the numeric comparison method. See Compare.numeric.
            This argument is not available for the 'step' algorithm.
        origin : float
            The shift of bias between the values. See Compare.numeric.
        missing_value : numpy.dtype
            The value for a comparison with a missing value. Default 0.
        label : label
            The label of the column in the resulting dataframe.

        """

        if method == 'step':
            num_sim_alg = _step_sim

        elif method in ['linear', 'lin']:
            num_sim_alg = _linear_sim

        elif method == 'squared':
            num_sim_alg = _squared_sim

        elif method in ['exp', 'exponential']:
            num_sim_alg = _exp_sim

        elif method in ['gauss', 'gaussian']:
            num_sim_alg = _gauss_sim

        else:
            raise ValueError("The algorithm '{}' is not known.".format(method))

        # logging
        logging.info("Comparing - initialize geographic '{method}' "
                     "algorithm - compare {l_left} with {l_right}".format(
                         l_left=(lat1, lng1),
                         l_right=(lat2, lng2),
                         method=method))

        return self._compare_vectorized(_geo_internal, (lat1, lng1),
                                        (lat2, lng2), num_sim_alg, *args,
                                        **kwargs)
示例#27
0
    def compute(self, pairs, x, x_link=None):
        """Compare the records of each record pair.

        Calling this method starts the comparing of records.

        Parameters
        ----------
        pairs : pandas.MultiIndex
            A pandas MultiIndex with the record pairs to compare. The indices
            in the MultiIndex are indices of the DataFrame(s) to link.
        x : pandas.DataFrame
            The DataFrame to link. If `x_link` is given, the comparing is a
            linking problem. If `x_link` is not given, the problem is one of
            deduplication.
        x_link : pandas.DataFrame, optional
            The second DataFrame.

        Returns
        -------
        pandas.DataFrame
            A pandas DataFrame with feature vectors, i.e. the result of
            comparing each record pair.
        """

        if not isinstance(pairs, pandas.MultiIndex):
            raise ValueError(
                "expected pandas.MultiIndex with record pair indices "
                "as first argument")

        if not isinstance(x, pandas.DataFrame):
            raise ValueError("expected pandas.DataFrame as second argument")

        if x_link is not None and not isinstance(x_link, pandas.DataFrame):
            raise ValueError("expected pandas.DataFrame as third argument")

        logging.info("Comparing - start comparing data")

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = self._loc2(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = self._loc2(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = self._loc2(x_link[sublabels_right], pairs, 1)

        # log timing
        index_time = time.time() - start_time

        results = pandas.DataFrame(index=pairs)
        label_num = 0  # make a label is label is None

        for f, lbl1, lbl2, label, args, kwargs in self._compare_functions:

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            c = f(*tuple(data1 + data2 + args), **kwargs)

            if isinstance(c, (pandas.Series, pandas.DataFrame)):
                c = c.values  # convert pandas into numpy

            if label is not None:
                label = listify(label)

            n_cols = 1 if len(c.shape) == 1 else c.shape[1]

            labels = []
            for i in range(0, n_cols):

                label_val = label[i] if label is not None else label_num
                label_num += 1

                labels.append(label_val)

            results[label_val] = c

        # log timing
        total_time = time.time() - start_time

        # log timing
        logging.info("Comparing - computation time: ~{:.2f}s (from which "
                     "indexing: ~{:.2f}s)".format(total_time, index_time))

        # log results
        logf_result = "Comparing - summary shape={}"
        logging.info(logf_result.format(results.shape))

        return results
示例#28
0
    def index(self, x, x_link=None):
        """Make an index of record pairs.

        Use a custom function to make record pairs of one or two dataframes.
        Each function should return a pandas.MultiIndex with record pairs.

        Parameters
        ----------
        x: pandas.DataFrame
            A pandas DataFrame. When `x_link` is None, the algorithm makes
            record pairs within the DataFrame. When `x_link` is not empty,
            the algorithm makes pairs between `x` and `x_link`.
        x_link: pandas.DataFrame, optional
            A second DataFrame to link with the DataFrame x.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair contains
            the index labels of two records.

        """

        if x is None:  # error
            raise ValueError("provide at least one dataframe")
        elif x_link is not None:  # linking (two arg)
            x = (x, x_link)
        elif isinstance(x, (list, tuple)):  # dedup or linking (single arg)
            x = tuple(x)
        else:  # dedup (single arg)
            x = (x, )

        if self.verify_integrity:

            for df in x:
                self._verify_integrety(df)

        # start timing
        start_time = time.time()

        # linking
        if not self._deduplication(x):
            logging.info("Indexing - start indexing two DataFrames")

            pairs = self._link_index(*x)
            names = self._make_index_names(x[0].index.name, x[1].index.name)

        # deduplication
        else:
            logging.info("Indexing - start indexing single DataFrame")

            pairs = self._dedup_index(*x)
            names = self._make_index_names(x[0].index.name, x[0].index.name)

        pairs.rename(names, inplace=True)

        # store the number of pairs
        self._n.append(pairs.shape[0])
        self._n_max.append(max_pairs(x))

        # summary
        n = len(pairs)
        rr = 1 - self._n[-1] / self._n_max[-1]
        rr_avg = 1 - np.sum(self._n) / np.sum(self._n_max)

        # log timing
        logf_time = "Indexing - computation time: ~{:.2f}s"
        logging.info(logf_time.format(time.time() - start_time))

        # log results
        logf_result = "Indexing - summary n={:d}, " \
            "reduction_ratio={:0.5f}, reduction_ratio_mean={:0.5f}"
        logging.info(logf_result.format(n, rr, rr_avg))

        return pairs
示例#29
0
    def _compute(self, pairs, x, x_link=None):

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        # index_time = time.time() - start_time

        features = []

        for feat in self.features:

            # --- DATA1
            # None: no data passed to func
            if feat.labels_left is None:
                data1 = tuple()
            # empty array: empty df with index passed to func
            elif feat.labels_left == []:
                data1 = (df_a_indexed[[]], )
            # else: subset columns and pass tuple of series
            else:
                data1 = tuple(
                    [df_a_indexed[lbl] for lbl in listify(feat.labels_left)])

            # --- DATA2
            # None: no data passed to func
            if feat.labels_right is None:
                data2 = tuple()
            # empty array: empty df with index passed to func
            elif feat.labels_right == []:
                data2 = (df_b_indexed[[]], )
            # else: subset columns and pass tuple of series
            else:
                data2 = tuple(
                    [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])

            result = feat._compute(data1, data2)
            features.append((result, feat.label))

        features = self._union(features, pairs)

        # log timing
        n = pairs.shape[0]
        i_max = '?' if self._i_max is None else self._i_max
        eta = time.time() - start_time
        self._eta.append(eta)
        self._n.append(n)

        # log
        logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format(
            self._i, i_max, eta, n))

        # log total
        if self._output_log_total:

            n_total = np.sum(self._n)
            eta_total = np.sum(self._eta)

            logging.info(
                "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format(
                    self._i, i_max, eta_total, n_total))

        self._i += 1

        return features
示例#30
0
    def index(self, x, x_link=None):
        """Make an index of record pairs.

        Parameters
        ----------
        x: pandas.DataFrame
            A pandas DataFrame. When `x_link` is None, the algorithm makes
            record pairs within the DataFrame. When `x_link` is not empty,
            the algorithm makes pairs between `x` and `x_link`.
        x_link: pandas.DataFrame, optional
            A second DataFrame to link with the DataFrame x.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair contains
            the index labels of two records.

        """
        if not self.algorithms:
            raise ValueError("No algorithms given.")

        # start timing
        start_time = time.time()

        pairs = None
        for cl_alg in self.algorithms:
            pairs_i = cl_alg.index(x, x_link)

            if pairs is None:
                pairs = pairs_i
            else:
                pairs = pairs.union(pairs_i)

        if x_link is not None:
            n_max = max_pairs((x, x_link))
        else:
            n_max = max_pairs(x)

        # store the number of pairs
        n = pairs.shape[0]
        eta = time.time() - start_time
        rr = 1 - n / n_max
        i_max = '?' if self._i_max is None else self._i_max

        self._eta.append(eta)
        self._n.append(n)
        self._n_max.append(n_max)

        # log
        logging.info("indexing [{:d}/{}] - time: {:.2f}s - pairs: {:d}/{:d} - "
                     "rr: {:0.5f}".format(self._i, i_max, eta, n, n_max, rr))

        # log total
        if self._output_log_total:

            n_total = np.sum(self._n)
            n_max_total = np.sum(self._n_max)
            rr_avg = 1 - n_total / n_max_total
            eta_total = np.sum(self._eta)

            logging.info("indexing [{:d}/{}] - time: {:.2f}s - "
                         "pairs_total: {:d}/{:d} - rr_total: {:0.5f}".format(
                             self._i, i_max, eta_total, n_total, n_max_total,
                             rr_avg))

        self._i += 1

        return pairs