Пример #1
0
    def compare(self, comp_func, labels_a, labels_b, *args, **kwargs):
        """Compare two records.

        Core method to compare record pairs. This method takes a function and
        data from both records in the record pair. The data is compared with
        the compare function. The built-in methods also use this function.

        Example
        -------

        >>> comp = recordlinkage.Compare(PAIRS, DATAFRAME1, DATAFRAME2)
        >>> comp.exact('first_name', 'name')

        >>> # same as
        >>> comp.compare(recordlinkage._compare_exact, 'first_name', 'name')

        Parameters
        ----------
        comp_func : function
            A comparison function. This function can be a built-in function or
            a user defined comparison function.
        labels_a : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        labels_b : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        name : label
            The name of the feature and the name of the column.
        store : bool, default True
            Store the result in the dataframe.

        Returns
        -------
        pandas.Series
            A pandas series with the result of comparing each record pair.

        """

        if len(self.pairs) == 0:
            raise ValueError(
                "need at least one record pair"
            )

        # the name and store arguments
        name = kwargs.pop('name', None)
        store = kwargs.pop('store', True)

        labels_a = listify(labels_a)
        labels_b = listify(labels_b)

        data_a = []

        for label_a in labels_a:

            # the label is a numpy or pandas object
            if is_numpy_like(label_a) or is_pandas_like(label_a):
                data_a.append(label_a)

            # check requested labels (for better error messages)
            elif label_a not in self.df_a.columns:
                raise KeyError("label '%s' is not found in the first"
                               "dataframe" % label_a)

            else:

                if self.low_memory:

                    df_a_label = self._loc2(self.df_a[label_a], self.pairs, 0)
                    data_a.append(df_a_label)

                # not low memory
                else:
                    if self._df_a_indexed is None:

                        self._df_a_indexed = self._loc2(
                            self.df_a, self.pairs, 0)

                    data_a.append(self._df_a_indexed[label_a])

        data_a = tuple(data_a)

        data_b = []

        for label_b in labels_b:

            # the label is a numpy or pandas object
            if is_numpy_like(label_b) or is_pandas_like(label_b):
                data_b.append(label_b)

            # check requested labels (for better error messages)
            elif label_b not in self.df_b.columns:

                raise KeyError("label '%s' is not found in the second"
                               "dataframe" % label_b)

            else:

                if self.low_memory:

                    df_b_label = self._loc2(self.df_b[label_b], self.pairs, 1)
                    data_b.append(df_b_label)

                # not low memory
                else:
                    if self._df_b_indexed is None:

                        self._df_b_indexed = self._loc2(
                            self.df_b, self.pairs, 1)

                    data_b.append(self._df_b_indexed[label_b])

        data_b = tuple(data_b)


        if self.njobs > 1:

            jobs = []

            chunk_size = np.ceil(self.njobs / len(self.pairs))

            # each job
            for i in range(0, self.njobs):

                # The data arguments
                args_a = tuple(df_a_indexed.loc[i*chunk_size:(i+1)*chunk_size, da] for da in labels_a)
                args_b = tuple(df_b_indexed.loc[i*chunk_size:(i+1)*chunk_size, db] for db in labels_b)

                p = mp.Process(target=comp_func, args=args_a + args_b + args, kwargs=kwargs)
                jobs.append(p)

            for proc in jobs:

                # Start the process
                p.start()
                proc.join()

            # merge parts
            c = pandas.concat(jobs, axis=0, copy=False)

        else:

            # # The data arguments
            # args_a = tuple(df_a_indexed.loc[:, da] for da in labels_a)
            # args_b = tuple(df_b_indexed.loc[:, db] for db in labels_b)

            # Compute the comparison
            c = comp_func(*tuple(data_a + data_b + args), **kwargs)

        # if a pandas series is returned, overwrite the index. The
        # returned index can be different than the MultiIndex passed to
        # the compare function.
        if isinstance(c, pandas.Series):
            c.index = self.vectors.index

        # append column to Compare.vectors
        if store:
            name_or_id = name if name else len(self.vectors.columns)
            self.vectors[name_or_id] = c

        return self.vectors[name_or_id].rename(name)
Пример #2
0
    def _union(self, objs, index=None, column_i=0):
        """Make a union of the features.

        The term 'union' is based on the terminology of scikit-learn.

        """

        feat_conc = []

        for feat, label in objs:

            # result is tuple of results
            if isinstance(feat, tuple):
                if label is None:
                    label = [None] * len(feat)

                partial_result = self._union(zip(feat, label),
                                             column_i=column_i)
                feat_conc.append(partial_result)
                column_i = column_i + partial_result.shape[1]

            # result is pandas.Series.
            elif isinstance(feat, pandas.Series):
                feat.reset_index(drop=True, inplace=True)
                if label is None:
                    label = column_i
                feat.rename(label, inplace=True)
                feat_conc.append(feat)
                column_i = column_i + 1

            # result is pandas.DataFrame
            elif isinstance(feat, pandas.DataFrame):
                feat.reset_index(drop=True, inplace=True)
                if label is None:
                    label = np.arange(column_i, column_i + feat.shape[1])
                feat.columns = label
                feat_conc.append(feat)
                column_i = column_i + feat.shape[1]

            # result is numpy 1d array
            elif is_numpy_like(feat) and len(feat.shape) == 1:
                if label is None:
                    label = column_i
                f = pandas.Series(feat, name=label, copy=False)

                feat_conc.append(f)
                column_i = column_i + 1

            # result is numpy 2d array
            elif is_numpy_like(feat) and len(feat.shape) == 2:
                if label is None:
                    label = np.arange(column_i, column_i + feat.shape[1])
                feat_df = pandas.DataFrame(feat, columns=label, copy=False)
                if label is None:
                    feat_df.columns = [None for _ in range(feat_df.shape[1])]
                feat_conc.append(feat_df)
                column_i = column_i + feat.shape[1]

            # other results are not (yet) supported
            else:
                raise ValueError("expected numpy.ndarray or "
                                 "pandas object to be returned, "
                                 "got '{}'".format(feat.__class__.__name__))

        result = pandas.concat(feat_conc, axis=1, copy=False)
        if index is not None:
            result.set_index(index, inplace=True)

        return result
Пример #3
0
    def compare(self, comp_func, labels_a, labels_b, *args, **kwargs):
        """[DEPRECATED] Compare two records.

        Core method to compare record pairs. This method takes a function and
        data from both records in the record pair. The data is compared with
        the compare function. The built-in methods also use this function.

        Example
        -------

        >>> comp = recordlinkage.Compare(PAIRS, DATAFRAME1, DATAFRAME2)
        >>> comp.exact('first_name', 'name')

        >>> # same as
        >>> comp.compare(recordlinkage._compare_exact, 'first_name', 'name')

        Parameters
        ----------
        comp_func : function
            A comparison function. This function can be a built-in function or
            a user defined comparison function.
        labels_a : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        labels_b : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        name : label
            The name of the feature and the name of the column.
        store : bool, default True
            Store the result in the dataframe.

        Returns
        -------
        pandas.Series
            A pandas series with the result of comparing each record pair.

        """

        if isinstance(comp_func, pandas.MultiIndex):
            raise ValueError("see new api documentation: "
                             "use method 'compute' instead of 'compare'")

        if len(self.pairs) == 0:
            raise ValueError("need at least one record pair")

        # the name and store arguments
        name = kwargs.pop('name', None)
        store = kwargs.pop('store', True)

        labels_a = listify(labels_a)
        labels_b = listify(labels_b)

        data_a = []

        for label_a in labels_a:

            # the label is a numpy or pandas object
            if is_numpy_like(label_a) or is_pandas_like(label_a):
                data_a.append(label_a)

            # check requested labels (for better error messages)
            elif label_a not in self.df_a.columns:
                raise KeyError("label '{}' is not found in the first"
                               "dataframe".format(label_a))

            else:

                if self.low_memory:

                    df_a_label = self._loc2(self.df_a[label_a], self.pairs, 0)
                    data_a.append(df_a_label)

                # not low memory
                else:
                    if self._df_a_indexed is None:

                        self._df_a_indexed = self._loc2(
                            self.df_a, self.pairs, 0)

                    data_a.append(self._df_a_indexed[label_a])

        data_a = tuple(data_a)

        data_b = []

        for label_b in labels_b:

            # the label is a numpy or pandas object
            if is_numpy_like(label_b) or is_pandas_like(label_b):
                data_b.append(label_b)

            # check requested labels (for better error messages)
            elif label_b not in self.df_b.columns:

                raise KeyError("label '{}' is not found in the second"
                               "dataframe".format(label_b))

            else:

                if self.low_memory:

                    df_b_label = self._loc2(self.df_b[label_b], self.pairs, 1)
                    data_b.append(df_b_label)

                # not low memory
                else:
                    if self._df_b_indexed is None:

                        self._df_b_indexed = self._loc2(
                            self.df_b, self.pairs, 1)

                    data_b.append(self._df_b_indexed[label_b])

        data_b = tuple(data_b)

        # Compute the comparison
        c = comp_func(*tuple(data_a + data_b + args), **kwargs)

        # if a pandas series is returned, overwrite the index. The
        # returned index can be different than the MultiIndex passed to
        # the compare function.
        if isinstance(c, pandas.Series):
            c.index = self.vectors.index

        # append column to Compare.vectors
        if store:
            name_or_id = name if name else len(self.vectors.columns)
            self.vectors[name_or_id] = c

        return self.vectors[name_or_id].rename(name)