예제 #1
0
    def _link_index(self, df_a, df_b):
        # Index name conflicts do not occur. They are handled in the
        # decorator.

        left_on = listify(self.left_on)
        right_on = listify(self.right_on)

        if self.on:
            left_on, right_on = listify(self.on), listify(self.on)

        if not left_on or not right_on:
            raise ValueError("no column labels given")

        if len(left_on) != len(right_on):
            raise ValueError(
                "length of left and right keys needs to be the same")

        blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)]

        # make a dataset for the data on the left
        data_left = df_a[left_on].dropna(axis=0, how='any', inplace=False)
        data_left.columns = blocking_keys
        data_left['index_x'] = data_left.index

        # make a dataset for the data on the right
        data_right = df_b[right_on].dropna(axis=0, how='any', inplace=False)
        data_right.columns = blocking_keys
        data_right['index_y'] = data_right.index

        # merge the dataframes
        pairs = data_left.merge(data_right, how='inner',
                                on=blocking_keys).set_index(
                                    ['index_x', 'index_y'])

        return pairs.index.rename([df_a.index.name, df_b.index.name])
예제 #2
0
    def _link_index(self, df_a, df_b):

        left_on, right_on = self._get_left_and_right_on()
        left_on = listify(left_on)
        right_on = listify(right_on)

        blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)]

        # make a dataset for the data on the left
        # 1. make a dataframe
        # 2. rename columns
        # 3. add index col
        # 4. drop na (last step to presever index)
        data_left = pandas.DataFrame(df_a[left_on], copy=False)
        data_left.columns = blocking_keys
        data_left['index_x'] = numpy.arange(len(df_a))
        data_left.dropna(axis=0, how='any', subset=blocking_keys, inplace=True)

        # make a dataset for the data on the right
        data_right = pandas.DataFrame(df_b[right_on], copy=False)
        data_right.columns = blocking_keys
        data_right['index_y'] = numpy.arange(len(df_b))
        data_right.dropna(axis=0,
                          how='any',
                          subset=blocking_keys,
                          inplace=True)

        # merge the dataframes
        pairs_df = data_left.merge(data_right, how='inner', on=blocking_keys)

        return pandas.MultiIndex(
            levels=[df_a.index.values, df_b.index.values],
            labels=[pairs_df['index_x'].values, pairs_df['index_y'].values],
            verify_integrity=False)
예제 #3
0
    def _compute(self, pairs, x, x_link=None):

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        # index_time = time.time() - start_time

        features = []

        for feat in self.features:

            lbl1 = feat.labels_left
            lbl2 = feat.labels_right

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            result = feat._compute(data1, data2)
            features.append((result, feat.label))

        features = self._union(features, pairs)

        # log timing
        n = pairs.shape[0]
        i_max = '?' if self._i_max is None else self._i_max
        eta = time.time() - start_time
        self._eta.append(eta)
        self._n.append(n)

        # log
        logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format(
            self._i, i_max, eta, n))

        # log total
        if self._output_log_total:

            n_total = np.sum(self._n)
            eta_total = np.sum(self._eta)

            logging.info(
                "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format(
                    self._i, i_max, eta_total, n_total))

        self._i += 1

        return features
예제 #4
0
    def compute(self, pairs, x, x_link=None):
        """Compare the records of each record pair.

        Calling this method starts the comparing of records.

        Parameters
        ----------
        pairs : pandas.MultiIndex
            A pandas MultiIndex with the record pairs to compare. The indices
            in the MultiIndex are indices of the DataFrame(s) to link.
        x : pandas.DataFrame
            The DataFrame to link. If `x_link` is given, the comparing is a
            linking problem. If `x_link` is not given, the problem is one of
            deduplication.
        x_link : pandas.DataFrame, optional
            The second DataFrame.

        Returns
        -------
        pandas.DataFrame
            A pandas DataFrame with feature vectors, i.e. the result of
            comparing each record pair.
        """

        if not is_pandas_2d_multiindex(pairs):
            raise ValueError(
                "expected pandas.MultiIndex with record pair indices "
                "as first argument")

        if not isinstance(x, pandas.DataFrame):
            raise ValueError("expected pandas.DataFrame as second argument")

        if x_link is not None and not isinstance(x_link, pandas.DataFrame):
            raise ValueError("expected pandas.DataFrame as third argument")

        labels_left = listify(self.labels_left)
        labels_right = listify(self.labels_right)

        df_a = frame_indexing(x[labels_left], pairs, 0)

        if x_link is None:
            df_b = frame_indexing(x[labels_right], pairs, 1)
        else:
            df_b = frame_indexing(x_link[labels_right], pairs, 1)

        data1 = tuple([df_a[lbl] for lbl in listify(self.labels_left)])
        data2 = tuple([df_b[lbl] for lbl in listify(self.labels_right)])

        results = self._compute(*tuple(data1 + data2))

        return results
예제 #5
0
 def __init__(self,
              left_on=None,
              right_on=None,
              max_nulls=0,
              max_non_matches=0,
              windows=1,
              **kwargs):
     super(NeighbourhoodBlock, self).__init__(
         left_on=left_on, right_on=right_on, **kwargs)
     self.max_nulls = max_nulls
     self.max_non_matches = max_non_matches
     self.windows = listify(windows)
예제 #6
0
    def _link_index(self, df_a, df_b):

        if self.on is not None:
            if self.left_on is not None or self.right_on is not None:
                raise IndexError('Can only pass argument "on" OR "left_on" '
                                 'and "right_on", not a combination of both.')
            left_on = right_on = listify(self.on)
        else:
            if self.left_on is None and self.right_on is None:
                raise IndexError('pass argument "on" OR "left_on" and '
                                 '"right_on" at class initalization.')
            elif self.left_on is None:
                raise IndexError('Argument "left_on" is missing '
                                 'at class initalization.')
            elif self.right_on is None:
                raise IndexError('Argument "right_on" is missing '
                                 'at class initalization.')
            else:
                left_on = listify(self.left_on)
                right_on = listify(self.right_on)

        blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)]

        # make a dataset for the data on the left
        data_left = df_a[left_on].dropna(axis=0, how='any', inplace=False)
        data_left.columns = blocking_keys
        data_left['index_x'] = data_left.index

        # make a dataset for the data on the right
        data_right = df_b[right_on].dropna(axis=0, how='any', inplace=False)
        data_right.columns = blocking_keys
        data_right['index_y'] = data_right.index

        # merge the dataframes
        pairs = data_left.merge(
            data_right, how='inner', on=blocking_keys
        ).set_index(['index_x', 'index_y'])

        return pairs.index
예제 #7
0
    def _get_labels_right(self, validate=None):
        """Get all labels of the right dataframe."""
        labels = []

        for compare_func in self.features:

            labels = labels + listify(compare_func.labels_right)

        # check requested labels (for better error messages)
        if not is_label_dataframe(labels, validate):
            error_msg = "label is not found in the dataframe"
            raise KeyError(error_msg)

        return unique(labels)
예제 #8
0
        def get_normalized_linkage_params():
            def default_on_possibilities():
                yield self.left_on
                yield self.right_on
                yield [
                    c for c in dfs[0].columns
                    if all(c in df.columns for df in dfs)
                ]

            default_on = next(
                iter(
                    filter(lambda x: x is not None,
                           default_on_possibilities())))
            key_columns = [
                listify(side_on or default_on)
                for side_on in [self.left_on, self.right_on]
            ]
            key_cols = set(map(len, key_columns))
            n_key_cols = next(iter(key_cols))
            if (len(key_cols) > 1) or (n_key_cols == 0):
                raise IndexError('Invalid blocking keys')
            combined_ranks = numpy.vstack([
                pandas.concat([df[col] for df, col in zip(dfs, col_grp)]).rank(
                    method='dense',
                    na_option='keep').fillna(0).astype(int).values - 1
                for col_grp in zip(*key_columns)
            ]).astype(float).T
            combined_ranks[combined_ranks < 0] = numpy.nan
            blocks, indices = deduped_blocks_and_indices(
                blocks=combined_ranks,
                indices=split_to_match(numpy.arange(len(combined_ranks)), dfs))
            n_keys = blocks.shape[1]
            windows = self.windows + self.windows[-1:] * (
                n_keys - len(self.windows))
            if (len(windows) > n_keys) or not all(
                    isinstance(w, int) and (w > 0) and (w % 2 == 1)
                    for w in windows):
                raise ValueError(
                    'Windows must be positive odd integers and the maximum'
                    'number allowed is the number of blocking keys'
                )
            rank_distance_limits = (
                numpy.array(windows) // 2).astype(float).reshape((1, -1))
            return blocks, indices, rank_distance_limits
예제 #9
0
    def _get_labels(self, frame_i, validate=None):
        """Get all labels.

        Parameters
        ----------
        frame_i : str
            A string, 'left' or 'right', incidating the dataframe to collect
            labels from.

        """

        labels = []

        for compare_func in self._compare_functions:

            labels = labels + listify(compare_func[frame_i])

        # check requested labels (for better error messages)
        if not is_label_dataframe(labels, validate):
            error_msg = "label is not found in the dataframe"
            raise KeyError(error_msg)

        return unique(labels)
예제 #10
0
    def compute(self, pairs, x, x_link=None):
        """Compare the records of each record pair.

        Calling this method starts the comparing of records.

        Parameters
        ----------
        pairs : pandas.MultiIndex
            A pandas MultiIndex with the record pairs to compare. The indices
            in the MultiIndex are indices of the DataFrame(s) to link.
        x : pandas.DataFrame
            The DataFrame to link. If `x_link` is given, the comparing is a
            linking problem. If `x_link` is not given, the problem is one of
            deduplication.
        x_link : pandas.DataFrame, optional
            The second DataFrame.

        Returns
        -------
        pandas.DataFrame
            A pandas DataFrame with feature vectors, i.e. the result of 
            comparing each record pair.
        """

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = self._loc2(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = self._loc2(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = self._loc2(x_link[sublabels_right], pairs, 1)

        results = pandas.DataFrame(index=pairs)
        label_num = 0  # make a label is label is None

        for f, lbl1, lbl2, label, args, kwargs in self._compare_functions:

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            c = f(*tuple(data1 + data2 + args), **kwargs)

            if isinstance(c, (pandas.Series, pandas.DataFrame)):
                c = c.values  # convert pandas into numpy

            if label is not None:
                label = listify(label)

            n_cols = 1 if len(c.shape) == 1 else c.shape[1]

            labels = []
            for i in range(0, n_cols):

                label_val = label[i] if label is not None else label_num
                label_num += 1

                labels.append(label_val)

            results[label_val] = c

        return results
예제 #11
0
    def _link_index(self, df_a, df_b):

        if self.on is not None:
            if self.left_on is not None or self.right_on is not None:
                raise IndexError('Can only pass argument "on" OR "left_on" '
                                 'and "right_on", not a combination of both.')
            left_on = right_on = listify(self.on)
        else:
            if self.left_on is None and self.right_on is None:
                raise IndexError('pass argument "on" OR "left_on" and '
                                 '"right_on" at class initalization.')
            elif self.left_on is None:
                raise IndexError('Argument "left_on" is missing '
                                 'at class initalization.')
            elif self.right_on is None:
                raise IndexError('Argument "right_on" is missing '
                                 'at class initalization.')
            else:
                left_on = listify(self.left_on)
                right_on = listify(self.right_on)

        window = self.window

        # Check if window is an odd number
        if not isinstance(window, int) or (window < 0) or not bool(window % 2):
            raise ValueError(
                'window is not a positive and odd integer')

        # # sorting key is single column
        # if isinstance(self.on, (tuple, list, dict)):
        #     raise ValueError(
        #         "sorting key is not a label")

        # make blocking keys correct

        block_left_on = listify(self.block_left_on)
        block_right_on = listify(self.block_right_on)

        if self.block_on:
            block_left_on = listify(self.block_on)
            block_right_on = listify(self.block_on)

        # drop missing values and columns without relevant information
        data_left = df_a[listify(left_on) + block_left_on].dropna(
            axis=0, how='any', inplace=False
        )
        data_left.columns = ['sorting_key'] + \
            ["blocking_key_%d" % i for i, v in enumerate(block_left_on)]
        data_left['index_x'] = data_left.index

        data_right = df_b[listify(right_on) + block_right_on].dropna(
            axis=0, how='any', inplace=False
        )
        data_right.columns = ['sorting_key'] + \
            ["blocking_key_%d" % i for i, v in enumerate(block_right_on)]
        data_right['index_y'] = data_right.index

        # sorting_key_values is the terminology in Data Matching [Christen,
        # 2012]
        if self.sorting_key_values is None:

            self.sorting_key_values = self._get_sorting_key_values(
                data_left['sorting_key'].values,
                data_right['sorting_key'].values
            )

        sorting_key_factors = pandas.Series(
            numpy.arange(len(self.sorting_key_values)),
            index=self.sorting_key_values)

        data_left['sorting_key'] = data_left[
            'sorting_key'].map(sorting_key_factors)
        data_right['sorting_key'] = data_right[
            'sorting_key'].map(sorting_key_factors)

        # Internal window size
        _window = int((window - 1) / 2)

        def merge_lagged(x, y, w):
            """Merge two dataframes with a lag on in the sorting key."""

            y = y.copy()
            y['sorting_key'] = y['sorting_key'] + w

            return x.merge(y, how='inner')

        pairs_concat = [merge_lagged(data_left, data_right, w)
                        for w in range(-_window, _window + 1)]

        pairs = pandas.concat(pairs_concat, axis=0).set_index(
            ['index_x', 'index_y']
        ).index

        return pairs
예제 #12
0
파일: index.py 프로젝트: kgpayne/book-tools
 def __init__(self, left_on, right_on=None, **kwargs):
     super().__init__(**kwargs)
     self.left_on = listify(left_on)
     self.right_on = listify(right_on) if right_on else self.left_on
     self.soundex = fuzzy.Soundex(4)
예제 #13
0
    def _compute(self, pairs, x, x_link=None):

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        # index_time = time.time() - start_time

        features = []

        for feat in self.features:

            # --- DATA1
            # None: no data passed to func
            if feat.labels_left is None:
                data1 = tuple()
            # empty array: empty df with index passed to func
            elif feat.labels_left == []:
                data1 = (df_a_indexed[[]], )
            # else: subset columns and pass tuple of series
            else:
                data1 = tuple(
                    [df_a_indexed[lbl] for lbl in listify(feat.labels_left)])

            # --- DATA2
            # None: no data passed to func
            if feat.labels_right is None:
                data2 = tuple()
            # empty array: empty df with index passed to func
            elif feat.labels_right == []:
                data2 = (df_b_indexed[[]], )
            # else: subset columns and pass tuple of series
            else:
                data2 = tuple(
                    [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])

            result = feat._compute(data1, data2)
            features.append((result, feat.label))

        features = self._union(features, pairs)

        # log timing
        n = pairs.shape[0]
        i_max = '?' if self._i_max is None else self._i_max
        eta = time.time() - start_time
        self._eta.append(eta)
        self._n.append(n)

        # log
        logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format(
            self._i, i_max, eta, n))

        # log total
        if self._output_log_total:

            n_total = np.sum(self._n)
            eta_total = np.sum(self._eta)

            logging.info(
                "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format(
                    self._i, i_max, eta_total, n_total))

        self._i += 1

        return features
예제 #14
0
    def compare(self, comp_func, labels_a, labels_b, *args, **kwargs):
        """Compare two records.

        Core method to compare record pairs. This method takes a function and
        data from both records in the record pair. The data is compared with
        the compare function. The built-in methods also use this function.

        Example
        -------

        >>> comp = recordlinkage.Compare(PAIRS, DATAFRAME1, DATAFRAME2)
        >>> comp.exact('first_name', 'name')

        >>> # same as
        >>> comp.compare(recordlinkage._compare_exact, 'first_name', 'name')

        Parameters
        ----------
        comp_func : function
            A comparison function. This function can be a built-in function or
            a user defined comparison function.
        labels_a : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        labels_b : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        name : label
            The name of the feature and the name of the column.
        store : bool, default True
            Store the result in the dataframe.

        Returns
        -------
        pandas.Series
            A pandas series with the result of comparing each record pair.

        """

        if len(self.pairs) == 0:
            raise ValueError(
                "need at least one record pair"
            )

        # the name and store arguments
        name = kwargs.pop('name', None)
        store = kwargs.pop('store', True)

        labels_a = listify(labels_a)
        labels_b = listify(labels_b)

        data_a = []

        for label_a in labels_a:

            # the label is a numpy or pandas object
            if is_numpy_like(label_a) or is_pandas_like(label_a):
                data_a.append(label_a)

            # check requested labels (for better error messages)
            elif label_a not in self.df_a.columns:
                raise KeyError("label '%s' is not found in the first"
                               "dataframe" % label_a)

            else:

                if self.low_memory:

                    df_a_label = self._loc2(self.df_a[label_a], self.pairs, 0)
                    data_a.append(df_a_label)

                # not low memory
                else:
                    if self._df_a_indexed is None:

                        self._df_a_indexed = self._loc2(
                            self.df_a, self.pairs, 0)

                    data_a.append(self._df_a_indexed[label_a])

        data_a = tuple(data_a)

        data_b = []

        for label_b in labels_b:

            # the label is a numpy or pandas object
            if is_numpy_like(label_b) or is_pandas_like(label_b):
                data_b.append(label_b)

            # check requested labels (for better error messages)
            elif label_b not in self.df_b.columns:

                raise KeyError("label '%s' is not found in the second"
                               "dataframe" % label_b)

            else:

                if self.low_memory:

                    df_b_label = self._loc2(self.df_b[label_b], self.pairs, 1)
                    data_b.append(df_b_label)

                # not low memory
                else:
                    if self._df_b_indexed is None:

                        self._df_b_indexed = self._loc2(
                            self.df_b, self.pairs, 1)

                    data_b.append(self._df_b_indexed[label_b])

        data_b = tuple(data_b)


        if self.njobs > 1:

            jobs = []

            chunk_size = np.ceil(self.njobs / len(self.pairs))

            # each job
            for i in range(0, self.njobs):

                # The data arguments
                args_a = tuple(df_a_indexed.loc[i*chunk_size:(i+1)*chunk_size, da] for da in labels_a)
                args_b = tuple(df_b_indexed.loc[i*chunk_size:(i+1)*chunk_size, db] for db in labels_b)

                p = mp.Process(target=comp_func, args=args_a + args_b + args, kwargs=kwargs)
                jobs.append(p)

            for proc in jobs:

                # Start the process
                p.start()
                proc.join()

            # merge parts
            c = pandas.concat(jobs, axis=0, copy=False)

        else:

            # # The data arguments
            # args_a = tuple(df_a_indexed.loc[:, da] for da in labels_a)
            # args_b = tuple(df_b_indexed.loc[:, db] for db in labels_b)

            # Compute the comparison
            c = comp_func(*tuple(data_a + data_b + args), **kwargs)

        # if a pandas series is returned, overwrite the index. The
        # returned index can be different than the MultiIndex passed to
        # the compare function.
        if isinstance(c, pandas.Series):
            c.index = self.vectors.index

        # append column to Compare.vectors
        if store:
            name_or_id = name if name else len(self.vectors.columns)
            self.vectors[name_or_id] = c

        return self.vectors[name_or_id].rename(name)
예제 #15
0
    def _link_index(self, df_a, df_b):

        left_on, right_on = self._get_left_and_right_on()
        left_on = listify(left_on)
        right_on = listify(right_on)

        window = self.window

        # Check if window is an odd number
        if not isinstance(window, int) or (window < 0) or not bool(window % 2):
            raise ValueError('window is not a positive and odd integer')

        # # sorting key is single column
        # if isinstance(self.on, (tuple, list, dict)):
        #     raise ValueError(
        #         "sorting key is not a label")

        # make blocking keys correct

        block_left_on = listify(self.block_left_on)
        block_right_on = listify(self.block_right_on)

        if self.block_on:
            block_left_on = listify(self.block_on)
            block_right_on = listify(self.block_on)

        blocking_keys = ['sorting_key'] + \
            ["blocking_key_%d" % i for i, v in enumerate(block_left_on)]

        # make a dataset for the data on the left
        # 1. make a dataframe
        # 2. rename columns
        # 3. add index col
        # 4. drop na (last step to presever index)
        data_left = pandas.DataFrame(df_a[listify(left_on) + block_left_on],
                                     copy=False)
        data_left.columns = blocking_keys
        data_left['index_x'] = numpy.arange(len(df_a))
        data_left.dropna(axis=0, how='any', subset=blocking_keys, inplace=True)

        data_right = pandas.DataFrame(df_b[listify(right_on) + block_right_on],
                                      copy=False)
        data_right.columns = blocking_keys
        data_right['index_y'] = numpy.arange(len(df_b))
        data_right.dropna(axis=0,
                          how='any',
                          subset=blocking_keys,
                          inplace=True)

        # sorting_key_values is the terminology in Data Matching [Christen,
        # 2012]
        if self.sorting_key_values is None:

            self.sorting_key_values = self._get_sorting_key_values(
                data_left['sorting_key'].values,
                data_right['sorting_key'].values)

        sorting_key_factors = pandas.Series(numpy.arange(
            len(self.sorting_key_values)),
                                            index=self.sorting_key_values)

        data_left['sorting_key'] = data_left['sorting_key'].map(
            sorting_key_factors)
        data_right['sorting_key'] = data_right['sorting_key'].map(
            sorting_key_factors)

        # Internal window size
        _window = int((window - 1) / 2)

        def merge_lagged(x, y, w):
            """Merge two dataframes with a lag on in the sorting key."""

            y = y.copy()
            y['sorting_key'] = y['sorting_key'] + w

            return x.merge(y, how='inner')

        pairs_concat = [
            merge_lagged(data_left, data_right, w)
            for w in range(-_window, _window + 1)
        ]

        pairs_df = pandas.concat(pairs_concat, axis=0)

        return pandas.MultiIndex(
            levels=[df_a.index.values, df_b.index.values],
            labels=[pairs_df['index_x'].values, pairs_df['index_y'].values],
            verify_integrity=False)
예제 #16
0
    def compute(self, pairs, x, x_link=None):
        """Compare the records of each record pair.

        Calling this method starts the comparing of records.

        Parameters
        ----------
        pairs : pandas.MultiIndex
            A pandas MultiIndex with the record pairs to compare. The indices
            in the MultiIndex are indices of the DataFrame(s) to link.
        x : pandas.DataFrame
            The DataFrame to link. If `x_link` is given, the comparing is a
            linking problem. If `x_link` is not given, the problem is one of
            deduplication.
        x_link : pandas.DataFrame, optional
            The second DataFrame.

        Returns
        -------
        pandas.DataFrame
            A pandas DataFrame with feature vectors, i.e. the result of
            comparing each record pair.
        """

        if not isinstance(pairs, pandas.MultiIndex):
            raise ValueError(
                "expected pandas.MultiIndex with record pair indices "
                "as first argument")

        if not isinstance(x, pandas.DataFrame):
            raise ValueError("expected pandas.DataFrame as second argument")

        if x_link is not None and not isinstance(x_link, pandas.DataFrame):
            raise ValueError("expected pandas.DataFrame as third argument")

        logging.info("Comparing - start comparing data")

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = self._loc2(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = self._loc2(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = self._loc2(x_link[sublabels_right], pairs, 1)

        # log timing
        index_time = time.time() - start_time

        results = pandas.DataFrame(index=pairs)
        label_num = 0  # make a label is label is None

        for f, lbl1, lbl2, label, args, kwargs in self._compare_functions:

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            c = f(*tuple(data1 + data2 + args), **kwargs)

            if isinstance(c, (pandas.Series, pandas.DataFrame)):
                c = c.values  # convert pandas into numpy

            if label is not None:
                label = listify(label)

            n_cols = 1 if len(c.shape) == 1 else c.shape[1]

            labels = []
            for i in range(0, n_cols):

                label_val = label[i] if label is not None else label_num
                label_num += 1

                labels.append(label_val)

            results[label_val] = c

        # log timing
        total_time = time.time() - start_time

        # log timing
        logging.info("Comparing - computation time: ~{:.2f}s (from which "
                     "indexing: ~{:.2f}s)".format(total_time, index_time))

        # log results
        logf_result = "Comparing - summary shape={}"
        logging.info(logf_result.format(results.shape))

        return results
예제 #17
0
    def _link_index(self, df_a, df_b):

        # Index name conflicts do not occur. They are handled in the
        # decorator.

        left_on = listify(self.left_on)
        right_on = listify(self.right_on)

        if self.on:
            left_on = listify(self.on)
            right_on = listify(self.on)

        if not left_on or not right_on:
            raise ValueError("no column labels given")

        if len(left_on) != len(right_on):
            raise ValueError(
                "length of left and right keys needs to be the same")

        window = self.window

        # Check if window is an odd number
        if not isinstance(window, int) or (window < 0) or not bool(window % 2):
            raise ValueError('window is not a positive and odd integer')

        # # sorting key is single column
        # if isinstance(self.on, (tuple, list, dict)):
        #     raise ValueError(
        #         "sorting key is not a label")

        # make blocking keys correct

        block_left_on = listify(self.block_left_on)
        block_right_on = listify(self.block_right_on)

        if self.block_on:
            block_left_on = listify(self.block_on)
            block_right_on = listify(self.block_on)

        # drop missing values and columns without relevant information
        data_left = df_a[listify(left_on) + block_left_on].dropna(
            axis=0, how='any', inplace=False)
        data_left.columns = ['sorting_key'] + \
            ["blocking_key_%d" % i for i, v in enumerate(block_left_on)]
        data_left['index_x'] = data_left.index

        data_right = df_b[listify(right_on) + block_right_on].dropna(
            axis=0, how='any', inplace=False)
        data_right.columns = ['sorting_key'] + \
            ["blocking_key_%d" % i for i, v in enumerate(block_right_on)]
        data_right['index_y'] = data_right.index

        # sorting_key_values is the terminology in Data Matching [Christen,
        # 2012]
        if self.sorting_key_values is None:

            self.sorting_key_values = self._get_sorting_key_values(
                data_left['sorting_key'].values,
                data_right['sorting_key'].values)

        sorting_key_factors = pandas.Series(numpy.arange(
            len(self.sorting_key_values)),
                                            index=self.sorting_key_values)

        data_left['sorting_key'] = data_left['sorting_key'].map(
            sorting_key_factors)
        data_right['sorting_key'] = data_right['sorting_key'].map(
            sorting_key_factors)

        # Internal window size
        _window = int((window - 1) / 2)

        def merge_lagged(x, y, w):
            """Merge two dataframes with a lag on in the sorting key."""

            y = y.copy()
            y['sorting_key'] = y['sorting_key'] + w

            return x.merge(y, how='inner')

        pairs_concat = [
            merge_lagged(data_left, data_right, w)
            for w in range(-_window, _window + 1)
        ]

        pairs = pandas.concat(pairs_concat, axis=0).set_index(
            ['index_x',
             'index_y']).index.rename([df_a.index.name, df_b.index.name])

        return pairs
예제 #18
0
    def compare(self, comp_func, labels_a, labels_b, *args, **kwargs):
        """[DEPRECATED] Compare two records.

        Core method to compare record pairs. This method takes a function and
        data from both records in the record pair. The data is compared with
        the compare function. The built-in methods also use this function.

        Example
        -------

        >>> comp = recordlinkage.Compare(PAIRS, DATAFRAME1, DATAFRAME2)
        >>> comp.exact('first_name', 'name')

        >>> # same as
        >>> comp.compare(recordlinkage._compare_exact, 'first_name', 'name')

        Parameters
        ----------
        comp_func : function
            A comparison function. This function can be a built-in function or
            a user defined comparison function.
        labels_a : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        labels_b : label, pandas.Series, pandas.DataFrame
            The labels, Series or DataFrame to compare.
        name : label
            The name of the feature and the name of the column.
        store : bool, default True
            Store the result in the dataframe.

        Returns
        -------
        pandas.Series
            A pandas series with the result of comparing each record pair.

        """

        if isinstance(comp_func, pandas.MultiIndex):
            raise ValueError("see new api documentation: "
                             "use method 'compute' instead of 'compare'")

        if len(self.pairs) == 0:
            raise ValueError("need at least one record pair")

        # the name and store arguments
        name = kwargs.pop('name', None)
        store = kwargs.pop('store', True)

        labels_a = listify(labels_a)
        labels_b = listify(labels_b)

        data_a = []

        for label_a in labels_a:

            # the label is a numpy or pandas object
            if is_numpy_like(label_a) or is_pandas_like(label_a):
                data_a.append(label_a)

            # check requested labels (for better error messages)
            elif label_a not in self.df_a.columns:
                raise KeyError("label '{}' is not found in the first"
                               "dataframe".format(label_a))

            else:

                if self.low_memory:

                    df_a_label = self._loc2(self.df_a[label_a], self.pairs, 0)
                    data_a.append(df_a_label)

                # not low memory
                else:
                    if self._df_a_indexed is None:

                        self._df_a_indexed = self._loc2(
                            self.df_a, self.pairs, 0)

                    data_a.append(self._df_a_indexed[label_a])

        data_a = tuple(data_a)

        data_b = []

        for label_b in labels_b:

            # the label is a numpy or pandas object
            if is_numpy_like(label_b) or is_pandas_like(label_b):
                data_b.append(label_b)

            # check requested labels (for better error messages)
            elif label_b not in self.df_b.columns:

                raise KeyError("label '{}' is not found in the second"
                               "dataframe".format(label_b))

            else:

                if self.low_memory:

                    df_b_label = self._loc2(self.df_b[label_b], self.pairs, 1)
                    data_b.append(df_b_label)

                # not low memory
                else:
                    if self._df_b_indexed is None:

                        self._df_b_indexed = self._loc2(
                            self.df_b, self.pairs, 1)

                    data_b.append(self._df_b_indexed[label_b])

        data_b = tuple(data_b)

        # Compute the comparison
        c = comp_func(*tuple(data_a + data_b + args), **kwargs)

        # if a pandas series is returned, overwrite the index. The
        # returned index can be different than the MultiIndex passed to
        # the compare function.
        if isinstance(c, pandas.Series):
            c.index = self.vectors.index

        # append column to Compare.vectors
        if store:
            name_or_id = name if name else len(self.vectors.columns)
            self.vectors[name_or_id] = c

        return self.vectors[name_or_id].rename(name)
예제 #19
0
    def _compute(self, pairs, x, x_link=None):

        logging.info("Comparing - start comparing data")

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        index_time = time.time() - start_time

        results = pandas.DataFrame(index=pairs)
        label_num = 0  # make a label is label is None

        for feat, label in self.features:

            lbl1 = feat.labels_left
            lbl2 = feat.labels_right

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            c = feat._compute(*tuple(data1 + data2))

            if is_pandas_like(c):
                c = c.values  # convert pandas into numpy

            if label is not None:
                label = listify(label)

            n_cols = 1 if len(c.shape) == 1 else c.shape[1]

            labels = []
            for i in range(0, n_cols):

                label_val = label[i] if label is not None else label_num
                label_num += 1

                labels.append(label_val)

            results[label_val] = c

        # log timing
        total_time = time.time() - start_time

        # log timing
        logging.info("Comparing - computation time: ~{:.2f}s (from which "
                     "indexing: ~{:.2f}s)".format(total_time, index_time))

        # log results
        logf_result = "Comparing - summary shape={}"
        logging.info(logf_result.format(results.shape))

        return results