Пример #1
0
def max_pairs(shape):
    """[DEPRECATED] Compute the maximum number of record pairs possible."""

    if not isinstance(shape, (tuple, list)):
        x = get_length(shape)
        n = int(x * (x - 1) / 2)

    elif (isinstance(shape, (tuple, list)) and len(shape) == 1):
        x = get_length(shape[0])
        n = int(x * (x - 1) / 2)

    else:
        n = numpy.prod([get_length(xi) for xi in shape], dtype=numpy.uint64)

    return n
Пример #2
0
def full_index_size(*args):
    """Compute the number of records in a full index.

    Compute the number of records in a full index without building the index
    itself. The result is the maximum number of record pairs possible. This
    function is especially useful in measures like the `reduction_ratio`.

    Deduplication: Given a DataFrame A with length N, the full index size is
    N*(N-1)/2. Linking: Given a DataFrame A with length N and a DataFrame B
    with length M, the full index size is N*M.

    Parameters
    ----------
    *args: int, pandas.MultiIndex, pandas.Series, pandas.DataFrame
        A pandas object or a int representing the length of a dataset to link.
        When there is one argument, it is assumed that the record linkage is
        a deduplication process.

    Examples
    --------

    Use integers:
    >>> full_index_size(10)  # deduplication: 45 pairs
    >>> full_index_size(10, 10)  # linking: 100 pairs

    or pandas objects
    >>> full_index_size(DF)  # deduplication: len(DF)*(len(DF)-1)/2 pairs
    >>> full_index_size(DF, DF)  # linking: len(DF)*len(DF) pairs

    """

    # check if a list or tuple is passed as argument
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = tuple(args[0])

    if len(args) == 1:
        n = get_length(args[0])
        size = int(n * (n - 1) / 2)
    else:
        size = numpy.prod([get_length(arg) for arg in args],
                          dtype=numpy.unit64)

    return size