def max_pairs(shape): """[DEPRECATED] Compute the maximum number of record pairs possible.""" if not isinstance(shape, (tuple, list)): x = get_length(shape) n = int(x * (x - 1) / 2) elif (isinstance(shape, (tuple, list)) and len(shape) == 1): x = get_length(shape[0]) n = int(x * (x - 1) / 2) else: n = numpy.prod([get_length(xi) for xi in shape], dtype=numpy.uint64) return n
def full_index_size(*args): """Compute the number of records in a full index. Compute the number of records in a full index without building the index itself. The result is the maximum number of record pairs possible. This function is especially useful in measures like the `reduction_ratio`. Deduplication: Given a DataFrame A with length N, the full index size is N*(N-1)/2. Linking: Given a DataFrame A with length N and a DataFrame B with length M, the full index size is N*M. Parameters ---------- *args: int, pandas.MultiIndex, pandas.Series, pandas.DataFrame A pandas object or a int representing the length of a dataset to link. When there is one argument, it is assumed that the record linkage is a deduplication process. Examples -------- Use integers: >>> full_index_size(10) # deduplication: 45 pairs >>> full_index_size(10, 10) # linking: 100 pairs or pandas objects >>> full_index_size(DF) # deduplication: len(DF)*(len(DF)-1)/2 pairs >>> full_index_size(DF, DF) # linking: len(DF)*len(DF) pairs """ # check if a list or tuple is passed as argument if len(args) == 1 and isinstance(args[0], (list, tuple)): args = tuple(args[0]) if len(args) == 1: n = get_length(args[0]) size = int(n * (n - 1) / 2) else: size = numpy.prod([get_length(arg) for arg in args], dtype=numpy.unit64) return size