Пример #1
0
    args = parser.parse_args()
    ak.verbose = False
    ak.connect(args.hostname, args.port)
    print("size = ", args.size)
    SIZE = args.size
    a = ak.randint(0, 2 * SIZE, SIZE)
    b = ak.randint(0, 2 * SIZE, SIZE)

    set_union = ak.union1d(a, b)
    print("union1d = ", set_union.size, set_union)
    # elements in a or elements in b (or in both a and b)
    passed = ak.all(ak.in1d(set_union, a) | ak.in1d(set_union, b))
    print("union1d passed test: ", passed)

    set_intersection = ak.intersect1d(a, b)
    print("intersect1d = ", set_intersection.size, set_intersection)
    # elements in a and elements in b (elements in both a and b)
    passed = ak.all(
        ak.in1d(set_intersection, a) & ak.in1d(set_intersection, b))
    print("intersect1d passed test: ", passed)

    set_difference = ak.setdiff1d(a, b)
    print("setdiff1d = ", set_difference.size, set_difference)
    # elements in a and not in b
    passes = ak.all(
        ak.in1d(set_difference, a) & ak.in1d(set_difference, b, invert=True))
    print("setdiff1d passed test: ", passed)

    set_xor = ak.setxor1d(a, b)
    print("setxor1d = ", set_xor.size, set_xor)
Пример #2
0
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    Parameters
    ----------
    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
        
    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
        
    Notes
    -----
    The return values satisfy the following assertions
    
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
        
    '''
    if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64:
        raise ValueError("left and right must be pdarray(int64)")
    if wherefunc is not None:
        from inspect import signature
        sample = min((left.size, right.size, 5))
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError("wherefunc must be a function that accepts exactly two arguments")
        if whereargs is None or len(whereargs) != 2:
            raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError("Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError("Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e
    # Only join on intersection
    inter = ak.intersect1d(left, right)
    # Indices of left values present in intersection
    leftInds = ak.arange(left.size)[ak.in1d(left, inter)]
    # Left vals in intersection
    leftFilt = left[leftInds]
    # Indices of right vals present in inter
    rightInds = ak.arange(right.size)[ak.in1d(right, inter)]
    # Right vals in inter
    rightFilt = right[rightInds]
    byLeft = ak.GroupBy(leftFilt)
    byRight = ak.GroupBy(rightFilt)
    maxVal = inter.max()
    if forceDense or maxVal > 3*(left.size + right.size):
        # Remap intersection to dense, 0-up codes
        # Replace left values with dense codes
        uniqLeftVals = byLeft.unique_keys
        uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)]
        leftCodes = ak.zeros_like(leftFilt) - 1
        leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False)
        # Replace right values with dense codes
        uniqRightVals = byRight.unique_keys
        uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)]
        rightCodes = ak.zeros_like(rightFilt) - 1
        rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False)
        countSize = inter.size
    else:
        uniqLeftCodes = byLeft.unique_keys
        uniqRightCodes = byRight.unique_keys
        leftCodes = leftFilt
        rightCodes = rightFilt
        countSize = maxVal + 1
    # Expand indices to product domain
    # First count occurrences of each code in left and right
    leftCounts = ak.zeros(countSize, dtype=ak.int64)
    leftCounts[uniqLeftCodes] = byLeft.count()[1]
    rightCounts = ak.zeros(countSize, dtype=ak.int64)
    rightCounts[uniqRightCodes] = byRight.count()[1]
    # Repeat each left index as many times as that code occurs in right
    prodLeft = rightCounts[leftCodes]
    leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum())
    prodRight = leftCounts[rightCodes]
    rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum())
    # Evaluate where clause
    if wherefunc is None:
        return leftFullInds, rightFullInds
    else:
        # Gather whereargs
        leftWhere = whereargs[0][leftFullInds]
        rightWhere = whereargs[1][rightFullInds]
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
Пример #3
0
def intersect(a, b, positions=True, unique=False):
    """
    Find the intersection of two arkouda arrays.

    This function can be especially useful when `positions=True` so
    that the caller gets the indices of values present in both arrays.

    Parameters
    ----------
    a : ak.Strings or ak.pdarray
        An array of strings

    b : ak.Strings or ak.pdarray
        An array of strings

    positions : bool (default=True)
        Return tuple of boolean pdarrays that indicate positions in a and b
        where the values are in the intersection.

    unique : bool (default=False)
        If the number of distinct values in `a` (and `b`) is equal to the size of
        `a` (and `b`), there is a more efficient method to compute the intersection.

    Returns
    -------
    (ak.pdarray, ak.pdarray)
        The indices of `a` and `b` where any element occurs at least once in both
        arrays.
    """

    # To ensure compatibility with all types of arrays:
    if (isinstance(a, ak.pdarray) and isinstance(b, ak.pdarray)):
        intx = ak.intersect1d(a, b)
        if not positions:
            return intx
        else:
            maska = ak.in1d(a, intx)
            maskb = ak.in1d(b, intx)
            return (maska, maskb)

    # It takes more effort to do this with ak.Strings arrays.
    elif (isinstance(a, ak.Strings) and isinstance(b, ak.Strings)):

        # Hash the two arrays first
        hash_a00, hash_a01 = a.hash()
        hash_b00, hash_b01 = b.hash()

        # a and b do not have duplicate entries, so the hashes are distinct
        if unique:
            hash0 = ak.concatenate([hash_a00, hash_b00])
            hash1 = ak.concatenate([hash_a01, hash_b01])

            # Group by the unique hashes
            gb = ak.GroupBy([hash0, hash1])
            val, cnt = gb.count()

            # Hash counts, in groupby order
            counts = gb.broadcast(cnt, permute=False)

            # Same, in original order
            tmp = counts[:]
            counts[gb.permutation] = tmp
            del tmp

            # Masks
            maska = (counts > 1)[:a.size]
            maskb = (counts > 1)[a.size:]

            # The intersection for each array of hash values
            if positions:
                return (maska, maskb)
            else:
                return a[maska]

        # a and b may have duplicate entries, so get the unique hash values
        else:
            gba = ak.GroupBy([hash_a00, hash_a01])
            gbb = ak.GroupBy([hash_b00, hash_b01])

            # Take the unique keys as the hash we'll work with
            a0, a1 = gba.unique_keys
            b0, b1 = gbb.unique_keys
            hash0 = ak.concatenate([a0, b0])
            hash1 = ak.concatenate([a1, b1])

            # Group by the unique hashes
            gb = ak.GroupBy([hash0, hash1])
            val, cnt = gb.count()

            # Hash counts, in groupby order
            counts = gb.broadcast(cnt, permute=False)

            # Restore the original order
            tmp = counts[:]
            counts[gb.permutation] = tmp
            del tmp

            # Broadcast back up one more level
            countsa = counts[:a0.size]
            countsb = counts[a0.size:]
            counts2a = gba.broadcast(countsa, permute=False)
            counts2b = gbb.broadcast(countsb, permute=False)

            # Restore the original orders
            tmp = counts2a[:]
            counts2a[gba.permutation] = tmp
            del tmp
            tmp = counts2b[:]
            counts2b[gbb.permutation] = tmp
            del tmp

            # Masks
            maska = (counts2a > 1)
            maskb = (counts2b > 1)

            # The intersection for each array of hash values
            if positions:
                return (maska, maskb)
            else:
                return a[maska]