args = parser.parse_args() ak.verbose = False ak.connect(args.hostname, args.port) print("size = ", args.size) SIZE = args.size a = ak.randint(0, 2 * SIZE, SIZE) b = ak.randint(0, 2 * SIZE, SIZE) set_union = ak.union1d(a, b) print("union1d = ", set_union.size, set_union) # elements in a or elements in b (or in both a and b) passed = ak.all(ak.in1d(set_union, a) | ak.in1d(set_union, b)) print("union1d passed test: ", passed) set_intersection = ak.intersect1d(a, b) print("intersect1d = ", set_intersection.size, set_intersection) # elements in a and elements in b (elements in both a and b) passed = ak.all( ak.in1d(set_intersection, a) & ak.in1d(set_intersection, b)) print("intersect1d passed test: ", passed) set_difference = ak.setdiff1d(a, b) print("setdiff1d = ", set_difference.size, set_difference) # elements in a and not in b passes = ak.all( ak.in1d(set_difference, a) & ak.in1d(set_difference, b, invert=True)) print("setdiff1d passed test: ", passed) set_xor = ak.setxor1d(a, b) print("setxor1d = ", set_xor.size, set_xor)
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False): '''Perform inner join on values in <left> and <right>, using conditions defined by <wherefunc> evaluated on <whereargs>, returning indices of left-right pairs. Parameters ---------- left : pdarray(int64) The left values to join right : pdarray(int64) The right values to join wherefunc : function, optional Function that takes two pdarray arguments and returns a pdarray(bool) used to filter the join. Results for which wherefunc is False will be dropped. whereargs : 2-tuple of pdarray The two pdarray arguments to wherefunc Returns ------- leftInds : pdarray(int64) The left indices of pairs that meet the join condition rightInds : pdarray(int64) The right indices of pairs that meet the join condition Notes ----- The return values satisfy the following assertions `assert (left[leftInds] == right[rightInds]).all()` `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()` ''' if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64: raise ValueError("left and right must be pdarray(int64)") if wherefunc is not None: from inspect import signature sample = min((left.size, right.size, 5)) if len(signature(wherefunc).parameters) != 2: raise ValueError("wherefunc must be a function that accepts exactly two arguments") if whereargs is None or len(whereargs) != 2: raise ValueError("whereargs must be a 2-tuple with left and right arg arrays") if whereargs[0].size != left.size: raise ValueError("Left whereargs must be same size as left join values") if whereargs[1].size != right.size: raise ValueError("Right whereargs must be same size as right join values") try: _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample]) except Exception as e: raise ValueError("Error evaluating wherefunc") from e # Only join on intersection inter = ak.intersect1d(left, right) # Indices of left values present in intersection leftInds = ak.arange(left.size)[ak.in1d(left, inter)] # Left vals in intersection leftFilt = left[leftInds] # Indices of right vals present in inter rightInds = ak.arange(right.size)[ak.in1d(right, inter)] # Right vals in inter rightFilt = right[rightInds] byLeft = ak.GroupBy(leftFilt) byRight = ak.GroupBy(rightFilt) maxVal = inter.max() if forceDense or maxVal > 3*(left.size + right.size): # Remap intersection to dense, 0-up codes # Replace left values with dense codes uniqLeftVals = byLeft.unique_keys uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)] leftCodes = ak.zeros_like(leftFilt) - 1 leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False) # Replace right values with dense codes uniqRightVals = byRight.unique_keys uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)] rightCodes = ak.zeros_like(rightFilt) - 1 rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False) countSize = inter.size else: uniqLeftCodes = byLeft.unique_keys uniqRightCodes = byRight.unique_keys leftCodes = leftFilt rightCodes = rightFilt countSize = maxVal + 1 # Expand indices to product domain # First count occurrences of each code in left and right leftCounts = ak.zeros(countSize, dtype=ak.int64) leftCounts[uniqLeftCodes] = byLeft.count()[1] rightCounts = ak.zeros(countSize, dtype=ak.int64) rightCounts[uniqRightCodes] = byRight.count()[1] # Repeat each left index as many times as that code occurs in right prodLeft = rightCounts[leftCodes] leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum()) prodRight = leftCounts[rightCodes] rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum()) # Evaluate where clause if wherefunc is None: return leftFullInds, rightFullInds else: # Gather whereargs leftWhere = whereargs[0][leftFullInds] rightWhere = whereargs[1][rightFullInds] # Evaluate wherefunc and filter ranges, recompute segments whereSatisfied = wherefunc(leftWhere, rightWhere) return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
def intersect(a, b, positions=True, unique=False): """ Find the intersection of two arkouda arrays. This function can be especially useful when `positions=True` so that the caller gets the indices of values present in both arrays. Parameters ---------- a : ak.Strings or ak.pdarray An array of strings b : ak.Strings or ak.pdarray An array of strings positions : bool (default=True) Return tuple of boolean pdarrays that indicate positions in a and b where the values are in the intersection. unique : bool (default=False) If the number of distinct values in `a` (and `b`) is equal to the size of `a` (and `b`), there is a more efficient method to compute the intersection. Returns ------- (ak.pdarray, ak.pdarray) The indices of `a` and `b` where any element occurs at least once in both arrays. """ # To ensure compatibility with all types of arrays: if (isinstance(a, ak.pdarray) and isinstance(b, ak.pdarray)): intx = ak.intersect1d(a, b) if not positions: return intx else: maska = ak.in1d(a, intx) maskb = ak.in1d(b, intx) return (maska, maskb) # It takes more effort to do this with ak.Strings arrays. elif (isinstance(a, ak.Strings) and isinstance(b, ak.Strings)): # Hash the two arrays first hash_a00, hash_a01 = a.hash() hash_b00, hash_b01 = b.hash() # a and b do not have duplicate entries, so the hashes are distinct if unique: hash0 = ak.concatenate([hash_a00, hash_b00]) hash1 = ak.concatenate([hash_a01, hash_b01]) # Group by the unique hashes gb = ak.GroupBy([hash0, hash1]) val, cnt = gb.count() # Hash counts, in groupby order counts = gb.broadcast(cnt, permute=False) # Same, in original order tmp = counts[:] counts[gb.permutation] = tmp del tmp # Masks maska = (counts > 1)[:a.size] maskb = (counts > 1)[a.size:] # The intersection for each array of hash values if positions: return (maska, maskb) else: return a[maska] # a and b may have duplicate entries, so get the unique hash values else: gba = ak.GroupBy([hash_a00, hash_a01]) gbb = ak.GroupBy([hash_b00, hash_b01]) # Take the unique keys as the hash we'll work with a0, a1 = gba.unique_keys b0, b1 = gbb.unique_keys hash0 = ak.concatenate([a0, b0]) hash1 = ak.concatenate([a1, b1]) # Group by the unique hashes gb = ak.GroupBy([hash0, hash1]) val, cnt = gb.count() # Hash counts, in groupby order counts = gb.broadcast(cnt, permute=False) # Restore the original order tmp = counts[:] counts[gb.permutation] = tmp del tmp # Broadcast back up one more level countsa = counts[:a0.size] countsb = counts[a0.size:] counts2a = gba.broadcast(countsa, permute=False) counts2b = gbb.broadcast(countsb, permute=False) # Restore the original orders tmp = counts2a[:] counts2a[gba.permutation] = tmp del tmp tmp = counts2b[:] counts2b[gbb.permutation] = tmp del tmp # Masks maska = (counts2a > 1) maskb = (counts2b > 1) # The intersection for each array of hash values if positions: return (maska, maskb) else: return a[maska]