def zeros(size: int, dtype: type = np.float64) -> pdarray: """ Create a pdarray filled with zeros. Parameters ---------- size : int Size of the array (only rank-1 arrays supported) dtype : {float64, int64, bool} Type of resulting array, default float64 Returns ------- pdarray Zeros of the requested size and dtype Raises ------ TypeError Raised if the supplied dtype is not supported or if the size parameter is neither an int nor a str that is parseable to an int. See Also -------- ones, zeros_like Examples -------- >>> ak.zeros(5, dtype=ak.int64) array([0, 0, 0, 0, 0]) >>> ak.zeros(5, dtype=ak.float64) array([0, 0, 0, 0, 0]) >>> ak.zeros(5, dtype=ak.bool) array([False, False, False, False, False]) """ if not np.isscalar(size): raise TypeError("size must be a scalar, not {}".\ format(size.__class__.__name__)) dtype = akdtype(dtype) # normalize dtype # check dtype for error if cast(np.dtype, dtype).name not in numericDTypes: raise TypeError("unsupported dtype {}".format(dtype)) repMsg = generic_msg("create {} {}".format( cast(np.dtype, dtype).name, size)) return create_pdarray(repMsg)
def ones(size: int, dtype: type = float64) -> pdarray: """ Create a pdarray filled with ones. Parameters ---------- size : int Size of the array (only rank-1 arrays supported) dtype : {float64, int64, bool} Resulting array type, default float64 Returns ------- pdarray Ones of the requested size and dtype Raises ------ TypeError Raised if the supplied dtype is not supported or if the size parameter is neither an int nor a str that is parseable to an int. See Also -------- zeros, ones_like Examples -------- >>> ak.ones(5, dtype=ak.int64) array([1, 1, 1, 1, 1]) >>> ak.ones(5, dtype=ak.float64) array([1, 1, 1, 1, 1]) >>> ak.ones(5, dtype=ak.bool) array([True, True, True, True, True]) """ if not np.isscalar(size): raise TypeError("size must be a scalar, not {}".\ format(size.__class__.__name__)) dtype = akdtype(dtype) # normalize dtype # check dtype for error if dtype.name not in numericDTypes: raise TypeError("unsupported dtype {}".format(dtype)) kind, itemsize = translate_np_dtype(dtype) repMsg = generic_msg("create {} {}".format(dtype.name, size)) a = create_pdarray(repMsg) a.fill(1) return a
def stick(self, other, delimiter="", toLeft=False): """ Join the strings from another array onto one end of the strings of this array, optionally inserting a delimiter. Parameters ---------- other : Strings The strings to join onto self's strings delimiter : str String inserted between self and other toLeft : bool If true, join other strings to the left of self. By default, other is joined to the right of self. Returns ------- Strings The array of joined strings See Also -------- lstick, peel, rpeel Examples -------- >>> s = ak.array(['a', 'c', 'e']) >>> t = ak.array(['b', 'd', 'f']) >>> s.stick(t, delimiter='.') array(['a.b', 'c.d', 'e.f']) """ if not isinstance(other, Strings): raise TypeError( "stick: not supported between Strings and {}".format( type(other))) if isinstance(delimiter, bytes): delimiter = delimiter.decode() if not isinstance(delimiter, str): raise TypeError("Delimiter must be a string, not {}".format( type(delimiter))) msg = "segmentedBinopvv {} {} {} {} {} {} {} {} {}".format( "stick", self.objtype, self.offsets.name, self.bytes.name, other.objtype, other.offsets.name, other.bytes.name, NUMBER_FORMAT_STRINGS['bool'].format(toLeft), json.dumps([delimiter])) repMsg = generic_msg(msg) return Strings(*repMsg.split('+'))
def histogram(pda, bins=10): """ Compute a histogram of evenly spaced bins over the range of an array. Parameters ---------- pda : pdarray The values to histogram bins : int The number of equal-size bins to use (default: 10) Returns ------- pdarray The number of values present in each bin See Also -------- value_counts Notes ----- The bins are evenly spaced in the interval [pda.min(), pda.max()]. Currently, the user must re-compute the bin edges, e.g. with np.linspace (see below) in order to plot the histogram. Examples -------- >>> A = ak.arange(0, 10, 1) >>> nbins = 3 >>> h = ak.histogram(A, bins=nbins) >>> h array([3, 3, 4]) # Recreate the bin edges in NumPy >>> binEdges = np.linspace(A.min(), A.max(), nbins+1) >>> binEdges array([0., 3., 6., 9.]) # To plot, use only the left edges, and export the histogram to NumPy >>> plt.plot(binEdges[:-1], h.to_ndarray()) """ if isinstance(pda, pdarray) and isinstance(bins, int): repMsg = generic_msg("histogram {} {}".format(pda.name, bins)) return create_pdarray(repMsg) else: raise TypeError("must be pdarray {} and bins must be an int {}".format( pda, bins))
def suffix_array(strings: Strings) -> SArrays: """ Return the suffix arrays of given strings. The size/shape of each suffix arrays is the same as the corresponding strings. A simple example of suffix array is as follow. Given a string "banana$", all the suffixes are as follows. s[0]="banana$" s[1]="anana$" s[2]="nana$" s[3]="ana$" s[4]="na$" s[5]="a$" s[6]="$" The suffix array of string "banana$" is the array of indices of sorted suffixes. s[6]="$" s[5]="a$" s[3]="ana$" s[1]="anana$" s[0]="banana$" s[4]="na$" s[2]="nana$" so sa=[6,5,3,1,0,4,2] Returns ------- pdarray The suffix arrays of the given strings See Also -------- Notes ----- Raises ------ RuntimeError Raised if there is a server-side error in executing group request or creating the pdarray encapsulating the return message """ msg = "segmentedSuffixAry {} {} {}".format(strings.objtype, strings.offsets.name, strings.bytes.name) repMsg = generic_msg(msg) pdarrays = SArrays(*(repMsg.split('+'))) return pdarrays
def random_strings_uniform(minlen: int, maxlen: int, size: int, characters: str = 'uppercase', seed: Union[None, int] = None) -> Strings: """ Generate random strings with lengths uniformly distributed between minlen and maxlen, and with characters drawn from a specified set. Parameters ---------- minlen : int The minimum allowed length of string maxlen : int The maximum allowed length of string size : int The number of strings to generate characters : (uppercase, lowercase, numeric, printable, binary) The set of characters to draw from Returns ------- Strings The array of random strings Raises ------ ValueError Raised if minlen < 0, maxlen < minlen, or size < 0 See Also -------- random_strings_lognormal, randint """ if minlen < 0 or maxlen < minlen or size < 0: raise ValueError( ("Incompatible arguments: minlen < 0, maxlen < minlen, " + "or size < 0")) msg = "randomStrings {} {} {} {} {} {}".\ format(NUMBER_FORMAT_STRINGS['int64'].format(size), "uniform", characters, NUMBER_FORMAT_STRINGS['int64'].format(minlen), NUMBER_FORMAT_STRINGS['int64'].format(maxlen), seed) repMsg = generic_msg(msg) return Strings(*(cast(str, repMsg).split('+')))
def unique(pda, return_counts=False): """ Find the unique elements of an array. Returns the sorted unique elements of an array. There is an optional output in addition to the unique elements: the number of times each unique value comes up in the input array. Parameters ---------- pda : pdarray Input array. return_counts : bool, optional If True, also return the number of times each unique item appears in `pda`. Returns ------- unique : pdarray The sorted unique values. unique_counts : pdarray, optional The number of times each of the unique values comes up in the original array. Only provided if `return_counts` is True. Notes ----- Internally, this function checks to see whether `pda` is sorted and, if so, whether it is already unique. This step can save considerable computation. Otherwise, this function will sort `pda`. Examples -------- >>> A = ak.array([3, 2, 1, 1, 2, 3]) >>> ak.unique(A) array([1, 2, 3]) """ if isinstance(pda, pdarray): repMsg = generic_msg("unique {} {}".format(pda.name, return_counts)) if return_counts: vc = repMsg.split("+") if verbose: print(vc) return create_pdarray(vc[0]), create_pdarray(vc[1]) else: return create_pdarray(repMsg) else: raise TypeError("must be pdarray {}".format(pda))
def linspace(start: Union[float, int], stop: Union[float, int], length: int) -> pdarray: """ Create a pdarray of linearly-spaced floats in a closed interval. Parameters ---------- start : int Start of interval (inclusive) stop : int End of interval (inclusive) length : int Number of points Returns ------- pdarray, float64 Array of evenly spaced float values along the interval Raises ------ TypeError Raised if start or stop is not a float or int or if length is not an int See Also -------- arange Notes ----- If that start is greater than stop, the pdarray values are generated in descending order. Examples -------- >>> ak.linspace(0, 1, 5) array([0, 0.25, 0.5, 0.75, 1]) >>> ak.linspace(start=1, stop=0, length=5) array([1, 0.75, 0.5, 0.25, 0]) >>> ak.linspace(start=-5, stop=0, length=5) array([-5, -3.75, -2.5, -1.25, 0]) """ repMsg = generic_msg("linspace {} {} {}".format(start, stop, length)) return create_pdarray(repMsg)
def sort(pda: pdarray) -> pdarray: """ Return a sorted copy of the array. Only sorts numeric arrays; for Strings, use argsort. Parameters ---------- pda : pdarray or Categorical The array to sort (int64 or float64) Returns ------- pdarray, int64 or float64 The sorted copy of pda Raises ------ TypeError Raised if the parameter is not a pdarray ValueError Raised if sort attempted on a pdarray with an unsupported dtype such as bool See Also -------- argsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> sorted = ak.sort(a) >>> a array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ if pda.size == 0: return zeros(0, dtype=int64) if pda.dtype not in numeric_dtypes: raise ValueError("ak.sort supports float64 or int64, not {}".format( pda.dtype)) repMsg = generic_msg(cmd="sort", args="{}".format(pda.name)) return create_pdarray(cast(str, repMsg))
def flatten(self, delimiter: str, return_segments: bool = False) -> Union[Strings, Tuple]: """Unpack delimiter-joined substrings into a flat array. Parameters ---------- delimeter : str Characters used to split strings into substrings return_segments : bool If True, also return mapping of original strings to first substring in return array. Returns ------- Strings Flattened substrings with delimiters removed pdarray, int64 (optional) For each original string, the index of first corresponding substring in the return array See Also -------- peel, rpeel Examples -------- >>> orig = ak.array(['one|two', 'three|four|five', 'six']) >>> orig.flatten('|') array(['one', 'two', 'three', 'four', 'five', 'six']) >>> flat, map = orig.flatten('|', return_segments=True) >>> map array([0, 2, 5]) """ msg = "segmentedFlatten {}+{} {} {} {}".format(self.offsets.name, self.bytes.name, self.objtype, return_segments, json.dumps([delimiter])) repMsg = cast(str, generic_msg(msg)) if return_segments: arrays = repMsg.split('+', maxsplit=2) return Strings(arrays[0], arrays[1]), create_pdarray(arrays[2]) else: arrays = repMsg.split('+', maxsplit=1) return Strings(arrays[0], arrays[1])
def _binop(self, other: Union[SArrays, np.int_], op: str) -> pdarray: """ Executes the requested binop on this SArrays instance and the parameter SArrays object and returns the results within a pdarray object. Parameters ---------- other : SArrays the other object is a SArrays object op : str name of the binary operation to be performed Returns ------- pdarray encapsulating the results of the requested binop Raises - ----- ValueError Raised if (1) the op is not in the self.BinOps set, or (2) if the sizes of this and the other instance don't match, or (3) the other object is not a SArrays object RuntimeError Raised if a server-side error is thrown while executing the binary operation """ if op not in self.BinOps: raise ValueError("SArrays: unsupported operator: {}".format(op)) if isinstance(other, Strings): if self.size != other.size: raise ValueError("SArrays: size mismatch {} {}".\ format(self.size, other.size)) msg = "segmentedBinopvvInt {} {} {} {} {} {} {}".format( op, self.objtype, self.offsets.name, self.bytes.name, other.objtype, other.offsets.name, other.bytes.name) elif resolve_scalar_dtype(other) == 'int': msg = "segmentedBinopvsInt {} {} {} {} {} {}".format( op, self.objtype, self.offsets.name, self.bytes.name, self.objtype, json.dumps([other])) else: raise ValueError("SArrays: {} not supported between SArrays and {}"\ .format(op, other.__class__.__name__)) repMsg = generic_msg(msg) return create_pdarray(cast(str, repMsg))
def argmaxk(pda, k): """ Find the `k` maximum values of an array. Returns the largest `k` values of an array, sorted Parameters ---------- pda : pdarray Input array. k : integer The desired count of maximum values to be returned by the output. Returns ------- pdarray, int The indices of the maximum `k` values from pda Notes ----- This call is equivalent in value to: ak.argsort(a)[k:] and generally outperforms this operation. This reduction will see a significant drop in performance as `k` grows beyond a certain value. This value is system dependent, but generally about a `k` of 5 million is where performance degredation has been observed. Examples -------- >>> A = ak.array([10,5,1,3,7,2,9,0]) >>> ak.argmaxk(A, 3) array([4, 6, 0]) """ if isinstance(pda, pdarray): if k == 0: return [] if pda.size == 0: raise TypeError("must be a non-empty pdarray {} of type int or float".format(pda)) repMsg = generic_msg("maxk {} {} {}".format(pda.name, k, True)) return create_pdarray(repMsg) else: raise TypeError("must be pdarray {}".format(pda))
def get_lengths(self) -> pdarray: """ Return the length of each string in the array. Returns ------- pdarray, int The length of each string Raises ------ RuntimeError Raised if there is a server-side error thrown """ msg = "segmentLengths {} {} {}".\ format(self.objtype, self.offsets.name, self.bytes.name) return create_pdarray(generic_msg(msg))
def setdiff1d(pda1, pda2, assume_unique=False): """ Find the set difference of two arrays. Return the sorted, unique values in `pda1` that are not in `pda2`. Parameters ---------- pda1 : pdarray Input array. pda2 : pdarray Input comparison array. assume_unique : bool If True, the input arrays are both assumed to be unique, which can speed up the calculation. Default is False. Returns ------- pdarray Sorted 1D array of values in `pda1` that are not in `pda2`. See Also -------- unique, setxor1d Examples -------- >>> a = ak.array([1, 2, 3, 2, 4, 1]) >>> b = ak.array([3, 4, 5, 6]) >>> ak.setdiff1d(a, b) array([1, 2]) """ if isinstance(pda1, pdarray) and isinstance(pda2, pdarray): if pda1.size == 0: return pda1 # return a zero length pdarray if pda2.size == 0: return pda1 # subtracting nothing return orig pdarray if pda1.dtype == int and pda2.dtype == int: repMsg = generic_msg("setdiff1d {} {} {}".format(pda1.name, pda2.name, assume_unique)) return create_pdarray(repMsg) if not assume_unique: pda1 = unique(pda1) pda2 = unique(pda2) return pda1[in1d(pda1, pda2, invert=True)] else: raise TypeError("must be pdarray {} or {}".format(pda1,pda2))
def argsort(pda: Union[pdarray, Strings, 'Categorical']) -> pdarray: """ Return the permutation that sorts the array. Parameters ---------- pda : pdarray or Strings or Categorical The array to sort (int64 or float64) Returns ------- pdarray, int64 The indices such that ``pda[indices]`` is sorted Raises ------ TypeError Raised if the parameter is other than a pdarray or Strings See Also -------- coargsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilinent to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> perm = ak.argsort(a) >>> a[perm] array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ if hasattr(pda, "argsort"): return pda.argsort() if pda.size == 0: return zeros(0, dtype=int64) if isinstance(pda, Strings): name = '{}+{}'.format(pda.offsets.name, pda.bytes.name) else: name = pda.name repMsg = generic_msg("argsort {} {}".format(pda.objtype, name)) return create_pdarray(repMsg)
def split(self, maxsplit: int = 0, return_segments: bool = False): """ Split string by the occurrences of pattern. If maxsplit is nonzero, at most maxsplit splits occur """ from arkouda.strings import Strings cmd = "segmentedSplit" args = "{} {} {} {} {} {}".format(self.objtype, self.parent_entry_name, "legacy_placeholder", maxsplit, return_segments, json.dumps([self.pattern])) repMsg = cast(str, generic_msg(cmd=cmd, args=args)) if return_segments: arrays = repMsg.split('+', maxsplit=2) return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2]) else: return Strings.from_return_msg(repMsg)
def standard_normal(size: Union[int, np.int64], seed: Union[None, Union[int, np.int64]] = None) -> pdarray: """ Draw real numbers from the standard normal distribution. Parameters ---------- size : Union[int,np.int64] The number of samples to draw (size of the returned array) seed : Union[int,np.int64] Value used to initialize the random number generator Returns ------- pdarray, float64 The array of random numbers Raises ------ TypeError Raised if size is not an int ValueError Raised if size < 0 See Also -------- randint Notes ----- For random samples from :math:`N(\\mu, \\sigma^2)`, use: ``(sigma * standard_normal(size)) + mu`` Examples -------- >>> ak.standard_normal(3,1) array([-0.68586185091150265, 1.1723810583573375, 0.567584107142031]) """ if size < 0: raise ValueError("The size parameter must be > 0") msg = "randomNormal {} {}".format( NUMBER_FORMAT_STRINGS['int64'].format(size), seed) return create_pdarray(generic_msg(msg))
def suffix_array(filename: str) -> SArrays: """ This function is major used for testing correctness and performance Return the suffix array of given file name's content as a string. A simple example of suffix array is as follow. Given string "banana$", all the suffixes are as follows. s[0]="banana$" s[1]="anana$" s[2]="nana$" s[3]="ana$" s[4]="na$" s[5]="a$" s[6]="$" The suffix array of string "banana$" is the array of indices of sorted suffixes. s[6]="$" s[5]="a$" s[3]="ana$" s[1]="anana$" s[0]="banana$" s[4]="na$" s[2]="nana$" so sa=[6,5,3,1,0,4,2] Returns ------- pdarray The suffix arrays of the given strings See Also -------- Notes ----- Raises ------ RuntimeError Raised if there is a server-side error in executing group request or creating the pdarray encapsulating the return message """ msg = "segmentedSAFile {}".format(filename) repMsg = generic_msg(msg) pdarrays = SArrays(*(repMsg.split('+'))) return pdarrays
def attach_pda(user_defined_name: str) -> pdarray: """ Return a pdarray attached to the a registered name in the arkouda server which was registered using register_pda() Parameters ---------- user_defined_name : str user defined name which array was registered under Returns ------- pdarray pdarray which points to pdarray registered with user defined name in the arkouda server Raises ------ TypeError Raised if user_defined_name is not a str See also -------- register_pda, unregister_pda Notes ----- Registered names/pdarrays in the server are immune to deletion until they are unregistered. Examples -------- >>> a = zeros(100) >>> r_pda = ak.register_pda(a, "my_zeros") >>> # potentially disconnect from server and reconnect to server >>> b = ak.attach_pda("my_zeros") >>> # ...other work... >>> ak.unregister_pda(b) """ if not isinstance(user_defined_name, str): raise TypeError("user_defined_name must be a str") repMsg = generic_msg("attach {}".format(user_defined_name)) return create_pdarray(repMsg)
def aggregate(self, values, operator): ''' Using the permutation stored in the GroupBy instance, group another array of values and apply a reduction to each group's values. Parameters ---------- values : pdarray The values to group and reduce operator: str The name of the reduction operator to use Returns ------- unique_keys : (list of) pdarray or Strings The unique keys, in grouped order aggregates : pdarray One aggregate value per unique key in the GroupBy instance ''' if not isinstance(values, pdarray): raise TypeError("<values> must be a pdarray") if values.size != self.size: raise ValueError("Attempt to group array using key array of different length") if operator not in self.Reductions: raise ValueError("Unsupported reduction: {}\nMust be one of {}".format(operator, self.Reductions)) if self.assume_sorted: permuted_values = values else: permuted_values = values[self.permutation] if self.per_locale: cmd = "segmentedLocalRdx" else: cmd = "segmentedReduction" reqMsg = "{} {} {} {}".format(cmd, permuted_values.name, self.segments.name, operator) repMsg = generic_msg(reqMsg) if verbose: print(repMsg) if operator.startswith('arg'): return self.unique_keys, self.permutation[create_pdarray(repMsg)] else: return self.unique_keys, create_pdarray(repMsg)
def union1d(pda1: pdarray, pda2: pdarray) -> pdarray: """ Find the union of two arrays. Return the unique, sorted array of values that are in either of the two input arrays. Parameters ---------- pda1 : pdarray Input array pda2 : pdarray Input array Returns ------- pdarray Unique, sorted union of the input arrays. Raises ------ TypeError Raised if either pda1 or pda2 is not a pdarray See Also -------- intersect1d, unique Examples -------- >>> ak.union1d([-1, 0, 1], [-2, 0, 2]) array([-2, -1, 0, 1, 2]) """ if pda1.size == 0: return pda2 # union is pda2 if pda2.size == 0: return pda1 # union is pda1 if pda1.dtype == int and pda2.dtype == int: repMsg = generic_msg("union1d {} {}".\ format(pda1.name, pda2.name)) return create_pdarray(repMsg) return unique(concatenate((unique(pda1), unique(pda2))))
def abs(pda: pdarray) -> pdarray: """ Return the element-wise absolute value of the array. Parameters ---------- pda : pdarray Returns ------- pdarray A pdarray containing absolute values of the input array elements Raises ------ TypeError Raised if the parameter is not a pdarray """ repMsg = generic_msg("efunc {} {}".format("abs", pda.name)) return create_pdarray(repMsg)
def find_segments(self): if self.per_locale: cmd = "findLocalSegments" else: cmd = "findSegments" if self.nkeys == 1: keynames = self.keys.name else: keynames = ' '.join([k.name for k in self.keys]) reqMsg = "{} {} {:n} {:n} {}".format(cmd, self.permutation.name, self.nkeys, self.size, keynames) repMsg = generic_msg(reqMsg) segAttr, uniqAttr = repMsg.split("+") if verbose: print(segAttr, uniqAttr) self.segments = create_pdarray(segAttr) self.unique_key_indices = create_pdarray(uniqAttr) if self.nkeys == 1: self.unique_keys = self.keys[self.unique_key_indices] else: self.unique_keys = [k[self.unique_key_indices] for k in self.keys]
def group(self): """ Return the permutation that groups the array, placing equivalent strings together. This permutation does NOT sort the strings. All instances of the same string are guaranteed to lie in one contiguous block of the permuted array, but the blocks are not necessarily ordered. Returns ------- pdarray The permutation that groups the array by value See Also -------- GroupBy, unique """ msg = "segmentedGroup {} {} {}".format(self.objtype, self.offsets.name, self.bytes.name) repMsg = generic_msg(msg) return create_pdarray(repMsg)
def binop(self, other, op): if op not in self.BinOps: raise ValueError("Strings: unsupported operator: {}".format(op)) if isinstance(other, Strings): if self.size != other.size: raise ValueError("Strings: size mismatch {} {}".format( self.size, other.size)) msg = "segmentedBinopvv {} {} {} {} {} {} {}".format( op, self.objtype, self.offsets.name, self.bytes.name, other.objtype, other.offsets.name, other.bytes.name) elif resolve_scalar_dtype(other) == 'str': msg = "segmentedBinopvs {} {} {} {} {} {}".format( op, self.objtype, self.offsets.name, self.bytes.name, self.objtype, other) else: raise ValueError( "Strings: {} not supported between Strings and {}".format( op, type(other))) repMsg = generic_msg(msg) return create_pdarray(repMsg)
def in1d(pda1, pda2, invert=False): """ Test whether each element of a 1-D array is also present in a second array. Returns a boolean array the same length as `pda1` that is True where an element of `pda1` is in `pda2` and False otherwise. Parameters ---------- pda1 : pdarray Input array. pda2 : pdarray The values against which to test each value of `pda1`. invert : bool, optional If True, the values in the returned array are inverted (that is, False where an element of `pda1` is in `pda2` and True otherwise). Default is False. ``ak.in1d(a, b, invert=True)`` is equivalent to (but is faster than) ``~ak.in1d(a, b)``. Returns ------- pdarray, bool The values `pda1[in1d]` are in `pda2`. See Also -------- unique, intersect1d, union1d Notes ----- `in1d` can be considered as an element-wise function version of the python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is logically equivalent to ``ak.array([item in b for item in a])``, but is much faster and scales to arbitrarily large ``a``. """ if isinstance(pda1, pdarray) and isinstance(pda2, pdarray): repMsg = generic_msg("in1d {} {} {}".format(pda1.name, pda2.name, invert)) return create_pdarray(repMsg) else: raise TypeError("must be pdarray {} or {}".format(pda1, pda2))
def random_strings_lognormal(logmean, logstd, size, characters='uppercase'): """ Generate random strings with log-normally distributed lengths and with characters drawn from a specified set. Parameters ---------- logmean : float The log-mean of the length distribution logstd : float The log-standard-deviation of the length distribution size : int The number of strings to generate characters : (uppercase, lowercase, numeric, printable, binary) The set of characters to draw from Returns ------- Strings The array of random strings See Also -------- random_strings_lognormal, randint Notes ----- The lengths of the generated strings are distributed $Lognormal(\mu, \sigma^2)$, with :math:`\mu = logmean` and :math:`\sigma = logstd`. Thus, the strings will have an average length of :math:`exp(\mu + 0.5*\sigma^2)`, a minimum length of zero, and a heavy tail towards longer strings. """ if logstd <= 0 or size < 0: raise ValueError("Incompatible arguments") msg = "randomStrings {} {} {} {} {}".format(NUMBER_FORMAT_STRINGS['int64'].format(size), "lognormal", characters, NUMBER_FORMAT_STRINGS['float64'].format(logmean), NUMBER_FORMAT_STRINGS['float64'].format(logstd)) repMsg = generic_msg(msg) return Strings(*(repMsg.split('+')))
def sin(pda: pdarray) -> pdarray: """ Return the element-wise sine of the array. Parameters ---------- pda : pdarray Returns ------- pdarray A pdarray containing sin for each element of the original pdarray Raises ------ TypeError Raised if the parameter is not a pdarray """ repMsg = generic_msg(cmd="efunc", args="{} {}".format("sin", pda.name)) return create_pdarray(type_cast(str, repMsg))
def cos(pda: pdarray) -> pdarray: """ Return the element-wise cosine of the array. Parameters ---------- pda : pdarray Returns ------- pdarray A pdarray containing cosine for each element of the original pdarray Raises ------ TypeError Raised if the parameter is not a pdarray """ repMsg = generic_msg("efunc {} {}".format("cos", pda.name)) return create_pdarray(repMsg)
def hash(self): """ Compute a 128-bit hash of each string. Returns ------- (pdarray, pdarray) A pair of int64 pdarrays. The ith hash value is the concatenation of the ith values from each array. Notes ----- The implementation uses SipHash128, a fast and balanced hash function (used by Python for dictionaries and sets). For realistic numbers of strings (up to about 10**15), the probability of a collision between two 128-bit hash values is negligible. """ msg = "segmentedHash {} {} {}".format(self.objtype, self.offsets.name, self.bytes.name) repMsg = generic_msg(msg) h1, h2 = repMsg.split('+') return create_pdarray(h1), create_pdarray(h2)