def unregister_categorical_by_name(user_defined_name: str) -> None: """ Function to unregister Categorical object by name which was registered with the arkouda server via register() Parameters ---------- user_defined_name : str Name under which the Categorical object was registered Raises ------- TypeError if user_defined_name is not a string RegistrationError if there is an issue attempting to unregister any underlying components See Also -------- register, unregister, attach, is_registered """ # We have 4 subcomponents, unregister each of them Strings.unregister_strings_by_name(f"{user_defined_name}.categories") unregister_pdarray_by_name(f"{user_defined_name}.codes") # Unregister optional pieces only if they are contained in the registry registry = list_registry() if f"{user_defined_name}.permutation" in registry: unregister_pdarray_by_name(f"{user_defined_name}.permutation") if f"{user_defined_name}.segments" in registry: unregister_pdarray_by_name(f"{user_defined_name}.segments")
def unique(pda, return_counts=False): """ Find the unique elements of an array. Returns the unique elements of an array, sorted if the values are integers. There is an optional output in addition to the unique elements: the number of times each unique value comes up in the input array. Parameters ---------- pda : pdarray or Strings or Categorical Input array. return_counts : bool, optional If True, also return the number of times each unique item appears in `pda`. Returns ------- unique : pdarray or Strings The unique values. If input dtype is int64, return values will be sorted. unique_counts : pdarray, optional The number of times each of the unique values comes up in the original array. Only provided if `return_counts` is True. Notes ----- For integer arrays, this function checks to see whether `pda` is sorted and, if so, whether it is already unique. This step can save considerable computation. Otherwise, this function will sort `pda`. For Examples -------- >>> A = ak.array([3, 2, 1, 1, 2, 3]) >>> ak.unique(A) array([1, 2, 3]) """ if hasattr(pda, 'unique'): return pda.unique() elif isinstance(pda, pdarray): repMsg = generic_msg("unique {} {} {}".format(pda.objtype, pda.name, return_counts)) if return_counts: vc = repMsg.split("+") if verbose: print(vc) return create_pdarray(vc[0]), create_pdarray(vc[1]) else: return create_pdarray(repMsg) elif isinstance(pda, Strings): name = '{}+{}'.format(pda.offsets.name, pda.bytes.name) repMsg = generic_msg("unique {} {} {}".format(pda.objtype, name, return_counts)) vc = repMsg.split('+') if verbose: print(vc) if return_counts: return Strings(vc[0], vc[1]), create_pdarray(vc[2]) else: return Strings(vc[0], vc[1]) else: raise TypeError("must be pdarray or Strings {}".format(pda))
def random_strings_uniform(minlen, maxlen, size, characters='uppercase'): """ Generate random strings with lengths uniformly distributed between minlen and maxlen, and with characters drawn from a specified set. Parameters ---------- minlen : int The minimum allowed length of string maxlen : int The maximum allowed length of string size : int The number of strings to generate characters : (uppercase, lowercase, numeric, printable, binary) The set of characters to draw from Returns ------- Strings The array of random strings See Also -------- random_strings_lognormal, randint """ if minlen < 0 or maxlen < minlen or size < 0: raise ValueError("Incompatible arguments") msg = "randomStrings {} {} {} {} {}".format(NUMBER_FORMAT_STRINGS['int64'].format(size), "uniform", characters, NUMBER_FORMAT_STRINGS['int64'].format(minlen), NUMBER_FORMAT_STRINGS['int64'].format(maxlen)) repMsg = generic_msg(msg) return Strings(*(repMsg.split('+')))
def random_strings_lognormal(logmean: Union[float, int], logstd: float, size: int, characters: str = 'uppercase') -> Strings: """ Generate random strings with log-normally distributed lengths and with characters drawn from a specified set. Parameters ---------- logmean : Union[float, int] The log-mean of the length distribution logstd : float The log-standard-deviation of the length distribution size : int The number of strings to generate characters : (uppercase, lowercase, numeric, printable, binary) The set of characters to draw from Returns ------- Strings The array of random strings Raises ------ TypeError Raised if logmean is not a float or int, logstd is not a float, size is not an int, or if characters is not a str ValueError Raised if logstd <= 0 or size < 0 See Also -------- random_strings_lognormal, randint Notes ----- The lengths of the generated strings are distributed $Lognormal(\mu, \sigma^2)$, with :math:`\mu = logmean` and :math:`\sigma = logstd`. Thus, the strings will have an average length of :math:`exp(\mu + 0.5*\sigma^2)`, a minimum length of zero, and a heavy tail towards longer strings. """ if not isinstance(logmean, float) and not isinstance(logmean, int): raise TypeError("The logmean must be a float or int") if not isinstance(logstd, float): raise TypeError("The logstd must be a float") if not isinstance(size, int): raise TypeError("The size must be an integer") if not isinstance(characters, str): raise TypeError("characters must be a str") if logstd <= 0 or size < 0: raise ValueError("Incompatible arguments: logstd <= 0 or size < 0") msg = "randomStrings {} {} {} {} {}".\ format(NUMBER_FORMAT_STRINGS['int64'].format(size), "lognormal", characters, NUMBER_FORMAT_STRINGS['float64'].format(logmean), NUMBER_FORMAT_STRINGS['float64'].format(logstd)) repMsg = generic_msg(msg) return Strings(*(repMsg.split('+')))
def concatenate(arrays): """ Concatenate an iterable of ``pdarray`` objects into one ``pdarray``. Parameters ---------- arrays : iterable of ``pdarray`` or Strings or Categorical The arrays to concatenate. Must all have same dtype. Returns ------- pdarray Single array containing all values, in original order Examples -------- >>> ak.concatenate([ak.array([1, 2, 3]), ak.array([4, 5, 6])]) array([1, 2, 3, 4, 5, 6]) """ size = 0 objtype = None dtype = None names = [] if len(arrays) < 1: raise ValueError("concatenate called on empty iterable") if len(arrays) == 1: return arrays[0] if hasattr(arrays[0], 'concatenate'): return arrays[0].concatenate(arrays[1:]) for a in arrays: if not isinstance(a, pdarray) and not isinstance(a, Strings): raise ValueError( "Argument must be an iterable of pdarrays or Strings") if objtype == None: objtype = a.objtype if objtype == "pdarray": if dtype == None: dtype = a.dtype elif dtype != a.dtype: raise ValueError("All pdarrays must have same dtype") names.append(a.name) elif objtype == "str": names.append('{}+{}'.format(a.offsets.name, a.bytes.name)) else: raise NotImplementedError( "concatenate not implemented for object type {}".format( objtype)) size += a.size if size == 0: if objtype == "pdarray": return zeros_like(arrays[0]) else: return arrays[0] repMsg = generic_msg("concatenate {} {} {}".format(len(arrays), objtype, ' '.join(names))) if objtype == "pdarray": return create_pdarray(repMsg) elif objtype == "str": return Strings(*(repMsg.split('+')))
def random_strings_uniform( minlen: Union[int, np.int64], maxlen: Union[int, np.int64], size: Union[int, np.int64], characters: str = 'uppercase', seed: Union[None, Union[int, np.int64]] = None) -> Strings: """ Generate random strings with lengths uniformly distributed between minlen and maxlen, and with characters drawn from a specified set. Parameters ---------- minlen : Union[int,np.int64] The minimum allowed length of string maxlen : Union[int,np.int64] The maximum allowed length of string size : Union[int,np.int64] The number of strings to generate characters : (uppercase, lowercase, numeric, printable, binary) The set of characters to draw from seed : Union[None, Union[int,np.int64]], optional Value used to initialize the random number generator Returns ------- Strings The array of random strings Raises ------ ValueError Raised if minlen < 0, maxlen < minlen, or size < 0 See Also -------- random_strings_lognormal, randint Examples -------- >>> ak.random_strings_uniform(minlen=1, maxlen=5, seed=1, size=5) array(['TVKJ', 'EWAB', 'CO', 'HFMD', 'U']) >>> ak.random_strings_uniform(minlen=1, maxlen=5, seed=1, size=5, ... characters='printable') array(['+5"f', '-P]3', '4k', '~HFF', 'F']) """ if minlen < 0 or maxlen < minlen or size < 0: raise ValueError(("Incompatible arguments: minlen < 0, maxlen " + "< minlen, or size < 0")) msg = "randomStrings {} {} {} {} {} {}".\ format(NUMBER_FORMAT_STRINGS['int64'].format(size), "uniform", characters, NUMBER_FORMAT_STRINGS['int64'].format(minlen), NUMBER_FORMAT_STRINGS['int64'].format(maxlen), seed) repMsg = generic_msg(msg) return Strings(*(cast(str, repMsg).split('+')))
def split(self, maxsplit: int = 0, return_segments: bool = False): """ Split string by the occurrences of pattern. If maxsplit is nonzero, at most maxsplit splits occur """ from arkouda.strings import Strings cmd = "segmentedSplit" args = "{} {} {} {} {} {}".format(self.objtype, self.parent_entry_name, "legacy_placeholder", maxsplit, return_segments, json.dumps([self.pattern])) repMsg = cast(str, generic_msg(cmd=cmd, args=args)) if return_segments: arrays = repMsg.split('+', maxsplit=2) return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2]) else: return Strings.from_return_msg(repMsg)
def cast(pda: Union[pdarray, Strings], dt: Union[np.dtype, str]) -> Union[pdarray, Strings]: """ Cast an array to another dtype. Parameters ---------- pda : pdarray or Strings The array of values to cast dtype : np.dtype or str The target dtype to cast values to Returns ------- pdarray or Strings Array of values cast to desired dtype Notes ----- The cast is performed according to Chapel's casting rules and is NOT safe from overflows or underflows. The user must ensure that the target dtype has the precision and capacity to hold the desired result. Examples -------- >>> ak.cast(ak.linspace(1.0,5.0,5), dt=ak.int64) array([1, 2, 3, 4, 5]) >>> ak.cast(ak.arange(0,5), dt=ak.float64).dtype dtype('float64') >>> ak.cast(ak.arange(0,5), dt=ak.bool) array([False, True, True, True, True]) >>> ak.cast(ak.linspace(0,4,5), dt=ak.bool) array([False, True, True, True, True]) """ if isinstance(pda, pdarray): name = pda.name objtype = "pdarray" elif isinstance(pda, Strings): name = '+'.join((pda.offsets.name, pda.bytes.name)) objtype = "str" # typechecked decorator guarantees no other case dt = _as_dtype(dt) opt = "" cmd = "cast" args = "{} {} {} {}".format(name, objtype, dt.name, opt) repMsg = generic_msg(cmd=cmd, args=args) if dt.name.startswith("str"): return Strings(*(type_cast(str, repMsg).split("+"))) else: return create_pdarray(type_cast(str, repMsg))
def sub(self, repl: str, count: int = 0, return_num_subs: bool = False): """ Return the Strings obtained by replacing non-overlapping occurrences of pattern with the replacement repl. If count is nonzero, at most count substitutions occur If return_num_subs is True, return the number of substitutions that occurred """ from arkouda.strings import Strings cmd = "segmentedSub" args = "{} {} {} {} {} {} {}".format(self.objtype, self.parent_entry_name, "legacy_placeholder", repl, count, return_num_subs, json.dumps([self.pattern])) repMsg = cast(str, generic_msg(cmd=cmd, args=args)) if return_num_subs: arrays = repMsg.split('+', maxsplit=2) return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2]) else: return Strings.from_return_msg(repMsg)
def findall(self, return_match_origins: bool = False): """ Return all non-overlapping matches of pattern in Strings as a new Strings object """ from arkouda.strings import Strings self.find_locations() cmd = "segmentedFindAll" args = "{} {} {} {} {} {} {} {}".format(self.objtype, self.parent_entry_name, "legacy_placeholder", self.num_matches.name, self.starts.name, self.lengths.name, self.indices.name, return_match_origins) repMsg = cast(str, generic_msg(cmd=cmd, args=args)) if return_match_origins: arrays = repMsg.split('+', maxsplit=2) return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2]) else: return Strings.from_return_msg(repMsg)
def find_matches(self, return_match_origins: bool = False): """ Return all matches as a new Strings object Parameters ---------- return_match_origins: bool If True, return a pdarray containing the index of the original string each pattern match is from Returns ------- Strings Strings object containing only matches pdarray, int64 (optional) The index of the original string each pattern match is from Raises ------ RuntimeError Raised if there is a server-side error thrown Examples -------- >>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', '']) >>> strings.search('_+').find_matches(return_match_origins=True) (array(['_', '____', '__']), array([0 1 3])) """ from arkouda.strings import Strings cmd = "segmentedFindAll" args = "{} {} {} {} {} {} {} {}".format( self._objtype, self._parent_entry_name, "legacy_placeholder", self._matched.name, self._starts.name, self._lengths.name, self._indices.name, return_match_origins) repMsg = cast(str, generic_msg(cmd=cmd, args=args)) if return_match_origins: arrays = repMsg.split('+', maxsplit=2) return Strings.from_return_msg("+".join( arrays[0:2])), create_pdarray(arrays[2]) else: return Strings.from_return_msg(repMsg)
def random_strings_uniform(minlen: int, maxlen: int, size: int, characters: str = 'uppercase', seed: Union[None, int] = None) -> Strings: """ Generate random strings with lengths uniformly distributed between minlen and maxlen, and with characters drawn from a specified set. Parameters ---------- minlen : int The minimum allowed length of string maxlen : int The maximum allowed length of string size : int The number of strings to generate characters : (uppercase, lowercase, numeric, printable, binary) The set of characters to draw from Returns ------- Strings The array of random strings Raises ------ ValueError Raised if minlen < 0, maxlen < minlen, or size < 0 See Also -------- random_strings_lognormal, randint """ if minlen < 0 or maxlen < minlen or size < 0: raise ValueError( ("Incompatible arguments: minlen < 0, maxlen < minlen, " + "or size < 0")) msg = "randomStrings {} {} {} {} {} {}".\ format(NUMBER_FORMAT_STRINGS['int64'].format(size), "uniform", characters, NUMBER_FORMAT_STRINGS['int64'].format(minlen), NUMBER_FORMAT_STRINGS['int64'].format(maxlen), seed) repMsg = generic_msg(msg) return Strings(*(cast(str, repMsg).split('+')))
def attach(user_defined_name: str) -> Categorical: """ Function to return a Categorical object attached to the registered name in the arkouda server which was registered using register() Parameters ---------- user_defined_name : str user defined name which Categorical object was registered under Returns ------- Categorical The Categorical object created by re-attaching to the corresponding server components Raises ------ TypeError if user_defined_name is not a string See Also -------- register, is_registered, unregister, unregister_categorical_by_name """ # Build dict of registered components by invoking their corresponding Class.attach functions parts = { "categories": Strings.attach(f"{user_defined_name}.categories"), "codes": pdarray.attach(f"{user_defined_name}.codes"), } # Add optional pieces only if they're contained in the registry registry = list_registry() if f"{user_defined_name}.permutation" in registry: parts["permutation"] = pdarray.attach( f"{user_defined_name}.permutation") if f"{user_defined_name}.segments" in registry: parts["segments"] = pdarray.attach(f"{user_defined_name}.segments") c = Categorical(None, **parts) # Call constructor with unpacked kwargs c.name = user_defined_name # Update our name return c
def random_strings_lognormal(logmean, logstd, size, characters='uppercase'): """ Generate random strings with log-normally distributed lengths and with characters drawn from a specified set. Parameters ---------- logmean : float The log-mean of the length distribution logstd : float The log-standard-deviation of the length distribution size : int The number of strings to generate characters : (uppercase, lowercase, numeric, printable, binary) The set of characters to draw from Returns ------- Strings The array of random strings See Also -------- random_strings_lognormal, randint Notes ----- The lengths of the generated strings are distributed $Lognormal(\mu, \sigma^2)$, with :math:`\mu = logmean` and :math:`\sigma = logstd`. Thus, the strings will have an average length of :math:`exp(\mu + 0.5*\sigma^2)`, a minimum length of zero, and a heavy tail towards longer strings. """ if logstd <= 0 or size < 0: raise ValueError("Incompatible arguments") msg = "randomStrings {} {} {} {} {}".format(NUMBER_FORMAT_STRINGS['int64'].format(size), "lognormal", characters, NUMBER_FORMAT_STRINGS['float64'].format(logmean), NUMBER_FORMAT_STRINGS['float64'].format(logstd)) repMsg = generic_msg(msg) return Strings(*(repMsg.split('+')))
def read_hdf(dsetName, filenames): """ Read a single dataset from multiple HDF5 files into an arkouda pdarray. Parameters ---------- dsetName : str The name of the dataset (must be the same across all files) filenames : list or str Either a list of filenames or shell expression Returns ------- pdarray A pdarray instance pointing to the server-side data read in See Also -------- get_datasets, ls_hdf, read_all, load, save Notes ----- If filenames is a string, it is interpreted as a shell expression (a single filename is a valid expression, so it will work) and is expanded with glob to read all matching files. Use ``get_datasets`` to show the names of datasets in HDF5 files. If dsetName is not present in all files, a RuntimeError is raised. """ if isinstance(filenames, str): filenames = [filenames] rep_msg = generic_msg("readhdf {} {:n} {}".format(dsetName, len(filenames), json.dumps(filenames))) # This is a hack to detect a string return type # In the future, we should put the number and type into the return message if '+' in rep_msg: return Strings(*rep_msg.split('+')) else: return create_pdarray(rep_msg)
def cast(pda: Union[pdarray, Strings], dt) -> Union[pdarray, Strings]: """ Cast an array to another dtype. Parameters ---------- pda : pdarray or Strings The array of values to cast dtype : np.dtype or str The target dtype to cast values to Returns ------- pdarray or Strings Array of values cast to desired dtype Notes ----- The cast is performed according to Chapel's casting rules and is NOT safe from overflows or underflows. The user must ensure that the target dtype has the precision and capacity to hold the desired result. """ if isinstance(pda, pdarray): name = pda.name objtype = "pdarray" elif isinstance(pda, Strings): name = '+'.join((pda.offsets.name, pda.bytes.name)) objtype = "str" # typechecked decorator guarantees no other case dt = _as_dtype(dt) opt = "" msg = "cast {} {} {} {}".format(name, objtype, dt.name, opt) repMsg = generic_msg(msg) if dt.name.startswith("str"): return Strings(*(type_cast(str, repMsg).split("+"))) else: return create_pdarray(type_cast(str, repMsg))
def array(a: Union[pdarray, np.ndarray, Iterable]) -> Union[pdarray, Strings]: """ Convert an iterable to a pdarray or Strings object, sending the corresponding data to the arkouda server. Parameters ---------- a : Union[pdarray, np.ndarray] Rank-1 array of a supported dtype Returns ------- pdarray or Strings A pdarray instance stored on arkouda server or Strings instance, which is composed of two pdarrays stored on arkouda server Raises ------ TypeError Raised if a is not a pdarray, np.ndarray, or Python Iterable such as a list, array, tuple, or deque RuntimeError If a is not one-dimensional, nbytes > maxTransferBytes, a.dtype is not supported (not in DTypes), or if the product of a size and a.itemsize > maxTransferBytes See Also -------- pdarray.to_ndarray Notes ----- The number of bytes in the input array cannot exceed `arkouda.maxTransferBytes`, otherwise a RuntimeError will be raised. This is to protect the user from overwhelming the connection between the Python client and the arkouda server, under the assumption that it is a low-bandwidth connection. The user may override this limit by setting ak.maxTransferBytes to a larger value, but should proceed with caution. If the pdrray or ndarray is of type U, this method is called twice recursively to create the Strings object and the two corresponding pdarrays for string bytes and offsets, respectively. Examples -------- >>> a = [3, 5, 7] >>> b = ak.array(a) >>> b array([3, 5, 7]) >>> type(b) arkouda.pdarray """ # If a is already a pdarray, do nothing if isinstance(a, pdarray): return a from arkouda.client import maxTransferBytes # If a is not already a numpy.ndarray, convert it if not isinstance(a, np.ndarray): try: a = np.array(a) except: raise TypeError( ('a must be a pdarray, np.ndarray, or convertible to' + ' a numpy array')) # Only rank 1 arrays currently supported if a.ndim != 1: raise RuntimeError("Only rank-1 pdarrays or ndarrays supported") # Check if array of strings if a.dtype.kind == 'U' or 'U' in a.dtype.kind: encoded = np.array([elem.encode() for elem in a]) # Length of each string, plus null byte terminator lengths = np.array([len(elem) for elem in encoded]) + 1 # Compute zero-up segment offsets offsets = np.cumsum(lengths) - lengths # Allocate and fill bytes array with string segments nbytes = offsets[-1] + lengths[-1] if nbytes > maxTransferBytes: raise RuntimeError( ("Creating pdarray would require transferring {} bytes," + " which exceeds allowed transfer size. Increase " + "ak.maxTransferBytes to force.").format(nbytes)) values = np.zeros(nbytes, dtype=np.uint8) for s, o in zip(encoded, offsets): for i, b in enumerate(s): values[o + i] = b # Recurse to create pdarrays for offsets and values, then return Strings object return Strings(array(offsets), array(values)) # If not strings, then check that dtype is supported in arkouda if a.dtype.name not in DTypes: raise RuntimeError("Unhandled dtype {}".format(a.dtype)) # Do not allow arrays that are too large size = a.size if (size * a.itemsize) > maxTransferBytes: raise RuntimeError(("Array exceeds allowed transfer size. Increase " + "ak.maxTransferBytes to allow")) # Pack binary array data into a bytes object with a command header # including the dtype and size fmt = ">{:n}{}".format(size, structDtypeCodes[a.dtype.name]) req_msg = "array {} {:n} ".\ format(a.dtype.name, size).encode() + struct.pack(fmt, *a) repMsg = generic_msg(req_msg, send_bytes=True) return create_pdarray(cast(str, repMsg))
def random_strings_lognormal(logmean: Union[float, int], logstd: Union[float, int], size: int, characters: str = 'uppercase', seed: Union[None, int] = None) -> Strings: """ Generate random strings with log-normally distributed lengths and with characters drawn from a specified set. Parameters ---------- logmean : Union[float, int] The log-mean of the length distribution logstd : float The log-standard-deviation of the length distribution size : int The number of strings to generate characters : (uppercase, lowercase, numeric, printable, binary) The set of characters to draw from seed : int Value used to initialize the random number generator Returns ------- Strings The Strings object encapsulating a pdarray of random strings Raises ------ TypeError Raised if logmean is neither a float nor a int, logstd is not a float, size is not an int, or if characters is not a str ValueError Raised if logstd <= 0 or size < 0 See Also -------- random_strings_lognormal, randint Notes ----- The lengths of the generated strings are distributed $Lognormal(\\mu, \\sigma^2)$, with :math:`\\mu = logmean` and :math:`\\sigma = logstd`. Thus, the strings will have an average length of :math:`exp(\\mu + 0.5*\\sigma^2)`, a minimum length of zero, and a heavy tail towards longer strings. Examples -------- >>> ak.random_strings_lognormal(2, 0.25, 5, seed=1) array(['TVKJTE', 'ABOCORHFM', 'LUDMMGTB', 'KWOQNPHZ', 'VSXRRL']) >>> ak.random_strings_lognormal(2, 0.25, 5, seed=1, characters='printable') array(['+5"fp-', ']3Q4kC~HF', '=F=`,IE!', 'DjkBa'9(', '5oZ1)=']) """ if logstd <= 0 or size < 0: raise ValueError("Incompatible arguments: logstd <= 0 or size < 0") msg = "randomStrings {} {} {} {} {} {}".\ format(NUMBER_FORMAT_STRINGS['int64'].format(size), "lognormal", characters, NUMBER_FORMAT_STRINGS['float64'].format(logmean), NUMBER_FORMAT_STRINGS['float64'].format(logstd), seed) repMsg = generic_msg(msg) return Strings(*(cast(str, repMsg).split('+')))
def read_all(filenames, datasets=None, iterative=False): """ Read datasets from HDF5 files. Parameters ---------- filenames : list or str Either a list of filenames or shell expression datasets : list or str or None (List of) name(s) of dataset(s) to read (default: all available) iterative : boolean Iterative (True) or Single (False) function call(s) to server Returns ------- For a single dataset returns an Arkouda pdarray or an Arkouda Sring and for multiple datasets returns a dictionary of Ardkouda pdarrays and Arkouda Strings. Dictionary of {datasetName: pdarray or String} See Also -------- read_hdf, get_datasets, ls_hdf Notes ----- If filenames is a string, it is interpreted as a shell expression (a single filename is a valid expression, so it will work) and is expanded with glob to read all matching files. If iterative == True each dataset name and file names are passed to the server as independent sequential strings while if iterative == False all dataset names and file names are passed to the server in a single string. If datasets is None, infer the names of datasets from the first file and read all of them. Use ``get_datasets`` to show the names of datasets in HDF5 files. If not all datasets are present in all HDF5 files, a RuntimeError is raised.a """ if isinstance(filenames, str): filenames = [filenames] if datasets is None: datasets = get_datasets(filenames[0]) if isinstance(datasets, str): datasets = [datasets] else: # ensure dataset(s) exist if isinstance(datasets, str): datasets = [datasets] nonexistent = set(datasets) - set(get_datasets(filenames[0])) if len(nonexistent) > 0: raise ValueError("Dataset(s) not found: {}".format(nonexistent)) if iterative == True: # iterative calls to server readhdf return {dset: read_hdf(dset, filenames) for dset in datasets} else: # single call to server readAllHdf rep_msg = generic_msg("readAllHdf {:n} {:n} {} | {}".format( len(datasets), len(filenames), json.dumps(datasets), json.dumps(filenames))) if ',' in rep_msg: rep_msgs = rep_msg.split(' , ') d = dict() for dset, rm in zip(datasets, rep_msgs): if ('+' in rm): #String d[dset] = Strings(*rm.split('+')) else: d[dset] = create_pdarray(rm) return d elif '+' in rep_msg: return Strings(*rep_msg.split('+')) else: return create_pdarray(rep_msg)
def group(self, group_num: int = 0, return_group_origins: bool = False): """ Returns a new Strings containing the capture group corresponding to group_num. For the default, group_num=0, return the full match Parameters ---------- group_num: int The index of the capture group to be returned return_group_origins: bool If True, return a pdarray containing the index of the original string each capture group is from Returns ------- Strings Strings object containing only the capture groups corresponding to group_num pdarray, int64 (optional) The index of the original string each group is from Examples -------- >>> strings = ak.array(["Isaac Newton, physicist", '<--calculus-->', 'Gottfried Leibniz, mathematician']) >>> m = strings.search("(\\w+) (\\w+)") >>> m.group() array(['Isaac Newton', 'Gottfried Leibniz']) >>> m.group(1) array(['Isaac', 'Gottfried']) >>> m.group(2, return_group_origins=True) (array(['Newton', 'Leibniz']), array([0 2])) """ from arkouda.strings import Strings from arkouda.client import regexMaxCaptures if group_num < 0: raise ValueError("group_num cannot be negative") if group_num > regexMaxCaptures: max_capture_flag = f'-e REGEX_MAX_CAPTURES={group_num}' e = f"group_num={group_num} > regexMaxCaptures={regexMaxCaptures}. To run group({group_num}), recompile the server with flag '{max_capture_flag}'" raise ValueError(e) # We don't cache the locations of groups, find the location info and call findAll cmd = "segmentedFindLoc" args = "{} {} {} {} {}".format(self._objtype, self._parent_entry_name, "legacy_placeholder", group_num, json.dumps([self.re])) repMsg = cast(str, generic_msg(cmd=cmd, args=args)) created_map = json.loads(repMsg) global_starts = create_pdarray(created_map["Starts"]) global_lengths = create_pdarray(created_map["Lens"]) global_indices = create_pdarray(created_map["Indices"]) if self._match_type == MatchType.SEARCH: matched = create_pdarray(created_map["SearchBool"]) indices = create_pdarray(created_map["SearchInd"]) elif self._match_type == MatchType.MATCH: matched = create_pdarray(created_map["MatchBool"]) indices = create_pdarray(created_map["MatchInd"]) elif self._match_type == MatchType.FULLMATCH: matched = create_pdarray(created_map["FullMatchBool"]) indices = create_pdarray(created_map["FullMatchInd"]) else: raise ValueError(f"{self._match_type} is not a MatchType") starts = global_starts[global_indices[matched]] lengths = global_lengths[global_indices[matched]] cmd = "segmentedFindAll" args = "{} {} {} {} {} {} {} {}".format(self._objtype, self._parent_entry_name, "legacy_placeholder", matched.name, starts.name, lengths.name, indices.name, return_group_origins) repMsg = cast(str, generic_msg(cmd=cmd, args=args)) if return_group_origins: arrays = repMsg.split('+', maxsplit=2) return Strings.from_return_msg("+".join( arrays[0:2])), create_pdarray(arrays[2]) else: return Strings.from_return_msg(repMsg)
def array(a: Union[pdarray, np.ndarray, Iterable]) -> Union[pdarray, Strings]: """ Convert a Python or Numpy Iterable to a pdarray or Strings object, sending the corresponding data to the arkouda server. Parameters ---------- a : Union[pdarray, np.ndarray] Rank-1 array of a supported dtype Returns ------- pdarray or Strings A pdarray instance stored on arkouda server or Strings instance, which is composed of two pdarrays stored on arkouda server Raises ------ TypeError Raised if a is not a pdarray, np.ndarray, or Python Iterable such as a list, array, tuple, or deque RuntimeError Raised if a is not one-dimensional, nbytes > maxTransferBytes, a.dtype is not supported (not in DTypes), or if the product of a size and a.itemsize > maxTransferBytes ValueError Raised if the returned message is malformed or does not contain the fields required to generate the array. See Also -------- pdarray.to_ndarray Notes ----- The number of bytes in the input array cannot exceed `arkouda.maxTransferBytes`, otherwise a RuntimeError will be raised. This is to protect the user from overwhelming the connection between the Python client and the arkouda server, under the assumption that it is a low-bandwidth connection. The user may override this limit by setting ak.maxTransferBytes to a larger value, but should proceed with caution. If the pdrray or ndarray is of type U, this method is called twice recursively to create the Strings object and the two corresponding pdarrays for string bytes and offsets, respectively. Examples -------- >>> ak.array(np.arange(1,10)) array([1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> ak.array(range(1,10)) array([1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> strings = ak.array(['string {}'.format(i) for i in range(0,5)]) >>> type(strings) <class 'arkouda.strings.Strings'> """ # If a is already a pdarray, do nothing if isinstance(a, pdarray): return a from arkouda.client import maxTransferBytes # If a is not already a numpy.ndarray, convert it if not isinstance(a, np.ndarray): try: a = np.array(a) except: raise TypeError( ('a must be a pdarray, np.ndarray, or convertible to' + ' a numpy array')) # Only rank 1 arrays currently supported if a.ndim != 1: raise RuntimeError("Only rank-1 pdarrays or ndarrays supported") # Check if array of strings if 'U' in a.dtype.kind: # encode each string and add a null byte terminator encoded = [ i for i in itertools.chain.from_iterable( map(lambda x: x.encode() + b"\x00", a)) ] nbytes = len(encoded) if nbytes > maxTransferBytes: raise RuntimeError( ("Creating pdarray would require transferring {} bytes," + " which exceeds allowed transfer size. Increase " + "ak.maxTransferBytes to force.").format(nbytes)) encoded_np = np.array(encoded, dtype=np.uint8) args = f"{encoded_np.dtype.name} {encoded_np.size} seg_string={True}" rep_msg = generic_msg(cmd='array', args=args, payload=_array_memview(encoded_np), send_binary=True) parts = cast(str, rep_msg).split('+', maxsplit=3) return Strings.from_parts(parts[0], parts[1]) # If not strings, then check that dtype is supported in arkouda if a.dtype.name not in DTypes: raise RuntimeError("Unhandled dtype {}".format(a.dtype)) # Do not allow arrays that are too large size = a.size if (size * a.itemsize) > maxTransferBytes: raise RuntimeError(("Array exceeds allowed transfer size. Increase " + "ak.maxTransferBytes to allow")) # Pack binary array data into a bytes object with a command header # including the dtype and size. If the server has a different byteorder # than our numpy array we need to swap to match since the server expects # native endian bytes aview = _array_memview(a) args = f"{a.dtype.name} {size} seg_strings={False}" rep_msg = generic_msg(cmd='array', args=args, payload=aview, send_binary=True) return create_pdarray(rep_msg)
def unique( pda: Union[pdarray, Strings, 'Categorical'], # type: ignore return_counts: bool = False ) -> Union[Union[pdarray, Strings, 'Categorical'], # type: ignore Tuple[Union[pdarray, Strings, 'Categorical'], Optional[pdarray]]]: #type: ignore """ Find the unique elements of an array. Returns the unique elements of an array, sorted if the values are integers. There is an optional output in addition to the unique elements: the number of times each unique value comes up in the input array. Parameters ---------- pda : pdarray or Strings or Categorical Input array. return_counts : bool, optional If True, also return the number of times each unique item appears in `pda`. Returns ------- unique : pdarray or Strings The unique values. If input dtype is int64, return values will be sorted. unique_counts : pdarray, optional The number of times each of the unique values comes up in the original array. Only provided if `return_counts` is True. Raises ------ TypeError Raised if pda is not a pdarray or Strings object RuntimeError Raised if the pdarray or Strings dtype is unsupported Notes ----- For integer arrays, this function checks to see whether `pda` is sorted and, if so, whether it is already unique. This step can save considerable computation. Otherwise, this function will sort `pda`. Examples -------- >>> A = ak.array([3, 2, 1, 1, 2, 3]) >>> ak.unique(A) array([1, 2, 3]) """ from arkouda.categorical import Categorical as Categorical_ if hasattr(pda, 'unique'): return cast(Categorical_, pda).unique() elif isinstance(pda, pdarray): repMsg = generic_msg(cmd="unique", args="{} {} {}".\ format(pda.objtype, pda.name, return_counts)) if return_counts: vc = cast(str, repMsg).split("+") logger.debug(vc) return create_pdarray(cast(str, vc[0])), create_pdarray( cast(str, vc[1])) else: return create_pdarray(cast(str, repMsg)) elif isinstance(pda, Strings): name = '{}+{}'.format(pda.entry.name, "legacy_placeholder") repMsg = cast(str,generic_msg(cmd="unique", args="{} {} {}".\ format(pda.objtype, name, return_counts))) vc = repMsg.split('+') logger.debug(vc) if return_counts: return Strings.from_return_msg("+".join(vc[0:2])), create_pdarray( cast(str, vc[2])) else: return Strings.from_return_msg(repMsg) else: raise TypeError("must be pdarray, Strings, or Categorical {}")
def array(a: Union[pdarray, np.ndarray, Iterable]) -> Union[pdarray, Strings]: """ Convert a Python or Numpy Iterable to a pdarray or Strings object, sending the corresponding data to the arkouda server. Parameters ---------- a : Union[pdarray, np.ndarray] Rank-1 array of a supported dtype Returns ------- pdarray or Strings A pdarray instance stored on arkouda server or Strings instance, which is composed of two pdarrays stored on arkouda server Raises ------ TypeError Raised if a is not a pdarray, np.ndarray, or Python Iterable such as a list, array, tuple, or deque RuntimeError Raised if a is not one-dimensional, nbytes > maxTransferBytes, a.dtype is not supported (not in DTypes), or if the product of a size and a.itemsize > maxTransferBytes ValueError Raised if the returned message is malformed or does not contain the fields required to generate the array. See Also -------- pdarray.to_ndarray Notes ----- The number of bytes in the input array cannot exceed `arkouda.maxTransferBytes`, otherwise a RuntimeError will be raised. This is to protect the user from overwhelming the connection between the Python client and the arkouda server, under the assumption that it is a low-bandwidth connection. The user may override this limit by setting ak.maxTransferBytes to a larger value, but should proceed with caution. If the pdrray or ndarray is of type U, this method is called twice recursively to create the Strings object and the two corresponding pdarrays for string bytes and offsets, respectively. Examples -------- >>> ak.array(np.arange(1,10)) array([1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> ak.array(range(1,10)) array([1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> strings = ak.array(['string {}'.format(i) for i in range(0,5)]) >>> type(strings) <class 'arkouda.strings.Strings'> """ # If a is already a pdarray, do nothing if isinstance(a, pdarray): return a from arkouda.client import maxTransferBytes # If a is not already a numpy.ndarray, convert it if not isinstance(a, np.ndarray): try: a = np.array(a) except: raise TypeError( ('a must be a pdarray, np.ndarray, or convertible to' + ' a numpy array')) # Only rank 1 arrays currently supported if a.ndim != 1: raise RuntimeError("Only rank-1 pdarrays or ndarrays supported") # Check if array of strings if a.dtype.kind == 'U' or 'U' in a.dtype.kind: encoded = np.array([elem.encode() for elem in a]) # Length of each string, plus null byte terminator lengths = np.array([len(elem) for elem in encoded]) + 1 # Compute zero-up segment offsets offsets = np.cumsum(lengths) - lengths # Allocate and fill bytes array with string segments nbytes = offsets[-1] + lengths[-1] if nbytes > maxTransferBytes: raise RuntimeError( ("Creating pdarray would require transferring {} bytes," + " which exceeds allowed transfer size. Increase " + "ak.maxTransferBytes to force.").format(nbytes)) values = np.zeros(nbytes, dtype=np.uint8) for s, o in zip(encoded, offsets): for i, b in enumerate(s): values[o + i] = b # Recurse to create pdarrays for offsets and values, then return Strings object return Strings(cast(pdarray, array(offsets)), cast(pdarray, array(values))) # If not strings, then check that dtype is supported in arkouda if a.dtype.name not in DTypes: raise RuntimeError("Unhandled dtype {}".format(a.dtype)) # Do not allow arrays that are too large size = a.size if (size * a.itemsize) > maxTransferBytes: raise RuntimeError(("Array exceeds allowed transfer size. Increase " + "ak.maxTransferBytes to allow")) # Pack binary array data into a bytes object with a command header # including the dtype and size. If the server has a different byteorder # than our numpy array we need to swap to match since the server expects # native endian bytes if ((get_byteorder(a.dtype) == '<' and get_server_byteorder() == 'big') or (get_byteorder(a.dtype) == '>' and get_server_byteorder() == 'little')): abytes = a.byteswap().tobytes() else: abytes = a.tobytes() req_msg = "{} {:n} ".format(a.dtype.name, size).encode() + abytes repMsg = generic_msg(cmd='array', args=req_msg, send_bytes=True) return create_pdarray(repMsg)
def concatenate(arrays : Sequence[Union[pdarray,Strings]]) -> Union[pdarray,Strings]: """ Concatenate an iterable of ``pdarray`` or ``Strings`` objects into one ``pdarray`` or ``Strings`` object, respectively. Parameters ---------- arrays : Sequence[Union[pdarray,Strings]] The pdarrays or Strings to concatenate. For pdarrays, all must have same dtype. Returns ------- Union[pdarray,Strings] Single pdarray or Strings object containing all values, returned in the original order Raises ------ ValueError Raised if arrays is empty or if 1..n pdarrays have differing dtypes TypeError Raised if arrays is not a pdarrays or Strings iterable RuntimeError Raised if 1..n array elements are dtypes for which concatenate has not been implemented. Notes ----- ak.concatenate is not supported for bool or float64 pdarrays Examples -------- >>> ak.concatenate([ak.array([1, 2, 3]), ak.array([4, 5, 6])]) array([1, 2, 3, 4, 5, 6]) >>> ak.concatenate([ak.array([True,False,True]),ak.array([False,True,True])]) array([True, False, True, False, True, True]) >>> ak.concatenate([ak.array(['one','two']),ak.array(['three','four','five'])]) array(['one', 'two', 'three', 'four', 'five']) """ size = 0 objtype = None dtype = None names = [] if len(cast(list,arrays)) < 1: raise ValueError("concatenate called on empty iterable") if len(cast(list,arrays)) == 1: # there are no arrays to concatenate, so just return arrays param return cast(Union[pdarray,Strings],arrays[0]) for a in arrays: if not isinstance(a, pdarray) and not isinstance(a, Strings): raise TypeError(("arrays must be an iterable of pdarrays" " or Strings")) if objtype == None: objtype = a.objtype if objtype == "pdarray": if dtype == None: dtype = a.dtype elif dtype != a.dtype: raise ValueError("All pdarrays must have same dtype") names.append(cast(pdarray,a).name) elif objtype == "str": names.append('{}+{}'.format(cast(Strings,a).offsets.name, cast(Strings,a).bytes.name)) else: raise NotImplementedError(("concatenate not implemented " + "for object type {}".format(objtype))) size += a.size if size == 0: if objtype == "pdarray": return zeros_like(cast(pdarray,arrays[0])) else: return arrays[0] repMsg = generic_msg("concatenate {} {} {}".\ format(len(cast(list,arrays)), objtype, ' '.join(names))) if objtype == "pdarray": return create_pdarray(cast(str,repMsg)) elif objtype == "str": return Strings(*(cast(str,repMsg).split('+'))) else: raise TypeError('arrays must be an array of pdarray or Strings objects')
def array(a): """ Convert an iterable to a pdarray, sending data to the arkouda server. Parameters ---------- a : array_like Rank-1 array of a supported dtype Returns ------- pdarray Instance of pdarray stored on arkouda server See Also -------- pdarray.to_ndarray Notes ----- The number of bytes in the input array cannot exceed `arkouda.maxTransferBytes`, otherwise a RuntimeError will be raised. This is to protect the user from overwhelming the connection between the Python client and the arkouda server, under the assumption that it is a low-bandwidth connection. The user may override this limit by setting ak.maxTransferBytes to a larger value, but should proceed with caution. Examples -------- >>> a = [3, 5, 7] >>> b = ak.array(a) >>> b array([3, 5, 7]) >>> type(b) arkouda.pdarray """ # If a is already a pdarray, do nothing if isinstance(a, pdarray): return a # If a is not already a numpy.ndarray, convert it if not isinstance(a, np.ndarray): try: a = np.array(a) except: raise TypeError("Argument must be array-like") # Only rank 1 arrays currently supported if a.ndim != 1: raise RuntimeError("Only rank-1 arrays supported") # Check if array of strings if a.dtype.kind == 'U': # Length of each string, plus null byte terminator lengths = np.array([len(elem) for elem in a]) + 1 # Compute zero-up segment offsets offsets = np.cumsum(lengths) - lengths # Allocate and fill bytes array with string segments nbytes = offsets[-1] + lengths[-1] if nbytes > maxTransferBytes: raise RuntimeError( "Creating pdarray would require transferring {} bytes, which exceeds allowed transfer size. Increase ak.maxTransferBytes to force." .format(nbytes)) values = np.zeros(nbytes, dtype=np.uint8) for s, o in zip(a, offsets): for i, b in enumerate(s.encode()): values[o + i] = b # Recurse to create pdarrays for offsets and values, then return Strings object return Strings(array(offsets), array(values)) # If not strings, then check that dtype is supported in arkouda if a.dtype.name not in DTypes: raise RuntimeError("Unhandled dtype {}".format(a.dtype)) # Do not allow arrays that are too large size = a.size if (size * a.itemsize) > maxTransferBytes: raise RuntimeError( "Array exceeds allowed transfer size. Increase ak.maxTransferBytes to allow" ) # Pack binary array data into a bytes object with a command header # including the dtype and size fmt = ">{:n}{}".format(size, structDtypeCodes[a.dtype.name]) req_msg = "array {} {:n} ".format(a.dtype.name, size).encode() + struct.pack(fmt, *a) rep_msg = generic_msg(req_msg, send_bytes=True) return create_pdarray(rep_msg)
def concatenate( arrays: Sequence[Union[pdarray, Strings, 'Categorical']], #type: ignore ordered: bool = True ) -> Union[pdarray, Strings, 'Categorical']: #type: ignore """ Concatenate a list or tuple of ``pdarray`` or ``Strings`` objects into one ``pdarray`` or ``Strings`` object, respectively. Parameters ---------- arrays : Sequence[Union[pdarray,Strings,Categorical]] The arrays to concatenate. Must all have same dtype. ordered : bool If True (default), the arrays will be appended in the order given. If False, array data may be interleaved in blocks, which can greatly improve performance but results in non-deterministic ordering of elements. Returns ------- Union[pdarray,Strings,Categorical] Single pdarray or Strings object containing all values, returned in the original order Raises ------ ValueError Raised if arrays is empty or if 1..n pdarrays have differing dtypes TypeError Raised if arrays is not a pdarrays or Strings python Sequence such as a list or tuple RuntimeError Raised if 1..n array elements are dtypes for which concatenate has not been implemented. Examples -------- >>> ak.concatenate([ak.array([1, 2, 3]), ak.array([4, 5, 6])]) array([1, 2, 3, 4, 5, 6]) >>> ak.concatenate([ak.array([True,False,True]),ak.array([False,True,True])]) array([True, False, True, False, True, True]) >>> ak.concatenate([ak.array(['one','two']),ak.array(['three','four','five'])]) array(['one', 'two', 'three', 'four', 'five']) """ from arkouda.categorical import Categorical as Categorical_ size = 0 objtype = None dtype = None names = [] if ordered: mode = 'append' else: mode = 'interleave' if len(arrays) < 1: raise ValueError("concatenate called on empty iterable") if len(arrays) == 1: return cast(Union[pdarray, Strings, Categorical_], arrays[0]) if hasattr(arrays[0], 'concatenate'): return cast( Sequence[Categorical_], cast(Categorical_, arrays[0]).concatenate(cast(Sequence[Categorical_], arrays[1:]), ordered=ordered)) for a in arrays: if not isinstance(a, pdarray) and not isinstance(a, Strings): raise TypeError(("arrays must be an iterable of pdarrays" " or Strings")) if objtype == None: objtype = a.objtype if objtype == "pdarray": if dtype == None: dtype = a.dtype elif dtype != a.dtype: raise ValueError("All pdarrays must have same dtype") names.append(cast(pdarray, a).name) elif objtype == "str": names.append('{}+{}'.format( cast(Strings, a).entry.name, "legacy_placeholder")) else: raise NotImplementedError(("concatenate not implemented " + "for object type {}".format(objtype))) size += a.size if size == 0: if objtype == "pdarray": return zeros_like(cast(pdarray, arrays[0])) else: return arrays[0] repMsg = generic_msg(cmd="concatenate", args="{} {} {} {}".\ format(len(arrays), objtype, mode, ' '.join(names))) if objtype == "pdarray": return create_pdarray(cast(str, repMsg)) elif objtype == "str": # ConcatenateMsg returns created attrib(name)+created nbytes=123 return Strings.from_return_msg(cast(str, repMsg)) else: raise TypeError( 'arrays must be an array of pdarray or Strings objects')
def read_parquet(filenames : Union[str, List[str]], dsetname : Union[str, List[str]] = 'array', strictTypes: bool=True, allow_errors:bool = False)\ -> Union[pdarray, Strings, Mapping[str,Union[pdarray,Strings]]]: """ Read a single dataset from multiple Parquet files into an Arkouda pdarray object. Parameters ---------- filenames : list or str Either a list of filenames or shell expression dsetName : str The name of the dataset (must be the same across all files). Defaults to 'array'. strictTypes: bool If True (default), require all dtypes in all files to have the same precision and sign. If False, allow dtypes of different precision and sign across different files. For example, if one file contains a uint32 dataset and another contains an int64 dataset, the contents of both will be read into an int64 pdarray. allow_errors: bool Default False, if True will allow files with read errors to be skipped instead of failing. A warning will be included in the return containing the total number of files skipped due to failure and up to 10 filenames. Returns ------- pdarray A pdarray instance pointing to the server-side data Raises ------ TypeError Raised if dsetName is not a str or if filenames is neither a string nor a list of strings ValueError Raised if all datasets are not present in all parquet files RuntimeError Raised if one or more of the specified files cannot be opened See Also -------- read_hdf, get_datasets, ls_hdf, read_all, load, save Notes ----- If filenames is a string, it is interpreted as a shell expression (a single filename is a valid expression, so it will work) and is expanded with glob to read all matching files. Use ``get_datasets`` to show the names of datasets in Parquet files. If dsetName is not present in all files, a TypeError is raised. """ if isinstance(filenames, str): filenames = [filenames] if isinstance(dsetname, str): dsetname = [dsetname] rep_msg = generic_msg( cmd="readAllParquet", args= f"{strictTypes} {len(dsetname)} {len(filenames)} {allow_errors} {json.dumps(dsetname)} | {json.dumps(filenames)}" ) rep = json.loads( rep_msg) # See GenSymIO._buildReadAllHdfMsgJson for json structure items = rep["items"] if "items" in rep else [] file_errors = rep["file_errors"] if "file_errors" in rep else [] # We have a couple possible return conditions # 1. We have multiple items returned i.e. multi pdarrays # 2. We have a single pdarray # TODO: add support for a string objects in Parquet if len(items) > 1: # DataSets condition d: Dict[str, Union[pdarray, Strings]] = {} for item in items: if "seg_string" == item["arkouda_type"]: d[item["dataset_name"]] = Strings(*item["created"].split("+")) elif "pdarray" == item["arkouda_type"]: d[item["dataset_name"]] = create_pdarray(item["created"]) else: raise TypeError(f"Unknown arkouda type:{item['arkouda_type']}") return d elif len(items) == 1: item = items[0] if "pdarray" == item["arkouda_type"]: return create_pdarray(item["created"]) elif "seg_string" == item["arkouda_type"]: return Strings(*item["created"].split("+")) else: raise TypeError(f"Unknown arkouda type:{item['arkouda_type']}") else: raise RuntimeError("No items were returned")
def read_all(filenames : Union[str,List[str]], datasets : Optional[Union[str,List[str]]]=None, iterative : bool=False, strictTypes: bool=True) \ -> Union[pdarray, Strings, Mapping[str,Union[pdarray,Strings]]]: """ Read datasets from HDF5 files. Parameters ---------- filenames : list or str Either a list of filenames or shell expression datasets : list or str or None (List of) name(s) of dataset(s) to read (default: all available) iterative : bool Iterative (True) or Single (False) function call(s) to server strictTypes: bool If True (default), require all dtypes of a given dataset to have the same precision and sign. If False, allow dtypes of different precision and sign across different files. For example, if one file contains a uint32 dataset and another contains an int64 dataset with the same name, the contents of both will be read into an int64 pdarray. Returns ------- For a single dataset returns an Arkouda pdarray or Arkouda Strings object and for multiple datasets returns a dictionary of Arkouda pdarrays or Arkouda Strings. Dictionary of {datasetName: pdarray or String} Raises ------ ValueError Raised if all datasets are not present in all hdf5 files See Also -------- read_hdf, get_datasets, ls_hdf Notes ----- If filenames is a string, it is interpreted as a shell expression (a single filename is a valid expression, so it will work) and is expanded with glob to read all matching files. If iterative == True each dataset name and file names are passed to the server as independent sequential strings while if iterative == False all dataset names and file names are passed to the server in a single string. If datasets is None, infer the names of datasets from the first file and read all of them. Use ``get_datasets`` to show the names of datasets to HDF5 files. """ if isinstance(filenames, str): filenames = [filenames] if datasets is None: datasets = get_datasets(filenames[0]) if isinstance(datasets, str): datasets = [datasets] else: # ensure dataset(s) exist if isinstance(datasets, str): datasets = [datasets] nonexistent = set(datasets) - set(get_datasets(filenames[0])) if len(nonexistent) > 0: raise ValueError("Dataset(s) not found: {}".format(nonexistent)) if iterative == True: # iterative calls to server readhdf return { dset: read_hdf(dset, filenames, strictTypes=strictTypes) for dset in datasets } else: # single call to server readAllHdf rep_msg = generic_msg("readAllHdf {} {:n} {:n} {} | {}".\ format(strictTypes, len(datasets), len(filenames), json.dumps(datasets), json.dumps(filenames))) if ',' in rep_msg: rep_msgs = cast(str, rep_msg).split(' , ') d: Dict[str, Union[pdarray, Strings]] = dict() for dset, rm in zip(datasets, rep_msgs): if ('+' in cast(str, rm)): #String d[dset] = Strings(*cast(str, rm).split('+')) else: d[dset] = create_pdarray(cast(str, rm)) return d elif '+' in rep_msg: return Strings(*cast(str, rep_msg).split('+')) else: return create_pdarray(cast(str, rep_msg))
def read_all(filenames : Union[str, List[str]], datasets: Optional[Union[str, List[str]]] = None, iterative: bool = False, strictTypes: bool = True, allow_errors: bool = False, calc_string_offsets = False)\ -> Union[pdarray, Strings, Mapping[str,Union[pdarray,Strings]]]: """ Read datasets from HDF5 files. Parameters ---------- filenames : list or str Either a list of filenames or shell expression datasets : list or str or None (List of) name(s) of dataset(s) to read (default: all available) iterative : bool Iterative (True) or Single (False) function call(s) to server strictTypes: bool If True (default), require all dtypes of a given dataset to have the same precision and sign. If False, allow dtypes of different precision and sign across different files. For example, if one file contains a uint32 dataset and another contains an int64 dataset with the same name, the contents of both will be read into an int64 pdarray. allow_errors: bool Default False, if True will allow files with read errors to be skipped instead of failing. A warning will be included in the return containing the total number of files skipped due to failure and up to 10 filenames. calc_string_offsets: bool Default False, if True this will tell the server to calculate the offsets/segments array on the server versus loading them from HDF5 files. In the future this option may be set to True as the default. Returns ------- For a single dataset returns an Arkouda pdarray or Arkouda Strings object and for multiple datasets returns a dictionary of Arkouda pdarrays or Arkouda Strings. Dictionary of {datasetName: pdarray or String} Raises ------ ValueError Raised if all datasets are not present in all hdf5 files or if one or more of the specified files do not exist RuntimeError Raised if one or more of the specified files cannot be opened. If `allow_errors` is true this may be raised if no values are returned from the server. TypeError Raised if we receive an unknown arkouda_type returned from the server See Also -------- read_hdf, get_datasets, ls_hdf Notes ----- If filenames is a string, it is interpreted as a shell expression (a single filename is a valid expression, so it will work) and is expanded with glob to read all matching files. If iterative == True each dataset name and file names are passed to the server as independent sequential strings while if iterative == False all dataset names and file names are passed to the server in a single string. If datasets is None, infer the names of datasets from the first file and read all of them. Use ``get_datasets`` to show the names of datasets to HDF5 files. """ if isinstance(filenames, str): filenames = [filenames] if datasets is None: datasets = get_datasets_allow_errors( filenames) if allow_errors else get_datasets(filenames[0]) if isinstance(datasets, str): datasets = [datasets] else: # ensure dataset(s) exist if isinstance(datasets, str): datasets = [datasets] nonexistent = set(datasets) - \ (set(get_datasets_allow_errors(filenames)) if allow_errors else set(get_datasets(filenames[0]))) if len(nonexistent) > 0: raise ValueError("Dataset(s) not found: {}".format(nonexistent)) if iterative == True: # iterative calls to server readhdf return { dset: read_hdf(dset, filenames, strictTypes=strictTypes, allow_errors=allow_errors, calc_string_offsets=calc_string_offsets) for dset in datasets } else: # single call to server readAllHdf rep_msg = generic_msg( cmd="readAllHdf", args= f"{strictTypes} {len(datasets)} {len(filenames)} {allow_errors} {calc_string_offsets} {json.dumps(datasets)} | {json.dumps(filenames)}" ) rep = json.loads( rep_msg) # See GenSymIO._buildReadAllHdfMsgJson for json structure items = rep["items"] if "items" in rep else [] file_errors = rep["file_errors"] if "file_errors" in rep else [] if allow_errors and file_errors: file_error_count = rep[ "file_error_count"] if "file_error_count" in rep else -1 warnings.warn( f"There were {file_error_count} errors reading files on the server. " + f"Sample error messages {file_errors}", RuntimeWarning) # We have a couple possible return conditions # 1. We have multiple items returned i.e. multi pdarrays, multi strings, multi pdarrays & strings # 2. We have a single pdarray # 3. We have a single strings object if len(items) > 1: # DataSets condition d: Dict[str, Union[pdarray, Strings]] = {} for item in items: if "seg_string" == item["arkouda_type"]: d[item["dataset_name"]] = Strings.from_return_msg( item["created"]) elif "pdarray" == item["arkouda_type"]: d[item["dataset_name"]] = create_pdarray(item["created"]) else: raise TypeError( f"Unknown arkouda type:{item['arkouda_type']}") return d elif len(items) == 1: item = items[0] if "pdarray" == item["arkouda_type"]: return create_pdarray(item["created"]) elif "seg_string" == item["arkouda_type"]: return Strings.from_return_msg(item["created"]) else: raise TypeError(f"Unknown arkouda type:{item['arkouda_type']}") else: raise RuntimeError("No items were returned")