示例#1
0
    def unregister_categorical_by_name(user_defined_name: str) -> None:
        """
        Function to unregister Categorical object by name which was registered
        with the arkouda server via register()

        Parameters
        ----------
        user_defined_name : str
            Name under which the Categorical object was registered

        Raises
        -------
        TypeError
            if user_defined_name is not a string
        RegistrationError
            if there is an issue attempting to unregister any underlying components

        See Also
        --------
        register, unregister, attach, is_registered
        """
        # We have 4 subcomponents, unregister each of them
        Strings.unregister_strings_by_name(f"{user_defined_name}.categories")
        unregister_pdarray_by_name(f"{user_defined_name}.codes")

        # Unregister optional pieces only if they are contained in the registry
        registry = list_registry()
        if f"{user_defined_name}.permutation" in registry:
            unregister_pdarray_by_name(f"{user_defined_name}.permutation")
        if f"{user_defined_name}.segments" in registry:
            unregister_pdarray_by_name(f"{user_defined_name}.segments")
示例#2
0
def unique(pda, return_counts=False):
    """
    Find the unique elements of an array.

    Returns the unique elements of an array, sorted if the values are integers. 
    There is an optional output in addition to the unique elements: the number 
    of times each unique value comes up in the input array.

    Parameters
    ----------
    pda : pdarray or Strings or Categorical
        Input array.
    return_counts : bool, optional
        If True, also return the number of times each unique item appears
        in `pda`.

    Returns
    -------
    unique : pdarray or Strings
        The unique values. If input dtype is int64, return values will be sorted.
    unique_counts : pdarray, optional
        The number of times each of the unique values comes up in the
        original array. Only provided if `return_counts` is True.

    Notes
    -----
    For integer arrays, this function checks to see whether `pda` is sorted and, if so,
    whether it is already unique. This step can save considerable computation.
    Otherwise, this function will sort `pda`. For 

    Examples
    --------
    >>> A = ak.array([3, 2, 1, 1, 2, 3])
    >>> ak.unique(A)
    array([1, 2, 3])
    """
    if hasattr(pda, 'unique'):
        return pda.unique()
    elif isinstance(pda, pdarray):
        repMsg = generic_msg("unique {} {} {}".format(pda.objtype, pda.name,
                                                      return_counts))
        if return_counts:
            vc = repMsg.split("+")
            if verbose: print(vc)
            return create_pdarray(vc[0]), create_pdarray(vc[1])
        else:
            return create_pdarray(repMsg)
    elif isinstance(pda, Strings):
        name = '{}+{}'.format(pda.offsets.name, pda.bytes.name)
        repMsg = generic_msg("unique {} {} {}".format(pda.objtype, name,
                                                      return_counts))
        vc = repMsg.split('+')
        if verbose: print(vc)
        if return_counts:
            return Strings(vc[0], vc[1]), create_pdarray(vc[2])
        else:
            return Strings(vc[0], vc[1])
    else:
        raise TypeError("must be pdarray or Strings {}".format(pda))
示例#3
0
def random_strings_uniform(minlen, maxlen, size, characters='uppercase'):
    """
    Generate random strings with lengths uniformly distributed between 
    minlen and maxlen, and with characters drawn from a specified set.

    Parameters
    ----------
    minlen : int
        The minimum allowed length of string
    maxlen : int
        The maximum allowed length of string
    size : int
        The number of strings to generate
    characters : (uppercase, lowercase, numeric, printable, binary)
        The set of characters to draw from

    Returns
    -------
    Strings
        The array of random strings

    See Also
    --------
    random_strings_lognormal, randint
    """
    if minlen < 0 or maxlen < minlen or size < 0:
        raise ValueError("Incompatible arguments")
    msg = "randomStrings {} {} {} {} {}".format(NUMBER_FORMAT_STRINGS['int64'].format(size),
                                                "uniform",
                                                characters,
                                                NUMBER_FORMAT_STRINGS['int64'].format(minlen),
                                                NUMBER_FORMAT_STRINGS['int64'].format(maxlen))
    repMsg = generic_msg(msg)
    return Strings(*(repMsg.split('+')))
示例#4
0
def random_strings_lognormal(logmean: Union[float, int],
                             logstd: float,
                             size: int,
                             characters: str = 'uppercase') -> Strings:
    """
    Generate random strings with log-normally distributed lengths and 
    with characters drawn from a specified set.

    Parameters
    ----------
    logmean : Union[float, int]
        The log-mean of the length distribution
    logstd : float
        The log-standard-deviation of the length distribution
    size : int
        The number of strings to generate
    characters : (uppercase, lowercase, numeric, printable, binary)
        The set of characters to draw from

    Returns
    -------
    Strings
        The array of random strings
    
    Raises
    ------
    TypeError
        Raised if logmean is not a float or int, logstd is not a float, 
        size is not an int, or if characters is not a str
    ValueError
        Raised if logstd <= 0 or size < 0

    See Also
    --------
    random_strings_lognormal, randint

    Notes
    -----
    The lengths of the generated strings are distributed $Lognormal(\mu, \sigma^2)$,
    with :math:`\mu = logmean` and :math:`\sigma = logstd`. Thus, the strings will
    have an average length of :math:`exp(\mu + 0.5*\sigma^2)`, a minimum length of 
    zero, and a heavy tail towards longer strings.
    """
    if not isinstance(logmean, float) and not isinstance(logmean, int):
        raise TypeError("The logmean must be a float or int")
    if not isinstance(logstd, float):
        raise TypeError("The logstd must be a float")
    if not isinstance(size, int):
        raise TypeError("The size must be an integer")
    if not isinstance(characters, str):
        raise TypeError("characters must be a str")
    if logstd <= 0 or size < 0:
        raise ValueError("Incompatible arguments: logstd <= 0 or size < 0")
    msg = "randomStrings {} {} {} {} {}".\
                             format(NUMBER_FORMAT_STRINGS['int64'].format(size),
                             "lognormal", characters,
                             NUMBER_FORMAT_STRINGS['float64'].format(logmean),
                             NUMBER_FORMAT_STRINGS['float64'].format(logstd))
    repMsg = generic_msg(msg)
    return Strings(*(repMsg.split('+')))
示例#5
0
def concatenate(arrays):
    """
    Concatenate an iterable of ``pdarray`` objects into one ``pdarray``.

    Parameters
    ----------
    arrays : iterable of ``pdarray`` or Strings or Categorical
        The arrays to concatenate. Must all have same dtype.

    Returns
    -------
    pdarray
        Single array containing all values, in original order

    Examples
    --------
    >>> ak.concatenate([ak.array([1, 2, 3]), ak.array([4, 5, 6])])
    array([1, 2, 3, 4, 5, 6])
    """
    size = 0
    objtype = None
    dtype = None
    names = []
    if len(arrays) < 1:
        raise ValueError("concatenate called on empty iterable")
    if len(arrays) == 1:
        return arrays[0]
    if hasattr(arrays[0], 'concatenate'):
        return arrays[0].concatenate(arrays[1:])
    for a in arrays:
        if not isinstance(a, pdarray) and not isinstance(a, Strings):
            raise ValueError(
                "Argument must be an iterable of pdarrays or Strings")
        if objtype == None:
            objtype = a.objtype
        if objtype == "pdarray":
            if dtype == None:
                dtype = a.dtype
            elif dtype != a.dtype:
                raise ValueError("All pdarrays must have same dtype")
            names.append(a.name)
        elif objtype == "str":
            names.append('{}+{}'.format(a.offsets.name, a.bytes.name))
        else:
            raise NotImplementedError(
                "concatenate not implemented for object type {}".format(
                    objtype))
        size += a.size
    if size == 0:
        if objtype == "pdarray":
            return zeros_like(arrays[0])
        else:
            return arrays[0]
    repMsg = generic_msg("concatenate {} {} {}".format(len(arrays), objtype,
                                                       ' '.join(names)))
    if objtype == "pdarray":
        return create_pdarray(repMsg)
    elif objtype == "str":
        return Strings(*(repMsg.split('+')))
示例#6
0
def random_strings_uniform(
        minlen: Union[int, np.int64],
        maxlen: Union[int, np.int64],
        size: Union[int, np.int64],
        characters: str = 'uppercase',
        seed: Union[None, Union[int, np.int64]] = None) -> Strings:
    """
    Generate random strings with lengths uniformly distributed between 
    minlen and maxlen, and with characters drawn from a specified set.

    Parameters
    ----------
    minlen : Union[int,np.int64]
        The minimum allowed length of string
    maxlen : Union[int,np.int64]
        The maximum allowed length of string
    size : Union[int,np.int64]
        The number of strings to generate
    characters : (uppercase, lowercase, numeric, printable, binary)
        The set of characters to draw from
    seed :  Union[None, Union[int,np.int64]], optional
        Value used to initialize the random number generator

    Returns
    -------
    Strings
        The array of random strings
        
    Raises
    ------
    ValueError
        Raised if minlen < 0, maxlen < minlen, or size < 0

    See Also
    --------
    random_strings_lognormal, randint
    
    Examples
    --------
    >>> ak.random_strings_uniform(minlen=1, maxlen=5, seed=1, size=5)
    array(['TVKJ', 'EWAB', 'CO', 'HFMD', 'U'])
    
    >>> ak.random_strings_uniform(minlen=1, maxlen=5, seed=1, size=5, 
    ... characters='printable')
    array(['+5"f', '-P]3', '4k', '~HFF', 'F'])
    """
    if minlen < 0 or maxlen < minlen or size < 0:
        raise ValueError(("Incompatible arguments: minlen < 0, maxlen " +
                          "< minlen, or size < 0"))
    msg = "randomStrings {} {} {} {} {} {}".\
          format(NUMBER_FORMAT_STRINGS['int64'].format(size),
                 "uniform", characters,
                 NUMBER_FORMAT_STRINGS['int64'].format(minlen),
                 NUMBER_FORMAT_STRINGS['int64'].format(maxlen),
                 seed)
    repMsg = generic_msg(msg)
    return Strings(*(cast(str, repMsg).split('+')))
示例#7
0
 def split(self, maxsplit: int = 0, return_segments: bool = False):
     """
     Split string by the occurrences of pattern. If maxsplit is nonzero, at most maxsplit splits occur
     """
     from arkouda.strings import Strings
     cmd = "segmentedSplit"
     args = "{} {} {} {} {} {}".format(self.objtype,
                                       self.parent_entry_name,
                                       "legacy_placeholder",
                                       maxsplit,
                                       return_segments,
                                       json.dumps([self.pattern]))
     repMsg = cast(str, generic_msg(cmd=cmd, args=args))
     if return_segments:
         arrays = repMsg.split('+', maxsplit=2)
         return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2])
     else:
         return Strings.from_return_msg(repMsg)
示例#8
0
def cast(pda: Union[pdarray, Strings],
         dt: Union[np.dtype, str]) -> Union[pdarray, Strings]:
    """
    Cast an array to another dtype.

    Parameters
    ----------
    pda : pdarray or Strings
        The array of values to cast
    dtype : np.dtype or str
        The target dtype to cast values to

    Returns
    -------
    pdarray or Strings
        Array of values cast to desired dtype

    Notes
    -----
    The cast is performed according to Chapel's casting rules and is NOT safe 
    from overflows or underflows. The user must ensure that the target dtype 
    has the precision and capacity to hold the desired result.
    
    Examples
    --------
    >>> ak.cast(ak.linspace(1.0,5.0,5), dt=ak.int64)
    array([1, 2, 3, 4, 5])    
    
    >>> ak.cast(ak.arange(0,5), dt=ak.float64).dtype
    dtype('float64')
    
    >>> ak.cast(ak.arange(0,5), dt=ak.bool)
    array([False, True, True, True, True])
    
    >>> ak.cast(ak.linspace(0,4,5), dt=ak.bool)
    array([False, True, True, True, True])
    """

    if isinstance(pda, pdarray):
        name = pda.name
        objtype = "pdarray"
    elif isinstance(pda, Strings):
        name = '+'.join((pda.offsets.name, pda.bytes.name))
        objtype = "str"
    # typechecked decorator guarantees no other case

    dt = _as_dtype(dt)
    opt = ""
    cmd = "cast"
    args = "{} {} {} {}".format(name, objtype, dt.name, opt)
    repMsg = generic_msg(cmd=cmd, args=args)
    if dt.name.startswith("str"):
        return Strings(*(type_cast(str, repMsg).split("+")))
    else:
        return create_pdarray(type_cast(str, repMsg))
示例#9
0
 def sub(self, repl: str, count: int = 0, return_num_subs: bool = False):
     """
     Return the Strings obtained by replacing non-overlapping occurrences of pattern with the replacement repl.
     If count is nonzero, at most count substitutions occur
     If return_num_subs is True, return the number of substitutions that occurred
     """
     from arkouda.strings import Strings
     cmd = "segmentedSub"
     args = "{} {} {} {} {} {} {}".format(self.objtype,
                                          self.parent_entry_name,
                                          "legacy_placeholder",
                                          repl,
                                          count,
                                          return_num_subs,
                                          json.dumps([self.pattern]))
     repMsg = cast(str, generic_msg(cmd=cmd, args=args))
     if return_num_subs:
         arrays = repMsg.split('+', maxsplit=2)
         return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2])
     else:
         return Strings.from_return_msg(repMsg)
示例#10
0
 def findall(self, return_match_origins: bool = False):
     """
     Return all non-overlapping matches of pattern in Strings as a new Strings object
     """
     from arkouda.strings import Strings
     self.find_locations()
     cmd = "segmentedFindAll"
     args = "{} {} {} {} {} {} {} {}".format(self.objtype,
                                             self.parent_entry_name,
                                             "legacy_placeholder",
                                             self.num_matches.name,
                                             self.starts.name,
                                             self.lengths.name,
                                             self.indices.name,
                                             return_match_origins)
     repMsg = cast(str, generic_msg(cmd=cmd, args=args))
     if return_match_origins:
         arrays = repMsg.split('+', maxsplit=2)
         return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2])
     else:
         return Strings.from_return_msg(repMsg)
示例#11
0
    def find_matches(self, return_match_origins: bool = False):
        """
        Return all matches as a new Strings object

        Parameters
        ----------
        return_match_origins: bool
            If True, return a pdarray containing the index of the original string each pattern match is from

        Returns
        -------
        Strings
            Strings object containing only matches
        pdarray, int64 (optional)
            The index of the original string each pattern match is from

        Raises
        ------
        RuntimeError
            Raised if there is a server-side error thrown

        Examples
        --------
        >>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
        >>> strings.search('_+').find_matches(return_match_origins=True)
        (array(['_', '____', '__']), array([0 1 3]))
        """
        from arkouda.strings import Strings
        cmd = "segmentedFindAll"
        args = "{} {} {} {} {} {} {} {}".format(
            self._objtype, self._parent_entry_name, "legacy_placeholder",
            self._matched.name, self._starts.name, self._lengths.name,
            self._indices.name, return_match_origins)
        repMsg = cast(str, generic_msg(cmd=cmd, args=args))
        if return_match_origins:
            arrays = repMsg.split('+', maxsplit=2)
            return Strings.from_return_msg("+".join(
                arrays[0:2])), create_pdarray(arrays[2])
        else:
            return Strings.from_return_msg(repMsg)
示例#12
0
def random_strings_uniform(minlen: int,
                           maxlen: int,
                           size: int,
                           characters: str = 'uppercase',
                           seed: Union[None, int] = None) -> Strings:
    """
    Generate random strings with lengths uniformly distributed between 
    minlen and maxlen, and with characters drawn from a specified set.

    Parameters
    ----------
    minlen : int
        The minimum allowed length of string
    maxlen : int
        The maximum allowed length of string
    size : int
        The number of strings to generate
    characters : (uppercase, lowercase, numeric, printable, binary)
        The set of characters to draw from

    Returns
    -------
    Strings
        The array of random strings
        
    Raises
    ------
    ValueError
        Raised if minlen < 0, maxlen < minlen, or size < 0

    See Also
    --------
    random_strings_lognormal, randint
    """
    if minlen < 0 or maxlen < minlen or size < 0:
        raise ValueError(
            ("Incompatible arguments: minlen < 0, maxlen < minlen, " +
             "or size < 0"))
    msg = "randomStrings {} {} {} {} {} {}".\
          format(NUMBER_FORMAT_STRINGS['int64'].format(size),
                 "uniform", characters,
                 NUMBER_FORMAT_STRINGS['int64'].format(minlen),
                 NUMBER_FORMAT_STRINGS['int64'].format(maxlen),
                 seed)
    repMsg = generic_msg(msg)
    return Strings(*(cast(str, repMsg).split('+')))
示例#13
0
    def attach(user_defined_name: str) -> Categorical:
        """
        Function to return a Categorical object attached to the registered name in the
        arkouda server which was registered using register()

        Parameters
        ----------
        user_defined_name : str
            user defined name which Categorical object was registered under

        Returns
        -------
        Categorical
               The Categorical object created by re-attaching to the corresponding server components

       Raises
       ------
       TypeError
            if user_defined_name is not a string

        See Also
        --------
        register, is_registered, unregister, unregister_categorical_by_name
        """
        # Build dict of registered components by invoking their corresponding Class.attach functions
        parts = {
            "categories": Strings.attach(f"{user_defined_name}.categories"),
            "codes": pdarray.attach(f"{user_defined_name}.codes"),
        }

        # Add optional pieces only if they're contained in the registry
        registry = list_registry()
        if f"{user_defined_name}.permutation" in registry:
            parts["permutation"] = pdarray.attach(
                f"{user_defined_name}.permutation")
        if f"{user_defined_name}.segments" in registry:
            parts["segments"] = pdarray.attach(f"{user_defined_name}.segments")

        c = Categorical(None, **parts)  # Call constructor with unpacked kwargs
        c.name = user_defined_name  # Update our name
        return c
示例#14
0
def random_strings_lognormal(logmean, logstd, size, characters='uppercase'):
    """
    Generate random strings with log-normally distributed lengths and 
    with characters drawn from a specified set.

    Parameters
    ----------
    logmean : float
        The log-mean of the length distribution
    logstd : float
        The log-standard-deviation of the length distribution
    size : int
        The number of strings to generate
    characters : (uppercase, lowercase, numeric, printable, binary)
        The set of characters to draw from

    Returns
    -------
    Strings
        The array of random strings

    See Also
    --------
    random_strings_lognormal, randint

    Notes
    -----
    The lengths of the generated strings are distributed $Lognormal(\mu, \sigma^2)$,
    with :math:`\mu = logmean` and :math:`\sigma = logstd`. Thus, the strings will have
    an average length of :math:`exp(\mu + 0.5*\sigma^2)`, a minimum length of zero, and
    a heavy tail towards longer strings.
    """
    if logstd <= 0 or size < 0:
        raise ValueError("Incompatible arguments")
    msg = "randomStrings {} {} {} {} {}".format(NUMBER_FORMAT_STRINGS['int64'].format(size),
                                                "lognormal",
                                                characters,
                                                NUMBER_FORMAT_STRINGS['float64'].format(logmean),
                                                NUMBER_FORMAT_STRINGS['float64'].format(logstd))
    repMsg = generic_msg(msg)
    return Strings(*(repMsg.split('+')))
示例#15
0
def read_hdf(dsetName, filenames):
    """
    Read a single dataset from multiple HDF5 files into an arkouda pdarray. 

    Parameters
    ----------
    dsetName : str
        The name of the dataset (must be the same across all files)
    filenames : list or str
        Either a list of filenames or shell expression

    Returns
    -------
    pdarray
        A pdarray instance pointing to the server-side data read in

    See Also
    --------
    get_datasets, ls_hdf, read_all, load, save

    Notes
    -----
    If filenames is a string, it is interpreted as a shell expression 
    (a single filename is a valid expression, so it will work) and is 
    expanded with glob to read all matching files. Use ``get_datasets`` to 
    show the names of datasets in HDF5 files.

    If dsetName is not present in all files, a RuntimeError is raised.
    """
    if isinstance(filenames, str):
        filenames = [filenames]
    rep_msg = generic_msg("readhdf {} {:n} {}".format(dsetName, len(filenames),
                                                      json.dumps(filenames)))
    # This is a hack to detect a string return type
    # In the future, we should put the number and type into the return message
    if '+' in rep_msg:
        return Strings(*rep_msg.split('+'))
    else:
        return create_pdarray(rep_msg)
示例#16
0
def cast(pda: Union[pdarray, Strings], dt) -> Union[pdarray, Strings]:
    """
    Cast an array to another dtype.

    Parameters
    ----------
    pda : pdarray or Strings
        The array of values to cast
    dtype : np.dtype or str
        The target dtype to cast values to

    Returns
    -------
    pdarray or Strings
        Array of values cast to desired dtype

    Notes
    -----
    The cast is performed according to Chapel's casting rules and is NOT safe 
    from overflows or underflows. The user must ensure that the target dtype 
    has the precision and capacity to hold the desired result.
    """

    if isinstance(pda, pdarray):
        name = pda.name
        objtype = "pdarray"
    elif isinstance(pda, Strings):
        name = '+'.join((pda.offsets.name, pda.bytes.name))
        objtype = "str"
    # typechecked decorator guarantees no other case

    dt = _as_dtype(dt)
    opt = ""
    msg = "cast {} {} {} {}".format(name, objtype, dt.name, opt)
    repMsg = generic_msg(msg)
    if dt.name.startswith("str"):
        return Strings(*(type_cast(str, repMsg).split("+")))
    else:
        return create_pdarray(type_cast(str, repMsg))
示例#17
0
def array(a: Union[pdarray, np.ndarray, Iterable]) -> Union[pdarray, Strings]:
    """
    Convert an iterable to a pdarray or Strings object, sending the corresponding
    data to the arkouda server. 

    Parameters
    ----------
    a : Union[pdarray, np.ndarray]
        Rank-1 array of a supported dtype

    Returns
    -------
    pdarray or Strings
        A pdarray instance stored on arkouda server or Strings instance, which
        is composed of two pdarrays stored on arkouda server
        
    Raises
    ------
    TypeError
        Raised if a is not a pdarray, np.ndarray, or Python Iterable such as a
        list, array, tuple, or deque
    RuntimeError
        If a is not one-dimensional, nbytes > maxTransferBytes, a.dtype is
        not supported (not in DTypes), or if the product of a size and
        a.itemsize > maxTransferBytes

    See Also
    --------
    pdarray.to_ndarray

    Notes
    -----
    The number of bytes in the input array cannot exceed `arkouda.maxTransferBytes`,
    otherwise a RuntimeError will be raised. This is to protect the user
    from overwhelming the connection between the Python client and the arkouda
    server, under the assumption that it is a low-bandwidth connection. The user
    may override this limit by setting ak.maxTransferBytes to a larger value, 
    but should proceed with caution.
    
    If the pdrray or ndarray is of type U, this method is called twice recursively 
    to create the Strings object and the two corresponding pdarrays for string 
    bytes and offsets, respectively.

    Examples
    --------
    >>> a = [3, 5, 7]
    >>> b = ak.array(a)
    >>> b
    array([3, 5, 7])
   
    >>> type(b)
    arkouda.pdarray    
    """
    # If a is already a pdarray, do nothing
    if isinstance(a, pdarray):
        return a
    from arkouda.client import maxTransferBytes
    # If a is not already a numpy.ndarray, convert it
    if not isinstance(a, np.ndarray):
        try:
            a = np.array(a)
        except:
            raise TypeError(
                ('a must be a pdarray, np.ndarray, or convertible to' +
                 ' a numpy array'))
    # Only rank 1 arrays currently supported
    if a.ndim != 1:
        raise RuntimeError("Only rank-1 pdarrays or ndarrays supported")
    # Check if array of strings
    if a.dtype.kind == 'U' or 'U' in a.dtype.kind:
        encoded = np.array([elem.encode() for elem in a])
        # Length of each string, plus null byte terminator
        lengths = np.array([len(elem) for elem in encoded]) + 1
        # Compute zero-up segment offsets
        offsets = np.cumsum(lengths) - lengths
        # Allocate and fill bytes array with string segments
        nbytes = offsets[-1] + lengths[-1]
        if nbytes > maxTransferBytes:
            raise RuntimeError(
                ("Creating pdarray would require transferring {} bytes," +
                 " which exceeds allowed transfer size. Increase " +
                 "ak.maxTransferBytes to force.").format(nbytes))
        values = np.zeros(nbytes, dtype=np.uint8)
        for s, o in zip(encoded, offsets):
            for i, b in enumerate(s):
                values[o + i] = b
        # Recurse to create pdarrays for offsets and values, then return Strings object
        return Strings(array(offsets), array(values))
    # If not strings, then check that dtype is supported in arkouda
    if a.dtype.name not in DTypes:
        raise RuntimeError("Unhandled dtype {}".format(a.dtype))
    # Do not allow arrays that are too large
    size = a.size
    if (size * a.itemsize) > maxTransferBytes:
        raise RuntimeError(("Array exceeds allowed transfer size. Increase " +
                            "ak.maxTransferBytes to allow"))
    # Pack binary array data into a bytes object with a command header
    # including the dtype and size
    fmt = ">{:n}{}".format(size, structDtypeCodes[a.dtype.name])
    req_msg = "array {} {:n} ".\
                    format(a.dtype.name, size).encode() + struct.pack(fmt, *a)
    repMsg = generic_msg(req_msg, send_bytes=True)
    return create_pdarray(cast(str, repMsg))
示例#18
0
def random_strings_lognormal(logmean: Union[float, int],
                             logstd: Union[float, int],
                             size: int,
                             characters: str = 'uppercase',
                             seed: Union[None, int] = None) -> Strings:
    """
    Generate random strings with log-normally distributed lengths and 
    with characters drawn from a specified set.

    Parameters
    ----------
    logmean : Union[float, int]
        The log-mean of the length distribution
    logstd : float
        The log-standard-deviation of the length distribution
    size : int
        The number of strings to generate
    characters : (uppercase, lowercase, numeric, printable, binary)
        The set of characters to draw from
    seed : int
        Value used to initialize the random number generator

    Returns
    -------
    Strings
        The Strings object encapsulating a pdarray of random strings
    
    Raises
    ------
    TypeError
        Raised if logmean is neither a float nor a int, logstd is not a float, 
        size is not an int, or if characters is not a str
    ValueError
        Raised if logstd <= 0 or size < 0

    See Also
    --------
    random_strings_lognormal, randint

    Notes
    -----
    The lengths of the generated strings are distributed $Lognormal(\\mu, \\sigma^2)$,
    with :math:`\\mu = logmean` and :math:`\\sigma = logstd`. Thus, the strings will
    have an average length of :math:`exp(\\mu + 0.5*\\sigma^2)`, a minimum length of 
    zero, and a heavy tail towards longer strings.
    
    Examples
    --------
    >>> ak.random_strings_lognormal(2, 0.25, 5, seed=1)
    array(['TVKJTE', 'ABOCORHFM', 'LUDMMGTB', 'KWOQNPHZ', 'VSXRRL'])
    
    >>> ak.random_strings_lognormal(2, 0.25, 5, seed=1, characters='printable')
    array(['+5"fp-', ']3Q4kC~HF', '=F=`,IE!', 'DjkBa'9(', '5oZ1)='])
    """
    if logstd <= 0 or size < 0:
        raise ValueError("Incompatible arguments: logstd <= 0 or size < 0")
    msg = "randomStrings {} {} {} {} {} {}".\
          format(NUMBER_FORMAT_STRINGS['int64'].format(size),
                 "lognormal", characters,
                 NUMBER_FORMAT_STRINGS['float64'].format(logmean),
                 NUMBER_FORMAT_STRINGS['float64'].format(logstd),
                 seed)
    repMsg = generic_msg(msg)
    return Strings(*(cast(str, repMsg).split('+')))
示例#19
0
def read_all(filenames, datasets=None, iterative=False):
    """
    Read datasets from HDF5 files.

    Parameters
    ----------
    filenames : list or str
        Either a list of filenames or shell expression
    datasets : list or str or None
        (List of) name(s) of dataset(s) to read (default: all available)
    iterative : boolean
        Iterative (True) or Single (False) function call(s) to server

    Returns
    -------
    For a single dataset returns an Arkouda pdarray or an Arkouda Sring and
    for multiple datasets returns a dictionary of Ardkouda pdarrays and
    Arkouda Strings.
        Dictionary of {datasetName: pdarray or String}

    See Also
    --------
    read_hdf, get_datasets, ls_hdf

    Notes
    -----
    If filenames is a string, it is interpreted as a shell expression
    (a single filename is a valid expression, so it will work) and is
    expanded with glob to read all matching files.

    If iterative == True each dataset name and file names are passed to
    the server as independent sequential strings while if iterative == False
    all dataset names and file names are passed to the server in a single
    string.

    If datasets is None, infer the names of datasets from the first file
    and read all of them. Use ``get_datasets`` to show the names of datasets in
    HDF5 files.

    If not all datasets are present in all HDF5 files, a RuntimeError
    is raised.a

    """
    if isinstance(filenames, str):
        filenames = [filenames]
    if datasets is None:
        datasets = get_datasets(filenames[0])
    if isinstance(datasets, str):
        datasets = [datasets]
    else:  # ensure dataset(s) exist
        if isinstance(datasets, str):
            datasets = [datasets]
        nonexistent = set(datasets) - set(get_datasets(filenames[0]))
        if len(nonexistent) > 0:
            raise ValueError("Dataset(s) not found: {}".format(nonexistent))
    if iterative == True:  # iterative calls to server readhdf
        return {dset: read_hdf(dset, filenames) for dset in datasets}
    else:  # single call to server readAllHdf
        rep_msg = generic_msg("readAllHdf {:n} {:n} {} | {}".format(
            len(datasets), len(filenames), json.dumps(datasets),
            json.dumps(filenames)))
        if ',' in rep_msg:
            rep_msgs = rep_msg.split(' , ')
            d = dict()
            for dset, rm in zip(datasets, rep_msgs):
                if ('+' in rm):  #String
                    d[dset] = Strings(*rm.split('+'))
                else:
                    d[dset] = create_pdarray(rm)
            return d
        elif '+' in rep_msg:
            return Strings(*rep_msg.split('+'))
        else:
            return create_pdarray(rep_msg)
示例#20
0
    def group(self, group_num: int = 0, return_group_origins: bool = False):
        """
        Returns a new Strings containing the capture group corresponding to group_num. For the default, group_num=0, return the full match

        Parameters
        ----------
        group_num: int
            The index of the capture group to be returned
        return_group_origins: bool
            If True, return a pdarray containing the index of the original string each capture group is from

        Returns
        -------
        Strings
            Strings object containing only the capture groups corresponding to group_num
        pdarray, int64 (optional)
            The index of the original string each group is from

        Examples
        --------
        >>> strings = ak.array(["Isaac Newton, physicist", '<--calculus-->', 'Gottfried Leibniz, mathematician'])
        >>> m = strings.search("(\\w+) (\\w+)")
        >>> m.group()
        array(['Isaac Newton', 'Gottfried Leibniz'])
        >>> m.group(1)
        array(['Isaac', 'Gottfried'])
        >>> m.group(2, return_group_origins=True)
        (array(['Newton', 'Leibniz']), array([0 2]))
        """
        from arkouda.strings import Strings
        from arkouda.client import regexMaxCaptures
        if group_num < 0:
            raise ValueError("group_num cannot be negative")
        if group_num > regexMaxCaptures:
            max_capture_flag = f'-e REGEX_MAX_CAPTURES={group_num}'
            e = f"group_num={group_num} > regexMaxCaptures={regexMaxCaptures}. To run group({group_num}), recompile the server with flag '{max_capture_flag}'"
            raise ValueError(e)

        # We don't cache the locations of groups, find the location info and call findAll
        cmd = "segmentedFindLoc"
        args = "{} {} {} {} {}".format(self._objtype, self._parent_entry_name,
                                       "legacy_placeholder", group_num,
                                       json.dumps([self.re]))
        repMsg = cast(str, generic_msg(cmd=cmd, args=args))
        created_map = json.loads(repMsg)
        global_starts = create_pdarray(created_map["Starts"])
        global_lengths = create_pdarray(created_map["Lens"])
        global_indices = create_pdarray(created_map["Indices"])
        if self._match_type == MatchType.SEARCH:
            matched = create_pdarray(created_map["SearchBool"])
            indices = create_pdarray(created_map["SearchInd"])
        elif self._match_type == MatchType.MATCH:
            matched = create_pdarray(created_map["MatchBool"])
            indices = create_pdarray(created_map["MatchInd"])
        elif self._match_type == MatchType.FULLMATCH:
            matched = create_pdarray(created_map["FullMatchBool"])
            indices = create_pdarray(created_map["FullMatchInd"])
        else:
            raise ValueError(f"{self._match_type} is not a MatchType")
        starts = global_starts[global_indices[matched]]
        lengths = global_lengths[global_indices[matched]]
        cmd = "segmentedFindAll"
        args = "{} {} {} {} {} {} {} {}".format(self._objtype,
                                                self._parent_entry_name,
                                                "legacy_placeholder",
                                                matched.name, starts.name,
                                                lengths.name, indices.name,
                                                return_group_origins)
        repMsg = cast(str, generic_msg(cmd=cmd, args=args))
        if return_group_origins:
            arrays = repMsg.split('+', maxsplit=2)
            return Strings.from_return_msg("+".join(
                arrays[0:2])), create_pdarray(arrays[2])
        else:
            return Strings.from_return_msg(repMsg)
示例#21
0
def array(a: Union[pdarray, np.ndarray, Iterable]) -> Union[pdarray, Strings]:
    """
    Convert a Python or Numpy Iterable to a pdarray or Strings object, sending 
    the corresponding data to the arkouda server. 

    Parameters
    ----------
    a : Union[pdarray, np.ndarray]
        Rank-1 array of a supported dtype

    Returns
    -------
    pdarray or Strings
        A pdarray instance stored on arkouda server or Strings instance, which
        is composed of two pdarrays stored on arkouda server
        
    Raises
    ------
    TypeError
        Raised if a is not a pdarray, np.ndarray, or Python Iterable such as a
        list, array, tuple, or deque
    RuntimeError
        Raised if a is not one-dimensional, nbytes > maxTransferBytes, a.dtype is
        not supported (not in DTypes), or if the product of a size and
        a.itemsize > maxTransferBytes
    ValueError
        Raised if the returned message is malformed or does not contain the fields
        required to generate the array.

    See Also
    --------
    pdarray.to_ndarray

    Notes
    -----
    The number of bytes in the input array cannot exceed `arkouda.maxTransferBytes`,
    otherwise a RuntimeError will be raised. This is to protect the user
    from overwhelming the connection between the Python client and the arkouda
    server, under the assumption that it is a low-bandwidth connection. The user
    may override this limit by setting ak.maxTransferBytes to a larger value, 
    but should proceed with caution.
    
    If the pdrray or ndarray is of type U, this method is called twice recursively 
    to create the Strings object and the two corresponding pdarrays for string 
    bytes and offsets, respectively.

    Examples
    --------
    >>> ak.array(np.arange(1,10))
    array([1, 2, 3, 4, 5, 6, 7, 8, 9])
    
    >>> ak.array(range(1,10))
    array([1, 2, 3, 4, 5, 6, 7, 8, 9])
   
    >>> strings = ak.array(['string {}'.format(i) for i in range(0,5)])
    >>> type(strings)
    <class 'arkouda.strings.Strings'>  
    """
    # If a is already a pdarray, do nothing
    if isinstance(a, pdarray):
        return a
    from arkouda.client import maxTransferBytes
    # If a is not already a numpy.ndarray, convert it
    if not isinstance(a, np.ndarray):
        try:
            a = np.array(a)
        except:
            raise TypeError(
                ('a must be a pdarray, np.ndarray, or convertible to' +
                 ' a numpy array'))
    # Only rank 1 arrays currently supported
    if a.ndim != 1:
        raise RuntimeError("Only rank-1 pdarrays or ndarrays supported")
    # Check if array of strings
    if 'U' in a.dtype.kind:
        # encode each string and add a null byte terminator
        encoded = [
            i for i in itertools.chain.from_iterable(
                map(lambda x: x.encode() + b"\x00", a))
        ]
        nbytes = len(encoded)
        if nbytes > maxTransferBytes:
            raise RuntimeError(
                ("Creating pdarray would require transferring {} bytes," +
                 " which exceeds allowed transfer size. Increase " +
                 "ak.maxTransferBytes to force.").format(nbytes))
        encoded_np = np.array(encoded, dtype=np.uint8)
        args = f"{encoded_np.dtype.name} {encoded_np.size} seg_string={True}"
        rep_msg = generic_msg(cmd='array',
                              args=args,
                              payload=_array_memview(encoded_np),
                              send_binary=True)
        parts = cast(str, rep_msg).split('+', maxsplit=3)
        return Strings.from_parts(parts[0], parts[1])

    # If not strings, then check that dtype is supported in arkouda
    if a.dtype.name not in DTypes:
        raise RuntimeError("Unhandled dtype {}".format(a.dtype))
    # Do not allow arrays that are too large
    size = a.size
    if (size * a.itemsize) > maxTransferBytes:
        raise RuntimeError(("Array exceeds allowed transfer size. Increase " +
                            "ak.maxTransferBytes to allow"))
    # Pack binary array data into a bytes object with a command header
    # including the dtype and size. If the server has a different byteorder
    # than our numpy array we need to swap to match since the server expects
    # native endian bytes
    aview = _array_memview(a)
    args = f"{a.dtype.name} {size} seg_strings={False}"
    rep_msg = generic_msg(cmd='array',
                          args=args,
                          payload=aview,
                          send_binary=True)
    return create_pdarray(rep_msg)
示例#22
0
def unique(
    pda: Union[pdarray, Strings, 'Categorical'],  # type: ignore
    return_counts: bool = False
) -> Union[Union[pdarray, Strings, 'Categorical'],  # type: ignore
           Tuple[Union[pdarray, Strings, 'Categorical'],
                 Optional[pdarray]]]:  #type: ignore
    """
    Find the unique elements of an array.

    Returns the unique elements of an array, sorted if the values are integers. 
    There is an optional output in addition to the unique elements: the number 
    of times each unique value comes up in the input array.

    Parameters
    ----------
    pda : pdarray or Strings or Categorical
        Input array.
    return_counts : bool, optional
        If True, also return the number of times each unique item appears
        in `pda`.

    Returns
    -------
    unique : pdarray or Strings
        The unique values. If input dtype is int64, return values will be sorted.
    unique_counts : pdarray, optional
        The number of times each of the unique values comes up in the
        original array. Only provided if `return_counts` is True.
        
    Raises
    ------
    TypeError
        Raised if pda is not a pdarray or Strings object
    RuntimeError
        Raised if the pdarray or Strings dtype is unsupported

    Notes
    -----
    For integer arrays, this function checks to see whether `pda` is sorted
    and, if so, whether it is already unique. This step can save considerable 
    computation. Otherwise, this function will sort `pda`.

    Examples
    --------
    >>> A = ak.array([3, 2, 1, 1, 2, 3])
    >>> ak.unique(A)
    array([1, 2, 3])
    """
    from arkouda.categorical import Categorical as Categorical_
    if hasattr(pda, 'unique'):
        return cast(Categorical_, pda).unique()
    elif isinstance(pda, pdarray):
        repMsg = generic_msg(cmd="unique", args="{} {} {}".\
                             format(pda.objtype, pda.name, return_counts))
        if return_counts:
            vc = cast(str, repMsg).split("+")
            logger.debug(vc)
            return create_pdarray(cast(str, vc[0])), create_pdarray(
                cast(str, vc[1]))
        else:
            return create_pdarray(cast(str, repMsg))
    elif isinstance(pda, Strings):
        name = '{}+{}'.format(pda.entry.name, "legacy_placeholder")
        repMsg = cast(str,generic_msg(cmd="unique", args="{} {} {}".\
                             format(pda.objtype, name, return_counts)))
        vc = repMsg.split('+')
        logger.debug(vc)
        if return_counts:
            return Strings.from_return_msg("+".join(vc[0:2])), create_pdarray(
                cast(str, vc[2]))
        else:
            return Strings.from_return_msg(repMsg)
    else:
        raise TypeError("must be pdarray, Strings, or Categorical {}")
示例#23
0
def array(a: Union[pdarray, np.ndarray, Iterable]) -> Union[pdarray, Strings]:
    """
    Convert a Python or Numpy Iterable to a pdarray or Strings object, sending 
    the corresponding data to the arkouda server. 

    Parameters
    ----------
    a : Union[pdarray, np.ndarray]
        Rank-1 array of a supported dtype

    Returns
    -------
    pdarray or Strings
        A pdarray instance stored on arkouda server or Strings instance, which
        is composed of two pdarrays stored on arkouda server
        
    Raises
    ------
    TypeError
        Raised if a is not a pdarray, np.ndarray, or Python Iterable such as a
        list, array, tuple, or deque
    RuntimeError
        Raised if a is not one-dimensional, nbytes > maxTransferBytes, a.dtype is
        not supported (not in DTypes), or if the product of a size and
        a.itemsize > maxTransferBytes
    ValueError
        Raised if the returned message is malformed or does not contain the fields
        required to generate the array.

    See Also
    --------
    pdarray.to_ndarray

    Notes
    -----
    The number of bytes in the input array cannot exceed `arkouda.maxTransferBytes`,
    otherwise a RuntimeError will be raised. This is to protect the user
    from overwhelming the connection between the Python client and the arkouda
    server, under the assumption that it is a low-bandwidth connection. The user
    may override this limit by setting ak.maxTransferBytes to a larger value, 
    but should proceed with caution.
    
    If the pdrray or ndarray is of type U, this method is called twice recursively 
    to create the Strings object and the two corresponding pdarrays for string 
    bytes and offsets, respectively.

    Examples
    --------
    >>> ak.array(np.arange(1,10))
    array([1, 2, 3, 4, 5, 6, 7, 8, 9])
    
    >>> ak.array(range(1,10))
    array([1, 2, 3, 4, 5, 6, 7, 8, 9])
   
    >>> strings = ak.array(['string {}'.format(i) for i in range(0,5)])
    >>> type(strings)
    <class 'arkouda.strings.Strings'>  
    """
    # If a is already a pdarray, do nothing
    if isinstance(a, pdarray):
        return a
    from arkouda.client import maxTransferBytes
    # If a is not already a numpy.ndarray, convert it
    if not isinstance(a, np.ndarray):
        try:
            a = np.array(a)
        except:
            raise TypeError(
                ('a must be a pdarray, np.ndarray, or convertible to' +
                 ' a numpy array'))
    # Only rank 1 arrays currently supported
    if a.ndim != 1:
        raise RuntimeError("Only rank-1 pdarrays or ndarrays supported")
    # Check if array of strings
    if a.dtype.kind == 'U' or 'U' in a.dtype.kind:
        encoded = np.array([elem.encode() for elem in a])
        # Length of each string, plus null byte terminator
        lengths = np.array([len(elem) for elem in encoded]) + 1
        # Compute zero-up segment offsets
        offsets = np.cumsum(lengths) - lengths
        # Allocate and fill bytes array with string segments
        nbytes = offsets[-1] + lengths[-1]
        if nbytes > maxTransferBytes:
            raise RuntimeError(
                ("Creating pdarray would require transferring {} bytes," +
                 " which exceeds allowed transfer size. Increase " +
                 "ak.maxTransferBytes to force.").format(nbytes))
        values = np.zeros(nbytes, dtype=np.uint8)
        for s, o in zip(encoded, offsets):
            for i, b in enumerate(s):
                values[o + i] = b
        # Recurse to create pdarrays for offsets and values, then return Strings object
        return Strings(cast(pdarray, array(offsets)),
                       cast(pdarray, array(values)))
    # If not strings, then check that dtype is supported in arkouda
    if a.dtype.name not in DTypes:
        raise RuntimeError("Unhandled dtype {}".format(a.dtype))
    # Do not allow arrays that are too large
    size = a.size
    if (size * a.itemsize) > maxTransferBytes:
        raise RuntimeError(("Array exceeds allowed transfer size. Increase " +
                            "ak.maxTransferBytes to allow"))
    # Pack binary array data into a bytes object with a command header
    # including the dtype and size. If the server has a different byteorder
    # than our numpy array we need to swap to match since the server expects
    # native endian bytes
    if ((get_byteorder(a.dtype) == '<' and get_server_byteorder() == 'big')
            or (get_byteorder(a.dtype) == '>'
                and get_server_byteorder() == 'little')):
        abytes = a.byteswap().tobytes()
    else:
        abytes = a.tobytes()
    req_msg = "{} {:n} ".format(a.dtype.name, size).encode() + abytes
    repMsg = generic_msg(cmd='array', args=req_msg, send_bytes=True)
    return create_pdarray(repMsg)
示例#24
0
def concatenate(arrays : Sequence[Union[pdarray,Strings]]) -> Union[pdarray,Strings]:
    """
    Concatenate an iterable of ``pdarray`` or ``Strings`` objects into 
    one ``pdarray`` or ``Strings`` object, respectively.

    Parameters
    ----------
    arrays : Sequence[Union[pdarray,Strings]]
        The pdarrays or Strings to concatenate. For pdarrays, all must have same 
        dtype.

    Returns
    -------
    Union[pdarray,Strings]
        Single pdarray or Strings object containing all values, returned in
        the original order
        
    Raises
    ------
    ValueError
        Raised if arrays is empty or if 1..n pdarrays have
        differing dtypes
    TypeError
        Raised if arrays is not a pdarrays or Strings iterable
    RuntimeError
        Raised if 1..n array elements are dtypes for which
        concatenate has not been implemented.

    Notes
    -----
    ak.concatenate is not supported for bool or float64 pdarrays

    Examples
    --------
    >>> ak.concatenate([ak.array([1, 2, 3]), ak.array([4, 5, 6])])
    array([1, 2, 3, 4, 5, 6])
    
    >>> ak.concatenate([ak.array([True,False,True]),ak.array([False,True,True])])
    array([True, False, True, False, True, True])
    
    >>> ak.concatenate([ak.array(['one','two']),ak.array(['three','four','five'])])
    array(['one', 'two', 'three', 'four', 'five'])
    """
    size = 0
    objtype = None
    dtype = None
    names = []
    if len(cast(list,arrays)) < 1:
        raise ValueError("concatenate called on empty iterable")
    if len(cast(list,arrays)) == 1:
        # there are no arrays to concatenate, so just return arrays param
        return cast(Union[pdarray,Strings],arrays[0])
    for a in arrays:
        if not isinstance(a, pdarray) and not isinstance(a, Strings):
            raise TypeError(("arrays must be an iterable of pdarrays" 
                             " or Strings"))
        if objtype == None:
            objtype = a.objtype
        if objtype == "pdarray":
            if dtype == None:
                dtype = a.dtype
            elif dtype != a.dtype:
                raise ValueError("All pdarrays must have same dtype")
            names.append(cast(pdarray,a).name)
        elif objtype == "str":
            names.append('{}+{}'.format(cast(Strings,a).offsets.name, 
                                                   cast(Strings,a).bytes.name))
        else:
            raise NotImplementedError(("concatenate not implemented " +
                                    "for object type {}".format(objtype)))
        size += a.size
    if size == 0:
        if objtype == "pdarray":
            return zeros_like(cast(pdarray,arrays[0]))
        else:
            return arrays[0]
    repMsg = generic_msg("concatenate {} {} {}".\
                            format(len(cast(list,arrays)), objtype, ' '.join(names)))
    if objtype == "pdarray":
        return create_pdarray(cast(str,repMsg))
    elif objtype == "str":
        return Strings(*(cast(str,repMsg).split('+')))
    else:
        raise TypeError('arrays must be an array of pdarray or Strings objects')
示例#25
0
def array(a):
    """
    Convert an iterable to a pdarray, sending data to the arkouda server.

    Parameters
    ----------
    a : array_like
        Rank-1 array of a supported dtype

    Returns
    -------
    pdarray
        Instance of pdarray stored on arkouda server

    See Also
    --------
    pdarray.to_ndarray

    Notes
    -----
    The number of bytes in the input array cannot exceed `arkouda.maxTransferBytes`,
    otherwise a RuntimeError will be raised. This is to protect the user
    from overwhelming the connection between the Python client and the arkouda
    server, under the assumption that it is a low-bandwidth connection. The user
    may override this limit by setting ak.maxTransferBytes to a larger value, 
    but should proceed with caution.

    Examples
    --------
    >>> a = [3, 5, 7]
    >>> b = ak.array(a)
    >>> b
    array([3, 5, 7])
   
    >>> type(b)
    arkouda.pdarray    
    """
    # If a is already a pdarray, do nothing
    if isinstance(a, pdarray):
        return a
    # If a is not already a numpy.ndarray, convert it
    if not isinstance(a, np.ndarray):
        try:
            a = np.array(a)
        except:
            raise TypeError("Argument must be array-like")
    # Only rank 1 arrays currently supported
    if a.ndim != 1:
        raise RuntimeError("Only rank-1 arrays supported")
    # Check if array of strings
    if a.dtype.kind == 'U':
        # Length of each string, plus null byte terminator
        lengths = np.array([len(elem) for elem in a]) + 1
        # Compute zero-up segment offsets
        offsets = np.cumsum(lengths) - lengths
        # Allocate and fill bytes array with string segments
        nbytes = offsets[-1] + lengths[-1]
        if nbytes > maxTransferBytes:
            raise RuntimeError(
                "Creating pdarray would require transferring {} bytes, which exceeds allowed transfer size. Increase ak.maxTransferBytes to force."
                .format(nbytes))
        values = np.zeros(nbytes, dtype=np.uint8)
        for s, o in zip(a, offsets):
            for i, b in enumerate(s.encode()):
                values[o + i] = b
        # Recurse to create pdarrays for offsets and values, then return Strings object
        return Strings(array(offsets), array(values))
    # If not strings, then check that dtype is supported in arkouda
    if a.dtype.name not in DTypes:
        raise RuntimeError("Unhandled dtype {}".format(a.dtype))
    # Do not allow arrays that are too large
    size = a.size
    if (size * a.itemsize) > maxTransferBytes:
        raise RuntimeError(
            "Array exceeds allowed transfer size. Increase ak.maxTransferBytes to allow"
        )
    # Pack binary array data into a bytes object with a command header
    # including the dtype and size
    fmt = ">{:n}{}".format(size, structDtypeCodes[a.dtype.name])
    req_msg = "array {} {:n} ".format(a.dtype.name,
                                      size).encode() + struct.pack(fmt, *a)
    rep_msg = generic_msg(req_msg, send_bytes=True)
    return create_pdarray(rep_msg)
示例#26
0
def concatenate(
    arrays: Sequence[Union[pdarray, Strings, 'Categorical']],  #type: ignore
    ordered: bool = True
) -> Union[pdarray, Strings, 'Categorical']:  #type: ignore
    """
    Concatenate a list or tuple of ``pdarray`` or ``Strings`` objects into 
    one ``pdarray`` or ``Strings`` object, respectively.

    Parameters
    ----------
    arrays : Sequence[Union[pdarray,Strings,Categorical]]
        The arrays to concatenate. Must all have same dtype.
    ordered : bool
        If True (default), the arrays will be appended in the
        order given. If False, array data may be interleaved
        in blocks, which can greatly improve performance but
        results in non-deterministic ordering of elements.

    Returns
    -------
    Union[pdarray,Strings,Categorical]
        Single pdarray or Strings object containing all values, returned in
        the original order
        
    Raises
    ------
    ValueError
        Raised if arrays is empty or if 1..n pdarrays have
        differing dtypes
    TypeError
        Raised if arrays is not a pdarrays or Strings python Sequence such as a 
        list or tuple
    RuntimeError
        Raised if 1..n array elements are dtypes for which
        concatenate has not been implemented.

    Examples
    --------
    >>> ak.concatenate([ak.array([1, 2, 3]), ak.array([4, 5, 6])])
    array([1, 2, 3, 4, 5, 6])
    
    >>> ak.concatenate([ak.array([True,False,True]),ak.array([False,True,True])])
    array([True, False, True, False, True, True])
    
    >>> ak.concatenate([ak.array(['one','two']),ak.array(['three','four','five'])])
    array(['one', 'two', 'three', 'four', 'five'])

    """
    from arkouda.categorical import Categorical as Categorical_
    size = 0
    objtype = None
    dtype = None
    names = []
    if ordered:
        mode = 'append'
    else:
        mode = 'interleave'
    if len(arrays) < 1:
        raise ValueError("concatenate called on empty iterable")
    if len(arrays) == 1:
        return cast(Union[pdarray, Strings, Categorical_], arrays[0])

    if hasattr(arrays[0], 'concatenate'):
        return cast(
            Sequence[Categorical_],
            cast(Categorical_,
                 arrays[0]).concatenate(cast(Sequence[Categorical_],
                                             arrays[1:]),
                                        ordered=ordered))
    for a in arrays:
        if not isinstance(a, pdarray) and not isinstance(a, Strings):
            raise TypeError(("arrays must be an iterable of pdarrays"
                             " or Strings"))
        if objtype == None:
            objtype = a.objtype
        if objtype == "pdarray":
            if dtype == None:
                dtype = a.dtype
            elif dtype != a.dtype:
                raise ValueError("All pdarrays must have same dtype")
            names.append(cast(pdarray, a).name)
        elif objtype == "str":
            names.append('{}+{}'.format(
                cast(Strings, a).entry.name, "legacy_placeholder"))
        else:
            raise NotImplementedError(("concatenate not implemented " +
                                       "for object type {}".format(objtype)))
        size += a.size
    if size == 0:
        if objtype == "pdarray":
            return zeros_like(cast(pdarray, arrays[0]))
        else:
            return arrays[0]

    repMsg = generic_msg(cmd="concatenate", args="{} {} {} {}".\
                            format(len(arrays), objtype, mode, ' '.join(names)))
    if objtype == "pdarray":
        return create_pdarray(cast(str, repMsg))
    elif objtype == "str":
        # ConcatenateMsg returns created attrib(name)+created nbytes=123
        return Strings.from_return_msg(cast(str, repMsg))
    else:
        raise TypeError(
            'arrays must be an array of pdarray or Strings objects')
示例#27
0
def read_parquet(filenames : Union[str, List[str]],
                 dsetname : Union[str, List[str]]  = 'array',
                 strictTypes: bool=True, allow_errors:bool = False)\
             -> Union[pdarray, Strings, Mapping[str,Union[pdarray,Strings]]]:
    """
    Read a single dataset from multiple Parquet files into an Arkouda
    pdarray object.

    Parameters
    ----------
    filenames : list or str
        Either a list of filenames or shell expression
    dsetName : str
        The name of the dataset (must be the same across all files).
        Defaults to 'array'.
    strictTypes: bool
        If True (default), require all dtypes in all files to have the
        same precision and sign. If False, allow dtypes of different
        precision and sign across different files. For example, if one 
        file contains a uint32 dataset and another contains an int64
        dataset, the contents of both will be read into an int64 pdarray.
    allow_errors: bool
        Default False, if True will allow files with read errors to be skipped
        instead of failing.  A warning will be included in the return containing
        the total number of files skipped due to failure and up to 10 filenames.

    Returns
    -------
    pdarray
        A pdarray instance pointing to the server-side data

    Raises
    ------
    TypeError 
        Raised if dsetName is not a str or if filenames is neither a string
        nor a list of strings
    ValueError 
        Raised if all datasets are not present in all parquet files  
    RuntimeError
        Raised if one or more of the specified files cannot be opened  

    See Also
    --------
    read_hdf, get_datasets, ls_hdf, read_all, load, save

    Notes
    -----
    If filenames is a string, it is interpreted as a shell expression
    (a single filename is a valid expression, so it will work) and is
    expanded with glob to read all matching files. Use ``get_datasets`` to
    show the names of datasets in Parquet files.

    If dsetName is not present in all files, a TypeError is raised.
    """
    if isinstance(filenames, str):
        filenames = [filenames]
    if isinstance(dsetname, str):
        dsetname = [dsetname]

    rep_msg = generic_msg(
        cmd="readAllParquet",
        args=
        f"{strictTypes} {len(dsetname)} {len(filenames)} {allow_errors} {json.dumps(dsetname)} | {json.dumps(filenames)}"
    )
    rep = json.loads(
        rep_msg)  # See GenSymIO._buildReadAllHdfMsgJson for json structure
    items = rep["items"] if "items" in rep else []
    file_errors = rep["file_errors"] if "file_errors" in rep else []

    # We have a couple possible return conditions
    # 1. We have multiple items returned i.e. multi pdarrays
    # 2. We have a single pdarray
    # TODO: add support for a string objects in Parquet
    if len(items) > 1:  #  DataSets condition
        d: Dict[str, Union[pdarray, Strings]] = {}
        for item in items:
            if "seg_string" == item["arkouda_type"]:
                d[item["dataset_name"]] = Strings(*item["created"].split("+"))
            elif "pdarray" == item["arkouda_type"]:
                d[item["dataset_name"]] = create_pdarray(item["created"])
            else:
                raise TypeError(f"Unknown arkouda type:{item['arkouda_type']}")
        return d

    elif len(items) == 1:
        item = items[0]
        if "pdarray" == item["arkouda_type"]:
            return create_pdarray(item["created"])
        elif "seg_string" == item["arkouda_type"]:
            return Strings(*item["created"].split("+"))
        else:
            raise TypeError(f"Unknown arkouda type:{item['arkouda_type']}")
    else:
        raise RuntimeError("No items were returned")
示例#28
0
def read_all(filenames : Union[str,List[str]],
             datasets : Optional[Union[str,List[str]]]=None,
             iterative : bool=False,
             strictTypes: bool=True) \
             -> Union[pdarray, Strings, Mapping[str,Union[pdarray,Strings]]]:
    """
    Read datasets from HDF5 files.

    Parameters
    ----------
    filenames : list or str
        Either a list of filenames or shell expression
    datasets : list or str or None
        (List of) name(s) of dataset(s) to read (default: all available)
    iterative : bool
        Iterative (True) or Single (False) function call(s) to server
    strictTypes: bool
        If True (default), require all dtypes of a given dataset to have the
        same precision and sign. If False, allow dtypes of different
        precision and sign across different files. For example, if one 
        file contains a uint32 dataset and another contains an int64
        dataset with the same name, the contents of both will be read 
        into an int64 pdarray.

    Returns
    -------
    For a single dataset returns an Arkouda pdarray or Arkouda Strings object
    and for multiple datasets returns a dictionary of Arkouda pdarrays or
    Arkouda Strings.
        Dictionary of {datasetName: pdarray or String}

    Raises
    ------
    ValueError 
        Raised if all datasets are not present in all hdf5 files

    See Also
    --------
    read_hdf, get_datasets, ls_hdf

    Notes
    -----
    If filenames is a string, it is interpreted as a shell expression
    (a single filename is a valid expression, so it will work) and is
    expanded with glob to read all matching files.

    If iterative == True each dataset name and file names are passed to
    the server as independent sequential strings while if iterative == False
    all dataset names and file names are passed to the server in a single
    string.

    If datasets is None, infer the names of datasets from the first file
    and read all of them. Use ``get_datasets`` to show the names of datasets
    to HDF5 files.
    """
    if isinstance(filenames, str):
        filenames = [filenames]
    if datasets is None:
        datasets = get_datasets(filenames[0])
    if isinstance(datasets, str):
        datasets = [datasets]
    else:  # ensure dataset(s) exist
        if isinstance(datasets, str):
            datasets = [datasets]
        nonexistent = set(datasets) - set(get_datasets(filenames[0]))
        if len(nonexistent) > 0:
            raise ValueError("Dataset(s) not found: {}".format(nonexistent))
    if iterative == True:  # iterative calls to server readhdf
        return {
            dset: read_hdf(dset, filenames, strictTypes=strictTypes)
            for dset in datasets
        }
    else:  # single call to server readAllHdf
        rep_msg = generic_msg("readAllHdf {} {:n} {:n} {} | {}".\
                format(strictTypes, len(datasets), len(filenames), json.dumps(datasets),
                       json.dumps(filenames)))
        if ',' in rep_msg:
            rep_msgs = cast(str, rep_msg).split(' , ')
            d: Dict[str, Union[pdarray, Strings]] = dict()
            for dset, rm in zip(datasets, rep_msgs):
                if ('+' in cast(str, rm)):  #String
                    d[dset] = Strings(*cast(str, rm).split('+'))
                else:
                    d[dset] = create_pdarray(cast(str, rm))
            return d
        elif '+' in rep_msg:
            return Strings(*cast(str, rep_msg).split('+'))
        else:
            return create_pdarray(cast(str, rep_msg))
示例#29
0
def read_all(filenames : Union[str, List[str]],
             datasets: Optional[Union[str, List[str]]] = None,
             iterative: bool = False,
             strictTypes: bool = True,
             allow_errors: bool = False,
             calc_string_offsets = False)\
             -> Union[pdarray, Strings, Mapping[str,Union[pdarray,Strings]]]:
    """
    Read datasets from HDF5 files.

    Parameters
    ----------
    filenames : list or str
        Either a list of filenames or shell expression
    datasets : list or str or None
        (List of) name(s) of dataset(s) to read (default: all available)
    iterative : bool
        Iterative (True) or Single (False) function call(s) to server
    strictTypes: bool
        If True (default), require all dtypes of a given dataset to have the
        same precision and sign. If False, allow dtypes of different
        precision and sign across different files. For example, if one 
        file contains a uint32 dataset and another contains an int64
        dataset with the same name, the contents of both will be read 
        into an int64 pdarray.
    allow_errors: bool
        Default False, if True will allow files with read errors to be skipped
        instead of failing.  A warning will be included in the return containing
        the total number of files skipped due to failure and up to 10 filenames.
    calc_string_offsets: bool
        Default False, if True this will tell the server to calculate the
        offsets/segments array on the server versus loading them from HDF5 files.
        In the future this option may be set to True as the default.

    Returns
    -------
    For a single dataset returns an Arkouda pdarray or Arkouda Strings object
    and for multiple datasets returns a dictionary of Arkouda pdarrays or
    Arkouda Strings.
        Dictionary of {datasetName: pdarray or String}

    Raises
    ------
    ValueError 
        Raised if all datasets are not present in all hdf5 files or if one or
        more of the specified files do not exist
    RuntimeError
        Raised if one or more of the specified files cannot be opened.
        If `allow_errors` is true this may be raised if no values are returned
        from the server.
    TypeError
        Raised if we receive an unknown arkouda_type returned from the server

    See Also
    --------
    read_hdf, get_datasets, ls_hdf

    Notes
    -----
    If filenames is a string, it is interpreted as a shell expression
    (a single filename is a valid expression, so it will work) and is
    expanded with glob to read all matching files.

    If iterative == True each dataset name and file names are passed to
    the server as independent sequential strings while if iterative == False
    all dataset names and file names are passed to the server in a single
    string.

    If datasets is None, infer the names of datasets from the first file
    and read all of them. Use ``get_datasets`` to show the names of datasets
    to HDF5 files.
    """
    if isinstance(filenames, str):
        filenames = [filenames]
    if datasets is None:
        datasets = get_datasets_allow_errors(
            filenames) if allow_errors else get_datasets(filenames[0])
    if isinstance(datasets, str):
        datasets = [datasets]
    else:  # ensure dataset(s) exist
        if isinstance(datasets, str):
            datasets = [datasets]
        nonexistent = set(datasets) - \
            (set(get_datasets_allow_errors(filenames)) if allow_errors else set(get_datasets(filenames[0])))
        if len(nonexistent) > 0:
            raise ValueError("Dataset(s) not found: {}".format(nonexistent))
    if iterative == True:  # iterative calls to server readhdf
        return {
            dset: read_hdf(dset,
                           filenames,
                           strictTypes=strictTypes,
                           allow_errors=allow_errors,
                           calc_string_offsets=calc_string_offsets)
            for dset in datasets
        }
    else:  # single call to server readAllHdf
        rep_msg = generic_msg(
            cmd="readAllHdf",
            args=
            f"{strictTypes} {len(datasets)} {len(filenames)} {allow_errors} {calc_string_offsets} {json.dumps(datasets)} | {json.dumps(filenames)}"
        )
        rep = json.loads(
            rep_msg)  # See GenSymIO._buildReadAllHdfMsgJson for json structure
        items = rep["items"] if "items" in rep else []
        file_errors = rep["file_errors"] if "file_errors" in rep else []
        if allow_errors and file_errors:
            file_error_count = rep[
                "file_error_count"] if "file_error_count" in rep else -1
            warnings.warn(
                f"There were {file_error_count} errors reading files on the server. "
                + f"Sample error messages {file_errors}", RuntimeWarning)

        # We have a couple possible return conditions
        # 1. We have multiple items returned i.e. multi pdarrays, multi strings, multi pdarrays & strings
        # 2. We have a single pdarray
        # 3. We have a single strings object
        if len(items) > 1:  #  DataSets condition
            d: Dict[str, Union[pdarray, Strings]] = {}
            for item in items:
                if "seg_string" == item["arkouda_type"]:
                    d[item["dataset_name"]] = Strings.from_return_msg(
                        item["created"])
                elif "pdarray" == item["arkouda_type"]:
                    d[item["dataset_name"]] = create_pdarray(item["created"])
                else:
                    raise TypeError(
                        f"Unknown arkouda type:{item['arkouda_type']}")
            return d
        elif len(items) == 1:
            item = items[0]
            if "pdarray" == item["arkouda_type"]:
                return create_pdarray(item["created"])
            elif "seg_string" == item["arkouda_type"]:
                return Strings.from_return_msg(item["created"])
            else:
                raise TypeError(f"Unknown arkouda type:{item['arkouda_type']}")
        else:
            raise RuntimeError("No items were returned")