示例#1
0
def str_slice(x, start=0, stop=None):  # TODO: support n
    """Slice substrings from each string element in a column.

    :param int start: The start position for the slice operation.
    :param int end: The stop position for the slice operation.
    :returns: an expression containing the sliced substrings.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.slice(start=2, stop=5)
    Expression = str_pandas_slice(text, start=2, stop=5)
    Length: 5 dtype: str (expression)
    ---------------------------------
    0  met
    1   ry
    2   co
    3    r
    4   y.
    """
    if stop is None:
        sll = _to_string_sequence(x).slice_string_end(start)
    else:
        sll = _to_string_sequence(x).slice_string(start, stop)
    return sll
示例#2
0
def str_cat(x, other):
    """Concatenate two string columns on a row-by-row basis.

    :param expression other: The expression of the other column to be concatenated.
    :returns: an expression containing the concatenated columns.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.cat(df.text)
    Expression = str_cat(text, text)
    Length: 5 dtype: str (expression)
    ---------------------------------
    0      SomethingSomething
    1  very prettyvery pretty
    2      is comingis coming
    3                  ourour
    4                way.way.
    """
    sl1 = _to_string_sequence(x)
    sl2 = _to_string_sequence(other)
    sl = sl1.concat(sl2)
    return column.ColumnStringArrow.from_string_sequence(sl)
示例#3
0
文件: cpu.py 项目: stjordanis/vaex
 def process(self, thread_index, i1, i2, filter_mask, ar):
     from vaex.column import _to_string_sequence
     if self.set is None:
         self.set = self.ordered_set_type()
     if self.selection:
         selection_mask = self.df.evaluate_selection_mask(self.selection,
                                                          i1=i1,
                                                          i2=i2,
                                                          cache=True)
         ar = filter(ar, selection_mask)
     if self.dtype.is_list and self.flatten:
         ar = ar.values
     if self.dtype_item.is_string:
         ar = _to_string_sequence(ar)
     else:
         ar = vaex.array_types.to_numpy(ar)
     if np.ma.isMaskedArray(ar):
         mask = np.ma.getmaskarray(ar)
         self.set.update(ar, mask)
     else:
         self.set.update(ar)
     if self.unique_limit is not None:
         count = self.set.count
         # we skip null and nan here, since this is just an early bail out
         if count > self.unique_limit:
             raise vaex.RowLimitException(
                 f'Resulting set would have >= {self.unique_limit} unique combinations'
             )
示例#4
0
def str_isspace(x):
    """Check if all characters in a string sample are whitespaces.

    :returns: an expression evaluated to True if a sample contains only whitespaces, otherwise False.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', '      ', ' ']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3
      4

    >>> df.text.str.isspace()
    Expression = str_isspace(text)
    Length: 5 dtype: bool (expression)
    ----------------------------------
    0  False
    1  False
    2  False
    3   True
    4   True
    """
    return _to_string_sequence(x).isspace()
示例#5
0
def str_isalpha(x):
    """Check if all characters in a string sample are alphabetic.

    :returns: an expression evaluated to True if a sample contains only alphabetic characters, otherwise False.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.isalpha()
    Expression = str_isalpha(text)
    Length: 5 dtype: bool (expression)
    ----------------------------------
    0   True
    1  False
    2  False
    3   True
    4  False
    """
    return _to_string_sequence(x).isalpha()
示例#6
0
def str_zfill(x, width):
    """Pad strings in a column by prepanding "0" characters.

    :param int width: The minimum length of the resulting string. Strings shorter less than `width` will be prepended with zeros.
    :returns: an expression containing the modified strings.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.zfill(width=12)
    Expression = str_zfill(text, width=12)
    Length: 5 dtype: str (expression)
    ---------------------------------
    0  000Something
    1  0very pretty
    2  000is coming
    3  000000000our
    4  00000000way.
    """
    sl = _to_string_sequence(x).pad(width, '0', True, False)
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#7
0
def str_upper(x):
    """Converts all strings in a column to uppercase.

    :returns: an expression containing the converted strings.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.


    >>> df.text.str.upper()
    Expression = str_upper(text)
    Length: 5 dtype: str (expression)
    ---------------------------------
    0    SOMETHING
    1  VERY PRETTY
    2    IS COMING
    3          OUR
    4         WAY.

    """
    sl = _to_string_sequence(x).upper()
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#8
0
def str_startswith(x, pat):
    """Check if a start of a string matches a pattern.

    :param str pat: A string pattern. Regular expressions are not supported.
    :returns: an expression which is evaluated to True if the pattern is found at the start of a string sample, False otherwise.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.startswith(pat='is')
    Expression = str_startswith(text, pat='is')
    Length: 5 dtype: bool (expression)
    ----------------------------------
    0  False
    1  False
    2   True
    3  False
    4  False
    """
    return _to_string_sequence(x).startswith(pat)
示例#9
0
def str_replace(x, pat, repl, n=-1, flags=0, regex=False):
    """Replace occurences of a pattern/regex in a column with some other string.

    :param str pattern: string or a regex pattern
    :param str replace: a replacement string
    :param int n: number of replacements to be made from the start. If -1 make all replacements.
    :param int flags: ??
    :param bool regex: If True, ...?
    :returns: an expression containing the string replacements.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.replace(pat='et', repl='__')
    Expression = str_replace(text, pat='et', repl='__')
    Length: 5 dtype: str (expression)
    ---------------------------------
    0    Som__hing
    1  very pr__ty
    2    is coming
    3          our
    4         way.
    """
    sl = _to_string_sequence(x).replace(pat, repl, n, flags, regex)
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#10
0
def str_repeat(x, repeats):
    """Duplicate each string in a column.

    :param int repeats: number of times each string sample is to be duplicated.
    :returns: an expression containing the duplicated strings

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.repeat(3)
    Expression = str_repeat(text, 3)
    Length: 5 dtype: str (expression)
    ---------------------------------
    0        SomethingSomethingSomething
    1  very prettyvery prettyvery pretty
    2        is comingis comingis coming
    3                          ourourour
    4                       way.way.way.
    """
    sl = _to_string_sequence(x).repeat(repeats)
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#11
0
def str_pad(x, width, side='left', fillchar=' '):
    """Pad strings in a given column.

    :param int width: The total width of the string
    :param str side: If 'left' than pad on the left, if 'right' than pad on the right side the string.
    :param str fillchar: The character used for padding.
    :returns: an expression containing the padded strings.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.pad(width=10, side='left', fillchar='!')
    Expression = str_pad(text, width=10, side='left', fillchar='!')
    Length: 5 dtype: str (expression)
    ---------------------------------
    0   !Something
    1  very pretty
    2   !is coming
    3   !!!!!!!our
    4   !!!!!!way.
    """
    sl = _to_string_sequence(x).pad(width, fillchar, side in ['left', 'both'], side in ['right', 'both'])
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#12
0
def str_capitalize(x):
    """Capitalize the first letter of a string sample.

    :returns: an expression containing the capitalized strings.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.capitalize()
    Expression = str_capitalize(text)
    Length: 5 dtype: str (expression)
    ---------------------------------
    0    Something
    1  Very pretty
    2    Is coming
    3          Our
    4         Way.
    """
    sl = _to_string_sequence(x).capitalize()
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#13
0
def str_isupper(x):
    """Check if all characters in a string sample are lowercase characters.

    :returns: an expression evaluated to True if a sample contains only lowercase characters, otherwise False.

    Example:

    >>> import vaex
    >>> text = ['SOMETHING', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  SOMETHING
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.isupper()
    Expression = str_isupper(text)
    Length: 5 dtype: bool (expression)
    ----------------------------------
    0   True
    1  False
    2  False
    3  False
    4  False
    """
    return _to_string_sequence(x).isupper()
示例#14
0
def str_contains(x, pattern, regex=True):
    """Check if a string pattern or regex is contained within a sample of a string column.

    :param str pattern: A string or regex pattern
    :param bool regex: If True,
    :returns: an expression which is evaluated to True if the pattern is found in a given sample, and it is False otherwise.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.contains('very')
    Expression = str_contains(text, 'very')
    Length: 5 dtype: bool (expression)
    ----------------------------------
    0  False
    1   True
    2  False
    3  False
    4  False
    """
    return _to_string_sequence(x).search(pattern, regex)
示例#15
0
def str_lstrip(x, to_strip=None):
    """Remove leading characters from a string sample.

    :param str to_strip: The string to be removed
    :returns: an expression containing the modified string column.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.lstrip(to_strip='very ')
    Expression = str_lstrip(text, to_strip='very ')
    Length: 5 dtype: str (expression)
    ---------------------------------
    0  Something
    1     pretty
    2  is coming
    3        our
    4       way.
    """
    # in c++ we give empty string the same meaning as None
    sl = _to_string_sequence(x).lstrip('' if to_strip is None else to_strip) if to_strip != '' else x
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#16
0
def str_ljust(x, width, fillchar=' '):
    """Fills the right side of string samples with a specified character such that the strings are right-hand justified.

    :param int width: The minimal width of the strings.
    :param str fillchar: The character used for filling.
    :returns: an expression containing the filled strings.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.ljust(width=10, fillchar='!')
    Expression = str_ljust(text, width=10, fillchar='!')
    Length: 5 dtype: str (expression)
    ---------------------------------
    0   Something!
    1  very pretty
    2   is coming!
    3   our!!!!!!!
    4   way.!!!!!!
    """
    sl = _to_string_sequence(x).pad(width, fillchar, False, True)
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#17
0
def str_lower(x):
    """Converts string samples to lower case.

    :returns: an expression containing the converted strings.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.lower()
    Expression = str_lower(text)
    Length: 5 dtype: str (expression)
    ---------------------------------
    0    something
    1  very pretty
    2    is coming
    3          our
    4         way.
    """
    sl = _to_string_sequence(x).lower()
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#18
0
def str_find(x, sub, start=0, end=None):
    """Returns the lowest indices in each string in a column, where the provided substring is fully contained between within a
    sample. If the substring is not found, -1 is returned.

    :param str sub: A substring to be found in the samples
    :param int start:
    :param int end:
    :returns: an expression containing the lowest indices specifying the start of the substring.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.find(sub="et")
    Expression = str_find(text, sub='et')
    Length: 5 dtype: int64 (expression)
    -----------------------------------
    0   3
    1   7
    2  -1
    3  -1
    4  -1
    """
    return _to_string_sequence(x).find(sub, start, 0 if end is None else end, end is None, True)
示例#19
0
def str_byte_length(x):
    """Returns the number of bytes in a string sample.

    :returns: an expression contains the number of bytes in each sample of a string column.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.byte_length()
    Expression = str_byte_length(text)
    Length: 5 dtype: int64 (expression)
    -----------------------------------
    0   9
    1  11
    2   9
    3   3
    4   4
    """
    return _to_string_sequence(x).byte_length()
示例#20
0
def str_endswith(x, pat):
    """Check if the end of each string sample matches the specified pattern.

    :param str pat: A string pattern or a regex
    :returns: an expression evaluated to True if the pattern is found at the end of a given sample, False otherwise.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.endswith(pat="ing")
    Expression = str_endswith(text, pat='ing')
    Length: 5 dtype: bool (expression)
    ----------------------------------
    0   True
    1  False
    2   True
    3  False
    4  False
    """
    return _to_string_sequence(x).endswith(pat)
示例#21
0
def str_count(x, pat, regex=False):
    """Count the occurences of a pattern in sample of a string column.

    :param str pat: A string or regex pattern
    :param bool regex: If True,
    :returns: an expression containing the number of times a pattern is found in each sample.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.count(pat="et", regex=False)
    Expression = str_count(text, pat='et', regex=False)
    Length: 5 dtype: int64 (expression)
    -----------------------------------
    0  1
    1  1
    2  0
    3  0
    4  0
    """
    return _to_string_sequence(x).count(pat, regex)
示例#22
0
def str_match(x, pattern):
    """Check if a string sample matches a given regular expression.

    :param str pattern: a string or regex to match to a string sample.
    :returns: an expression which is evaluated to True if a match is found, False otherwise.

    Example:

    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.match(pattern='our')
    Expression = str_match(text, pattern='our')
    Length: 5 dtype: bool (expression)
    ----------------------------------
    0  False
    1  False
    2  False
    3   True
    4  False
    """
    return _to_string_sequence(x).match(pattern)
示例#23
0
def str_center(x, width, fillchar=' '):
    """ Fills the left and right side of the strings with additional characters, such that the sample has a total of `width`
    characters.

    :param int width: The total number of characters of the resulting string sample.
    :param str fillchar: The character used for filling.
    :returns: an expression containing the filled strings.

    Example:
    >>> import vaex
    >>> text = ['Something', 'very pretty', 'is coming', 'our', 'way.']
    >>> df = vaex.from_arrays(text=text)
    >>> df
      #  text
      0  Something
      1  very pretty
      2  is coming
      3  our
      4  way.

    >>> df.text.str.center(width=11, fillchar='!')
    Expression = str_center(text, width=11, fillchar='!')
    Length: 5 dtype: str (expression)
    ---------------------------------
    0  !Something!
    1  very pretty
    2  !is coming!
    3  !!!!our!!!!
    4  !!!!way.!!!
    """
    sl = _to_string_sequence(x).pad(width, fillchar, True, True)
    return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
示例#24
0
def str_split(x, pattern=None):  # TODO: support n
    x = _to_string_sequence(x)
    if isinstance(x, vaex.strings.StringArray):
        x = x.to_arrow()
    if pattern == '':
        raise ValueError('empty separator')
    sll = x.split('' if pattern is None else pattern)
    return sll
示例#25
0
    def process(self, thread_index, i1, i2, filter_mask, ar):
        from vaex.column import _to_string_sequence
        self._check_row_limit()
        if self.selection:
            selection_mask = self.df.evaluate_selection_mask(self.selection,
                                                             i1=i1,
                                                             i2=i2,
                                                             cache=True)
            ar = filter(ar, selection_mask)
        if len(ar) == 0:
            return
        if self.dtype.is_list and self.flatten:
            ar = ar.values
        if self.dtype_item.is_string:
            ar = _to_string_sequence(ar)
        else:
            ar = vaex.array_types.to_numpy(ar)
            if ar.strides != (1, ):
                ar = ar.copy()
        chunk_size = 1024 * 1024

        self._check_row_limit()
        if np.ma.isMaskedArray(ar):
            mask = np.ma.getmaskarray(ar)
            if self.return_inverse:
                values, map_index = self.set.update(
                    ar,
                    mask,
                    -1,
                    chunk_size=chunk_size,
                    bucket_size=chunk_size * 4,
                    return_values=self.return_inverse)
                self.chunks.append((i1, i2, values, map_index))
            else:
                self.set.update(ar,
                                mask,
                                -1,
                                chunk_size=chunk_size,
                                bucket_size=chunk_size * 4)
        else:
            if self.return_inverse:
                values, map_index = self.set.update(
                    ar,
                    -1,
                    chunk_size=chunk_size,
                    bucket_size=chunk_size * 4,
                    return_values=self.return_inverse)
                self.chunks.append((i1, i2, values, map_index))
            else:
                self.set.update(ar,
                                -1,
                                chunk_size=chunk_size,
                                bucket_size=chunk_size * 4)
        if logger.level >= logging.DEBUG:
            logger.debug(
                f"set uses {sys.getsizeof(self.set):,} bytes (offset {i1:,}, length {i2-i1:,})"
            )
        self._check_row_limit()
示例#26
0
文件: writer.py 项目: t-triobox/vaex
 def write(self, values):
     no_values = len(values)
     if no_values:
         # to_column = to_array
         from_sequence = _to_string_sequence(values)
         to_sequence = self.to_array.string_sequence.slice(
             self.to_offset, self.to_offset + no_values,
             self.string_byte_offset)
         self.string_byte_offset += to_sequence.fill_from(from_sequence)
         self.to_offset += no_values
     if self.to_offset == self.count:
         # last offset
         self.to_array.indices[self.count] = self.string_byte_offset
示例#27
0
文件: hash.py 项目: t-triobox/vaex
 def isin(self, values):
     if vaex.column._is_stringy(values) or self.dtype_item == str:
         values = vaex.column._to_string_column(values)
         values = _to_string_sequence(values)
         # return x.string_sequence.isin(values)
         return self._internal.isin(values)
     else:
         if np.ma.isMaskedArray(values):
             isin = self._internal.isin(values.data)
             isin[values.mask] = False
             return isin
         else:
             return self._internal.isin(values)
示例#28
0
 def map(thread_index, i1, i2, ar):
     if counters[thread_index] is None:
         counters[thread_index] = counter_type()
     if dtype == str_type:
         previous_ar = ar
         ar = _to_string_sequence(ar)
         if not transient:
             assert ar is previous_ar.string_sequence
     if np.ma.isMaskedArray(ar):
         mask = np.ma.getmaskarray(ar)
         counters[thread_index].update(ar, mask)
     else:
         counters[thread_index].update(ar)
     return 0
 def map(thread_index, i1, i2, ar):
     if counters[thread_index] is None:
         counters[thread_index] = counter_type()
     if dtype == str_type:
         previous_ar = ar
         ar = _to_string_sequence(ar)
         if not transient:
             assert ar is previous_ar.string_sequence
     if np.ma.isMaskedArray(ar):
         mask = np.ma.getmaskarray(ar)
         counters[thread_index].update(ar, mask)
     else:
         counters[thread_index].update(ar)
     return 0
示例#30
0
 def map(thread_index, i1, i2, ar):
     if counters[thread_index] is None:
         counters[thread_index] = counter_type()
     if data_type.is_list and flatten:
         ar = ar.values
     if data_type_item.is_string:
         ar = _to_string_sequence(ar)
     else:
         ar = vaex.array_types.to_numpy(ar)
     if np.ma.isMaskedArray(ar):
         mask = np.ma.getmaskarray(ar)
         counters[thread_index].update(ar, mask)
     else:
         counters[thread_index].update(ar)
     return 0
示例#31
0
文件: hash.py 项目: t-triobox/vaex
    def map(self, keys, check_missing=False):
        '''Map key values to unique integers'''
        from vaex.column import _to_string_sequence

        if not isinstance(keys, vaex.array_types.supported_array_types) or self.dtype == str:
            # sometimes the dtype can be object, but seen as an string array
            keys = _to_string_sequence(keys)
        else:
            keys = vaex.array_types.to_numpy(keys)
        indices = self._internal.map_ordinal(keys)
        if np.ma.isMaskedArray(keys):
            indices[keys.mask] = self.null_value
        if check_missing:
            indices = np.ma.array(indices, mask=indices==-1)
        return indices
示例#32
0
def _export_column(dataset_input, dataset_output, column_name, full_mask, shuffle, sort, selection, N, 
    order_array, order_array_inverse, progress_status):

        if 1:
            block_scope = dataset_input._block_scope(0, vaex.execution.buffer_size_default)
            to_array = dataset_output.columns[column_name]
            dtype = dataset_input.dtype(column_name)
            if shuffle or sort:  # we need to create a in memory copy, otherwise we will do random writes which is VERY inefficient
                to_array_disk = to_array
                if np.ma.isMaskedArray(to_array):
                    to_array = np.empty_like(to_array_disk)
                else:
                    if dtype == str_type:
                        # we create an empty column copy
                        to_array = to_array._zeros_like()
                    else:
                        to_array = np.zeros_like(to_array_disk)
            to_offset = 0  # we need this for selections
            count = len(dataset_input) if not selection else dataset_input.length_unfiltered()
            is_string = dtype == str_type
            # TODO: if no filter, selection or mask, we can choose the quick path for str
            string_byte_offset = 0

            for i1, i2 in vaex.utils.subdivide(count, max_length=max_length):
                logger.debug("from %d to %d (total length: %d, output length: %d)", i1, i2, len(dataset_input), N)
                block_scope.move(i1, i2)
                if selection:
                    mask = full_mask[i1:i2]
                    values = dataset_input.evaluate(column_name, i1=i1, i2=i2, filtered=False) #selection=selection)
                    values = values[mask]
                    no_values = len(values)
                    if no_values:
                        if is_string:
                            to_column = to_array
                            assert isinstance(to_column, ColumnStringArrow)
                            from_sequence = _to_string_sequence(values)
                            to_sequence = to_column.string_sequence.slice(to_offset, to_offset+no_values, string_byte_offset)
                            string_byte_offset += to_sequence.fill_from(from_sequence)
                            to_offset += no_values
                        else:
                            fill_value = np.nan if dtype.kind == "f" else None
                            # assert np.ma.isMaskedArray(to_array) == np.ma.isMaskedArray(values), "to (%s) and from (%s) array are not of both masked or unmasked (%s)" %\
                            # (np.ma.isMaskedArray(to_array), np.ma.isMaskedArray(values), column_name)
                            if dtype.type == np.datetime64:
                                values = values.view(np.int64)
                            if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(values):
                                to_array.data[to_offset:to_offset + no_values] = values.filled(fill_value)
                                to_array.mask[to_offset:to_offset + no_values] = values.mask
                            elif not np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(values):
                                to_array[to_offset:to_offset + no_values] = values.filled(fill_value)
                            else:
                                to_array[to_offset:to_offset + no_values] = values
                            to_offset += no_values
                else:
                    values = dataset_input.evaluate(column_name, i1=i1, i2=i2)
                    if is_string:
                        no_values = len(values)
                        # for strings, we don't take sorting/shuffling into account when building the structure
                        to_column = to_array
                        assert isinstance(to_column, ColumnStringArrow)
                        from_sequence = _to_string_sequence(values)
                        to_sequence = to_column.string_sequence.slice(i1, i2, string_byte_offset)
                        string_byte_offset += to_sequence.fill_from(from_sequence)
                    else:
                        assert len(values) == (i2-i1)
                        fill_value = np.nan if dtype.kind == "f" else None
                        # assert np.ma.isMaskedArray(to_array) == np.ma.isMaskedArray(values), "to (%s) and from (%s) array are not of both masked or unmasked (%s)" %\
                        # (np.ma.isMaskedArray(to_array), np.ma.isMaskedArray(values), column_name)
                        if dtype.type == np.datetime64:
                            values = values.view(np.int64)
                        if shuffle or sort:
                            indices = order_array[i1:i2]
                            if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(values):
                                to_array.data[indices] = values.filled(fill_value)
                                to_array.mask[indices] = values.mask
                            elif not np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(values):
                                to_array[indices] = values.filled(fill_value)
                            else:
                                to_array[indices] = values
                        else:
                            if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(values):
                                to_array.data[i1:i2] = values.filled(fill_value)
                                to_array.mask[i1:i2] = values.mask
                            elif np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(values):
                                to_array[i1:i2] = values.filled(fill_value)
                            else:
                                to_array[i1:i2] = values
                with progress_lock:
                    progress_status.value += i2 - i1
                if progress_status.cancelled:
                    break
                #if not progress(progress_value / float(progress_total)):
                #    break
            if is_string:  # write out the last index
                to_column = to_array
                if selection:
                    to_column.indices[to_offset] = string_byte_offset
                else:
                    to_column.indices[count] = string_byte_offset
            if shuffle or sort:  # write to disk in one go
                if dtype == str_type:  # strings are sorted afterwards
                    view = to_array.string_sequence.lazy_index(order_array_inverse)
                    to_array_disk.string_sequence.fill_from(view)
                else:
                    if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(to_array_disk):
                        to_array_disk.data[:] = to_array.data
                        to_array_disk.mask[:] = to_array.mask
                    else:
                        to_array_disk[:] = to_array