예제 #1
0
def _to_string_sequence(x, force=True):
    if isinstance(x, pa.DictionaryArray):
        x = x.dictionary.take(
            x.indices
        )  # equivalent to PyArrow 5.0.0's dictionary_decode() but backwards compatible
    if isinstance(x, pa.ChunkedArray):
        # turn into pa.Array, TODO: do we want this, this may result in a big mem copy
        table = pa.Table.from_arrays([x], ["single"])
        table_concat = table.combine_chunks()
        column = table_concat.columns[0]
        if column.num_chunks == 0:
            x = pa.array([], type=column.type)
        else:
            assert column.num_chunks == 1
            x = column.chunk(0)

    if isinstance(x, ColumnString):
        return x.string_sequence
    elif isinstance(x, pa.Array):
        from vaex.arrow import convert
        return convert.column_from_arrow_array(x).string_sequence
    elif isinstance(x, np.ndarray):
        mask = None
        if np.ma.isMaskedArray(x):
            mask = np.ma.getmaskarray(x)
            x = x.data
        if x.dtype == 'O':
            return vaex.strings.StringArray(
                x) if mask is None else vaex.strings.StringArray(x, mask)
        elif x.dtype.kind in 'US':
            x = x.astype('O')
            return vaex.strings.StringArray(
                x) if mask is None else vaex.strings.StringArray(x, mask)
        else:
            # This path is only required because the str_pandas wrapper uses NaN for missing values
            # see pandas_wrapper in functions.py
            if force:
                length = len(x)
                bytes = np.zeros((0, ), dtype=np.uint8)
                indices = np.zeros((length + 1, ), dtype=np.int64)
                null_bitmap = np.ones(((length + 7) // 8, ), dtype=np.uint8)
                return vaex.strings.StringList64(bytes, indices, length, 0,
                                                 null_bitmap, 0)
            else:
                ValueError('unsupported dtype ' + str(x.dtype))
    elif isinstance(
            x,
        (vaex.superstrings.StringList32, vaex.superstrings.StringList64)):
        return x
    elif isinstance(x, (list, type)):
        return _to_string_sequence(np.array(x))
    else:
        raise ValueError('not a ColumnString or ndarray: ' + str(x))
예제 #2
0
파일: __init__.py 프로젝트: donrv/vaex
def string_column(strings):
    from vaex.arrow.convert import column_from_arrow_array
    import pyarrow as pa
    return column_from_arrow_array(pa.array(strings))