def _to_string_sequence(x, force=True): if isinstance(x, pa.DictionaryArray): x = x.dictionary.take( x.indices ) # equivalent to PyArrow 5.0.0's dictionary_decode() but backwards compatible if isinstance(x, pa.ChunkedArray): # turn into pa.Array, TODO: do we want this, this may result in a big mem copy table = pa.Table.from_arrays([x], ["single"]) table_concat = table.combine_chunks() column = table_concat.columns[0] if column.num_chunks == 0: x = pa.array([], type=column.type) else: assert column.num_chunks == 1 x = column.chunk(0) if isinstance(x, ColumnString): return x.string_sequence elif isinstance(x, pa.Array): from vaex.arrow import convert return convert.column_from_arrow_array(x).string_sequence elif isinstance(x, np.ndarray): mask = None if np.ma.isMaskedArray(x): mask = np.ma.getmaskarray(x) x = x.data if x.dtype == 'O': return vaex.strings.StringArray( x) if mask is None else vaex.strings.StringArray(x, mask) elif x.dtype.kind in 'US': x = x.astype('O') return vaex.strings.StringArray( x) if mask is None else vaex.strings.StringArray(x, mask) else: # This path is only required because the str_pandas wrapper uses NaN for missing values # see pandas_wrapper in functions.py if force: length = len(x) bytes = np.zeros((0, ), dtype=np.uint8) indices = np.zeros((length + 1, ), dtype=np.int64) null_bitmap = np.ones(((length + 7) // 8, ), dtype=np.uint8) return vaex.strings.StringList64(bytes, indices, length, 0, null_bitmap, 0) else: ValueError('unsupported dtype ' + str(x.dtype)) elif isinstance( x, (vaex.superstrings.StringList32, vaex.superstrings.StringList64)): return x elif isinstance(x, (list, type)): return _to_string_sequence(np.array(x)) else: raise ValueError('not a ColumnString or ndarray: ' + str(x))
def string_column(strings): from vaex.arrow.convert import column_from_arrow_array import pyarrow as pa return column_from_arrow_array(pa.array(strings))