def test_chunked_array_asarray(): data = [ pa.array([0]), pa.array([1, 2, 3]) ] chunked_arr = pa.chunked_array(data) np_arr = np.asarray(chunked_arr) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('int64') # An optional type can be specified when calling np.asarray np_arr = np.asarray(chunked_arr, dtype='str') assert np_arr.tolist() == ['0', '1', '2', '3'] # Types are modified when there are nulls data = [ pa.array([1, None]), pa.array([1, 2, 3]) ] chunked_arr = pa.chunked_array(data) np_arr = np.asarray(chunked_arr) elements = np_arr.tolist() assert elements[0] == 1. assert np.isnan(elements[1]) assert elements[2:] == [1., 2., 3.] assert np_arr.dtype == np.dtype('float64')
def ne(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): x = xarrs else: x = pa.chunked_array(xarrs) if isinstance(yarrs, pa.ChunkedArray): y = yarrs else: y = pa.chunked_array(yarrs) assert not x.equals(y) assert not y.equals(x)
def test_table_pickle(): data = [ pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()), pa.chunked_array([["some", "strings", None, ""]], type=pa.string()), ] schema = pa.schema([pa.field('ints', pa.uint32()), pa.field('strs', pa.string())], metadata={b'foo': b'bar'}) table = pa.Table.from_arrays(data, schema=schema) result = pickle.loads(pickle.dumps(table)) result._validate() assert result.equals(table)
def eq(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): x = xarrs else: x = pa.chunked_array(xarrs) if isinstance(yarrs, pa.ChunkedArray): y = yarrs else: y = pa.chunked_array(yarrs) assert x.equals(y) assert y.equals(x) assert x == y assert x != str(y)
def test_table_from_pydict(): table = pa.Table.from_pydict({}) assert table.num_columns == 0 assert table.num_rows == 0 assert table.schema == pa.schema([]) assert table.to_pydict() == {} # With arrays as values data = OrderedDict([('strs', pa.array([u'', u'foo', u'bar'])), ('floats', pa.array([4.5, 5, None]))]) schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # With chunked arrays as values data = OrderedDict([('strs', pa.chunked_array([[u''], [u'foo', u'bar']])), ('floats', pa.chunked_array([[4.5], [5, None]]))]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # With lists as values data = OrderedDict([('strs', [u'', u'foo', u'bar']), ('floats', [4.5, 5, None])]) table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema assert table.to_pydict() == data # With metadata and inferred schema metadata = {b'foo': b'bar'} schema = schema.add_metadata(metadata) table = pa.Table.from_pydict(data, metadata=metadata) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pydict() == data # With explicit schema table = pa.Table.from_pydict(data, schema=schema) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pydict() == data # Cannot pass both schema and metadata with pytest.raises(ValueError): pa.Table.from_pydict(data, schema=schema, metadata=metadata)
def test_chunked_array_str(): data = [ pa.array([1, 2, 3]), pa.array([4, 5, 6]) ] data = pa.chunked_array(data) assert str(data) == """[
def test_chunked_array_basics(): data = pa.chunked_array([], type=pa.string()) assert data.type == pa.string() assert data.to_pylist() == [] with pytest.raises(ValueError): pa.chunked_array([]) data = pa.chunked_array([ [1, 2, 3], [4, 5, 6], [7, 8, 9] ]) assert isinstance(data.chunks, list) assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks) assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks()) assert len(data.chunks) == 3
def simple_dicts_table(): dict_values = pa.array(["foo", "baz", "quux"], type=pa.utf8()) data = [ pa.chunked_array([ pa.DictionaryArray.from_arrays([1, 0, None], dict_values), pa.DictionaryArray.from_arrays([2, 1], dict_values)]), ] return pa.Table.from_arrays(data, names=['some_dicts'])
def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) result = pickle.loads(pickle.dumps(array)) assert result.equals(array)
def test_chunked_array_equals(): def eq(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): x = xarrs else: x = pa.chunked_array(xarrs) if isinstance(yarrs, pa.ChunkedArray): y = yarrs else: y = pa.chunked_array(yarrs) assert x.equals(y) assert y.equals(x) assert x == y assert x != str(y) def ne(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): x = xarrs else: x = pa.chunked_array(xarrs) if isinstance(yarrs, pa.ChunkedArray): y = yarrs else: y = pa.chunked_array(yarrs) assert not x.equals(y) assert not y.equals(x) assert x != y eq(pa.chunked_array([], type=pa.int32()), pa.chunked_array([], type=pa.int32())) ne(pa.chunked_array([], type=pa.int32()), pa.chunked_array([], type=pa.int64())) a = pa.array([0, 2], type=pa.int32()) b = pa.array([0, 2], type=pa.int64()) c = pa.array([0, 3], type=pa.int32()) d = pa.array([0, 2, 0, 3], type=pa.int32()) eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) assert not pa.chunked_array([], type=pa.int32()).equals(None)
def test_column_pickle(): arr = pa.chunked_array([[1, 2], [5, 6, 7]], type=pa.int16()) field = pa.field("ints", pa.int16()).add_metadata({b"foo": b"bar"}) col = pa.column(field, arr) result = pickle.loads(pickle.dumps(col)) assert result.equals(col) assert result.data.num_chunks == 2 assert result.field == field
def chunked_arrays(draw, type, min_chunks=0, max_chunks=None, chunk_size=None): if isinstance(type, st.SearchStrategy): type = draw(type) # TODO(kszucs): remove it, field metadata is not kept h.assume(not pa.types.is_struct(type)) chunk = arrays(type, size=chunk_size) chunks = st.lists(chunk, min_size=min_chunks, max_size=max_chunks) return pa.chunked_array(draw(chunks), type=type)
def test_chunked_array_iter(): data = [ pa.array([0]), pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([7, 8, 9]) ] arr = pa.chunked_array(data) for i, j in zip(range(10), arr): assert i == j assert isinstance(arr, Iterable)
def test_unique_simple(): cases = [ (pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])), (pa.array(['foo', None, 'bar', 'foo']), pa.array(['foo', 'bar'])) ] for arr, expected in cases: result = arr.unique() assert result.equals(expected) result = pa.column("column", arr).unique() assert result.equals(expected) result = pa.chunked_array([arr]).unique() assert result.equals(expected)
def test_dictionary_encode_simple(): cases = [ (pa.array([1, 2, 3, None, 1, 2, 3]), pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, None, 0, 1, 2], type='int32'), [1, 2, 3])), (pa.array(['foo', None, 'bar', 'foo']), pa.DictionaryArray.from_arrays( pa.array([0, None, 1, 0], type='int32'), ['foo', 'bar'])) ] for arr, expected in cases: result = arr.dictionary_encode() assert result.equals(expected) result = pa.column("column", arr).dictionary_encode() assert result.data.chunk(0).equals(expected) result = pa.chunked_array([arr]).dictionary_encode() assert result.chunk(0).equals(expected)
def test_chunked_array_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) carr = pa.chunked_array(a) x, y = carr.flatten() assert x.equals(pa.chunked_array(pa.array([1, 3, 5], type=pa.int16()))) assert y.equals( pa.chunked_array(pa.array([2.5, 4.5, 6.5], type=pa.float32()))) # Empty column a = pa.array([], type=ty) carr = pa.chunked_array(a) x, y = carr.flatten() assert x.equals(pa.chunked_array(pa.array([], type=pa.int16()))) assert y.equals(pa.chunked_array(pa.array([], type=pa.float32())))
def _render_minimum_or_maximum(table, colnames, outcolname, fn): if not colnames: return ArrowRenderResult(table) out_np_arrays = [] num_chunks = table[colnames[0]].num_chunks for chunk in range(num_chunks): in_np_arrays = [ table[colname].chunk(chunk).to_numpy(zero_copy_only=False) for colname in colnames ] out_np_array = fn.reduce(in_np_arrays) out_np_arrays.append(out_np_array) if outcolname in table.column_names: table = table.remove_column(table.column_names.index(outcolname)) table = table.append_column( outcolname, pa.chunked_array(out_np_arrays, pa.timestamp("ns"))) return ArrowRenderResult(table)
def test_column_factory_function(): # ARROW-1575 arr = pa.array([0, 1, 2, 3, 4]) arr2 = pa.array([5, 6, 7, 8]) col1 = pa.Column.from_array('foo', arr) col2 = pa.Column.from_array(pa.field('foo', arr.type), arr) assert col1.equals(col2) col3 = pa.column('foo', [arr, arr2]) chunked_arr = pa.chunked_array([arr, arr2]) col4 = pa.column('foo', chunked_arr) assert col3.equals(col4) col5 = pa.column('foo', arr.to_pandas()) assert col5.equals(pa.column('foo', arr)) # Type mismatch with pytest.raises(ValueError): pa.Column.from_array(pa.field('foo', pa.string()), arr)
def __arrow_array__(self, type=None): # type: (pa.DataType,) -> pa.Array """ Implement pyarrow array interface (requires pyarrow>=0.15). Returns ------- pa.Array """ if self._has_single_chunk: data = self.data.chunks[0] else: data = pa.concat_arrays(self.data.iterchunks()) self.data = pa.chunked_array([data ]) # modify a data pointer inplace if type is not None and type != data.type: return data.cast(type, safe=False) else: return data
def __setitem__(self, key, value): if isinstance(value, (pd.Index, pd.Series)): value = value.to_numpy() key = check_array_indexer(self, key) scalar_key = is_scalar(key) # validate new items if scalar_key: if pd.isna(value): value = None elif not is_list_like(value): raise ValueError('Must provide list.') if self._use_arrow: array = np.asarray(self._arrow_array.to_pandas()) array[key] = value self._arrow_array = pa.chunked_array( [pa.array(array, type=self.dtype.arrow_type)]) else: self._ndarray[key] = value
def reader(): record_batches = [] for fragment in fragments: for scan_task in fragment.scan(batch_size=chunk_size, use_threads=False, columns=columns): for record_batch in scan_task.execute(): record_batches.append((record_batch)) dict_or_list_of_arrays = collections.defaultdict(list) for rb in record_batches: for name, array in zip(rb.schema.names, rb.columns): dict_or_list_of_arrays[name].append(array) chunks = { name: pa.chunked_array(arrays) for name, arrays in dict_or_list_of_arrays.items() } for name, chunk in chunks.items(): assert len( chunk ) == rows_planned, f'Oops, got a chunk ({name}) of length {len(chunk)} while it is expected to be of length {rows_planned}' return chunks
def fillna(self, value=None, method=None, limit=None): cls = type(self) if pa is None or self._force_use_pandas: # pyarrow not installed return cls( pd.Series(self.to_numpy()).fillna(value=value, method=method, limit=limit)) chunks = [] for chunk_array in self._arrow_array.chunks: array = chunk_array.to_pandas() if method is None: result_array = self._array_fillna(array, value) else: result_array = array.fillna(value=value, method=method, limit=limit) chunks.append(pa.array(result_array, from_pandas=True)) return cls(pa.chunked_array(chunks), dtype=self._dtype)
def test_arrow_chunked_struct(self): if pyarrow is None: pytest.skip("unable to import pyarrow") else: a = pyarrow.chunked_array([ pyarrow.array([{ "x": 1, "y": 1.1 }, { "x": 2, "y": 2.2 }, { "x": 3, "y": 3.3 }]), pyarrow.array([]), pyarrow.array([{ "x": 4, "y": 4.4 }, { "x": 5, "y": 5.5 }]) ]) assert awkward.arrow.fromarrow(a).tolist() == [{ "x": 1, "y": 1.1 }, { "x": 2, "y": 2.2 }, { "x": 3, "y": 3.3 }, { "x": 4, "y": 4.4 }, { "x": 5, "y": 5.5 }]
def test_chunked_array_getitem(): data = [pa.array([1, 2, 3]), pa.array([4, 5, 6])] data = pa.chunked_array(data) assert data[1].as_py() == 2 assert data[-1].as_py() == 6 assert data[-6].as_py() == 1 with pytest.raises(IndexError): data[6] with pytest.raises(IndexError): data[-7] # Ensure this works with numpy scalars assert data[np.int32(1)].as_py() == 2 data_slice = data[2:4] assert data_slice.to_pylist() == [3, 4] data_slice = data[4:-1] assert data_slice.to_pylist() == [5] data_slice = data[99:99] assert data_slice.type == data.type assert data_slice.to_pylist() == []
def test_chunked_array_slice(): data = [ pa.array([1, 2, 3]), pa.array([4, 5, 6]) ] data = pa.chunked_array(data) data_slice = data.slice(len(data)) assert data_slice.type == data.type assert data_slice.to_pylist() == [] data_slice = data.slice(len(data) + 10) assert data_slice.type == data.type assert data_slice.to_pylist() == [] table = pa.Table.from_arrays([data], names=["a"]) table_slice = table.slice(len(table)) assert len(table_slice) == 0 table = pa.Table.from_arrays([data], names=["a"]) table_slice = table.slice(len(table) + 10) assert len(table_slice) == 0
def __getitem__(self, item): if isinstance(item, slice): chunks = [] ds = self.ds.__getitem__(item) for chunk_start, chunk_end, reader in ds.chunk_iterator( [self.name]): ar = reader()[self.name] if isinstance(ar, pa.ChunkedArray): chunks.extend(ar.chunks) else: chunks.append(ar) if len(chunks) == 1: return chunks[0] if any([ isinstance(k, vaex.array_types.supported_arrow_array_types) for k in chunks ]): return pa.chunked_array([k for k in chunks]) else: return np.concatenate(chunks) else: raise NotImplementedError
def _set_via_chunk_iteration(self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]) -> pa.ChunkedArray: """ Loop through the array chunks and set the new values while leaving the chunking layout unchanged. """ chunk_indices = self._within_chunk_indices(indices) new_data = [] for i, chunk in enumerate(self._data.iterchunks()): c_ind = chunk_indices[i] n = len(c_ind) c_value, value = value[:n], value[n:] if n == 1: # fast path chunk = self._set_single_index_in_chunk( chunk, c_ind[0], c_value[0]) elif n > 0: mask = np.zeros(len(chunk), dtype=np.bool_) mask[c_ind] = True if not pa_version_under5p0: if c_value is None or isna(np.array(c_value)).all(): chunk = pc.if_else(mask, None, chunk) else: chunk = pc.replace_with_mask(chunk, mask, c_value) else: # The pyarrow compute functions were added in # version 5.0. For prior versions we implement # our own by converting to numpy and back. chunk = chunk.to_numpy(zero_copy_only=False) chunk[mask] = c_value chunk = pa.array(chunk, type=pa.string()) new_data.append(chunk) return pa.chunked_array(new_data)
def _2(a: pa.Array, b: Any, op: Callable): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append( np_ufunc_op(a[offset:offset + len(chunk)], chunk, op)) return pa.chunked_array(new_chunks) elif isinstance(b, pa.Array): np_arr_a = _extract_data_buffer_as_np_array(a) np_arr_b = _extract_data_buffer_as_np_array(b) if a.null_count > 0 and b.null_count > 0: # TODO: Combine them before extracting mask_a = extract_isnull_bytemap(a) mask_b = extract_isnull_bytemap(b) mask = mask_a | mask_b elif a.null_count > 0: mask = extract_isnull_bytemap(a) elif b.null_count > 0: mask = extract_isnull_bytemap(b) else: mask = None new_arr = op(np_arr_a, np_arr_b) # Don't set type as we might have valid casts like int->float in truediv return pa.array(new_arr, mask=mask) else: # b is non-masked, either array-like or scalar # numpy can handle all types of b from here np_arr = _extract_data_buffer_as_np_array(a) if a.null_count > 0: mask = extract_isnull_bytemap(a) else: mask = None new_arr = op(np_arr, b) # Don't set type as we might have valid casts like int->float in truediv return pa.array(new_arr, mask=mask)
def convert(x, type, default_type="numpy"): import vaex.column if type == "numpy": if isinstance(x, (list, tuple)): return concat([convert(k, type) for k in x]) else: return to_numpy(x, strict=True) if type == "numpy-arrow": # used internally, numpy if possible, otherwise arrow if isinstance(x, (list, tuple)): return concat([convert(k, type) for k in x]) else: return to_numpy(x, strict=False) elif type == "arrow": if isinstance(x, (list, tuple)): return pa.chunked_array([convert(k, type) for k in x]) else: return to_arrow(x) elif type == "xarray": return to_xarray(x) elif type in ['list', 'python']: return convert(x, 'numpy').tolist() elif type is None: if isinstance(x, (list, tuple)): chunks = [convert(k, type) for k in x] if isinstance( chunks[0], (pa.Array, pa.ChunkedArray, vaex.column.ColumnStringArrow)): return convert(chunks, "arrow") elif isinstance(chunks[0], np.ndarray): return convert(chunks, "numpy") else: raise ValueError("Unknown type: %r" % chunks[0]) else: # return convert(x, Nonedefault_type) return x else: raise ValueError("Unknown type: %r" % type)
def test_arrow_chunked_struct(self): if pyarrow is not None: a = pyarrow.chunked_array([ pyarrow.array([{ "x": 1, "y": 1.1 }, { "x": 2, "y": 2.2 }, { "x": 3, "y": 3.3 }]), pyarrow.array([]), pyarrow.array([{ "x": 4, "y": 4.4 }, { "x": 5, "y": 5.5 }]) ]) assert awkward.arrow.view(a).tolist() == [{ "x": 1, "y": 1.1 }, { "x": 2, "y": 2.2 }, { "x": 3, "y": 3.3 }, { "x": 4, "y": 4.4 }, { "x": 5, "y": 5.5 }]
def test_cast_kernel_on_extension_arrays(): # test array casting storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(IntegerType(), storage) # test that no allocation happens during identity cast allocated_before_cast = pa.total_allocated_bytes() casted = arr.cast(pa.int64()) assert pa.total_allocated_bytes() == allocated_before_cast cases = [(pa.int64(), pa.Int64Array), (pa.int32(), pa.Int32Array), (pa.int16(), pa.Int16Array), (pa.uint64(), pa.UInt64Array), (pa.uint32(), pa.UInt32Array), (pa.uint16(), pa.UInt16Array)] for typ, klass in cases: casted = arr.cast(typ) assert casted.type == typ assert isinstance(casted, klass) # test chunked array casting arr = pa.chunked_array([arr, arr]) casted = arr.cast(pa.int16()) assert casted.type == pa.int16() assert isinstance(casted, pa.ChunkedArray)
def test_chunked_array_getitem(): data = [ pa.array([1, 2, 3]), pa.array([4, 5, 6]) ] data = pa.chunked_array(data) assert data[1].as_py() == 2 assert data[-1].as_py() == 6 assert data[-6].as_py() == 1 with pytest.raises(IndexError): data[6] with pytest.raises(IndexError): data[-7] data_slice = data[2:4] assert data_slice.to_pylist() == [3, 4] data_slice = data[4:-1] assert data_slice.to_pylist() == [5] data_slice = data[99:99] assert data_slice.type == data.type assert data_slice.to_pylist() == []
def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append( dispatch_chunked_binary_map(a[offset:offset + len(chunk)], chunk, ops)) return pa.chunked_array(new_chunks) elif isinstance(b, pa.Array): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") return ops.get("array_array", _not_implemented_path)(a, b) else: if np.isscalar(b): return ops.get("array_scalar", _not_implemented_path)(a, b) else: if len(a) != len(b): raise ValueError("Inputs don't have the same length.") return ops.get("array_nparray", _not_implemented_path)(a, b)
def _render_difference(table, colname1, colname2, unit, outcolname): if not colname1 or not colname2: return ArrowRenderResult(table) out_arrays = [] if unit == "nanosecond": out_type = pa.int64() out_metadata = {"format": "{:,d}"} else: out_type = pa.float64() out_metadata = {"format": "{:,}"} num_chunks = table[colname1].num_chunks for chunk in range(num_chunks): chunk1 = table[colname1].chunk(chunk).cast(pa.int64()) chunk2 = table[colname2].chunk(chunk).cast(pa.int64()) # TODO subtract_checked and report error difference_in_ns = pa.compute.subtract(chunk2, chunk1) if unit == "nanosecond": # Nanosecond differences are integers out_array = difference_in_ns else: out_array = pa.compute.divide( difference_in_ns.cast(pa.float64(), safe=False), pa.scalar(_NS_PER_UNIT[unit], pa.float64()), ) out_arrays.append(out_array) if outcolname in table.column_names: table = table.remove_column(table.column_names.index(outcolname)) table = table.append_column( pa.field(outcolname, out_type, metadata=out_metadata), pa.chunked_array(out_arrays, out_type), ) return ArrowRenderResult(table)
def test_combined_in_chunk_offsets(): a = pa.chunked_array([[]]) b = pa.chunked_array([[]]) in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) assert in_a_offsets == [(0, 0, 0)] assert in_b_offsets == [(0, 0, 0)] a = pa.chunked_array([[1]]) b = pa.chunked_array([[2]]) in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) assert in_a_offsets == [(0, 0, 1)] assert in_b_offsets == [(0, 0, 1)] a = pa.chunked_array([[1, 2], [3, 4, 5]]) b = pa.chunked_array([[1], [2, 3], [4, 5]]) in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) assert in_a_offsets == [(0, 0, 1), (0, 1, 1), (1, 0, 1), (1, 1, 2)] assert in_b_offsets == [(0, 0, 1), (1, 0, 1), (1, 1, 1), (2, 0, 2)]
def test_reduce_op_no_identity(data, skipna, op, pandas_op): arrow = pa.array(data, type=pa.float64(), from_pandas=True) pandas = pd.Series(data, dtype=float) should_raise = arrow.null_count == len(arrow) and (skipna or len(arrow) == 0) if should_raise: with pytest.raises(ValueError): assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) else: assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) # Split in the middle and check whether this still works if len(data) > 2: arrow = pa.chunked_array( [ pa.array(data[: len(data) // 2], type=pa.float64(), from_pandas=True), pa.array(data[len(data) // 2 :], type=pa.float64(), from_pandas=True), ] ) if should_raise: with pytest.raises(ValueError): assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) else: assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))
def _take_on_chunks(self, indices, limits_idx, cum_lengths, sort_idx=None): def take_in_one_chunk(i_chunk): indices_chunk = indices[limits_idx[i_chunk]:limits_idx[i_chunk + 1]] indices_chunk -= cum_lengths[i_chunk] if (self.dtype.is_list and self.data.chunk(i_chunk).flatten().null_count == 0 and self.data.chunk(i_chunk).null_count == 0 and self.flatten().dtype._is_numeric): return take_indices_on_pyarrow_list(self.data.chunk(i_chunk), indices_chunk) else: return self.data.chunk(i_chunk).take(pa.array(indices_chunk)) # this is a pyarrow.Array result = [take_in_one_chunk(i) for i in range(self.data.num_chunks)] # we know that self.data.num_chunks >1 if sort_idx is None: return FletcherArray( pa.chunked_array(filter(len, result), type=self.data.type)) else: return FletcherArray( pa.concat_arrays(result).take(pa.array(sort_idx)))
def test_dictionary_array_automatically_read(use_legacy_dataset): # ARROW-3246 # Make a large dictionary, a little over 4MB of data dict_length = 4000 dict_values = pa.array([('x' * 1000 + '_{}'.format(i)) for i in range(dict_length)]) num_chunks = 10 chunk_size = 100 chunks = [] for i in range(num_chunks): indices = np.random.randint(0, dict_length, size=chunk_size).astype(np.int32) chunks.append( pa.DictionaryArray.from_arrays(pa.array(indices), dict_values)) table = pa.table([pa.chunked_array(chunks)], names=['f0']) result = _simple_table_write_read(table, use_legacy_dataset) assert result.equals(table) # The only key in the metadata was the Arrow schema key assert result.schema.metadata is None
def test_chunked_array_basics(): data = pa.chunked_array([], type=pa.string()) assert data.to_pylist() == [] with pytest.raises(ValueError): pa.chunked_array([])
def test_chunked_array_str(): data = [pa.array([1, 2, 3]), pa.array([4, 5, 6])] data = pa.chunked_array(data) assert str(data) == """[
def test_chunked_array_mismatch_types(): with pytest.raises(pa.ArrowInvalid): pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])])
def wrapper(array): if isinstance(array, pa.ChunkedArray): return pa.chunked_array( [func(chunk) for chunk in array.chunks]) else: return func(array)
def __init__(self, dfs, name=None): from vaex.column import ColumnConcatenatedLazy crs = np.array([df.geometry.crs.srs for df in dfs]) crs = np.unique(crs) if len(crs) > 1: raise ValueError( 'Concatenating dataframes where different crs not supported.') else: crs = crs[0] if len(crs) == 1 else None metadata = dfs[0]._metadata geoms = [] for df in dfs: if isinstance(df.geometry._geometry, pa.Array): geoms.append(df.geometry._geometry) elif isinstance(df.geometry._geometry, pa.ChunkedArray): for chunk in df.geometry._geometry.chunks: geoms.append(chunk) else: geoms.append(pa.array(df.geometry._geometry)) geometry = pa.chunked_array(geoms) super(GeoDataFrameConcatenated, self).__init__(geometry, crs=crs, metadata=metadata) self.dfs = dfs = [df.extract() for df in dfs] self.name = name or "-".join(df.name for df in self.dfs) self.path = "-".join(df.path for df in self.dfs) first, tail = dfs[0], dfs[1:] for column_name in first.get_column_names(virtual=False, hidden=True, alias=False): if all([ column_name in df.get_column_names(virtual=False, hidden=True, alias=False) for df in tail ]): self.column_names.append(column_name) self.columns = {} for column_name in self.get_column_names(virtual=False, hidden=True, alias=False): self.columns[column_name] = ColumnConcatenatedLazy( [df[column_name] for df in dfs]) self._save_assign_expression(column_name) for name in list(first.virtual_columns.keys()): if all([ first.virtual_columns[name] == df.virtual_columns.get( name, None) for df in tail ]): self.add_virtual_column(name, first.virtual_columns[name]) else: self.columns[name] = ColumnConcatenatedLazy( [df[name] for df in dfs]) self.column_names.append(name) self._save_assign_expression(name) for df in tail: if first._column_aliases != df._column_aliases: raise ValueError( f'Concatenating dataframes where different column aliases not supported: {first._column_aliases} != {df._column_aliases}' ) self._column_aliases = first._column_aliases.copy() for df in dfs[:1]: for name, value in list(df.variables.items()): if name not in self.variables: self.set_variable(name, value, write=False) # self.write_virtual_meta() self._length_unfiltered = sum(len(ds) for ds in self.dfs) self._length_original = self._length_unfiltered self._index_end = self._length_unfiltered
def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: """Set one or more values inplace. Parameters ---------- key : int, ndarray, or slice When called from, e.g. ``Series.__setitem__``, ``key`` will be one of * scalar int * ndarray of integers. * boolean ndarray * slice object value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object value or values to be set of ``key``. Returns ------- None """ key = check_array_indexer(self, key) if is_integer(key): key = cast(int, key) if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") # Slice data and insert in-between new_data = [ *self._data[0:key].chunks, pa.array([value], type=pa.string()), *self._data[(key + 1):].chunks, ] self._data = pa.chunked_array(new_data) else: # Convert to integer indices and iteratively assign. # TODO: Make a faster variant of this in Arrow upstream. # This is probably extremely slow. # Convert all possible input key types to an array of integers if isinstance(key, slice): key_array = np.array(range(len(self))[key]) elif is_bool_dtype(key): # TODO(ARROW-9430): Directly support setitem(booleans) key_array = np.argwhere(key).flatten() else: # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) if len(key_array) != len(value): raise ValueError("Length of indexer and values mismatch") for k, v in zip(key_array, value): self[k] = v
def wrapper(arr: Union[pa.Array, pa.ChunkedArray], *args, **kwargs): if isinstance(arr, pa.ChunkedArray): return pa.chunked_array( [func(chunk, *args, **kwargs) for chunk in arr.chunks]) else: return func(arr, *args, **kwargs)
def _concat_same_type(cls, to_concat): chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) arr = pa.chunked_array(chunks) return cls(arr)
def test_np_ufunc_op_chunked_scalar(): a = pa.chunked_array([[1, 2], [3, None]]) b = 4 expected = pa.array([5, 6, 7, None]) check_np_ufunc(a, b, expected)
def from_scalars(cls, values): arr = pa.chunked_array([pa.array(np.asarray(values))]) return cls(arr)
def from_array(cls, arr): assert isinstance(arr, pa.Array) return cls(pa.chunked_array([arr]))