def setup(self): np.random.seed(93487) # TODO: Is it maybe faster to separate each type into its own Take* class? # It seems like the data is regenerated for each benchmark and thus # is quite some overhead here. self.data = np.random.randint(0, 2**20, size=2**24) self.pd_int = pd.Series(self.data) self.fr_cont_int = pd.Series(fr.FletcherContinuousArray(self.data)) chunked_data = pa.chunked_array([ pa.array(self.data[0:len(self.data) // 2]), pa.array(self.data[len(self.data) // 2:-1]), ]) self.fr_chunked_int = pd.Series(fr.FletcherChunkedArray(chunked_data)) mask = np.random.rand(2**24) > 0.8 self.pd_int_na = pd.Series(pd.arrays.IntegerArray(self.data, mask)) self.fr_cont_int_na = pd.Series( fr.FletcherContinuousArray(pa.array(self.data, mask=mask))) self.fr_chunked_int_na = pd.Series( fr.FletcherChunkedArray(pa.array(self.data, mask=mask))) self.data_small = np.random.randint(0, 2**16, size=2**18) self.data_small_missing = self.data_small.copy() self.data_small_missing[0:-1:2] = -1 data_small_str = self.data_small.astype(str) self.pd_str = pd.Series(data_small_str) self.fr_cont_str = pd.Series( fr.FletcherContinuousArray(data_small_str)) data_small_str_chunked = pa.chunked_array([ pa.array(data_small_str[0:len(data_small_str) // 2]), pa.array(data_small_str[len(data_small_str) // 2:-1]), ]) self.fr_chunked_str = pd.Series( fr.FletcherChunkedArray(data_small_str_chunked))
def test_take_list_arrays(): indices = [0, 1, 4, 3, 5] indptr = [0, 2, 3, 5] list_array = pa.ListArray.from_arrays(indptr, indices) large_list_array = pa.LargeListArray.from_arrays(indptr, indices) test_with_null = fr.FletcherContinuousArray( pa.array([[1, 2], [None, 3], [4, 5]])) assert np.all( pa.array(test_with_null.take([1, 2, 1])).to_pylist() == [[None, 3], [4, 5], [None, 3]]) test = fr.FletcherContinuousArray( pa.chunked_array([list_array, list_array])).take([0, 5, 1]) test_large = fr.FletcherContinuousArray( pa.chunked_array([large_list_array, large_list_array])).take([0, 5, 1]) expected = [[0, 1], [3, 5], [4]] assert np.all( list( map( lambda x: np.all(np.array(test[x]) == np.array(expected)[x]), range(0, len(test)), ))) assert np.all( list( map( lambda x: np.all( np.array(test_large[x]) == np.array(expected)[x]), range(0, len(test_large)), )))
def setup(self): data = np.random.randint(0, 2**20, size=2**24) self.pd_int = pd.Series(data) self.fr_cont_int = pd.Series(fr.FletcherContinuousArray(data)) self.fr_chunked_int = pd.Series(fr.FletcherChunkedArray(data)) mask = np.random.rand(2**24) > 0.8 self.pd_int_na = pd.Series(pd.arrays.IntegerArray(data, mask)) self.fr_cont_int_na = pd.Series( fr.FletcherContinuousArray(pa.array(data, mask=mask))) self.fr_chunked_int_na = pd.Series( fr.FletcherChunkedArray(pa.array(data, mask=mask)))
def _fr_series_from_data(data, fletcher_variant, dtype=pa.string()): arrow_data = pa.array(data, type=dtype) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) return pd.Series(fr_array)
def _do_test_text_strip(str_accessor, fletcher_variant, fletcher_slice_offset, data, strip_method="strip"): if any("\x00" in x for x in data if x): # pytest.skip("pandas cannot handle \\x00 characters in tests") # Skip is not working properly with hypothesis return ser_pd = pd.Series(data, dtype=str) arrow_data = pa.array([None for _ in range(fletcher_slice_offset)] + data, type=pa.string()) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) ser_fr = pd.Series(fr_array[fletcher_slice_offset:]) result_pd = getattr(ser_pd.str, strip_method)() result_fr = getattr(getattr(ser_fr, str_accessor), strip_method)() result_fr = result_fr.astype(object) # Pandas returns np.nan for NA values in cat, keep this in line result_fr[result_fr.isna()] = np.nan result_pd[result_pd.isna()] = np.nan tm.assert_series_equal(result_fr, result_pd)
def test_awkward_accessor(): x = fletcher.FletcherContinuousArray([[1.0, 2.0], [], [3.0, 4.0, 5.0]]) y = np.zeros(len(x), dtype=float) df = pd.DataFrame(dict(x=x, y=y)) df.to_root(".test.root", compression_jagged=None) df = pd.read_root(".test.root") assert df["x"].ak(0).sum().tolist() == [3.0, 0.0, 12.0] assert awkward1.sum(df["x"], axis=-1).tolist() == [3.0, 0.0, 12.0]
def test_jagged(): x_in = fletcher.FletcherContinuousArray([[1.0, 2.0], [], [3.0, 4.0, 5.0]]) df = pd.DataFrame(dict(x=x_in)) df.to_root(".test.root", compression_jagged=None) x_out = pd.read_root(".test.root")["x"].values v_in = list(map(list, x_in.data)) v_out = list(map(list, x_out.data)) assert v_in == v_out
def test_flatten(): list_array = pa.array([[1, 2], [3, 4]]) npt.assert_array_equal( fr.FletcherContinuousArray(list_array).flatten(), [1, 2, 3, 4]) chunked_list_array = pa.chunked_array([list_array, list_array]) npt.assert_array_equal( fr.FletcherChunkedArray(chunked_list_array).flatten(), [1, 2, 3, 4, 1, 2, 3, 4])
def test_pandas_from_arrow(): arr = pa.array(["a", "b", "c"], pa.string()) expected_series_woutname = pd.Series(fr.FletcherChunkedArray(arr)) pdt.assert_series_equal(expected_series_woutname, fr.pandas_from_arrow(arr)) expected_series_woutname = pd.Series(fr.FletcherContinuousArray(arr)) pdt.assert_series_equal( expected_series_woutname, fr.pandas_from_arrow(arr, continuous=True) ) rb = pa.RecordBatch.from_arrays([arr], ["column"]) expected_df = pd.DataFrame({"column": fr.FletcherChunkedArray(arr)}) table = pa.Table.from_arrays([arr], ["column"]) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb)) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(table)) expected_df = pd.DataFrame({"column": fr.FletcherContinuousArray(arr)}) table = pa.Table.from_arrays([arr], ["column"]) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb, continuous=True)) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(table, continuous=True))
def array_to_fletcher_or_numpy(array): import fletcher arrow_array = awkward1.to_arrow(array) fletcher_array = fletcher.FletcherContinuousArray(arrow_array) if (array.ndim >= 2) or (fletcher_array.data.null_count > 0): return fletcher_array if "list<" not in str(fletcher_array.dtype): a = array.layout if hasattr(a, "content"): a = a.content return np.array(a, copy=False) return fletcher_array
def test_chunkdataframe(): x = fletcher.FletcherContinuousArray(100 * [[1.0, 2.0], [], [3.0, 4.0, 5.0]]) y = np.arange(len(x), dtype=float) df = pd.DataFrame(dict(x=x, y=y)) df.to_root(".test.root", compression_jagged=None) df = ChunkDataFrame(filename=".test.root", treename="t", entry_start=0, entry_stop=10) assert "x" not in df.columns assert len(df["x"]) == 10 assert "x" in df.columns
def test_text_cat(data, fletcher_variant, fletcher_variant_2): if any("\x00" in x for x in data if x): # pytest.skip("pandas cannot handle \\x00 characters in tests") # Skip is not working properly with hypothesis return ser_pd = pd.Series(data, dtype=str) arrow_data = pa.array(data, type=pa.string()) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) ser_fr = pd.Series(fr_array) if fletcher_variant_2 == "chunked": fr_other_array = fr.FletcherChunkedArray(arrow_data) else: fr_other_array = fr.FletcherContinuousArray(arrow_data) ser_fr_other = pd.Series(fr_other_array) result_pd = ser_pd.str.cat(ser_pd) result_fr = ser_fr.fr_text.cat(ser_fr_other) result_fr = result_fr.astype(object) # Pandas returns np.nan for NA values in cat, keep this in line result_fr[result_fr.isna()] = np.nan tm.assert_series_equal(result_fr, result_pd)
def test_chunkdataframe_subset(): x = fletcher.FletcherContinuousArray(100 * [[1.0, 2.0], [], [3.0, 4.0, 5.0]]) y = np.arange(len(x), dtype=float) df = pd.DataFrame(dict(x=x, y=y)) df.to_root(".test.root", compression_jagged=None) df = ChunkDataFrame(filename=".test.root", treename="t", entry_start=0, entry_stop=10) _ = df["x"] # subset of dataframe myslice = slice(0, 10, 2) df = df.iloc[myslice] # now check that the right subset of y is read _ = df["y"] assert (df["y"] == y[myslice]).all()
def test_text_zfill(data, str_accessor, fletcher_variant): if any("\x00" in x for x in data if x): # pytest.skip("pandas cannot handle \\x00 characters in tests") # Skip is not working properly with hypothesis return ser_pd = pd.Series(data, dtype=str) max_str_len = ser_pd.map(_optional_len).max() if pd.isna(max_str_len): max_str_len = 0 arrow_data = pa.array(data, type=pa.string()) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) ser_fr = pd.Series(fr_array) result_pd = ser_pd.str.zfill(max_str_len + 1) result_fr = getattr(ser_fr, str_accessor).zfill(max_str_len + 1) result_fr = result_fr.astype(object) # Pandas returns np.nan for NA values in cat, keep this in line result_fr[result_fr.isna()] = np.nan tm.assert_series_equal(result_fr, result_pd)