def test_reduce_max_min_no_nulls(): test = [1, 2, 3, -23, 75] fr_test_int = fr.FletcherArray(test, dtype=pa.int64()) fr_test_float = fr.FletcherArray(test, dtype=pa.float64()) fr_test_int_no_nulls = fr.FletcherArray( pa.Array.from_buffers( type=pa.int64(), length=len(fr_test_int), buffers=[None, fr_test_int.data.chunk(0).buffers()[1]], )) fr_test_float_no_nulls = fr.FletcherArray( pa.Array.from_buffers( type=pa.float64(), length=len(fr_test_float), buffers=[None, fr_test_float.data.chunk(0).buffers()[1]], )) result_int_max = fr_test_int_no_nulls._reduce("max") result_int_min = fr_test_int_no_nulls._reduce("min") result_float_max = fr_test_float_no_nulls._reduce("max") result_float_min = fr_test_float_no_nulls._reduce("min") expected_result_int_max = 75 expected_result_int_min = -23 expected_result_float_max = 75.0 expected_result_float_min = -23.0 assert result_int_max == expected_result_int_max assert result_int_min == expected_result_int_min assert result_float_max == expected_result_float_max assert result_float_min == expected_result_float_min
def test_take_list_arrays(): indices = [0, 1, 4, 3, 5] indptr = [0, 2, 3, 5] list_array = pa.ListArray.from_arrays(indptr, indices) large_list_array = pa.LargeListArray.from_arrays(indptr, indices) test_with_null = fr.FletcherArray(pa.array([[1, 2], [None, 3], [4, 5]])) assert np.all( pa.array(test_with_null.take([1, 2, 1])).to_pylist() == [[None, 3], [4, 5], [None, 3]]) test = fr.FletcherArray(pa.chunked_array([list_array, list_array])).take([0, 5, 1]) test_large = fr.FletcherArray( pa.chunked_array([large_list_array, large_list_array])).take([0, 5, 1]) expected = [[0, 1], [3, 5], [4]] assert np.all( list( map( lambda x: np.all(np.array(test[x]) == np.array(expected)[x]), range(0, len(test)), ))) assert np.all( list( map( lambda x: np.all( np.array(test_large[x]) == np.array(expected)[x]), range(0, len(test_large)), )))
def test_take_on_chunks_with_many_chunks(): test = [[1, 2, 3] for _ in range(100)] fr_test = fr.FletcherArray(pa.chunked_array(test)) indices1 = np.array([(30 * k + (k % 3)) for k in range(0, 10)]) # bins will be already sorted indices2 = np.array([2, 5] * 100) # bins will have to be sorted limits_idx1 = np.array([0] + [k // 10 for k in range(10, 110)]) limits_idx2 = np.array([0] + [100] + [200] * 99) sort_idx1 = None sort_idx2 = np.array([2 * k for k in range(0, 100)] + [2 * k + 1 for k in range(100)]) indices2 = indices2[sort_idx2] cum_lengths = np.array([3 * k for k in range(100)]) for indices, limits_idx, cum_lengths, sort_idx in [ (indices1, limits_idx1, cum_lengths, sort_idx1), (indices2, limits_idx2, cum_lengths, sort_idx2), ]: expected_result = fr.FletcherArray( [np.concatenate(test)[e] for e in indices]) result = fr_test._take_on_chunks(indices, limits_idx=limits_idx, cum_lengths=cum_lengths, sort_idx=sort_idx) npt.assert_array_equal(expected_result, result)
def test_concatenate_blocks(): v1 = fr.FletcherArray(TEST_ARRAY) s = pd.Series(v1, index=pd.RangeIndex(3), fastpath=True) result = pd.concat([s, s], ignore_index=True) expected = pd.Series( fr.FletcherArray( pa.array(["Test", "string", None, "Test", "string", None]))) tm.assert_series_equal(result, expected)
def test_take(): test = [[1, 2, 8, 3], [4, 1, 5, 6], [7, 8, 9]] indices = [4, 2, 8] * 100 fr_test = fr.FletcherArray(pa.chunked_array(test)) result = fr_test.take(indices) expected_result = fr.FletcherArray( pa.chunked_array([[4, 8, 7] for _ in range(100)])) npt.assert_array_equal(expected_result, result)
def test_take_on_concatenated_chunks(): test = [[1, 2, 8, 3], [4, 1, 5, 6], [7, 8, 9]] indices = np.array([4, 2, 8]) expected_result = fr.FletcherArray( [np.concatenate(test)[e] for e in indices]) result = fr.FletcherArray( pa.chunked_array(test))._take_on_concatenated_chunks(indices) npt.assert_array_equal(expected_result, result)
def setup(self): data = np.zeros(2 ** 24).astype(bool) self.fr_data = pd.Series(fr.FletcherArray(pa.array(data))) self.np_data = pd.Series(data.astype(np.float32)) data_withna = np.zeros(2 ** 24).astype(bool).astype(object) data_withna[-1] = None self.fr_data_withna = pd.Series(fr.FletcherArray(pa.array(data_withna))) self.np_data_withna = pd.Series(data_withna.astype(np.float32))
def test_indices_dtype(): arr1 = fr.FletcherArray(np.zeros(np.iinfo(np.int32()).max + 1)) arr2 = fr.FletcherArray(np.zeros(np.iinfo(np.int32()).max + 2)) for arr in [arr1, arr2]: npt.assert_equal( len(arr) - 1, np.array([len(arr) - 1], dtype=arr._indices_dtype)[0]) npt.assert_equal(arr1._indices_dtype, np.dtype(np.int32)) npt.assert_equal(arr2._indices_dtype, np.dtype(np.int64))
def test_pandas_from_arrow(): arr = pa.array(["a", "b", "c"], pa.string()) expected_series_woutname = pd.Series(fr.FletcherArray(arr)) pdt.assert_series_equal(expected_series_woutname, fr.pandas_from_arrow(arr)) rb = pa.RecordBatch.from_arrays([arr], ["column"]) expected_df = pd.DataFrame({"column": fr.FletcherArray(arr)}) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb))
def test_reduce_mean(): test = [[1, 2, 3], [1, 2, None]] fr_test_int = fr.FletcherArray(pa.chunked_array(test), dtype=pa.int64()) fr_test_float = fr.FletcherArray(pa.chunked_array(test), dtype=pa.float64()) result_int = fr_test_int._reduce("mean") result_float = fr_test_float._reduce("mean") expected_result = 9 / 5 assert result_int == expected_result assert result_float == expected_result
def test_take_on_chunks(): test = [[1, 2, 8, 3], [4, 1, 5, 6], [7, 8, 9]] indices = np.array([2, 4, 8]) limits_idx = np.array([0, 1, 2, 3]) cum_lengths = np.array([0, 4, 8]) expected_result = fr.FletcherArray( [np.concatenate(test)[e] for e in indices]) result = fr.FletcherArray(pa.chunked_array(test))._take_on_chunks( indices, limits_idx=limits_idx, cum_lengths=cum_lengths) npt.assert_array_equal(expected_result, result)
def test_take_on_concatenated_chunks_with_many_chunks(): test = [[1, 2, 3] for _ in range(100)] fr_test = fr.FletcherArray(pa.chunked_array(test)) indices1 = np.array([(30 * k + (k % 3)) for k in range(0, 10)]) indices2 = np.array([2, 5] * 100) for indices in [indices1, indices2]: expected_result = fr.FletcherArray( [np.concatenate(test)[e] for e in indices]) result = fr_test._take_on_concatenated_chunks(indices) npt.assert_array_equal(expected_result, result)
def test_dataframe_from_series_no_dict(): s = pd.Series(fr.FletcherArray(TEST_ARRAY)) result = pd.DataFrame(s) expected = pd.DataFrame({0: s}) tm.assert_frame_equal(result, expected) s = pd.Series(fr.FletcherArray(TEST_ARRAY), name="A") result = pd.DataFrame(s) expected = pd.DataFrame({"A": s}) tm.assert_frame_equal(result, expected)
def test_flatten(): test = [[1, 2], [], [3, 4]] empty_array = [[], [], [], []] fr_test = fr.FletcherArray(test) fr_test_empty_array = fr.FletcherArray( pa.chunked_array([empty_array, empty_array])) npt.assert_array_equal(fr_test.flatten(), fr.FletcherArray([1, 2, 3, 4])) npt.assert_array_equal(fr_test_empty_array.flatten(), np.array([], dtype=np.int64))
def test_arrow_array_types(self): # noqa: F811 fr_arr = fr.FletcherArray(pa.array([3, None, 4.4])) # non-safe casting assert fr_arr.__arrow_array__(type=pa.int64()).equals( pa.array([3, None, 4])) assert fr_arr.data.chunk(0).equals(pa.array([3, None, 4.4])) fr_arr = fr.FletcherArray(pa.array(["3", "-2", "4.4"])) # non-safe casting assert fr_arr.__arrow_array__(type=pa.float64()).equals( pa.array([3, -2, 4.4])) assert fr_arr.data.chunk(0).equals(pa.array(["3", "-2", "4.4"]))
def test_fillna_chunked(test_array_chunked): ser = pd.Series(fr.FletcherArray(test_array_chunked)) ser = ser.fillna("filled") expected_list = TEST_LIST[:2] + ["filled"] chunks = [] for _ in range(10): chunks.append(pa.array(expected_list)) chunked_exp = pa.chunked_array(chunks) expected = pd.Series(fr.FletcherArray(chunked_exp)) tm.assert_series_equal(ser, expected)
def test_groupby(): arr = fr.FletcherArray(["a", "a", "b", None]) df = pd.DataFrame({"str": arr, "int": [10, 5, 24, 6]}) result = df.groupby("str").sum() expected = pd.DataFrame({"int": [15, 24]}, index=pd.Index(["a", "b"], name="str")) tm.assert_frame_equal(result, expected)
def setup(self, chunked, value, indices): # assert np.isscalar(values) or len(values) == len(indices) array = generate_test_array(self.n) if indices == "int": if value == "array_value": raise NotImplementedError() self.indexer = 50 elif indices == "int_array": self.indexer = list(range(0, self.n, 5)) elif indices == "bool_array": self.indexer = np.zeros(self.n, dtype=bool) self.indexer[list(range(0, self.n, 5))] = True elif indices == "slice": self.indexer = slice(0, self.n, 5) if value == "scalar_value": self.value = "setitem" elif value == "array_value": self.value = [str(x) for x in range(self.n)] self.value = np.array(self.value)[self.indexer] if len(self.value) == 1: self.value = self.value[0] self.df = pd.DataFrame({"str": array}) if chunked: array = np.array_split(array, 1000) else: array = [array] self.df_ext = pd.DataFrame({ "str": fr.FletcherArray( pa.chunked_array( [pa.array(chunk, pa.string()) for chunk in array])) })
def test_setitem_chunked(test_array_chunked): ser = pd.Series(fr.FletcherArray(test_array_chunked)) new_val = "new_value" old_val = ser[15] assert new_val != old_val ser[15] = new_val assert new_val == ser[15]
def test_max_min_with_offset(): # pyarrow fills the buffer with value zero when there is a null, so we do a test with only negative values. test = [[-30, None, -1, None], [-2, -15, -6]] fr_test = fr.FletcherArray(pa.chunked_array(test)) assert fr_test[1:]._reduce("max") == -1 assert fr_test[1:]._reduce("min") == -15
def test_nbytes(): array = fr.FletcherArray(pa.array(["A", None, "CC"])) # Minimal storage usage: # 1 byte for the valid bitmap # 4 bytes for the offset array # 3 bytes for the actual string content assert array.nbytes >= 8
def test_reduce_sum(): test = [[1, 2, 3], [1, 2, None]] fr_test_int = fr.FletcherArray(pa.chunked_array(test), dtype=pa.int64()) fr_test_float = fr.FletcherArray(pa.chunked_array(test), dtype=pa.float64()) result_int = fr_test_int._reduce("sum") result_float = fr_test_float._reduce("sum") expected_result_int = 9 expected_result_float = 9.0 assert result_int == expected_result_int assert result_float == expected_result_float assert fr.FletcherArray([], dtype=pa.int32())._reduce("sum") == 0
def test_series_attributes(): s = pd.Series(fr.FletcherArray(TEST_ARRAY)) assert s.ndim == 1 assert s.size == 3 assert s.values is not None # This line currently fails with pandas master: https://github.com/pandas-dev/pandas/issues/22414 assert (s.T == s).all() assert s.memory_usage() > 8
def test_dataframe_constructor(): v = fr.FletcherArray(TEST_ARRAY) df = pd.DataFrame({"A": v}) assert isinstance(df.dtypes["A"], fr.FletcherDtype) assert df.shape == (3, 1) # Test some calls to typical DataFrame functions str(df) df.info()
def test_text_cat(data): if any("\x00" in x for x in data if x): # pytest.skip("pandas cannot handle \\x00 characters in tests") # Skip is not working properly with hypothesis return ser_pd = pd.Series(data, dtype=str) arrow_data = pa.array(data, type=pa.string()) fr_array = fr.FletcherArray(arrow_data) ser_fr = pd.Series(fr_array) fr_other_array = fr.FletcherArray(arrow_data) ser_fr_other = pd.Series(fr_other_array) result_pd = ser_pd.str.cat(ser_pd) result_fr = ser_fr.fr_text.cat(ser_fr_other) result_fr = result_fr.astype(object) # Pandas returns np.nan for NA values in cat, keep this in line result_fr[result_fr.isna()] = np.nan tm.assert_series_equal(result_fr, result_pd)
def test_factorize(): arr = fr.FletcherArray(TEST_ARRAY) labels, uniques = arr.factorize() expected_labels, expected_uniques = pd.factorize(arr.astype(object)) assert isinstance(uniques, fr.FletcherArray) uniques = uniques.astype(object) npt.assert_array_equal(labels, expected_labels) npt.assert_array_equal(uniques, expected_uniques)
def test_access_element_with_np_integers(): inputs = fr.FletcherArray([1, 2, 5, 7]) head = inputs[np.int64(0)] second = inputs[np.int32(1)] third = inputs[np.int16(2)] last = inputs[np.int8(3)] assert head == 1 assert second == 2 assert third == 5 assert last == 7
def test_factorize_with_offset(test, test_with_nulls): fr_test = fr.FletcherArray(test) result_indices_with_offset, result_unique_with_offset = fr_test[ 1:].factorize() expected_indices_with_offset, expected_unique_with_offset = ( [0, 0, 0], fr.FletcherArray(["b"]), ) npt.assert_array_equal(result_indices_with_offset, expected_indices_with_offset) npt.assert_array_equal(result_unique_with_offset, expected_unique_with_offset) test_with_chunks_and_nulls = fr.FletcherArray( pa.chunked_array([test_with_nulls, test_with_nulls]))[1:] indices, unique = test_with_chunks_and_nulls.factorize() npt.assert_array_equal(test_with_chunks_and_nulls, unique.take(indices, allow_fill=True))
def test_eq(): test = [[1, 2, 3], [4, 5, 1, None]] fr_test = fr.FletcherArray(pa.chunked_array(test)) df_test = pd.DataFrame({"a": fr.FletcherArray(pa.chunked_array(test))})["a"] result = fr_test == 1 expected_result = np.array([True, False, False, False, False, True, False]) npt.assert_array_equal(result, expected_result) npt.assert_array_equal(fr_test == fr_test, np.array([True] * 6 + [False])) npt.assert_array_equal(df_test == 2, np.array([False, True] + 5 * [False])) npt.assert_array_equal( df_test == np.array([1, 3, 2, 4, 5, 6, 7]), np.array([True, False, False, True, True, False, False]), ) with pytest.raises(ValueError) as error_length: df_test == [1, 2] assert "Lengths must match to compare" == str(error_length.value)
def test_reduce_max_min(): test = [[1, 2, 3], [-23, 75, None]] fr_test_int = fr.FletcherArray(pa.chunked_array(test), dtype=pa.int64()) fr_test_float = fr.FletcherArray(pa.chunked_array(test), dtype=pa.float64()) result_int_max = fr_test_int._reduce("max") result_int_min = fr_test_int._reduce("min") result_float_max = fr_test_float._reduce("max") result_float_min = fr_test_float._reduce("min") expected_result_int_max = 75 expected_result_int_min = -23 expected_result_float_max = 75.0 expected_result_float_min = -23.0 assert result_int_max == expected_result_int_max assert result_int_min == expected_result_int_min assert result_float_max == expected_result_float_max assert result_float_min == expected_result_float_min