示例#1
0
    def setup(self):
        np.random.seed(93487)
        # TODO: Is it maybe faster to separate each type into its own Take* class?
        #       It seems like the data is regenerated for each benchmark and thus
        #       is quite some overhead here.
        self.data = np.random.randint(0, 2**20, size=2**24)
        self.pd_int = pd.Series(self.data)
        self.fr_cont_int = pd.Series(fr.FletcherContinuousArray(self.data))
        chunked_data = pa.chunked_array([
            pa.array(self.data[0:len(self.data) // 2]),
            pa.array(self.data[len(self.data) // 2:-1]),
        ])
        self.fr_chunked_int = pd.Series(fr.FletcherChunkedArray(chunked_data))

        mask = np.random.rand(2**24) > 0.8
        self.pd_int_na = pd.Series(pd.arrays.IntegerArray(self.data, mask))
        self.fr_cont_int_na = pd.Series(
            fr.FletcherContinuousArray(pa.array(self.data, mask=mask)))
        self.fr_chunked_int_na = pd.Series(
            fr.FletcherChunkedArray(pa.array(self.data, mask=mask)))

        self.data_small = np.random.randint(0, 2**16, size=2**18)
        self.data_small_missing = self.data_small.copy()
        self.data_small_missing[0:-1:2] = -1
        data_small_str = self.data_small.astype(str)
        self.pd_str = pd.Series(data_small_str)
        self.fr_cont_str = pd.Series(
            fr.FletcherContinuousArray(data_small_str))
        data_small_str_chunked = pa.chunked_array([
            pa.array(data_small_str[0:len(data_small_str) // 2]),
            pa.array(data_small_str[len(data_small_str) // 2:-1]),
        ])
        self.fr_chunked_str = pd.Series(
            fr.FletcherChunkedArray(data_small_str_chunked))
示例#2
0
def test_take_list_arrays():
    indices = [0, 1, 4, 3, 5]
    indptr = [0, 2, 3, 5]
    list_array = pa.ListArray.from_arrays(indptr, indices)
    large_list_array = pa.LargeListArray.from_arrays(indptr, indices)

    test_with_null = fr.FletcherContinuousArray(
        pa.array([[1, 2], [None, 3], [4, 5]]))

    assert np.all(
        pa.array(test_with_null.take([1, 2, 1])).to_pylist() ==
        [[None, 3], [4, 5], [None, 3]])

    test = fr.FletcherContinuousArray(
        pa.chunked_array([list_array, list_array])).take([0, 5, 1])
    test_large = fr.FletcherContinuousArray(
        pa.chunked_array([large_list_array, large_list_array])).take([0, 5, 1])
    expected = [[0, 1], [3, 5], [4]]
    assert np.all(
        list(
            map(
                lambda x: np.all(np.array(test[x]) == np.array(expected)[x]),
                range(0, len(test)),
            )))
    assert np.all(
        list(
            map(
                lambda x: np.all(
                    np.array(test_large[x]) == np.array(expected)[x]),
                range(0, len(test_large)),
            )))
示例#3
0
    def setup(self):
        data = np.random.randint(0, 2**20, size=2**24)
        self.pd_int = pd.Series(data)
        self.fr_cont_int = pd.Series(fr.FletcherContinuousArray(data))
        self.fr_chunked_int = pd.Series(fr.FletcherChunkedArray(data))

        mask = np.random.rand(2**24) > 0.8
        self.pd_int_na = pd.Series(pd.arrays.IntegerArray(data, mask))
        self.fr_cont_int_na = pd.Series(
            fr.FletcherContinuousArray(pa.array(data, mask=mask)))
        self.fr_chunked_int_na = pd.Series(
            fr.FletcherChunkedArray(pa.array(data, mask=mask)))
示例#4
0
def _fr_series_from_data(data, fletcher_variant, dtype=pa.string()):
    arrow_data = pa.array(data, type=dtype)
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    return pd.Series(fr_array)
示例#5
0
def _do_test_text_strip(str_accessor,
                        fletcher_variant,
                        fletcher_slice_offset,
                        data,
                        strip_method="strip"):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    arrow_data = pa.array([None for _ in range(fletcher_slice_offset)] + data,
                          type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array[fletcher_slice_offset:])

    result_pd = getattr(ser_pd.str, strip_method)()
    result_fr = getattr(getattr(ser_fr, str_accessor), strip_method)()
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    result_pd[result_pd.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd)
示例#6
0
def test_awkward_accessor():
    x = fletcher.FletcherContinuousArray([[1.0, 2.0], [], [3.0, 4.0, 5.0]])
    y = np.zeros(len(x), dtype=float)
    df = pd.DataFrame(dict(x=x, y=y))
    df.to_root(".test.root", compression_jagged=None)
    df = pd.read_root(".test.root")
    assert df["x"].ak(0).sum().tolist() == [3.0, 0.0, 12.0]
    assert awkward1.sum(df["x"], axis=-1).tolist() == [3.0, 0.0, 12.0]
示例#7
0
def test_jagged():
    x_in = fletcher.FletcherContinuousArray([[1.0, 2.0], [], [3.0, 4.0, 5.0]])
    df = pd.DataFrame(dict(x=x_in))
    df.to_root(".test.root", compression_jagged=None)
    x_out = pd.read_root(".test.root")["x"].values
    v_in = list(map(list, x_in.data))
    v_out = list(map(list, x_out.data))
    assert v_in == v_out
示例#8
0
def test_flatten():
    list_array = pa.array([[1, 2], [3, 4]])
    npt.assert_array_equal(
        fr.FletcherContinuousArray(list_array).flatten(), [1, 2, 3, 4])

    chunked_list_array = pa.chunked_array([list_array, list_array])
    npt.assert_array_equal(
        fr.FletcherChunkedArray(chunked_list_array).flatten(),
        [1, 2, 3, 4, 1, 2, 3, 4])
示例#9
0
def test_pandas_from_arrow():
    arr = pa.array(["a", "b", "c"], pa.string())

    expected_series_woutname = pd.Series(fr.FletcherChunkedArray(arr))
    pdt.assert_series_equal(expected_series_woutname, fr.pandas_from_arrow(arr))

    expected_series_woutname = pd.Series(fr.FletcherContinuousArray(arr))
    pdt.assert_series_equal(
        expected_series_woutname, fr.pandas_from_arrow(arr, continuous=True)
    )

    rb = pa.RecordBatch.from_arrays([arr], ["column"])
    expected_df = pd.DataFrame({"column": fr.FletcherChunkedArray(arr)})
    table = pa.Table.from_arrays([arr], ["column"])
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb))
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(table))

    expected_df = pd.DataFrame({"column": fr.FletcherContinuousArray(arr)})
    table = pa.Table.from_arrays([arr], ["column"])
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb, continuous=True))
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(table, continuous=True))
示例#10
0
def array_to_fletcher_or_numpy(array):
    import fletcher

    arrow_array = awkward1.to_arrow(array)
    fletcher_array = fletcher.FletcherContinuousArray(arrow_array)
    if (array.ndim >= 2) or (fletcher_array.data.null_count > 0):
        return fletcher_array
    if "list<" not in str(fletcher_array.dtype):
        a = array.layout
        if hasattr(a, "content"):
            a = a.content
        return np.array(a, copy=False)
    return fletcher_array
示例#11
0
def test_chunkdataframe():
    x = fletcher.FletcherContinuousArray(100 *
                                         [[1.0, 2.0], [], [3.0, 4.0, 5.0]])
    y = np.arange(len(x), dtype=float)
    df = pd.DataFrame(dict(x=x, y=y))
    df.to_root(".test.root", compression_jagged=None)
    df = ChunkDataFrame(filename=".test.root",
                        treename="t",
                        entry_start=0,
                        entry_stop=10)
    assert "x" not in df.columns
    assert len(df["x"]) == 10
    assert "x" in df.columns
示例#12
0
def test_text_cat(data, fletcher_variant, fletcher_variant_2):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    arrow_data = pa.array(data, type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array)
    if fletcher_variant_2 == "chunked":
        fr_other_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_other_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr_other = pd.Series(fr_other_array)

    result_pd = ser_pd.str.cat(ser_pd)
    result_fr = ser_fr.fr_text.cat(ser_fr_other)
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd)
示例#13
0
def test_chunkdataframe_subset():
    x = fletcher.FletcherContinuousArray(100 *
                                         [[1.0, 2.0], [], [3.0, 4.0, 5.0]])
    y = np.arange(len(x), dtype=float)
    df = pd.DataFrame(dict(x=x, y=y))
    df.to_root(".test.root", compression_jagged=None)
    df = ChunkDataFrame(filename=".test.root",
                        treename="t",
                        entry_start=0,
                        entry_stop=10)

    _ = df["x"]
    # subset of dataframe
    myslice = slice(0, 10, 2)
    df = df.iloc[myslice]
    # now check that the right subset of y is read
    _ = df["y"]
    assert (df["y"] == y[myslice]).all()
示例#14
0
def test_text_zfill(data, str_accessor, fletcher_variant):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    max_str_len = ser_pd.map(_optional_len).max()
    if pd.isna(max_str_len):
        max_str_len = 0
    arrow_data = pa.array(data, type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array)

    result_pd = ser_pd.str.zfill(max_str_len + 1)
    result_fr = getattr(ser_fr, str_accessor).zfill(max_str_len + 1)
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd)