def test_arrow_dtype(): s = pa.array(['a', 'b']) assert list(ArrowStringDtype().__from_arrow__(s)) == list( ArrowStringArray(s)) assert ArrowStringDtype() == ArrowStringDtype.construct_from_string( 'Arrow[string]') assert ArrowListDtype( ArrowListDtype('string')) == ArrowListDtype.construct_from_string( 'Arrow[List[string]]') assert repr(ArrowListDtype(np.int8)) == 'Arrow[List[int8]]' with pytest.raises(TypeError): ArrowListDtype.construct_from_string('Arrow[string]') assert ArrowListDtype.is_dtype('Arrow[List[uint8]]') is True assert ArrowListDtype.is_dtype('List[int8]') is False assert ArrowListDtype.is_dtype(ArrowStringDtype()) is False assert ArrowListDtype(np.int8) != ArrowStringDtype() assert ArrowListDtype(np.int8).kind == np.dtype(object).kind assert ArrowListDtype(np.int8).arrow_type == pa.list_(pa.int8())
def testArrowDtype(self): s = pa.array(['a', 'b']) self.assertEqual(list(ArrowStringDtype().__from_arrow__(s)), list(ArrowStringArray(s))) self.assertEqual( ArrowStringDtype(), ArrowStringDtype.construct_from_string('Arrow[string]')) self.assertEqual( ArrowListDtype(ArrowListDtype('string')), ArrowListDtype.construct_from_string('Arrow[List[string]]')) self.assertEqual(repr(ArrowListDtype(np.int8)), 'Arrow[List[int8]]') with self.assertRaises(TypeError): ArrowListDtype.construct_from_string('Arrow[string]') self.assertTrue(ArrowListDtype.is_dtype('Arrow[List[uint8]]')) self.assertFalse(ArrowListDtype.is_dtype('List[int8]')) self.assertFalse(ArrowListDtype.is_dtype(ArrowStringDtype())) self.assertNotEqual(ArrowListDtype(np.int8), ArrowStringDtype()) self.assertEqual(ArrowListDtype(np.int8).kind, np.dtype(np.int8).kind) self.assertEqual( ArrowListDtype(np.int8).arrow_type, pa.list_(pa.int8()))
def testArrowStringArrayCreation(self): # create from pandas Series series = pd.Series(['a', 'bc', 'de']) array = ArrowStringArray(series) self.assertIsInstance(array._arrow_array, pa.ChunkedArray) if pd.__version__ >= '1.0.0': # test create from StringArray which occurs in pandas 1.0 s = pd.arrays.StringArray(np.array(['a', 'bc', 'de'], dtype=object)) array = ArrowStringArray(s) self.assertIsInstance(array._arrow_array, pa.ChunkedArray) # create from list lst = ['a', 'bc', 'de'] array = ArrowStringArray(lst) self.assertIsInstance(array._arrow_array, pa.ChunkedArray) # create from pyarrow Array a = pa.array(['a', 'bc', 'de']) array = ArrowStringArray(a) self.assertIsInstance(array._arrow_array, pa.ChunkedArray) # create from ArrowStringArray array2 = ArrowStringArray(array) self.assertIsInstance(array2._arrow_array, pa.ChunkedArray) # test copy arrow_array = array2._arrow_array array3 = ArrowStringArray(arrow_array, copy=True) self.assertIsNot(array3._arrow_array, arrow_array) # test from_scalars array = ArrowStringArray.from_scalars([1, 2]) self.assertIsInstance(array._arrow_array, pa.ChunkedArray) self.assertIsInstance(array._arrow_array.chunks[0], pa.StringArray) # test _from_sequence array = ArrowStringArray._from_sequence(['a', 'b', 'cc']) self.assertIsInstance(array._arrow_array, pa.ChunkedArray) # test _from_sequence_of_strings array = ArrowStringArray._from_sequence_of_strings(['a', 'b']) self.assertIsInstance(array._arrow_array, pa.ChunkedArray)
def test_arrow_string_array_functions(): lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object) # leverage string array to get the right answer string_array = pd.arrays.StringArray(lst) has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文']) has_na_string_array = pd.arrays.StringArray( np.array(['abc', pd.NA, 'eee', '中文'], dtype=object)) for pandas_only in [False, True]: with option_context({'dataframe.arrow_array.pandas_only': pandas_only}): arrow_array = ArrowStringArray(lst) # getitem, scalar assert arrow_array[1] == string_array[1] assert arrow_array[-1] == string_array[-1] # getitem, slice assert list(arrow_array[:2]) == list(string_array[:2]) assert list(arrow_array[1:-1]) == list(string_array[1:-1]) assert list(arrow_array[::2]) == list(string_array[::2]) # getitem, boolean index cond = np.array([len(c) > 2 for c in lst]) assert list(arrow_array[cond]) == list(string_array[cond]) # getitem, fancy index selection = [3, 1, 2] assert list(arrow_array[selection]) == list( string_array[selection]) selection = [3, -1, 2, -4] assert list(arrow_array[selection]) == list( string_array[selection]) selection = np.array([3, -1, 2, -4]) assert list(arrow_array[selection]) == list( string_array[selection]) # setitem arrow_array2 = arrow_array.copy() string_array2 = string_array.copy() arrow_array2[0] = 'ss' string_array2[0] = 'ss' assert list(arrow_array2) == list(string_array2) arrow_array2[1:3] = ['ss1', 'ss2'] string_array2[1:3] = ['ss1', 'ss2'] assert list(arrow_array2) == list(string_array2) arrow_array2[1:3] = arrow_array2[2:4] string_array2[1:3] = string_array2[2:4] assert list(arrow_array2) == list(string_array2) arrow_array2[2:] = pd.Series(['ss3', 'ss4']) string_array2[2:] = pd.Series(['ss3', 'ss4']) assert list(arrow_array2) == list(string_array2) with pytest.raises(ValueError): arrow_array2[0] = ['a', 'b'] arrow_array2[-1] = None string_array2[-1] = None assert list(arrow_array2)[:-1] == list(string_array2)[:-1] assert pd.isna(list(arrow_array2)[-1]) is True with pytest.raises(ValueError): arrow_array2[0] = 2 with pytest.raises(ValueError): arrow_array2[:2] = [1, 2] # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), string_array.to_numpy()) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True)) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(copy=True, na_value='ss'), has_na_string_array.to_numpy(copy=True, na_value='ss')) # test fillna arrow_array3 = has_na_arrow_array.fillna('filled') string_array3 = has_na_string_array.fillna('filled') assert list(arrow_array3) == list(string_array3) # test astype arrow_array4 = ArrowStringArray(['1', '10', '100']) # leverage string array to get the right answer string_array4 = pd.arrays.StringArray( np.array(['1', '10', '100'], dtype=object)) np.testing.assert_array_equal(arrow_array4.astype(np.int64), string_array4.astype(np.int64)) np.testing.assert_almost_equal(arrow_array4.astype(float), string_array4.astype(float)) assert list(arrow_array4.astype( ArrowStringDtype(), copy=False)) == list( string_array4.astype(pd.StringDtype(), copy=False)) assert list(arrow_array4.astype( ArrowStringDtype(), copy=True)) == list( string_array4.astype(pd.StringDtype(), copy=True)) # test factorize codes, unique = arrow_array.factorize() codes2, unique2 = string_array.factorize() assert list(codes) == list(codes2) assert list(unique) == list(unique2) # test nbytes assert arrow_array.nbytes < pd.Series( string_array.astype(object)).memory_usage(deep=True, index=False) # test memory_usage if pandas_only: assert arrow_array.memory_usage( deep=False) == pd.Series(string_array).memory_usage( index=False) else: assert arrow_array.memory_usage( deep=True) == arrow_array.nbytes # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), has_na_string_array.isna()) has_na_arrow_array2 = has_na_arrow_array.copy() has_na_arrow_array2._force_use_pandas = True np.testing.assert_array_equal(has_na_arrow_array2.isna(), has_na_string_array.isna()) # test take assert list(arrow_array.take([1, 2, -1])) == list( string_array.take([1, 2, -1])) assert list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')) \ == list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa')) assert list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) \ == list(string_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) # test shift assert list(arrow_array.shift(2, fill_value='aa')) == list( string_array.shift(2, fill_value='aa')) # test value_counts assert list(arrow_array.value_counts()) == list( string_array.value_counts()) assert list(has_na_arrow_array.value_counts(dropna=True)) == list( has_na_string_array.value_counts(dropna=True)) # test all any assert arrow_array.all() == string_array.all() assert arrow_array.any() == string_array.any() # test arithmetic assert list(arrow_array + 's') == list(string_array + 's') assert list( (arrow_array + has_na_arrow_array).fillna('ss')) == list( (string_array + has_na_string_array).fillna('ss')) # test comparison np.testing.assert_array_equal(arrow_array < 's', string_array < 's') pd.testing.assert_series_equal( pd.Series(arrow_array < has_na_arrow_array), pd.Series(string_array < has_na_string_array)) # test repr assert 'ArrowStringArray' in repr(arrow_array) # test concat empty arrow_array5 = ArrowStringArray( pa.chunked_array([], type=pa.string())) concatenated = ArrowStringArray._concat_same_type( [arrow_array5, arrow_array5]) if not pandas_only: assert len(concatenated._arrow_array.chunks) == 1 pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))
def testArrowStringArrayFunctions(self): lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object) arrow_array = ArrowStringArray(lst) # leverage string array to get the right answer string_array = pd.arrays.StringArray(lst) has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文']) has_na_string_array = pd.arrays.StringArray( np.array(['abc', pd.NA, 'eee', '中文'], dtype=object)) # getitem, scalar self.assertEqual(arrow_array[1], string_array[1]) self.assertEqual(arrow_array[-1], string_array[-1]) # getitem, slice self.assertListEqual(list(arrow_array[:2]), list(string_array[:2])) self.assertListEqual(list(arrow_array[1:-1]), list(string_array[1:-1])) self.assertListEqual(list(arrow_array[::2]), list(string_array[::2])) # getitem, boolean index cond = np.array([len(c) > 2 for c in lst]) self.assertListEqual(list(arrow_array[cond]), list(string_array[cond])) # getitem, fancy index selection = [3, 1, 2] self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) selection = [3, -1, 2, -4] self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) selection = np.array([3, -1, 2, -4]) self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) # setitem arrow_array2 = arrow_array.copy() string_array2 = string_array.copy() arrow_array2[0] = 'ss' string_array2[0] = 'ss' self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[1:3] = ['ss1', 'ss2'] string_array2[1:3] = ['ss1', 'ss2'] self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[1:3] = arrow_array2[2:4] string_array2[1:3] = string_array2[2:4] self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[2:] = pd.Series(['ss3', 'ss4']) string_array2[2:] = pd.Series(['ss3', 'ss4']) self.assertListEqual(list(arrow_array2), list(string_array2)) with self.assertRaises(ValueError): arrow_array2[0] = ['a', 'b'] arrow_array2[-1] = None string_array2[-1] = None self.assertListEqual(list(arrow_array2)[:-1], list(string_array2)[:-1]) self.assertTrue(pd.isna(list(arrow_array2)[-1])) with self.assertRaises(ValueError): arrow_array2[0] = 2 with self.assertRaises(ValueError): arrow_array2[:2] = [1, 2] # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), string_array.to_numpy()) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True)) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(copy=True, na_value='ss'), has_na_string_array.to_numpy(copy=True, na_value='ss')) # test fillna arrow_array3 = has_na_arrow_array.fillna('filled') string_array3 = has_na_string_array.fillna('filled') self.assertListEqual(list(arrow_array3), list(string_array3)) # test astype arrow_array4 = ArrowStringArray(['1', '10', '100']) # leverage string array to get the right answer string_array4 = pd.arrays.StringArray( np.array(['1', '10', '100'], dtype=object)) np.testing.assert_array_equal(arrow_array4.astype(np.int64), string_array4.astype(np.int64)) np.testing.assert_almost_equal(arrow_array4.astype(float), string_array4.astype(float)) self.assertListEqual( list(arrow_array4.astype(ArrowStringDtype(), copy=False)), list(string_array4.astype(pd.StringDtype(), copy=False))) self.assertListEqual( list(arrow_array4.astype(ArrowStringDtype(), copy=True)), list(string_array4.astype(pd.StringDtype(), copy=True))) # test factorize codes, unique = arrow_array.factorize() codes2, unique2 = string_array.factorize() self.assertListEqual(list(codes), list(codes2)) self.assertListEqual(list(unique), list(unique2)) # test nbytes self.assertLess(arrow_array.nbytes, pd.Series(string_array).memory_usage(deep=True)) # test memory_usage self.assertEqual(arrow_array.memory_usage(deep=True), arrow_array.nbytes) # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), has_na_string_array.isna()) has_na_arrow_array2 = has_na_arrow_array.copy() has_na_arrow_array2._force_use_pandas = True np.testing.assert_array_equal(has_na_arrow_array2.isna(), has_na_string_array.isna()) # test take self.assertListEqual(list(arrow_array.take([1, 2, -1])), list(string_array.take([1, 2, -1]))) self.assertListEqual( list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')), list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa'))) self.assertListEqual( list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')), list( string_array.take([1, 2, -1], allow_fill=True, fill_value='aa'))) # test shift self.assertListEqual(list(arrow_array.shift(2, fill_value='aa')), list(string_array.shift(2, fill_value='aa'))) # test value_counts self.assertListEqual(list(arrow_array.value_counts()), list(string_array.value_counts())) self.assertListEqual( list(has_na_arrow_array.value_counts(dropna=True)), list(has_na_string_array.value_counts(dropna=True))) # test all any self.assertEqual(arrow_array.all(), string_array.all()) self.assertEqual(arrow_array.any(), string_array.any()) # test arithmetic self.assertListEqual(list(arrow_array + 's'), list(string_array + 's')) self.assertListEqual( list((arrow_array + has_na_arrow_array).fillna('ss')), list((string_array + has_na_string_array).fillna('ss'))) # test comparison np.testing.assert_array_equal(arrow_array < 's', string_array < 's') pd.testing.assert_series_equal( pd.Series(arrow_array < has_na_arrow_array), pd.Series(string_array < has_na_string_array)) # test repr self.assertIn('ArrowStringArray', repr(arrow_array)) # test concat empty arrow_array5 = ArrowStringArray(pa.chunked_array([], type=pa.string())) concatenated = ArrowStringArray._concat_same_type( [arrow_array5, arrow_array5]) self.assertEqual(len(concatenated._arrow_array.chunks), 1) pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))