def testArrowDtype(self): s = pa.array(['a', 'b']) self.assertEqual(list(ArrowStringDtype().__from_arrow__(s)), list(ArrowStringArray(s))) self.assertEqual( ArrowStringDtype(), ArrowStringDtype.construct_from_string('Arrow[string]')) self.assertEqual( ArrowListDtype(ArrowListDtype('string')), ArrowListDtype.construct_from_string('Arrow[List[string]]')) self.assertEqual(repr(ArrowListDtype(np.int8)), 'Arrow[List[int8]]') with self.assertRaises(TypeError): ArrowListDtype.construct_from_string('Arrow[string]') self.assertTrue(ArrowListDtype.is_dtype('Arrow[List[uint8]]')) self.assertFalse(ArrowListDtype.is_dtype('List[int8]')) self.assertFalse(ArrowListDtype.is_dtype(ArrowStringDtype())) self.assertNotEqual(ArrowListDtype(np.int8), ArrowStringDtype()) self.assertEqual(ArrowListDtype(np.int8).kind, np.dtype(np.int8).kind) self.assertEqual( ArrowListDtype(np.int8).arrow_type, pa.list_(pa.int8()))
def test_arrow_dtype(): s = pa.array(['a', 'b']) assert list(ArrowStringDtype().__from_arrow__(s)) == list( ArrowStringArray(s)) assert ArrowStringDtype() == ArrowStringDtype.construct_from_string( 'Arrow[string]') assert ArrowListDtype( ArrowListDtype('string')) == ArrowListDtype.construct_from_string( 'Arrow[List[string]]') assert repr(ArrowListDtype(np.int8)) == 'Arrow[List[int8]]' with pytest.raises(TypeError): ArrowListDtype.construct_from_string('Arrow[string]') assert ArrowListDtype.is_dtype('Arrow[List[uint8]]') is True assert ArrowListDtype.is_dtype('List[int8]') is False assert ArrowListDtype.is_dtype(ArrowStringDtype()) is False assert ArrowListDtype(np.int8) != ArrowStringDtype() assert ArrowListDtype(np.int8).kind == np.dtype(object).kind assert ArrowListDtype(np.int8).arrow_type == pa.list_(pa.int8())
def testToPandas(self): rs = np.random.RandomState(0) df = pd.DataFrame({'a': rs.rand(100), 'b': ['s' + str(i) for i in rs.randint(100, size=100)]}) batch_size = 15 n_batch = len(df) // 15 + 1 batches = [pa.RecordBatch.from_pandas(df[i * batch_size: (i + 1) * batch_size]) for i in range(n_batch)] table = pa.Table.from_batches(batches) df2 = arrow_table_to_pandas_dataframe(table) self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype()) self.assertLess(df2.memory_usage(deep=True).sum(), df.memory_usage(deep=True).sum()) # test serialize df3 = dataserializer.loads(dataserializer.dumps(df2)) self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype()) pd.testing.assert_frame_equal(df3, df2) # test df method df4 = df2.groupby('b').sum() expected = df.groupby('b').sum() pd.testing.assert_frame_equal(df4, expected) s = ('s' + df2['b']).astype('string') expected = ('s' + df['b']).astype('string') pd.testing.assert_series_equal(s, expected) s2 = df2['b'].str[:2] expected = df['b'].astype('string').str[:2] pd.testing.assert_series_equal(s2, expected)
def test_arrow_string_sort_values(setup): rs = np.random.RandomState(0) raw = pd.DataFrame({'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw['b'] = raw['b'].astype(ArrowStringDtype()) mdf = DataFrame(raw, chunk_size=3) df = mdf.sort_values(by='b') result = df.execute().fetch() expected = raw.sort_values(by='b') pd.testing.assert_frame_equal(result, expected)
def testArrowStringSortValues(self): rs = np.random.RandomState(0) raw = pd.DataFrame({'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw['b'] = raw['b'].astype(ArrowStringDtype()) mdf = DataFrame(raw, chunk_size=3) df = mdf.sort_values(by='b') result = self.executor.execute_dataframe(df, concat=True)[0] expected = raw.sort_values(by='b') pd.testing.assert_frame_equal(result, expected)
def df_type_to_np_type(df_type, use_arrow_dtype=False): from ...df import types from ...df.backends.pd.types import _df_to_np_types if df_type == types.string: if use_arrow_dtype: return ArrowStringDtype() else: return np.dtype('object') elif df_type in _df_to_np_types: return _df_to_np_types[df_type] elif df_type == types.timestamp: return np.datetime64(0, 'ns').dtype else: return np.dtype('object')
def test_to_pandas(): rs = np.random.RandomState(0) df = pd.DataFrame({ 'a': rs.rand(100), 'b': ['s' + str(i) for i in rs.randint(100, size=100)], 'c': [['ss0' + str(i), 'ss1' + str(i)] for i in rs.randint(100, size=100)] }) batch_size = 15 n_batch = len(df) // 15 + 1 batches = [ pa.RecordBatch.from_pandas(df[i * batch_size:(i + 1) * batch_size]) for i in range(n_batch) ] table = pa.Table.from_batches(batches) df1 = arrow_table_to_pandas_dataframe(table, use_arrow_dtype=False) assert df1.dtypes.iloc[1] == np.dtype('O') assert df1.dtypes.iloc[2] == np.dtype('O') df2 = arrow_table_to_pandas_dataframe(table) assert df2.dtypes.iloc[1] == ArrowStringDtype() assert df2.dtypes.iloc[2] == ArrowListDtype(str) assert df2.memory_usage(deep=True).sum() < df.memory_usage(deep=True).sum() # test df method df4 = df2.groupby('b').sum() expected = df.groupby('b').sum() pd.testing.assert_frame_equal(df4, expected) s = ('s' + df2['b']).astype('string') expected = ('s' + df['b']).astype('string') pd.testing.assert_series_equal(s, expected) s2 = df2['b'].str[:2] expected = df['b'].astype('string').str[:2] pd.testing.assert_series_equal(s2, expected)
def test_arrow_string_array_functions(): lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object) # leverage string array to get the right answer string_array = pd.arrays.StringArray(lst) has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文']) has_na_string_array = pd.arrays.StringArray( np.array(['abc', pd.NA, 'eee', '中文'], dtype=object)) for pandas_only in [False, True]: with option_context({'dataframe.arrow_array.pandas_only': pandas_only}): arrow_array = ArrowStringArray(lst) # getitem, scalar assert arrow_array[1] == string_array[1] assert arrow_array[-1] == string_array[-1] # getitem, slice assert list(arrow_array[:2]) == list(string_array[:2]) assert list(arrow_array[1:-1]) == list(string_array[1:-1]) assert list(arrow_array[::2]) == list(string_array[::2]) # getitem, boolean index cond = np.array([len(c) > 2 for c in lst]) assert list(arrow_array[cond]) == list(string_array[cond]) # getitem, fancy index selection = [3, 1, 2] assert list(arrow_array[selection]) == list( string_array[selection]) selection = [3, -1, 2, -4] assert list(arrow_array[selection]) == list( string_array[selection]) selection = np.array([3, -1, 2, -4]) assert list(arrow_array[selection]) == list( string_array[selection]) # setitem arrow_array2 = arrow_array.copy() string_array2 = string_array.copy() arrow_array2[0] = 'ss' string_array2[0] = 'ss' assert list(arrow_array2) == list(string_array2) arrow_array2[1:3] = ['ss1', 'ss2'] string_array2[1:3] = ['ss1', 'ss2'] assert list(arrow_array2) == list(string_array2) arrow_array2[1:3] = arrow_array2[2:4] string_array2[1:3] = string_array2[2:4] assert list(arrow_array2) == list(string_array2) arrow_array2[2:] = pd.Series(['ss3', 'ss4']) string_array2[2:] = pd.Series(['ss3', 'ss4']) assert list(arrow_array2) == list(string_array2) with pytest.raises(ValueError): arrow_array2[0] = ['a', 'b'] arrow_array2[-1] = None string_array2[-1] = None assert list(arrow_array2)[:-1] == list(string_array2)[:-1] assert pd.isna(list(arrow_array2)[-1]) is True with pytest.raises(ValueError): arrow_array2[0] = 2 with pytest.raises(ValueError): arrow_array2[:2] = [1, 2] # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), string_array.to_numpy()) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True)) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(copy=True, na_value='ss'), has_na_string_array.to_numpy(copy=True, na_value='ss')) # test fillna arrow_array3 = has_na_arrow_array.fillna('filled') string_array3 = has_na_string_array.fillna('filled') assert list(arrow_array3) == list(string_array3) # test astype arrow_array4 = ArrowStringArray(['1', '10', '100']) # leverage string array to get the right answer string_array4 = pd.arrays.StringArray( np.array(['1', '10', '100'], dtype=object)) np.testing.assert_array_equal(arrow_array4.astype(np.int64), string_array4.astype(np.int64)) np.testing.assert_almost_equal(arrow_array4.astype(float), string_array4.astype(float)) assert list(arrow_array4.astype( ArrowStringDtype(), copy=False)) == list( string_array4.astype(pd.StringDtype(), copy=False)) assert list(arrow_array4.astype( ArrowStringDtype(), copy=True)) == list( string_array4.astype(pd.StringDtype(), copy=True)) # test factorize codes, unique = arrow_array.factorize() codes2, unique2 = string_array.factorize() assert list(codes) == list(codes2) assert list(unique) == list(unique2) # test nbytes assert arrow_array.nbytes < pd.Series( string_array.astype(object)).memory_usage(deep=True, index=False) # test memory_usage if pandas_only: assert arrow_array.memory_usage( deep=False) == pd.Series(string_array).memory_usage( index=False) else: assert arrow_array.memory_usage( deep=True) == arrow_array.nbytes # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), has_na_string_array.isna()) has_na_arrow_array2 = has_na_arrow_array.copy() has_na_arrow_array2._force_use_pandas = True np.testing.assert_array_equal(has_na_arrow_array2.isna(), has_na_string_array.isna()) # test take assert list(arrow_array.take([1, 2, -1])) == list( string_array.take([1, 2, -1])) assert list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')) \ == list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa')) assert list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) \ == list(string_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) # test shift assert list(arrow_array.shift(2, fill_value='aa')) == list( string_array.shift(2, fill_value='aa')) # test value_counts assert list(arrow_array.value_counts()) == list( string_array.value_counts()) assert list(has_na_arrow_array.value_counts(dropna=True)) == list( has_na_string_array.value_counts(dropna=True)) # test all any assert arrow_array.all() == string_array.all() assert arrow_array.any() == string_array.any() # test arithmetic assert list(arrow_array + 's') == list(string_array + 's') assert list( (arrow_array + has_na_arrow_array).fillna('ss')) == list( (string_array + has_na_string_array).fillna('ss')) # test comparison np.testing.assert_array_equal(arrow_array < 's', string_array < 's') pd.testing.assert_series_equal( pd.Series(arrow_array < has_na_arrow_array), pd.Series(string_array < has_na_string_array)) # test repr assert 'ArrowStringArray' in repr(arrow_array) # test concat empty arrow_array5 = ArrowStringArray( pa.chunked_array([], type=pa.string())) concatenated = ArrowStringArray._concat_same_type( [arrow_array5, arrow_array5]) if not pandas_only: assert len(concatenated._arrow_array.chunks) == 1 pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))
def testDataSerialize(self): for type_, compress in itertools.product( (None,) + tuple(dataserializer.SerialType.__members__.values()), (None,) + tuple(dataserializer.CompressType.__members__.values())): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.load( BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) # test non-serializable object if pyarrow: non_serial = type('non_serial', (object,), dict(nbytes=10)) with self.assertRaises(SerializationFailed): dataserializer.dumps(non_serial()) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100,), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin') try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,)) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce')}) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s) # test complex s = complex(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) s = np.complex64(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) # test pickle d = ClassToPickle(dict(a=1, b='uvw')) dest_d = dataserializer.loads((dataserializer.dumps(d))) self.assertIs(type(d), type(dest_d)) self.assertEqual(d.a, dest_d.a) # test ndarray with negative strides arr = np.zeros((5, 6, 3)) arr2 = arr[:, :, ::-1] dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2)) np.testing.assert_array_equal(arr2, dest_arr2) # test ArrowArray df = pd.DataFrame({'a': ['s1', 's2', 's3'], 'b': [['s1', 's2'], ['s3'], ['s4', 's5']]}) df['a'] = df['a'].astype(ArrowStringDtype()) df['b'] = df['b'].astype(ArrowListDtype(str)) dest_df = dataserializer.loads(dataserializer.dumps(df)) self.assertIs(type(df), type(dest_df)) pd.testing.assert_frame_equal(df, dest_df) # test DataFrame with SparseDtype s = pd.Series([1, 2, np.nan, np.nan, 3]).astype( pd.SparseDtype(np.dtype(np.float64), np.nan)) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_series_equal(s, dest_s) df = pd.DataFrame({'s': s}) dest_df = dataserializer.loads((dataserializer.dumps(df))) pd.testing.assert_frame_equal(df, dest_df)
def testArrowStringArrayFunctions(self): lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object) arrow_array = ArrowStringArray(lst) # leverage string array to get the right answer string_array = pd.arrays.StringArray(lst) has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文']) has_na_string_array = pd.arrays.StringArray( np.array(['abc', pd.NA, 'eee', '中文'], dtype=object)) # getitem, scalar self.assertEqual(arrow_array[1], string_array[1]) self.assertEqual(arrow_array[-1], string_array[-1]) # getitem, slice self.assertListEqual(list(arrow_array[:2]), list(string_array[:2])) self.assertListEqual(list(arrow_array[1:-1]), list(string_array[1:-1])) self.assertListEqual(list(arrow_array[::2]), list(string_array[::2])) # getitem, boolean index cond = np.array([len(c) > 2 for c in lst]) self.assertListEqual(list(arrow_array[cond]), list(string_array[cond])) # getitem, fancy index selection = [3, 1, 2] self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) selection = [3, -1, 2, -4] self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) selection = np.array([3, -1, 2, -4]) self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) # setitem arrow_array2 = arrow_array.copy() string_array2 = string_array.copy() arrow_array2[0] = 'ss' string_array2[0] = 'ss' self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[1:3] = ['ss1', 'ss2'] string_array2[1:3] = ['ss1', 'ss2'] self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[1:3] = arrow_array2[2:4] string_array2[1:3] = string_array2[2:4] self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[2:] = pd.Series(['ss3', 'ss4']) string_array2[2:] = pd.Series(['ss3', 'ss4']) self.assertListEqual(list(arrow_array2), list(string_array2)) with self.assertRaises(ValueError): arrow_array2[0] = ['a', 'b'] arrow_array2[-1] = None string_array2[-1] = None self.assertListEqual(list(arrow_array2)[:-1], list(string_array2)[:-1]) self.assertTrue(pd.isna(list(arrow_array2)[-1])) with self.assertRaises(ValueError): arrow_array2[0] = 2 with self.assertRaises(ValueError): arrow_array2[:2] = [1, 2] # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), string_array.to_numpy()) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True)) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(copy=True, na_value='ss'), has_na_string_array.to_numpy(copy=True, na_value='ss')) # test fillna arrow_array3 = has_na_arrow_array.fillna('filled') string_array3 = has_na_string_array.fillna('filled') self.assertListEqual(list(arrow_array3), list(string_array3)) # test astype arrow_array4 = ArrowStringArray(['1', '10', '100']) # leverage string array to get the right answer string_array4 = pd.arrays.StringArray( np.array(['1', '10', '100'], dtype=object)) np.testing.assert_array_equal(arrow_array4.astype(np.int64), string_array4.astype(np.int64)) np.testing.assert_almost_equal(arrow_array4.astype(float), string_array4.astype(float)) self.assertListEqual( list(arrow_array4.astype(ArrowStringDtype(), copy=False)), list(string_array4.astype(pd.StringDtype(), copy=False))) self.assertListEqual( list(arrow_array4.astype(ArrowStringDtype(), copy=True)), list(string_array4.astype(pd.StringDtype(), copy=True))) # test factorize codes, unique = arrow_array.factorize() codes2, unique2 = string_array.factorize() self.assertListEqual(list(codes), list(codes2)) self.assertListEqual(list(unique), list(unique2)) # test nbytes self.assertLess(arrow_array.nbytes, pd.Series(string_array).memory_usage(deep=True)) # test memory_usage self.assertEqual(arrow_array.memory_usage(deep=True), arrow_array.nbytes) # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), has_na_string_array.isna()) has_na_arrow_array2 = has_na_arrow_array.copy() has_na_arrow_array2._force_use_pandas = True np.testing.assert_array_equal(has_na_arrow_array2.isna(), has_na_string_array.isna()) # test take self.assertListEqual(list(arrow_array.take([1, 2, -1])), list(string_array.take([1, 2, -1]))) self.assertListEqual( list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')), list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa'))) self.assertListEqual( list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')), list( string_array.take([1, 2, -1], allow_fill=True, fill_value='aa'))) # test shift self.assertListEqual(list(arrow_array.shift(2, fill_value='aa')), list(string_array.shift(2, fill_value='aa'))) # test value_counts self.assertListEqual(list(arrow_array.value_counts()), list(string_array.value_counts())) self.assertListEqual( list(has_na_arrow_array.value_counts(dropna=True)), list(has_na_string_array.value_counts(dropna=True))) # test all any self.assertEqual(arrow_array.all(), string_array.all()) self.assertEqual(arrow_array.any(), string_array.any()) # test arithmetic self.assertListEqual(list(arrow_array + 's'), list(string_array + 's')) self.assertListEqual( list((arrow_array + has_na_arrow_array).fillna('ss')), list((string_array + has_na_string_array).fillna('ss'))) # test comparison np.testing.assert_array_equal(arrow_array < 's', string_array < 's') pd.testing.assert_series_equal( pd.Series(arrow_array < has_na_arrow_array), pd.Series(string_array < has_na_string_array)) # test repr self.assertIn('ArrowStringArray', repr(arrow_array)) # test concat empty arrow_array5 = ArrowStringArray(pa.chunked_array([], type=pa.string())) concatenated = ArrowStringArray._concat_same_type( [arrow_array5, arrow_array5]) self.assertEqual(len(concatenated._arrow_array.chunks), 1) pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))