예제 #1
0
    def testArrowDtype(self):
        s = pa.array(['a', 'b'])
        self.assertEqual(list(ArrowStringDtype().__from_arrow__(s)),
                         list(ArrowStringArray(s)))

        self.assertEqual(
            ArrowStringDtype(),
            ArrowStringDtype.construct_from_string('Arrow[string]'))

        self.assertEqual(
            ArrowListDtype(ArrowListDtype('string')),
            ArrowListDtype.construct_from_string('Arrow[List[string]]'))

        self.assertEqual(repr(ArrowListDtype(np.int8)), 'Arrow[List[int8]]')

        with self.assertRaises(TypeError):
            ArrowListDtype.construct_from_string('Arrow[string]')

        self.assertTrue(ArrowListDtype.is_dtype('Arrow[List[uint8]]'))
        self.assertFalse(ArrowListDtype.is_dtype('List[int8]'))
        self.assertFalse(ArrowListDtype.is_dtype(ArrowStringDtype()))

        self.assertNotEqual(ArrowListDtype(np.int8), ArrowStringDtype())
        self.assertEqual(ArrowListDtype(np.int8).kind, np.dtype(np.int8).kind)

        self.assertEqual(
            ArrowListDtype(np.int8).arrow_type, pa.list_(pa.int8()))
예제 #2
0
def test_arrow_dtype():
    s = pa.array(['a', 'b'])
    assert list(ArrowStringDtype().__from_arrow__(s)) == list(
        ArrowStringArray(s))

    assert ArrowStringDtype() == ArrowStringDtype.construct_from_string(
        'Arrow[string]')

    assert ArrowListDtype(
        ArrowListDtype('string')) == ArrowListDtype.construct_from_string(
            'Arrow[List[string]]')

    assert repr(ArrowListDtype(np.int8)) == 'Arrow[List[int8]]'

    with pytest.raises(TypeError):
        ArrowListDtype.construct_from_string('Arrow[string]')

    assert ArrowListDtype.is_dtype('Arrow[List[uint8]]') is True
    assert ArrowListDtype.is_dtype('List[int8]') is False
    assert ArrowListDtype.is_dtype(ArrowStringDtype()) is False

    assert ArrowListDtype(np.int8) != ArrowStringDtype()
    assert ArrowListDtype(np.int8).kind == np.dtype(object).kind

    assert ArrowListDtype(np.int8).arrow_type == pa.list_(pa.int8())
예제 #3
0
    def testToPandas(self):
        rs = np.random.RandomState(0)
        df = pd.DataFrame({'a': rs.rand(100),
                           'b': ['s' + str(i) for i in rs.randint(100, size=100)]})

        batch_size = 15
        n_batch = len(df) // 15 + 1
        batches = [pa.RecordBatch.from_pandas(df[i * batch_size: (i + 1) * batch_size])
                   for i in range(n_batch)]
        table = pa.Table.from_batches(batches)

        df2 = arrow_table_to_pandas_dataframe(table)
        self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype())
        self.assertLess(df2.memory_usage(deep=True).sum(),
                        df.memory_usage(deep=True).sum())

        # test serialize
        df3 = dataserializer.loads(dataserializer.dumps(df2))
        self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype())
        pd.testing.assert_frame_equal(df3, df2)

        # test df method
        df4 = df2.groupby('b').sum()
        expected = df.groupby('b').sum()
        pd.testing.assert_frame_equal(df4, expected)

        s = ('s' + df2['b']).astype('string')
        expected = ('s' + df['b']).astype('string')
        pd.testing.assert_series_equal(s, expected)

        s2 = df2['b'].str[:2]
        expected = df['b'].astype('string').str[:2]
        pd.testing.assert_series_equal(s2, expected)
예제 #4
0
def test_arrow_string_sort_values(setup):
    rs = np.random.RandomState(0)
    raw = pd.DataFrame({'a': rs.rand(10),
                        'b': [f's{rs.randint(1000)}' for _ in range(10)]
                        })
    raw['b'] = raw['b'].astype(ArrowStringDtype())
    mdf = DataFrame(raw, chunk_size=3)

    df = mdf.sort_values(by='b')
    result = df.execute().fetch()
    expected = raw.sort_values(by='b')
    pd.testing.assert_frame_equal(result, expected)
예제 #5
0
    def testArrowStringSortValues(self):
        rs = np.random.RandomState(0)
        raw = pd.DataFrame({'a': rs.rand(10),
                            'b': [f's{rs.randint(1000)}' for _ in range(10)]
                            })
        raw['b'] = raw['b'].astype(ArrowStringDtype())
        mdf = DataFrame(raw, chunk_size=3)

        df = mdf.sort_values(by='b')
        result = self.executor.execute_dataframe(df, concat=True)[0]
        expected = raw.sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)
def df_type_to_np_type(df_type, use_arrow_dtype=False):
    from ...df import types
    from ...df.backends.pd.types import _df_to_np_types

    if df_type == types.string:
        if use_arrow_dtype:
            return ArrowStringDtype()
        else:
            return np.dtype('object')
    elif df_type in _df_to_np_types:
        return _df_to_np_types[df_type]
    elif df_type == types.timestamp:
        return np.datetime64(0, 'ns').dtype
    else:
        return np.dtype('object')
예제 #7
0
def test_to_pandas():
    rs = np.random.RandomState(0)
    df = pd.DataFrame({
        'a':
        rs.rand(100),
        'b': ['s' + str(i) for i in rs.randint(100, size=100)],
        'c':
        [['ss0' + str(i), 'ss1' + str(i)] for i in rs.randint(100, size=100)]
    })

    batch_size = 15
    n_batch = len(df) // 15 + 1
    batches = [
        pa.RecordBatch.from_pandas(df[i * batch_size:(i + 1) * batch_size])
        for i in range(n_batch)
    ]
    table = pa.Table.from_batches(batches)

    df1 = arrow_table_to_pandas_dataframe(table, use_arrow_dtype=False)
    assert df1.dtypes.iloc[1] == np.dtype('O')
    assert df1.dtypes.iloc[2] == np.dtype('O')

    df2 = arrow_table_to_pandas_dataframe(table)
    assert df2.dtypes.iloc[1] == ArrowStringDtype()
    assert df2.dtypes.iloc[2] == ArrowListDtype(str)
    assert df2.memory_usage(deep=True).sum() < df.memory_usage(deep=True).sum()

    # test df method
    df4 = df2.groupby('b').sum()
    expected = df.groupby('b').sum()
    pd.testing.assert_frame_equal(df4, expected)

    s = ('s' + df2['b']).astype('string')
    expected = ('s' + df['b']).astype('string')
    pd.testing.assert_series_equal(s, expected)

    s2 = df2['b'].str[:2]
    expected = df['b'].astype('string').str[:2]
    pd.testing.assert_series_equal(s2, expected)
예제 #8
0
def test_arrow_string_array_functions():
    lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object)
    # leverage string array to get the right answer
    string_array = pd.arrays.StringArray(lst)
    has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文'])
    has_na_string_array = pd.arrays.StringArray(
        np.array(['abc', pd.NA, 'eee', '中文'], dtype=object))

    for pandas_only in [False, True]:
        with option_context({'dataframe.arrow_array.pandas_only':
                             pandas_only}):
            arrow_array = ArrowStringArray(lst)

            # getitem, scalar
            assert arrow_array[1] == string_array[1]
            assert arrow_array[-1] == string_array[-1]
            # getitem, slice
            assert list(arrow_array[:2]) == list(string_array[:2])
            assert list(arrow_array[1:-1]) == list(string_array[1:-1])
            assert list(arrow_array[::2]) == list(string_array[::2])
            # getitem, boolean index
            cond = np.array([len(c) > 2 for c in lst])
            assert list(arrow_array[cond]) == list(string_array[cond])
            # getitem, fancy index
            selection = [3, 1, 2]
            assert list(arrow_array[selection]) == list(
                string_array[selection])
            selection = [3, -1, 2, -4]
            assert list(arrow_array[selection]) == list(
                string_array[selection])
            selection = np.array([3, -1, 2, -4])
            assert list(arrow_array[selection]) == list(
                string_array[selection])

            # setitem
            arrow_array2 = arrow_array.copy()
            string_array2 = string_array.copy()
            arrow_array2[0] = 'ss'
            string_array2[0] = 'ss'
            assert list(arrow_array2) == list(string_array2)
            arrow_array2[1:3] = ['ss1', 'ss2']
            string_array2[1:3] = ['ss1', 'ss2']
            assert list(arrow_array2) == list(string_array2)
            arrow_array2[1:3] = arrow_array2[2:4]
            string_array2[1:3] = string_array2[2:4]
            assert list(arrow_array2) == list(string_array2)
            arrow_array2[2:] = pd.Series(['ss3', 'ss4'])
            string_array2[2:] = pd.Series(['ss3', 'ss4'])
            assert list(arrow_array2) == list(string_array2)
            with pytest.raises(ValueError):
                arrow_array2[0] = ['a', 'b']
            arrow_array2[-1] = None
            string_array2[-1] = None
            assert list(arrow_array2)[:-1] == list(string_array2)[:-1]
            assert pd.isna(list(arrow_array2)[-1]) is True
            with pytest.raises(ValueError):
                arrow_array2[0] = 2
            with pytest.raises(ValueError):
                arrow_array2[:2] = [1, 2]

            # test to_numpy
            np.testing.assert_array_equal(arrow_array.to_numpy(),
                                          string_array.to_numpy())
            np.testing.assert_array_equal(arrow_array.to_numpy(copy=True),
                                          string_array.to_numpy(copy=True))
            np.testing.assert_array_equal(
                has_na_arrow_array.to_numpy(copy=True, na_value='ss'),
                has_na_string_array.to_numpy(copy=True, na_value='ss'))

            # test fillna
            arrow_array3 = has_na_arrow_array.fillna('filled')
            string_array3 = has_na_string_array.fillna('filled')
            assert list(arrow_array3) == list(string_array3)

            # test astype
            arrow_array4 = ArrowStringArray(['1', '10', '100'])
            # leverage string array to get the right answer
            string_array4 = pd.arrays.StringArray(
                np.array(['1', '10', '100'], dtype=object))
            np.testing.assert_array_equal(arrow_array4.astype(np.int64),
                                          string_array4.astype(np.int64))
            np.testing.assert_almost_equal(arrow_array4.astype(float),
                                           string_array4.astype(float))
            assert list(arrow_array4.astype(
                ArrowStringDtype(), copy=False)) == list(
                    string_array4.astype(pd.StringDtype(), copy=False))
            assert list(arrow_array4.astype(
                ArrowStringDtype(), copy=True)) == list(
                    string_array4.astype(pd.StringDtype(), copy=True))

            # test factorize
            codes, unique = arrow_array.factorize()
            codes2, unique2 = string_array.factorize()
            assert list(codes) == list(codes2)
            assert list(unique) == list(unique2)

            # test nbytes
            assert arrow_array.nbytes < pd.Series(
                string_array.astype(object)).memory_usage(deep=True,
                                                          index=False)

            # test memory_usage
            if pandas_only:
                assert arrow_array.memory_usage(
                    deep=False) == pd.Series(string_array).memory_usage(
                        index=False)
            else:
                assert arrow_array.memory_usage(
                    deep=True) == arrow_array.nbytes

            # test isna
            np.testing.assert_array_equal(has_na_arrow_array.isna(),
                                          has_na_string_array.isna())
            has_na_arrow_array2 = has_na_arrow_array.copy()
            has_na_arrow_array2._force_use_pandas = True
            np.testing.assert_array_equal(has_na_arrow_array2.isna(),
                                          has_na_string_array.isna())

            # test take
            assert list(arrow_array.take([1, 2, -1])) == list(
                string_array.take([1, 2, -1]))
            assert list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')) \
                   == list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa'))
            assert list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) \
                   == list(string_array.take([1, 2, -1], allow_fill=True, fill_value='aa'))

            # test shift
            assert list(arrow_array.shift(2, fill_value='aa')) == list(
                string_array.shift(2, fill_value='aa'))

            # test value_counts
            assert list(arrow_array.value_counts()) == list(
                string_array.value_counts())
            assert list(has_na_arrow_array.value_counts(dropna=True)) == list(
                has_na_string_array.value_counts(dropna=True))

            # test all any
            assert arrow_array.all() == string_array.all()
            assert arrow_array.any() == string_array.any()

            # test arithmetic
            assert list(arrow_array + 's') == list(string_array + 's')
            assert list(
                (arrow_array + has_na_arrow_array).fillna('ss')) == list(
                    (string_array + has_na_string_array).fillna('ss'))

            # test comparison
            np.testing.assert_array_equal(arrow_array < 's',
                                          string_array < 's')
            pd.testing.assert_series_equal(
                pd.Series(arrow_array < has_na_arrow_array),
                pd.Series(string_array < has_na_string_array))

            # test repr
            assert 'ArrowStringArray' in repr(arrow_array)

            # test concat empty
            arrow_array5 = ArrowStringArray(
                pa.chunked_array([], type=pa.string()))
            concatenated = ArrowStringArray._concat_same_type(
                [arrow_array5, arrow_array5])
            if not pandas_only:
                assert len(concatenated._arrow_array.chunks) == 1
            pd.testing.assert_series_equal(pd.Series(arrow_array5),
                                           pd.Series(concatenated))
예제 #9
0
    def testDataSerialize(self):
        for type_, compress in itertools.product(
                (None,) + tuple(dataserializer.SerialType.__members__.values()),
                (None,) + tuple(dataserializer.CompressType.__members__.values())):
            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.load(
                BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.float64(0.2345)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

        # test non-serializable object
        if pyarrow:
            non_serial = type('non_serial', (object,), dict(nbytes=10))
            with self.assertRaises(SerializationFailed):
                dataserializer.dumps(non_serial())

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100,), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin')
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                            'c': list('aabaaddce')})
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)

        # test complex
        s = complex(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        s = np.complex64(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        # test pickle
        d = ClassToPickle(dict(a=1, b='uvw'))
        dest_d = dataserializer.loads((dataserializer.dumps(d)))
        self.assertIs(type(d), type(dest_d))
        self.assertEqual(d.a, dest_d.a)

        # test ndarray with negative strides
        arr = np.zeros((5, 6, 3))
        arr2 = arr[:, :, ::-1]
        dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2))
        np.testing.assert_array_equal(arr2, dest_arr2)

        # test ArrowArray
        df = pd.DataFrame({'a': ['s1', 's2', 's3'],
                           'b': [['s1', 's2'], ['s3'], ['s4', 's5']]})
        df['a'] = df['a'].astype(ArrowStringDtype())
        df['b'] = df['b'].astype(ArrowListDtype(str))
        dest_df = dataserializer.loads(dataserializer.dumps(df))
        self.assertIs(type(df), type(dest_df))
        pd.testing.assert_frame_equal(df, dest_df)

        # test DataFrame with SparseDtype
        s = pd.Series([1, 2, np.nan, np.nan, 3]).astype(
            pd.SparseDtype(np.dtype(np.float64), np.nan))
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_series_equal(s, dest_s)
        df = pd.DataFrame({'s': s})
        dest_df = dataserializer.loads((dataserializer.dumps(df)))
        pd.testing.assert_frame_equal(df, dest_df)
예제 #10
0
    def testArrowStringArrayFunctions(self):
        lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object)
        arrow_array = ArrowStringArray(lst)
        # leverage string array to get the right answer
        string_array = pd.arrays.StringArray(lst)
        has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文'])
        has_na_string_array = pd.arrays.StringArray(
            np.array(['abc', pd.NA, 'eee', '中文'], dtype=object))

        # getitem, scalar
        self.assertEqual(arrow_array[1], string_array[1])
        self.assertEqual(arrow_array[-1], string_array[-1])
        # getitem, slice
        self.assertListEqual(list(arrow_array[:2]), list(string_array[:2]))
        self.assertListEqual(list(arrow_array[1:-1]), list(string_array[1:-1]))
        self.assertListEqual(list(arrow_array[::2]), list(string_array[::2]))
        # getitem, boolean index
        cond = np.array([len(c) > 2 for c in lst])
        self.assertListEqual(list(arrow_array[cond]), list(string_array[cond]))
        # getitem, fancy index
        selection = [3, 1, 2]
        self.assertListEqual(list(arrow_array[selection]),
                             list(string_array[selection]))
        selection = [3, -1, 2, -4]
        self.assertListEqual(list(arrow_array[selection]),
                             list(string_array[selection]))
        selection = np.array([3, -1, 2, -4])
        self.assertListEqual(list(arrow_array[selection]),
                             list(string_array[selection]))

        # setitem
        arrow_array2 = arrow_array.copy()
        string_array2 = string_array.copy()
        arrow_array2[0] = 'ss'
        string_array2[0] = 'ss'
        self.assertListEqual(list(arrow_array2), list(string_array2))
        arrow_array2[1:3] = ['ss1', 'ss2']
        string_array2[1:3] = ['ss1', 'ss2']
        self.assertListEqual(list(arrow_array2), list(string_array2))
        arrow_array2[1:3] = arrow_array2[2:4]
        string_array2[1:3] = string_array2[2:4]
        self.assertListEqual(list(arrow_array2), list(string_array2))
        arrow_array2[2:] = pd.Series(['ss3', 'ss4'])
        string_array2[2:] = pd.Series(['ss3', 'ss4'])
        self.assertListEqual(list(arrow_array2), list(string_array2))
        with self.assertRaises(ValueError):
            arrow_array2[0] = ['a', 'b']
        arrow_array2[-1] = None
        string_array2[-1] = None
        self.assertListEqual(list(arrow_array2)[:-1], list(string_array2)[:-1])
        self.assertTrue(pd.isna(list(arrow_array2)[-1]))
        with self.assertRaises(ValueError):
            arrow_array2[0] = 2
        with self.assertRaises(ValueError):
            arrow_array2[:2] = [1, 2]

        # test to_numpy
        np.testing.assert_array_equal(arrow_array.to_numpy(),
                                      string_array.to_numpy())
        np.testing.assert_array_equal(arrow_array.to_numpy(copy=True),
                                      string_array.to_numpy(copy=True))
        np.testing.assert_array_equal(
            has_na_arrow_array.to_numpy(copy=True, na_value='ss'),
            has_na_string_array.to_numpy(copy=True, na_value='ss'))

        # test fillna
        arrow_array3 = has_na_arrow_array.fillna('filled')
        string_array3 = has_na_string_array.fillna('filled')
        self.assertListEqual(list(arrow_array3), list(string_array3))

        # test astype
        arrow_array4 = ArrowStringArray(['1', '10', '100'])
        # leverage string array to get the right answer
        string_array4 = pd.arrays.StringArray(
            np.array(['1', '10', '100'], dtype=object))
        np.testing.assert_array_equal(arrow_array4.astype(np.int64),
                                      string_array4.astype(np.int64))
        np.testing.assert_almost_equal(arrow_array4.astype(float),
                                       string_array4.astype(float))
        self.assertListEqual(
            list(arrow_array4.astype(ArrowStringDtype(), copy=False)),
            list(string_array4.astype(pd.StringDtype(), copy=False)))
        self.assertListEqual(
            list(arrow_array4.astype(ArrowStringDtype(), copy=True)),
            list(string_array4.astype(pd.StringDtype(), copy=True)))

        # test factorize
        codes, unique = arrow_array.factorize()
        codes2, unique2 = string_array.factorize()
        self.assertListEqual(list(codes), list(codes2))
        self.assertListEqual(list(unique), list(unique2))

        # test nbytes
        self.assertLess(arrow_array.nbytes,
                        pd.Series(string_array).memory_usage(deep=True))

        # test memory_usage
        self.assertEqual(arrow_array.memory_usage(deep=True),
                         arrow_array.nbytes)

        # test isna
        np.testing.assert_array_equal(has_na_arrow_array.isna(),
                                      has_na_string_array.isna())
        has_na_arrow_array2 = has_na_arrow_array.copy()
        has_na_arrow_array2._force_use_pandas = True
        np.testing.assert_array_equal(has_na_arrow_array2.isna(),
                                      has_na_string_array.isna())

        # test take
        self.assertListEqual(list(arrow_array.take([1, 2, -1])),
                             list(string_array.take([1, 2, -1])))
        self.assertListEqual(
            list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')),
            list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa')))
        self.assertListEqual(
            list(arrow_array.take([1, 2, -1], allow_fill=True,
                                  fill_value='aa')),
            list(
                string_array.take([1, 2, -1], allow_fill=True,
                                  fill_value='aa')))

        # test shift
        self.assertListEqual(list(arrow_array.shift(2, fill_value='aa')),
                             list(string_array.shift(2, fill_value='aa')))

        # test value_counts
        self.assertListEqual(list(arrow_array.value_counts()),
                             list(string_array.value_counts()))
        self.assertListEqual(
            list(has_na_arrow_array.value_counts(dropna=True)),
            list(has_na_string_array.value_counts(dropna=True)))

        # test all any
        self.assertEqual(arrow_array.all(), string_array.all())
        self.assertEqual(arrow_array.any(), string_array.any())

        # test arithmetic
        self.assertListEqual(list(arrow_array + 's'), list(string_array + 's'))
        self.assertListEqual(
            list((arrow_array + has_na_arrow_array).fillna('ss')),
            list((string_array + has_na_string_array).fillna('ss')))

        # test comparison
        np.testing.assert_array_equal(arrow_array < 's', string_array < 's')
        pd.testing.assert_series_equal(
            pd.Series(arrow_array < has_na_arrow_array),
            pd.Series(string_array < has_na_string_array))

        # test repr
        self.assertIn('ArrowStringArray', repr(arrow_array))

        # test concat empty
        arrow_array5 = ArrowStringArray(pa.chunked_array([], type=pa.string()))
        concatenated = ArrowStringArray._concat_same_type(
            [arrow_array5, arrow_array5])
        self.assertEqual(len(concatenated._arrow_array.chunks), 1)
        pd.testing.assert_series_equal(pd.Series(arrow_array5),
                                       pd.Series(concatenated))