def test_read_multiple_parquet_files(self): tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) result = self.hdfs.read_parquet(tmpdir) _pandas_api.assert_frame_equal( result.to_pandas().sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_read_multiple_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) path = _get_hdfs_uri(tmpdir) result = pq.read_table(path) _pandas_api.assert_frame_equal( result.to_pandas().sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_read_multiple_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) path = _get_hdfs_uri(tmpdir) # TODO for URI it should not be needed to pass this argument result = pq.read_table(path, use_legacy_dataset=True) _pandas_api.assert_frame_equal( result.to_pandas().sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = test_parquet._test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path, filesystem=self.hdfs) result = pq.read_table(path, filesystem=self.hdfs).to_pandas() _pandas_api.assert_frame_equal(result, df)