def test_hdfs_read_multiple_parquet_files(self): import pyarrow.parquet as pq nfiles = 10 size = 5 tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid()) self.hdfs.mkdir(tmpdir) test_data = [] paths = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) paths.append(path) result = self.hdfs.read_parquet(tmpdir) expected = pa.concat_tables(test_data) pdt.assert_frame_equal( result.to_pandas().sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_read_multiple_parquet_files(self): import pyarrow.parquet as pq nfiles = 10 size = 5 tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid()) self.hdfs.mkdir(tmpdir) test_data = [] paths = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) paths.append(path) result = self.hdfs.read_parquet(tmpdir) expected = pa.concat_tables(test_data) pdt.assert_frame_equal(result.to_pandas() .sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = test_parquet._test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path, filesystem=self.hdfs) result = pq.read_table(path, filesystem=self.hdfs).to_pandas() _pandas_api.assert_frame_equal(result, df)
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = test_parquet._test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path) result = pq.read_table(path).to_pandas() pdt.assert_frame_equal(result, df)
def _write_multiple_hdfs_pq_files(self, tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 test_data = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) expected = pa.concat_tables(test_data) return expected
def _write_multiple_hdfs_pq_files(self, tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 test_data = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) expected = pa.concat_tables(test_data) return expected