assert rg_meta.num_columns == ncols + 1 # +1 for index @parquet @pytest.mark.parametrize( 'data, dtype, min_value, max_value, null_count, num_values', [ ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4), ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4), ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4), ([u'', u'b', unichar(1000), None, u'aaa' ], str, u' ', frombytes( (unichar(1000) + u' ').encode('utf-8')), 1, 4), ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5), ]) def test_parquet_column_statistics_api(data, dtype, min_value, max_value, null_count, num_values): df = pd.DataFrame({'data': data}, dtype=dtype) fileh = make_sample_file(df) meta = fileh.metadata rg_meta = meta.row_group(0) col_meta = rg_meta.column(0)
@parquet @pytest.mark.parametrize( 'data, dtype, min_value, max_value, null_count, num_values', [ ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4), ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4), ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4), ( [u'', u'b', unichar(1000), None, u'aaa'], str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4 ), ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5), ] ) def test_parquet_column_statistics_api( data, dtype, min_value, max_value, null_count, num_values): df = pd.DataFrame({'data': data}, dtype=dtype) fileh = make_sample_file(df)