def test_read_single_row_group(): import pyarrow.parquet as pq # ARROW-471 N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) assert pf.num_row_groups == K row_groups = [pf.read_row_group(i) for i in range(K)] result = pa.concat_tables(row_groups) tm.assert_frame_equal(df, result.to_pandas())
def test_extract_parquet(self): file = BASE_DIR / 'amazon-reviews-1000.snappy.parquet' cell_value = '<td>TSD Airsoft/Paintball Full-Face Mask, Goggle Lens</td>' with patch('t4_lambda_shared.preview.get_available_memory') as mem_mock: mem_mock.return_value = 1 with open(file, mode='rb') as parquet: body, info = extract_parquet(parquet) assert all(bracket in body for bracket in ('<', '>')) assert body.count('<') == body.count('>'), \ 'expected matching HTML tags' assert cell_value not in body, 'only expected columns' assert 'skipped rows' in info['warnings'] with open(file, mode='rb') as parquet: body, info = extract_parquet(parquet, as_html=True) assert cell_value in body, 'missing expected HTML cell' with open(file, mode='rb') as parquet: body, info = extract_parquet(parquet, skip_rows=True) assert 'skipped rows' in info['warnings'] assert cell_value not in body, 'only expected columns' with open(file, mode='rb') as parquet: body, info = extract_parquet(parquet, as_html=False) assert all(bracket not in body for bracket in ('<', '>')), \ 'did not expect HTML' parquet_file = pq.ParquetFile(file) assert all( column in info['schema']['names'] for column in parquet_file.schema.names ) assert [ parquet_file.metadata.num_rows, parquet_file.metadata.num_columns ] == info['shape'], 'Unexpected number of rows or columns'
def test_read_common_metadata_files(tmpdir): import pyarrow.parquet as pq N = 100 df = pd.DataFrame({ 'index': np.arange(N), 'values': np.random.randn(N) }, columns=['index', 'values']) base_path = str(tmpdir) data_path = pjoin(base_path, 'data.parquet') table = pa.Table.from_pandas(df) _write_table(table, data_path) metadata_path = pjoin(base_path, '_metadata') pq.write_metadata(table.schema, metadata_path) dataset = pq.ParquetDataset(base_path) assert dataset.metadata_path == metadata_path pf = pq.ParquetFile(data_path) assert dataset.schema.equals(pf.schema) # handle list of one directory dataset2 = pq.ParquetDataset([base_path]) assert dataset2.schema.equals(dataset.schema)
def temp_load(fname='df.gzip'): '''loads a Parquet file that was saved using temp_save faster than re-loading and re-processing the csv''' _table = (pq.ParquetFile(fname).read(use_pandas_metadata=True)) df = _table.to_pandas(strings_to_categorical=True ) # force strings to categoricals to save memory return df
def _generate_tables(self, files): schema = pa.schema(self.config.features.type ) if self.config.features is not None else None if self.config.features is not None and self.config.columns is not None: if sorted([field.name for field in schema]) != sorted(self.config.columns): raise ValueError( f"Tried to load parquet data with columns '{self.config.columns}' with mismatching features '{self.config.features}'" ) for file_idx, file in enumerate(files): with open(file, "rb") as f: parquet_file = pq.ParquetFile(f) try: for batch_idx, record_batch in enumerate( parquet_file.iter_batches( batch_size=self.config.batch_size, columns=self.config.columns)): pa_table = pa.Table.from_batches([record_batch]) if self.config.features is not None: pa_table = pa.Table.from_arrays( [pa_table[field.name] for field in schema], schema=schema) # Uncomment for debugging (will print the Arrow table size and elements) # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}") # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows))) yield f"{file_idx}_{batch_idx}", pa_table except ValueError as e: logger.error( f"Failed to read file '{file}' with error {type(e)}: {e}" ) raise
def test_parquet_file_pass_directory_instead_of_file(tempdir): # ARROW-7208 path = tempdir / 'directory' os.mkdir(str(path)) with pytest.raises(IOError, match="Expected file path"): pq.ParquetFile(path)
def __init__(self, input: InputFile, expected_schema: Schema, options, filter_expr: Expression, case_sensitive: bool, start: int = None, end: int = None): self._stats: typing.Dict[str, int] = dict() self._input = input self._input_fo = input.new_fo() self._arrow_file = pq.ParquetFile(self._input_fo) self._file_schema = convert_parquet_to_iceberg(self._arrow_file) self._expected_schema = expected_schema self._file_to_expected_name_map = ParquetReader.get_field_map( self._file_schema, self._expected_schema) self._options = options self._filter = get_dataset_filter( filter_expr, ParquetReader.get_reverse_field_map(self._file_schema, self._expected_schema)) self._case_sensitive = case_sensitive if start is not None or end is not None: raise NotImplementedError( "Partial file reads are not yet supported") # self.start = start # self.end = end self.materialized_table = False self._table = None _logger.debug("Reader initialized for %s" % self._input.path)
def main(inputFile, outputFile): """Temporary hackarounds from using old pyarrow""" parquet_file = pq.ParquetFile(inputFile) df = parquet_file.read().to_pandas() # Brutally fill nulls with 0 # It can be interpreted as boolean df.fillna(value=0, inplace=True) typeMapping = { 'FLOAT': 'float32', 'DOUBLE': 'float64', 'BOOLEAN': 'bool', 'BIGINT': 'int64', 'INTEGER': 'int32', 'TEXT': 'str', } filepath = os.path.join(getPackageDir("sdm_schemas"), 'yml', 'hsc.yaml') with open(filepath, 'r') as f: hscSchema = yaml.safe_load(f)['tables'] objectSchema = [table for table in hscSchema if table['name'] == 'Object'] hackDict = dict() for column in objectSchema[0]['columns']: sqlType = column['mysql:datatype'] # Hack the types for now if sqlType in typeMapping: sqlType = typeMapping[sqlType] hackDict[column['name']] = sqlType print(hackDict) df = df.astype(hackDict, copy=False) table = pyarrow.Table.from_pandas(df) pq.write_table(table, outputFile, compression='none')
def is_match(cls, file_path, options=None): """ Test the given file to check if the file has valid Parquet format. :param file_path: path to the file to be examined :type file_path: str :param options: parquet read options :type options: dict :return: is file a parquet file or not :rtype: bool """ if options is None: options = dict() # get current position of stream if data_utils.is_stream_buffer(file_path): starting_location = file_path.tell() try: pfile = pq.ParquetFile(file_path) # NOQA is_valid_parquet = True except Exception: is_valid_parquet = False # return to original position in stream if data_utils.is_stream_buffer(file_path): file_path.seek(starting_location, 0) return is_valid_parquet
def parquet_reader(folder): data = [] for file in os.listdir(folder): if os.path.splitext(file)[1] == '.parquet': file_path = os.path.join(folder, file) chunk = pq.ParquetFile(file_path).read().to_pandas() data.append(chunk) return pd.concat(data, axis = 0)
def dump(file_name): pqf = pq.ParquetFile(file_name) metadata = pqf.metadata dump_file(metadata) for row_group_index in range(metadata.num_row_groups): row_group = metadata.row_group(row_group_index) dump_row_group(row_group_index, row_group)
def __init__(self, filename, label): self.parquet = pq.ParquetFile(filename) #self.cols = None # read all columns #self.cols = ['X_jet.list.item.list.item.list.item','am','apt','iphi','ieta'] self.cols = [ 'X_jet.list.item.list.item.list.item', 'am', 'iphi', 'ieta' ] self.label = label
def __init__(self, test_data_path, host): # read the test data(label, features) stored as a parquet file object self.parquet_file = pq.ParquetFile(test_data_path) print('-----------------------') # creating the Kafka producer object self.producer = KafkaProducer(bootstrap_servers=[host], value_serializer=lambda x: dumps(x).encode('utf-8'))
def test_it_generates_new_file_without_matches(mock_delete): # Arrange column = {"Column": "customer_id", "MatchIds": ["12345", "23456"]} data = [{'customer_id': '12345'}, {'customer_id': '23456'}] df = pd.DataFrame(data) buf = BytesIO() df.to_parquet(buf) br = pa.BufferReader(buf.getvalue()) f = pq.ParquetFile(br, memory_map=False) mock_delete.return_value = pd.DataFrame([{'customer_id': '12345'}]) # Act out, stats = delete_matches_from_file(f, [column]) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 2, "DeletedRows": 1} == stats res = pa.BufferReader(out.getvalue()) newf = pq.ParquetFile(res, memory_map=False) assert 1 == newf.read().num_rows
def __init__(self, filename, label): self.parquet = pq.ParquetFile(filename) #self.cols = None # read all columns self.cols = [ 'Xtz_aod.list.item.list.item.list.item', 'm', 'pt', 'w', 'iphi', 'ieta' ] self.label = label
def get_parquet_schema(infile): """ Get schema from parquet file""" try: parquet_schema = parquet.ParquetFile(infile) return parquet_schema.schema except BaseException as e: raise e
def make_sample_file(df): a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() pq.write_table(a_table, buf, compression='SNAPPY', version='2.0') buf.seek(0) return pq.ParquetFile(buf)
def test_pass_separate_metadata(): # ARROW-471 df = alltypes_sample(size=10000) a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() pq.write_table(a_table, buf, compression='snappy', version='2.0') buf.seek(0) metadata = pq.ParquetFile(buf).metadata buf.seek(0) fileh = pq.ParquetFile(buf, metadata=metadata) pdt.assert_frame_equal(df, fileh.read().to_pandas())
def read_groups(path, groups): pf = pq.ParquetFile(path) #read row groups to pyarrow.Table and convert to pandas DataFrame df = pf.read_row_groups(row_groups=groups).to_pandas() #cleanup parquet file del pf return df
def read_parquet(): pf = pq.ParquetFile(TICKERS_DIR + '/ticker.parquet') for i in range(pf.metadata.num_row_groups): table = pf.read_row_group(i) columns = table.to_pydict() print(columns)
def test_pyarrow_compression(): table = pv.read_csv("./data/people/people1.csv") pq.write_table(table, "./tmp/pyarrow_out/people1.parquet") parquet_file = pq.ParquetFile("./tmp/pyarrow_out/people1.parquet") # print(parquet_file.metadata) # print(parquet_file.metadata.row_group(0)) # print(parquet_file.metadata.row_group(0).column(0)) # print(parquet_file.metadata.row_group(0).column(0).statistics) assert parquet_file.metadata.row_group(0).column(0).compression == "SNAPPY"
def __init__(self, filename=None, dataFrame=None): if filename is not None: self._pf = pq.ParquetFile(filename) self._df = None elif dataFrame is not None: self._df = dataFrame self._pf = None else: raise ValueError('Either filename or dataFrame must be passed.')
def get_file_map(args): r = re.compile(args.parquet_genotype_pattern) files = os.listdir(args.parquet_genotype_folder) files = {int(r.search(f).groups()[0]):os.path.join(args.parquet_genotype_folder, f) for f in files if r.search(f)} p = {} for k,v in files.items(): g = pq.ParquetFile(v) p[k] = g return p
def read_group_to_pandas(self, f, group_index, columns=None, use_arrow_dtype=None, **kwargs): file = pq.ParquetFile(f) t = file.read_row_group(group_index, columns=columns, **kwargs) return self._table_to_pandas(t, use_arrow_dtype=use_arrow_dtype)
def _init_reader(self, file: Union[TextIO, BinaryIO]) -> ParquetFile: """Generates a new parquet reader Doc: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html """ options = self._select_options("buffer_size") # type: ignore[arg-type] # Source is a file path and enabling memory_map can improve performance in some environments. options["memory_map"] = True return pq.ParquetFile(file, **options)
def __next__(self) -> pd.DataFrame: pq_reader = pq.ParquetFile(self.path) if self._current_row_group == pq_reader.num_row_groups: raise StopIteration table = pq_reader.read_row_group(self._current_row_group) self._current_row_group += 1 return table.to_pandas()
def tile(cls, op): chunk_index = 0 out_chunks = [] out_df = op.outputs[0] dtypes = out_df.dtypes if op.use_arrow_dtype is None and not op.gpu and \ options.dataframe.use_arrow_dtype: # pragma: no cover # check if use_arrow_dtype set on the server side dtypes = to_arrow_dtypes(out_df.dtypes) shape = (np.nan, out_df.shape[1]) paths = op.path if isinstance(op.path, (tuple, list)) else \ glob(op.path, storage_options=op.storage_options) for pth in paths: if op.groups_as_chunks: for group_idx in range(pq.ParquetFile(pth).num_row_groups): chunk_op = op.copy().reset_key() chunk_op._path = pth chunk_op._group_index = group_idx new_chunk = chunk_op.new_chunk( None, shape=shape, index=(chunk_index, 0), index_value=out_df.index_value, columns_value=out_df.columns_value, dtypes=dtypes) out_chunks.append(new_chunk) chunk_index += 1 else: chunk_op = op.copy().reset_key() chunk_op._path = pth new_chunk = chunk_op.new_chunk( None, shape=shape, index=(chunk_index, 0), index_value=out_df.index_value, columns_value=out_df.columns_value, dtypes=dtypes) out_chunks.append(new_chunk) chunk_index += 1 if op.incremental_index: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (out_df.shape[1], )) return new_op.new_dataframes(None, out_df.shape, dtypes=dtypes, index_value=out_df.index_value, columns_value=out_df.columns_value, chunks=out_chunks, nsplits=nsplits)
def _tile_no_partitioned(cls, op: "DataFrameReadParquet"): chunk_index = 0 out_chunks = [] out_df = op.outputs[0] dtypes = cls._to_arrow_dtypes(out_df.dtypes, op) shape = (np.nan, out_df.shape[1]) paths = op.path if isinstance(op.path, (tuple, list)) else \ glob(op.path, storage_options=op.storage_options) first_chunk_row_num, first_chunk_raw_bytes = None, None for i, pth in enumerate(paths): if i == 0: with open_file(pth, storage_options=op.storage_options) as f: first_chunk_row_num = get_engine(op.engine).get_row_num(f) first_chunk_raw_bytes = file_size(pth, storage_options=op.storage_options) if op.groups_as_chunks: num_row_groups = pq.ParquetFile(pth).num_row_groups for group_idx in range(num_row_groups): chunk_op = op.copy().reset_key() chunk_op._path = pth chunk_op._group_index = group_idx chunk_op._first_chunk_row_num = first_chunk_row_num chunk_op._first_chunk_raw_bytes = first_chunk_raw_bytes chunk_op._num_group_rows = num_row_groups new_chunk = chunk_op.new_chunk( None, shape=shape, index=(chunk_index, 0), index_value=out_df.index_value, columns_value=out_df.columns_value, dtypes=dtypes) out_chunks.append(new_chunk) chunk_index += 1 else: chunk_op = op.copy().reset_key() chunk_op._path = pth chunk_op._first_chunk_row_num = first_chunk_row_num chunk_op._first_chunk_raw_bytes = first_chunk_raw_bytes new_chunk = chunk_op.new_chunk( None, shape=shape, index=(chunk_index, 0), index_value=out_df.index_value, columns_value=out_df.columns_value, dtypes=dtypes) out_chunks.append(new_chunk) chunk_index += 1 if op.incremental_index: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],)) return new_op.new_dataframes(None, out_df.shape, dtypes=dtypes, index_value=out_df.index_value, columns_value=out_df.columns_value, chunks=out_chunks, nsplits=nsplits)
def read_parquet_metadata(path): """{docstring}""" pq_file = pq.ParquetFile(path) num_rows = pq_file.metadata.num_rows num_row_groups = pq_file.num_row_groups col_names = pq_file.schema.names return num_rows, num_row_groups, col_names
def write_commonmetadata_file(): with filesystem.open(os.path.join(path, "part.0.parquet")) as f: pf = pq.ParquetFile(f) all_metadata = copy.copy(pf.metadata.metadata) all_metadata[b'spatialpandas'] = b_spatial_metadata new_schema = pf.schema.to_arrow_schema().with_metadata(all_metadata) with filesystem.open(os.path.join(path, "_common_metadata"), 'wb') as f: pq.write_metadata(new_schema, f)