def test_read_single_row_group(): # ARROW-471 N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() pq.write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) assert pf.num_row_groups == K row_groups = [pf.read_row_group(i) for i in range(K)] result = pa.concat_tables(row_groups) pdt.assert_frame_equal(df, result.to_pandas()) cols = df.columns[:2] row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)] result = pa.concat_tables(row_groups) pdt.assert_frame_equal(df[cols], result.to_pandas())
def test_read_multiple_parquet_files(self): import pyarrow.parquet as pq nfiles = 10 size = 5 tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid()) self.hdfs.mkdir(tmpdir) test_data = [] paths = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) paths.append(path) result = self.hdfs.read_parquet(tmpdir) expected = pa.concat_tables(test_data) pdt.assert_frame_equal(result.to_pandas() .sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_concat_tables(): data = [ list(range(5)), [-10., -5., 0., 5., 10.] ] data2 = [ list(range(5, 10)), [1., 2., 3., 4., 5.] ] t1 = pa.Table.from_arrays([pa.from_pylist(x) for x in data], names=('a', 'b'), name='table_name') t2 = pa.Table.from_arrays([pa.from_pylist(x) for x in data2], names=('a', 'b'), name='table_name') result = pa.concat_tables([t1, t2], output_name='foo') assert result.name == 'foo' assert len(result) == 10 expected = pa.Table.from_arrays([pa.from_pylist(x + y) for x, y in zip(data, data2)], names=('a', 'b'), name='foo') assert result.equals(expected)
def test_indexed_table_mixin(): n_rows_per_chunk = 10 n_chunks = 4 pa_table = pa.Table.from_pydict({"col": [0] * n_rows_per_chunk}) pa_table = pa.concat_tables([pa_table] * n_chunks) table = Table(pa_table) assert all( table._offsets.tolist() == np.cumsum([0] + [n_rows_per_chunk] * n_chunks)) assert table.fast_slice(5) == pa_table.slice(5) assert table.fast_slice(2, 13) == pa_table.slice(2, 13)
def append(self, other_stream: "EventStream"): """ add another EventStream onto the calling one if they have the same name :param other_stream: other stream to add to current """ if other_stream.name == self.name: self._data = pa.concat_tables([self._data, other_stream._data]) self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.timestamps_metadata} self.metadata = {**self.metadata, **other_stream.metadata} self._errors.extend_error(other_stream.errors())
def read_files(read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper]): logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() tables = [] for read_path in read_paths: with fs.open_input_file(read_path) as f: tables.append(read_file(f, **reader_args)) return pa.concat_tables(tables)
def loaf(self, func, chunksize=1_000_000): """ ArrowLoaf: Generate DataFrames. Apply function to each frame. Loaf results together. Function must not change table schema. """ chunks, schema = self.chunks, self.schema chunks = map(func, chunks(chunksize)) chunks = concat_tables(build(x, schema=schema) for x in chunks) return type(self)(chunks)
def merge_sorted_blocks( blocks: List[Block[T]], key: SortKeyT, _descending: bool) -> Tuple[Block[T], BlockMetadata]: blocks = [b for b in blocks if b.num_rows > 0] if len(blocks) == 0: ret = pyarrow.Table.from_pydict({}) else: ret = pyarrow.concat_tables(blocks, promote=True) indices = pyarrow.compute.sort_indices(ret, sort_keys=key) ret = ret.take(indices) return ret, ArrowBlockAccessor(ret).get_metadata(None)
def test_filter_table_ordering(): table1 = pa.table({'a': [1, 2, 3, 4], 'b': ['a'] * 4}) table2 = pa.table({'a': [1, 2, 3, 4], 'b': ['b'] * 4}) table = pa.concat_tables([table1, table2]) for _ in range(20): # 20 seems to consistently cause errors when order is not preserved. # If the order problem is reintroduced this test will become flaky # which is still a signal that the order is not preserved. r = ep._filter_table(table, pc.field('a') == 1) assert r["b"] == pa.chunked_array([["a"], ["b"]])
def build(self) -> Block: if self._columns: tables = [pyarrow.Table.from_pydict(self._columns)] else: tables = [] tables.extend(self._tables) if len(tables) > 1: return pyarrow.concat_tables(tables) elif len(tables) > 0: return tables[0] else: return pyarrow.Table.from_pydict({})
def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) datasets = [ ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)]) ] for dataset in datasets: assert dataset.schema.equals(table.schema) result = dataset.to_table() assert result.equals(table)
def build(self) -> "ArrowBlock[T]": if self._columns: tables = [pyarrow.Table.from_pydict(self._columns)] else: tables = [] tables.extend(self._tables) if len(tables) > 1: return ArrowBlock(pyarrow.concat_tables(tables)) elif len(tables) > 0: return ArrowBlock(tables[0]) else: return ArrowBlock(pyarrow.Table.from_pydict({}))
def merge_sorted_blocks( blocks: List[Block[T]], key: "SortKeyT", _descending: bool ) -> Tuple[Block[T], BlockMetadata]: stats = BlockExecStats.builder() blocks = [b for b in blocks if b.num_rows > 0] if len(blocks) == 0: ret = ArrowBlockAccessor._empty_table() else: ret = pyarrow.concat_tables(blocks, promote=True) indices = pyarrow.compute.sort_indices(ret, sort_keys=key) ret = ArrowBlockAccessor.take_table(ret, indices) return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) # list of exact files needs to be passed to source() function # (dataset() will interpret it as separate sources) for dataset in [ ds.dataset(ds.source([path1, path2])), ds.dataset(ds.source([str(path1), str(path2)]))]: assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.new_scan().finish().to_table() assert result.replace_schema_metadata().equals(table)
def csv_read(read_paths: List[str]): logger.debug(f"Reading {len(read_paths)} files.") tables = [] for read_path in read_paths: with filesystem.open_input_file(read_path) as f: tables.append( csv.read_csv( f, read_options=csv.ReadOptions(use_threads=False), **arrow_csv_args)) block = ArrowBlock(pa.concat_tables(tables)) return block, block.get_metadata(input_files=read_paths)
def do_one_year(y, sales_promo=True): start = time.time() print("Processing Year:\t", y) stores_list = self.stores_df[self.stores_df.panel_year == y].store_code_uc.unique() out = pa.concat_tables([ read_one_sales(f, stores_list, incl_promo=sales_promo) for f in self.sales_dict[y] ]) end = time.time() print("in ", end - start, " seconds.") return out
def test_concat_tables_with_promotion(): t1 = pa.Table.from_arrays( [pa.array([1, 2], type=pa.int64())], ["int64_field"]) t2 = pa.Table.from_arrays( [pa.array([1.0, 2.0], type=pa.float32())], ["float_field"]) result = pa.concat_tables([t1, t2], promote=True) assert result.equals(pa.Table.from_arrays([ pa.array([1, 2, None, None], type=pa.int64()), pa.array([None, None, 1.0, 2.0], type=pa.float32()), ], ["int64_field", "float_field"]))
def fetch(self, verbose): ts = [] for i, p in enumerate(self.dataset.pieces): if self.partition_check(self.partition_values[i], self.part_filters): ts.append( p.read(columns=[ c for c in self.columns_backward if c not in self.partition_keys ], partitions=self.dataset.partitions)) t = pa.concat_tables(ts) return (filters(t, self.value_filters) if self.value_filters else t)
def test_read_single_row_group_with_column_subset(): N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) cols = list(df.columns[:2]) row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)] result = pa.concat_tables(row_groups) tm.assert_frame_equal(df[cols], result.to_pandas()) # ARROW-4267: Selection of duplicate columns still leads to these columns # being read uniquely. row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)] result = pa.concat_tables(row_groups) tm.assert_frame_equal(df[cols], result.to_pandas())
def test_concat_tables(arrow_file, in_memory_pa_table): t0 = in_memory_pa_table t1 = InMemoryTable(t0) t2 = MemoryMappedTable.from_file(arrow_file) t3 = ConcatenationTable.from_blocks(t1) concatenated_table = concat_tables([t0, t1, t2, t3]) assert concatenated_table.table == pa.concat_tables([t0] * 4) assert isinstance(concatenated_table, ConcatenationTable) assert len(concatenated_table.blocks) == 4 assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[1][0], InMemoryTable) assert isinstance(concatenated_table.blocks[2][0], MemoryMappedTable) assert isinstance(concatenated_table.blocks[3][0], InMemoryTable)
def gen_read(pieces: List[pq.ParquetDatasetPiece]): import pyarrow logger.debug("Reading {} parquet pieces".format(len(pieces))) tables = [ piece.read(columns=columns, use_threads=False, partitions=partitions) for piece in pieces ] if len(tables) > 1: table = pyarrow.concat_tables(tables) else: table = tables[0] return ArrowBlock(table)
def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) # list of exact files needs to be passed to source() function # (dataset() will interpret it as separate sources) for dataset in [ ds.dataset(ds.source([path1, path2])), ds.dataset(ds.source([str(path1), str(path2)])) ]: assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table, check_metadata=False)
def get_data( self, selector: SeriesSelector, start_date: datetime, end_date: datetime ) -> pa.Table: """Return the data for the given series in the given time frame, taking into account the request policy.""" if start_date == end_date or selector.name is None: return pa.Table.from_pydict({"ts": [], "value": []}) tables = [ self.__source.data.get_data(selector, start, end) for start, end in self.__to_intervals(start_date, end_date) ] tables = [table for table in tables if len(table) > 0] if len(tables) == 0: return pa.Table.from_pydict({"ts": [], "value": []}) return pa.concat_tables(tables)
def aggregate(self, data): names = list(data[0].keys()) cols = {name: [] for name in names} for entry in data: for key in entry: cols[key].append(entry[key]) arrays = [pa.array(cols[col]) for col in cols] table = pa.Table.from_arrays(arrays, names=names) if self.data is None: self.data = table else: self.data = pa.concat_tables(self.data, table)
def test_concat_tables(): data = [list(range(5)), [-10., -5., 0., 5., 10.]] data2 = [list(range(5, 10)), [1., 2., 3., 4., 5.]] t1 = pa.Table.from_arrays([pa.array(x) for x in data], names=('a', 'b')) t2 = pa.Table.from_arrays([pa.array(x) for x in data2], names=('a', 'b')) result = pa.concat_tables([t1, t2]) assert len(result) == 10 expected = pa.Table.from_arrays( [pa.array(x + y) for x, y in zip(data, data2)], names=('a', 'b')) assert result.equals(expected)
def read_pieces(pieces: List["pyarrow._dataset.ParquetFileFragment"]): import pyarrow as pa logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) tables = [ piece.to_table(use_threads=use_threads, columns=columns, **reader_args) for piece in pieces ] if len(tables) > 1: table = pa.concat_tables(tables) else: table = tables[0] return table
def test_upcast_pyarrow_dicts() -> None: # 1752 tbls = [] for i in range(128): tbls.append( pa.table({ "col_name": pa.array(["value_" + str(i)], pa.dictionary(pa.int8(), pa.string())), })) tbl = pa.concat_tables(tbls, promote=True) out = pl.from_arrow(tbl) assert out.shape == (128, 1)
def test_concatenation_table_from_tables(in_memory_pa_table): in_memory_table = InMemoryTable(in_memory_pa_table) concatenation_table = ConcatenationTable.from_blocks(in_memory_table) with assert_arrow_memory_doesnt_increase(): table = ConcatenationTable.from_tables( [in_memory_pa_table, in_memory_table, concatenation_table]) assert table.table == pa.concat_tables([in_memory_pa_table] * 3) assert isinstance(table, ConcatenationTable) assert len(table.blocks) == 3 assert all(len(tables) == 1 for tables in table.blocks) assert all( isinstance(tables[0], InMemoryTable) for tables in table.blocks) assert all(tables[0].table == in_memory_pa_table for tables in table.blocks)
def flush(self, idx): if idx % get_world_size() == get_rank(): input_tables = [] num_samples_to_flush = 0 while len(self._input_files) > 0: input_file = self._input_files.pop() num_samples_to_flush += input_file.num_samples if idx % get_world_size() == get_rank(): input_tables.append(self._read_table_from_file(input_file)) if num_samples_to_flush > 0: self._store( num_samples_to_flush, table=(pa.concat_tables(input_tables) if (idx % get_world_size() == get_rank()) else None), )
def merge_parquets(self): """ This function merge all the parquets files (one for each species) in a bigger one. """ parquet_lst = glob(''.join([self.path_parquet, '*.parquet'])) pq_tables = [] for f in tqdm(parquet_lst): table = pq.read_table(f) pq_tables.append(table) os.remove(f) final_table = pa.concat_tables(pq_tables) pq.write_table(final_table, ''.join([self.path_file, '.parquet']), use_dictionary = True, compression='snappy')
def _arrow_row_slice(self, row_numeric_idx): table = self._execute_arrow() if isinstance(row_numeric_idx, slice): start = 0 if row_numeric_idx.start is None else row_numeric_idx.start if start < 0: start = table.num_rows - start end = (table.num_rows if row_numeric_idx.stop is None else row_numeric_idx.stop) if end < 0: end = table.num_rows - end if row_numeric_idx.step is None or row_numeric_idx.step == 1: length = 0 if start >= end else end - start return table.slice(start, length) else: parts = [] for i in range(start, end, row_numeric_idx.step): parts.append(table.slice(i, 1)) return pyarrow.concat_tables(parts) start = None end = None parts = [] for idx in row_numeric_idx: if start is None: start = idx end = idx elif idx == end + 1: end = idx else: if start: parts.append(table.slice(start, end - start + 1)) start = idx end = idx parts.append(table.slice(start, end - start + 1)) return pyarrow.concat_tables(parts)
def download_data_from_s3(bucket, key): s3_cli = boto3.client('s3') response = s3_cli.list_objects_v2(Bucket=bucket, Prefix=key) keys = [content['Key'] for content in response['Contents'] if content['Key'][-8:] != '_SUCCESS'] tables = [] with tqdm.tqdm(total=len(keys), position=0, mininterval=5, maxinterval=20) as pbar: for key in keys: obj = io.BytesIO() s3_cli.download_fileobj(bucket, key, obj) data = pyarrow.orc.ORCFile(obj) tables.append(data.read()) pbar.update(1) meta_df = pyarrow.concat_tables(tables).to_pandas().fillna(0) return meta_df
def test_column_of_lists_chunked2(self): data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11], [12, 13], [14, 15], [16, 17]] data2 = [[8, 9], [18, 19]] a1 = pa.array(data1) a2 = pa.array(data2) t1 = pa.Table.from_arrays([a1], names=['a']) t2 = pa.Table.from_arrays([a2], names=['a']) concatenated = pa.concat_tables([t1, t2]) result = concatenated.to_pandas() expected = pd.DataFrame({'a': data1 + data2}) tm.assert_frame_equal(result, expected)
def test_read_single_row_group_with_column_subset(): import pyarrow.parquet as pq N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) cols = df.columns[:2] row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)] result = pa.concat_tables(row_groups) tm.assert_frame_equal(df[cols], result.to_pandas())
def test_concat_tables_with_different_schema_metadata(): import pandas as pd schema = pa.schema([ pa.field('a', pa.string()), pa.field('b', pa.string()), ]) values = list('abcdefgh') df1 = pd.DataFrame({'a': values, 'b': values}) df2 = pd.DataFrame({'a': [np.nan] * 8, 'b': values}) table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False) table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False) assert table1.schema.equals(table2.schema, check_metadata=False) assert not table1.schema.equals(table2.schema, check_metadata=True) table3 = pa.concat_tables([table1, table2]) assert table1.schema.equals(table3.schema, check_metadata=True) assert table2.schema.equals(table3.schema, check_metadata=False)
def test_read_single_row_group(): import pyarrow.parquet as pq # ARROW-471 N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) assert pf.num_row_groups == K row_groups = [pf.read_row_group(i) for i in range(K)] result = pa.concat_tables(row_groups) tm.assert_frame_equal(df, result.to_pandas())
def _write_multiple_hdfs_pq_files(self, tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 test_data = [] for i in range(nfiles): df = test_parquet._test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) expected = pa.concat_tables(test_data) return expected
def test_concat_tables(): data = [ list(range(5)), [-10., -5., 0., 5., 10.] ] data2 = [ list(range(5, 10)), [1., 2., 3., 4., 5.] ] t1 = pa.Table.from_arrays([pa.array(x) for x in data], names=('a', 'b')) t2 = pa.Table.from_arrays([pa.array(x) for x in data2], names=('a', 'b')) result = pa.concat_tables([t1, t2]) result._validate() assert len(result) == 10 expected = pa.Table.from_arrays([pa.array(x + y) for x, y in zip(data, data2)], names=('a', 'b')) assert result.equals(expected)
def test_column_of_lists_chunked(self): # ARROW-1357 df = pd.DataFrame({ 'lists': np.array([ [1, 2], None, [2, 3], [4, 5], [6, 7], [8, 9] ], dtype=object) }) schema = pa.schema([ pa.field('lists', pa.list_(pa.int64())) ]) t1 = pa.Table.from_pandas(df[:2], schema=schema) t2 = pa.Table.from_pandas(df[2:], schema=schema) table = pa.concat_tables([t1, t2]) result = table.to_pandas() tm.assert_frame_equal(result, df)
def test_read_multiple_files(tmpdir): nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) pq.write_table(table, path) test_data.append(table) paths.append(path) result = pq.read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) # Read with provided metadata metadata = pq.ParquetFile(paths[0]).metadata result2 = pq.read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[3], result[6]] result = pa.localfs.read_parquet( dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read) assert result.equals(expected) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) pq.write_table(t, bad_apple_path) bad_meta = pq.ParquetFile(bad_apple_path).metadata with pytest.raises(ValueError): pq.read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): pq.read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): pq.read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): pq.read_multiple_files(mixed_paths)
def test_read_multiple_files(tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) _write_table(table, path) test_data.append(table) paths.append(path) # Write a _SUCCESS.crc file with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f: f.write(b'0') def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, nthreads=nthreads) result = read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) with pytest.raises(NotImplementedError): pq.read_pandas(dirpath) # Read with provided metadata metadata = pq.ParquetFile(paths[0]).metadata result2 = read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[2], result[6], result[result.num_columns - 1]] result = pa.localfs.read_parquet( dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata) assert result.equals(expected) # Read with multiple threads pa.localfs.read_parquet(dirpath, nthreads=2) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) bad_meta = pq.ParquetFile(bad_apple_path).metadata with pytest.raises(ValueError): read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): read_multiple_files(mixed_paths)