def bench_read(self, niter=5): cases = [ ('parquet (UNC)', 'arrow Table', lambda: pq.read_table(self.parquet_unc_path, memory_map=False)), ('parquet (UNC)', 'pandas', lambda: (pq.read_table(self.parquet_unc_path, memory_map=False).to_pandas( ))), ('parquet (SNAPPY)', 'arrow Table', lambda: pq.read_table(self.parquet_snappy_path, memory_map=False) ), ('parquet (SNAPPY)', 'pandas', lambda: (pq.read_table( self.parquet_snappy_path, memory_map=False).to_pandas())), ('feather V2 (UNC)', 'pandas', lambda: feather.read_feather( self.feather_unc_path, memory_map=False)), ('feather V2 (LZ4)', 'pandas', lambda: feather.read_feather( self.feather_lz4_path, memory_map=False)), ('feather V2 (ZSTD)', 'pandas', lambda: feather.read_feather( self.feather_zstd_path, memory_map=False)), ('feather V2 (UNC)', 'arrow Table', lambda: feather.read_table( self.feather_unc_path, memory_map=False)), ('feather V2 (LZ4)', 'arrow Table', lambda: feather.read_table( self.feather_lz4_path, memory_map=False)), ('feather V2 (ZSTD)', 'arrow Table', lambda: feather.read_table( self.feather_zstd_path, memory_map=False)), ] return self._bench_cases(cases, niter)
def test_read_table(version): num_values = (100, 100) path = random_path() TEST_FILES.append(path) values = np.random.randint(0, 100, size=num_values) df = pd.DataFrame(values, columns=['col_' + str(i) for i in range(100)]) write_feather(df, path, version=version) data = pd.DataFrame(values, columns=['col_' + str(i) for i in range(100)]) table = pa.Table.from_pandas(data) result = read_table(path) assert_frame_equal(table.to_pandas(), result.to_pandas()) # Test without memory mapping result = read_table(path, memory_map=False) assert_frame_equal(table.to_pandas(), result.to_pandas()) result = read_feather(path, memory_map=False) assert_frame_equal(table.to_pandas(), result)
def read_feather(path, *args, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read feather dataset, this may " "be GPU accelerated in the future") pa_table = feather.read_table(path, *args, **kwargs) return DataFrame.from_arrow(pa_table)
def test_read_column_duplicated_in_file(tempdir): # duplicated columns in feather file (only works for feather v2) table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'a']) path = str(tempdir / "data.feather") write_feather(table, path, version=2) # no selection works fine result = read_table(path) assert result.equals(table) # selection with indices works result = read_table(path, columns=[0, 2]) assert result.column_names == ['a', 'a'] # selection with column names errors with pytest.raises(ValueError): read_table(path, columns=['a', 'b'])
def test_read_column_duplicated_selection(tempdir, version): # duplicated columns in the column selection table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'c']) path = str(tempdir / "data.feather") write_feather(table, path, version=version) for col_selection in [['a', 'b', 'a'], [0, 1, 0]]: result = read_table(path, columns=col_selection) assert result.column_names == ['a', 'b', 'a']
def table(self): if self._table is None: path = self.temp_path("feather", "lz4") if path.exists(): self._table = feather.read_table(path, memory_map=False) else: self._table = pyarrow.Table.from_pandas( self.dataframe, preserve_index=False, ).replace_schema_metadata(None) return self._table
def _check_arrow_roundtrip(table, path=None, compression=None): if path is None: path = random_path() TEST_FILES.append(path) write_feather(table, path, compression=compression) if not os.path.exists(path): raise Exception('file not written') result = read_table(path) assert result.equals(table)
def read_table(self, source: tp.BinaryIO, schema: tp.Optional[pa.Schema]) -> pa.Table: try: columns = schema.names if schema else None return pa_ft.read_table(source, columns) except pa.ArrowInvalid as e: err = f"Arrow file decoding failed, content is garbled" self._log.exception(err) raise _ex.EDataCorruption(err) from e
def _read_col_from_path(self, path): # print("readcol: trying to read from path: ", path) # df = pf.read_table(path).to_pandas() np_path = path + '.npy' feather_path = path + '.feather' if os.path.exists(np_path): return np.load(np_path) elif os.path.exists(feather_path): df = pf.read_table(feather_path).to_pandas() return df[df.columns[0]] # print("neither path existed!", np_path, feather_path) return None # neither path exists
def test_feather_v017_experimental_compression_backward_compatibility(datadir): # ARROW-11163 - ensure newer pyarrow versions can read the old feather # files from version 0.17.0 with experimental compression support (before # it was officially added to IPC format in 1.0.0) # file generated with: # table = pa.table({'a': range(5)}) # from pyarrow import feather # feather.write_feather( # table, "v0.17.0.version=2-compression=lz4.feather", # compression="lz4", version=2) expected = pa.table({'a': range(5)}) result = read_table(datadir / "v0.17.0.version=2-compression=lz4.feather") assert result.equals(expected)
def _get_benchmark_function(self, source, case): file_type, compression, output_type = case path = source.create_if_not_exists(file_type, compression) if file_type == "parquet" and output_type == "table": f = lambda: parquet.read_table(path, memory_map=False) elif file_type == "parquet" and output_type == "dataframe": f = lambda: parquet.read_table(path, memory_map=False).to_pandas() elif file_type == "feather" and output_type == "table": f = lambda: feather.read_table(path, memory_map=False) elif file_type == "feather" and output_type == "dataframe": f = lambda: feather.read_feather(path, memory_map=False) return f
def convert_apache_arrow_feather_to_apache_parquet( data_path: InputPath('ApacheArrowFeather'), output_data_path: OutputPath('ApacheParquet'), ): '''Converts Apache Arrow Feather to Apache Parquet. [Apache Arrow Feather](https://arrow.apache.org/docs/python/feather.html) [Apache Parquet](https://parquet.apache.org/) Annotations: author: Alexey Volkov <*****@*****.**> ''' from pyarrow import feather, parquet table = feather.read_table(data_path) parquet.write_table(table, output_data_path)
def test_feather_auto_chunked(self): from pyarrow.feather import read_table, write_feather x = np.arange(2048).reshape(1024, 2) s = TensorArray(x) df = pd.DataFrame({"i": list(range(len(s))), "tensor": s}) table = pa.Table.from_pandas(df) # Write table to feather and read back as a DataFrame with tempfile.TemporaryDirectory() as dirpath: filename = os.path.join(dirpath, "tensor_array_chunked_test.feather") write_feather(table, filename, chunksize=512) table = read_table(filename) self.assertGreaterEqual(table.column("tensor").num_chunks, 2) df_read = pd.read_feather(filename) pd.testing.assert_frame_equal(df, df_read)
def test_smry2arrow(testdata_folder: Path, tmp_path: Path) -> None: eclbase = (testdata_folder / "01_drogon_ahm" / "realization-0" / "iter-0" / "eclipse" / "model" / "DROGON-0").resolve() assert eclbase.with_suffix(".UNSMRY").exists() output_file = tmp_path / "output.arrow" ert_config_file = _create_minimal_ert_config_file( tmp_path, f"SMRY2ARROW(<ECLBASE>={eclbase})") output_file = tmp_path / "output" / "share" / "results" / "tables" / "unsmry.arrow" subprocess.check_output(["ert", "test_run", ert_config_file], cwd=tmp_path) # nosec assert output_file.exists() table = feather.read_table(output_file) assert table.shape == (243, 921) sample_date = table["DATE"][0] assert sample_date.type == pa.timestamp("ms") schema = table.schema field = schema.field("FOPT") field_meta = json.loads(field.metadata[b"smry_meta"]) assert field.type == pa.float32() assert field_meta["unit"] == "SM3" assert field_meta["is_total"] == True assert field_meta["is_rate"] == False assert field_meta["is_historical"] == False field = schema.field("FOPR") field_meta = json.loads(field.metadata[b"smry_meta"]) assert field.type == pa.float32() assert field_meta["unit"] == "SM3/DAY" assert field_meta["is_total"] == False assert field_meta["is_rate"] == True assert field_meta["is_historical"] == False field = schema.field("FOPTH") field_meta = json.loads(field.metadata[b"smry_meta"]) assert field.type == pa.float32() assert field_meta["unit"] == "SM3" assert field_meta["is_total"] == True assert field_meta["is_rate"] == False assert field_meta["is_historical"] == True
def from_feather(self,columns): """ from_feather transform feather to Time_Series_Data or Time_Series_Collection Parameters ---------- columns : list of str column names to fetch Returns ------- Time_Series_Data or Time_Series_Collection """ table = pf.read_table( source= self.dirPaths, columns = columns ) return from_arrow_table(table,self.timeSeriesCol,self.mainCategoryCol)
def test_missing_metadata(tmp_path): df = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) path = tmp_path / "test.feather" # convert to DataFrame with wkb -> writing to feather will have only pandas metadata df = df.to_wkb() df.to_feather(path) with pytest.raises(ValueError, match="Missing geo metadata"): dask_geopandas.read_feather(path) # remove metadata completely from pyarrow import feather table = feather.read_table(path) feather.write_feather(table.replace_schema_metadata(), path) with pytest.raises(ValueError, match="Missing geo metadata"): dask_geopandas.read_feather(path)
def test_read_table(self): num_values = (100, 100) path = random_path() self.test_files.append(path) writer = FeatherWriter() writer.open(path) values = np.random.randint(0, 100, size=num_values) for i in range(100): writer.write_array('col_' + str(i), values[:, i]) writer.close() data = pd.DataFrame(values, columns=['col_' + str(i) for i in range(100)]) table = pa.Table.from_pandas(data) result = read_table(path) assert_frame_equal(table.to_pandas(), result.to_pandas())
def test_use_threads(version): # ARROW-14470 num_values = (10, 10) path = random_path() TEST_FILES.append(path) values = np.random.randint(0, 10, size=num_values) columns = ['col_' + str(i) for i in range(10)] table = pa.Table.from_arrays(values, columns) write_feather(table, path, version=version) result = read_feather(path) assert_frame_equal(table.to_pandas(), result) # Test read_feather with use_threads=False result = read_feather(path, use_threads=False) assert_frame_equal(table.to_pandas(), result) # Test read_table with use_threads=False result = read_table(path, use_threads=False) assert result.equals(table)
def _from_geofeather(path, columns=None): """Deserialize a pandas.DataFrame stored in a feather file. If the corresponding .crs file is found, it is used to set the CRS of the GeoDataFrame. Parameters ---------- path : str path to feather file to read columns : list-like (optional, default: None) Subset of columns to read from the file, must include 'geometry'. If not provided, all columns are read. Returns ------- tuple of (pandas.DataFrame, dict or str) DataFrame will contain a "geometry" or "wkb" column with WKB-encoded geometry data. crs will be a dict or str depending on what was serialized. """ crs = None crsfilename = "{}.crs".format(path) if os.path.exists(crsfilename): crs = json.loads(open(crsfilename).read()) if "wkt" in crs: crs = crs["wkt"] elif "proj4" in crs: crs = crs["proj4"] else: warnings.warn( "{} coordinate reference system file is missing. No crs will be set for this GeoDataFrame." .format(crsfilename)) # TODO: use geopandas feather I/O instead return read_table(path, columns=columns).to_pandas(), crs
def convert_feather_v1_to_v2_vice_versa( input_ct_db_filename: str, output_ct_db_filename: str, compression: Optional[str] = "zstd", compression_level: int = 6, to_version: int = 2, ): """ Convert cisTarget Feather database from Feather v1 to v2 format (with or without compression) and vice versa. :param input_ct_db_filename: input cisTarget database filename. :param output_ct_db_filename: output cisTarget database filename. :param compression: Compression method: "zstd" (default), "lz4" or "uncompressed". :param compression_level: Compression level for "zstd" or "lz4". :param to_version: Output Feather file format version: 1 (legacy) or 2 (default). :return: """ if to_version != 2 and to_version != 1: raise ValueError( "Feather file version only supports 1 (legacy) or 2 (default).") if to_version == 1: # Compression is not supported in Feather v1 format. compression = "uncompressed" compression_level = None if compression not in {"zstd", "lz4", "uncompressed"}: raise ValueError( f'Unsupported compression value "{compression}". Choose "zstd" (default), "lz4" or "uncompressed".' ) # Read input cisTarget database as a pyarrow Table. df_pa_table = pf.read_table(source=input_ct_db_filename, ) # Get all column names. all_column_names = df_pa_table.column_names try: # Check if we have an old database that still used a "features" column and rename it. features_idx = all_column_names.index("features") # Get column which contains motif or track names. motifs_or_track_names = df_pa_table.column(features_idx) if pc.sum(pc.starts_with(motifs_or_track_names, "jaspar")).as_py() > 0: # It is a motif vs genes/regions database if JASPAR motif names were found in the "features" column. all_column_names[features_idx] = "motifs" else: all_column_names[features_idx] = "tracks" df_pa_table.drop(["features"]) # Rename features column in database to "motifs" or "tracks". df_pa_table = df_pa_table.rename_columns(all_column_names) except ValueError: # No old database (with "features" column). pass # Get database index column ("motifs", "tracks", "regions" or "genes" depending of the database type). for column_idx, column_name in enumerate(all_column_names): if column_name in {"motifs", "tracks", "regions", "genes"}: index_column = df_pa_table.column(column_idx) break # Sort column names (non-index columns) and add index column as last column. column_names_sorted_and_index = sorted([ column_name for column_name in all_column_names if column_name not in index_column._name ]) column_names_sorted_and_index.append(index_column._name) # Create a new pyarrow Table with columns in the new order. df_pa_table = df_pa_table.select(column_names_sorted_and_index) # Writhe cisTarget database to a new Feather file with the requested compression/version settings. pf.write_feather(df=df_pa_table, dest=output_ct_db_filename, compression=compression, compression_level=compression_level, version=to_version)
def test_export_feather(tmpdir_factory): """Test export of DataFrame to feather""" Settings.tidy = False Settings.humanize = True Settings.si_units = False # Request data request = DwdObservationRequest( parameter=DwdObservationDataset.CLIMATE_SUMMARY, resolution=DwdObservationResolution.DAILY, start_date="2019", end_date="2020", ).filter_by_station_id( station_id=[1048], ) df = request.values.all().df # Save to Feather file. filename = tmpdir_factory.mktemp("data").join("observations.feather") ExportMixin(df=df).to_target(f"file://{filename}") # Read back Feather file. table = feather.read_table(filename) # Validate dimensions. assert table.num_columns == 19 assert table.num_rows == 366 # Validate column names. assert table.column_names == [ "station_id", "dataset", "date", "qn_3", "wind_gust_max", "wind_speed", "qn_4", "precipitation_height", "precipitation_form", "sunshine_duration", "snow_depth", "cloud_cover_total", "pressure_vapor", "pressure_air_site", "temperature_air_mean_200", "humidity", "temperature_air_max_200", "temperature_air_min_200", "temperature_air_min_005", ] # Validate content. data = table.to_pydict() assert data["date"][0] == datetime.datetime(2019, 1, 1, 0, 0, tzinfo=datetime.timezone.utc) assert data["temperature_air_min_005"][0] == 1.5 assert data["date"][-1] == datetime.datetime(2020, 1, 1, 0, 0, tzinfo=datetime.timezone.utc) assert data["temperature_air_min_005"][-1] == -4.6 os.unlink(filename)
def test_feather_without_pandas(tempdir, version): # ARROW-8345 table = pa.table([pa.array([1, 2, 3])], names=['f0']) write_feather(table, str(tempdir / "data.feather"), version=version) result = read_table(str(tempdir / "data.feather")) assert result.equals(table)
def _read_col_from_path(self, path): df = pf.read_table(path).to_pandas() return df[df.columns[0]]