def create_from_arrow_unsmry_presampled( self, ens_path: str, rel_file_pattern: str, sampling_frequency: Optional[Frequency], ) -> EnsembleSummaryProvider: """Create EnsembleSummaryProvider from per-realization unsmry data in .arrow format. The `rel_file_pattern` parameter must specify a relative (per realization) file pattern that will be used to find the wanted .arrow files within each realization. The file pattern is relative to each realization's `runpath`. Typically the file pattern will be: "share/results/unsmry/*.arrow" This factory method will sample the input data according to the specified `sampling_frequency` during import. The returned summary provider does not support lazy resampling, but will always return data with the above specified frequency . """ timer = PerfTimer() freq_str = sampling_frequency.value if sampling_frequency else "raw" hash_str = _make_hash_string(ens_path + rel_file_pattern) storage_key = f"arrow_unsmry_presampled_{freq_str}__{hash_str}" provider = ProviderImplArrowPresampled.from_backing_store( self._storage_dir, storage_key ) if provider: LOGGER.info( f"Loaded presampled summary provider from backing store in " f"{timer.elapsed_s():.2f}s (" f"sampling_frequency={sampling_frequency}, ens_path={ens_path})" ) return provider # We can only import data from data source if storage writes are allowed if not self._allow_storage_writes: raise ValueError( f"Failed to load presampled summary provider for {ens_path}" ) LOGGER.info(f"Importing/saving arrow summary data for: {ens_path}") timer.lap_s() per_real_tables = load_per_realization_arrow_unsmry_files( ens_path, rel_file_pattern ) if not per_real_tables: raise ValueError( f"Could not find any .arrow unsmry files for ens_path={ens_path}" ) et_import_smry_s = timer.lap_s() if sampling_frequency is not None: for real_num, table in per_real_tables.items(): per_real_tables[real_num] = resample_single_real_table( table, sampling_frequency ) et_resample_s = timer.lap_s() ProviderImplArrowPresampled.write_backing_store_from_per_realization_tables( self._storage_dir, storage_key, per_real_tables ) et_write_s = timer.lap_s() provider = ProviderImplArrowPresampled.from_backing_store( self._storage_dir, storage_key ) if not provider: raise ValueError(f"Failed to load/create provider for {ens_path}") LOGGER.info( f"Saved presampled summary provider to backing store in {timer.elapsed_s():.2f}s (" f"import_smry={et_import_smry_s:.2f}s, " f"resample={et_resample_s:.2f}s, " f"write={et_write_s:.2f}s, " f"ens_path={ens_path})" ) return provider
def create_from_arrow_unsmry_lazy( self, ens_path: str, rel_file_pattern: str ) -> EnsembleSummaryProvider: """Create EnsembleSummaryProvider from per-realization unsmry data in .arrow format. The `rel_file_pattern` parameter must specify a relative (per realization) file pattern that will be used to find the wanted .arrow files within each realization. The file pattern is relative to each realization's `runpath`. Typically the file pattern will be: "share/results/unsmry/*.arrow" The returned summary provider supports lazy resampling. """ timer = PerfTimer() storage_key = ( f"arrow_unsmry_lazy__{_make_hash_string(ens_path + rel_file_pattern)}" ) provider = ProviderImplArrowLazy.from_backing_store( self._storage_dir, storage_key ) if provider: LOGGER.info( f"Loaded lazy summary provider from backing store in {timer.elapsed_s():.2f}s (" f"ens_path={ens_path})" ) return provider # We can only import data from data source if storage writes are allowed if not self._allow_storage_writes: raise ValueError(f"Failed to load lazy summary provider for {ens_path}") LOGGER.info(f"Importing/saving arrow summary data for: {ens_path}") timer.lap_s() per_real_tables = load_per_realization_arrow_unsmry_files( ens_path, rel_file_pattern ) if not per_real_tables: raise ValueError( f"Could not find any .arrow unsmry files for ens_path={ens_path}" ) et_import_smry_s = timer.lap_s() try: ProviderImplArrowLazy.write_backing_store_from_per_realization_tables( self._storage_dir, storage_key, per_real_tables ) except ValueError as exc: raise ValueError(f"Failed to write backing store for: {ens_path}") from exc et_write_s = timer.lap_s() provider = ProviderImplArrowLazy.from_backing_store( self._storage_dir, storage_key ) if not provider: raise ValueError(f"Failed to load/create lazy provider for {ens_path}") LOGGER.info( f"Saved lazy summary provider to backing store in {timer.elapsed_s():.2f}s (" f"import_smry={et_import_smry_s:.2f}s, write={et_write_s:.2f}s, ens_path={ens_path})" ) return provider
def write_backing_store_from_ensemble_dataframe( storage_dir: Path, storage_key: str, ensemble_df: pd.DataFrame) -> None: @dataclass class Elapsed: convert_date_s: float = -1 table_from_pandas_s: float = -1 find_and_store_min_max_s: float = -1 sorting_s: float = -1 write_s: float = -1 elapsed = Elapsed() arrow_file_name = storage_dir / (storage_key + ".arrow") LOGGER.debug( f"Writing backing store from ensemble dataframe to arrow file: {arrow_file_name}" ) timer = PerfTimer() # Force data type in the incoming DataFrame's DATE column to datetime.datetime objects # This is the first step in coercing pyarrow to always store DATEs as timestamps ensemble_df = make_date_column_datetime_object(ensemble_df) elapsed.convert_date_s = timer.lap_s() # By default, we'll now end up with a schema that has timestamp[ns] for the DATE column # We therefore modify the retrieved schema and specify usage of timestamp[ms] instead default_schema = pa.Schema.from_pandas(ensemble_df, preserve_index=False) schema_to_use = _set_date_column_type_to_timestamp_ms(default_schema) # For experimenting with conversion to float # timer.lap_s() # schema_to_use = _create_float_downcasting_schema(schema_to_use) # LOGGER.info( # f"Created schema for float downcasting in : {timer.lap_s():.2f}s" # ) timer.lap_s() table = pa.Table.from_pandas(ensemble_df, schema=schema_to_use, preserve_index=False) elapsed.table_from_pandas_s = timer.lap_s() # We're done with the dataframe del ensemble_df # Find per column min/max values and then store them as metadata on table's schema timer.lap_ms() per_vector_min_max = find_min_max_for_numeric_table_columns(table) table = add_per_vector_min_max_to_table_schema_metadata( table, per_vector_min_max) elapsed.find_and_store_min_max_s = timer.lap_s() table = _sort_table_on_date_then_real(table) elapsed.sorting_s = timer.lap_s() # feather.write_feather(table, dest=arrow_file_name) feather.write_feather(table, dest=arrow_file_name, compression="uncompressed") elapsed.write_s = timer.lap_s() LOGGER.debug( f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s (" f"convert_date={elapsed.convert_date_s:.2f}s, " f"table_from_pandas={elapsed.table_from_pandas_s:.2f}s, " f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, " f"sorting={elapsed.sorting_s:.2f}s, " f"write={elapsed.write_s:.2f}s)")
def write_backing_store_from_per_realization_tables( storage_dir: Path, storage_key: str, per_real_tables: Dict[int, pa.Table]) -> None: @dataclass class Elapsed: concat_tables_s: float = -1 build_add_real_col_s: float = -1 sorting_s: float = -1 find_and_store_min_max_s: float = -1 write_s: float = -1 elapsed = Elapsed() arrow_file_name = storage_dir / (storage_key + ".arrow") LOGGER.debug( f"Writing backing store from per real tables to arrow file: {arrow_file_name}" ) timer = PerfTimer() unique_column_names = set() for table in per_real_tables.values(): unique_column_names.update(table.schema.names) LOGGER.debug(f"Concatenating {len(per_real_tables)} tables with " f"{len(unique_column_names)} unique column names") timer.lap_s() full_table = pa.concat_tables(per_real_tables.values(), promote=True) elapsed.concat_tables_s = timer.lap_s() real_arr = np.empty(full_table.num_rows, np.int32) table_start_idx = 0 for real_num, real_table in per_real_tables.items(): real_arr[table_start_idx:table_start_idx + real_table.num_rows] = real_num table_start_idx += real_table.num_rows full_table = full_table.add_column(0, "REAL", pa.array(real_arr)) elapsed.build_add_real_col_s = timer.lap_s() # Find per column min/max values and then store them as metadata on table's schema per_vector_min_max = find_min_max_for_numeric_table_columns(full_table) full_table = add_per_vector_min_max_to_table_schema_metadata( full_table, per_vector_min_max) elapsed.find_and_store_min_max_s = timer.lap_s() full_table = _sort_table_on_date_then_real(full_table) elapsed.sorting_s = timer.lap_s() # feather.write_feather(full_table, dest=arrow_file_name) feather.write_feather(full_table, dest=arrow_file_name, compression="uncompressed") elapsed.write_s = timer.lap_s() LOGGER.debug( f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s (" f"concat_tables={elapsed.concat_tables_s:.2f}s, " f"build_add_real_col={elapsed.build_add_real_col_s:.2f}s, " f"sorting={elapsed.sorting_s:.2f}s, " f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, " f"write={elapsed.write_s:.2f}s)")
def write_backing_store_from_per_realization_tables( storage_dir: Path, storage_key: str, per_real_tables: Dict[int, pa.Table] ) -> None: # pylint: disable=too-many-locals @dataclass class Elapsed: concat_tables_s: float = -1 build_add_real_col_s: float = -1 sorting_s: float = -1 find_and_store_min_max_s: float = -1 write_s: float = -1 elapsed = Elapsed() arrow_file_name = storage_dir / (storage_key + ".arrow") LOGGER.debug(f"Writing backing store to arrow file: {arrow_file_name}") timer = PerfTimer() unique_column_names = set() for table in per_real_tables.values(): unique_column_names.update(table.schema.names) if "REAL" in table.schema.names: raise ValueError("Input tables should not have REAL column") if table.schema.field("DATE").type != pa.timestamp("ms"): raise ValueError("DATE column must have timestamp[ms] data type") if not _is_date_column_sorted(table): raise ValueError("DATE column must be sorted") LOGGER.debug( f"Concatenating {len(per_real_tables)} tables with " f"{len(unique_column_names)} unique column names" ) full_table = pa.concat_tables(per_real_tables.values(), promote=True) elapsed.concat_tables_s = timer.lap_s() real_arr = np.empty(full_table.num_rows, np.int32) table_start_idx = 0 for real_num, real_table in per_real_tables.items(): real_arr[table_start_idx : table_start_idx + real_table.num_rows] = real_num table_start_idx += real_table.num_rows full_table = full_table.add_column(0, "REAL", pa.array(real_arr)) elapsed.build_add_real_col_s = timer.lap_s() # Must sort table on real since interpolations work per realization # and we utilize slicing for speed full_table = _sort_table_on_real_then_date(full_table) elapsed.sorting_s = timer.lap_s() # Find per column min/max values and store them as metadata on table's schema per_vector_min_max = find_min_max_for_numeric_table_columns(full_table) full_table = add_per_vector_min_max_to_table_schema_metadata( full_table, per_vector_min_max ) elapsed.find_and_store_min_max_s = timer.lap_s() # feather.write_feather(full_table, dest=arrow_file_name) with pa.OSFile(str(arrow_file_name), "wb") as sink: with pa.RecordBatchFileWriter(sink, full_table.schema) as writer: writer.write_table(full_table) elapsed.write_s = timer.lap_s() LOGGER.debug( f"Wrote backing store to arrow file in: {timer.elapsed_s():.2f}s (" f"concat_tables={elapsed.concat_tables_s:.2f}s, " f"build_add_real_col={elapsed.build_add_real_col_s:.2f}s, " f"sorting={elapsed.sorting_s:.2f}s, " f"find_and_store_min_max={elapsed.find_and_store_min_max_s:.2f}s, " f"write={elapsed.write_s:.2f}s)" )