def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]: """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ catalog = catalog.shallow_copy() unsatisfied = pipeline.inputs() - set(catalog.list()) if unsatisfied: raise ValueError("Pipeline input(s) {} not found in the " "DataCatalog".format(unsatisfied)) free_outputs = pipeline.outputs() - set(catalog.list()) unregistered_ds = pipeline.data_sets() - set(catalog.list()) for ds_name in unregistered_ds: catalog.add(ds_name, self.create_default_data_set(ds_name)) self._run(pipeline, catalog) self._logger.info("Pipeline execution completed successfully.") return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: inputs = {} hook_manager = get_hook_manager() for name in node.inputs: hook_manager.hook.before_dataset_loaded( # pylint: disable=no-member dataset_name=name ) inputs[name] = catalog.load(name) hook_manager.hook.after_dataset_loaded( # pylint: disable=no-member dataset_name=name, data=inputs[name] ) is_async = False additional_inputs = _collect_inputs_from_hook( node, catalog, inputs, is_async, run_id=run_id ) inputs.update(additional_inputs) outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id) for name, data in outputs.items(): hook_manager.hook.before_dataset_saved( # pylint: disable=no-member dataset_name=name, data=data ) catalog.save(name, data) hook_manager.hook.after_dataset_saved( # pylint: disable=no-member dataset_name=name, data=data ) return node
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: inputs = {name: catalog.load(name) for name in node.inputs} hook_manager = get_hook_manager() is_async = False hook_manager.hook.before_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id ) try: outputs = node.run(inputs) except Exception as exc: hook_manager.hook.on_node_error( # pylint: disable=no-member error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) raise exc hook_manager.hook.after_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, run_id=run_id, ) for name, data in outputs.items(): catalog.save(name, data) return node
def after_pipeline_run( self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ) -> None: """Hook to be invoked after a pipeline runs. Args: run_params: The params needed for the given run. Should be identical to the data logged by Journal. # @fixme: this needs to be modelled explicitly as code, instead of comment Schema: { "project_path": str, "env": str, "kedro_version": str, "tags": Optional[List[str]], "from_nodes": Optional[List[str]], "to_nodes": Optional[List[str]], "node_names": Optional[List[str]], "from_inputs": Optional[List[str]], "load_versions": Optional[List[str]], "pipeline_name": str, "extra_params": Optional[Dict[str, Any]], } pipeline: The ``Pipeline`` that was run. catalog: The ``DataCatalog`` used during the run. """ if self._is_mlflow_enabled: if isinstance(pipeline, PipelineML): with TemporaryDirectory() as tmp_dir: # This will be removed at the end of the context manager, # but we need to log in mlflow before moving the folder kedro_pipeline_model = KedroPipelineModel( pipeline=pipeline.inference, catalog=catalog, input_name=pipeline.input_name, **pipeline.kpm_kwargs, ) artifacts = kedro_pipeline_model.extract_pipeline_artifacts( parameters_saving_folder=Path(tmp_dir)) log_model_kwargs = pipeline.log_model_kwargs.copy() model_signature = log_model_kwargs.pop("signature", None) if isinstance(model_signature, str): if model_signature == "auto": input_data = catalog.load(pipeline.input_name) model_signature = infer_signature( model_input=input_data) mlflow.pyfunc.log_model( python_model=kedro_pipeline_model, artifacts=artifacts, signature=model_signature, **log_model_kwargs, ) # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs mlflow.end_run() else: switch_catalog_logging(catalog, True)
def test_add_save_and_load(self, data_set, dummy_dataframe): """Test adding and then saving and reloading the data set""" catalog = DataCatalog(data_sets={}) catalog.add("test", data_set) catalog.save("test", dummy_dataframe) reloaded_df = catalog.load("test") assert_frame_equal(reloaded_df, dummy_dataframe)
def test_all_before_adding(self, fake_data_set, fake_transformer): catalog = DataCatalog() catalog.add_transformer(fake_transformer) catalog.add("test", fake_data_set) catalog.save("test", 42) assert catalog.load("test") == 44 assert fake_data_set.log == [("save", 43), ("load", 43)] assert fake_transformer.log == [("save", 42), ("load", 43)]
def after_catalog_created(self, catalog: DataCatalog) -> None: """gets params, initializes logger, and logs params.""" logger.info("Initializing mlflow logger") logger.info(__package__) params = catalog.load("parameters") self.config = MLFlowLoggerConfig(**params) if self.config.enabled: self._log_params() self._set_tags()
def run_node(node: Node, catalog: DataCatalog) -> Node: """Run a single `Node` with inputs from and outputs to the `catalog`. Args: node: The ``Node`` to run. catalog: A ``DataCatalog`` containing the node's inputs and outputs. Returns: The node argument. """ inputs = {name: catalog.load(name) for name in node.inputs} outputs = node.run(inputs) for name, data in outputs.items(): catalog.save(name, data) return node
def validate(parameters: Dict): # def validate(dummy2, parameters: Dict): import glob, os os.chdir(parameters["path_model_input"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] for well, file, filename in zip(wells_data, files, filenames): well = well[[ 'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well = well.iloc[:1399] well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') Raw_Data_preprocessed.append(well) os.chdir(parameters["path_val_stats"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values properties.append(property_) properties = np.array(properties) all_wells_input = [] all_wells_labels = [] for well_data, file in zip(Raw_Data_preprocessed, files): DWT_Aprox_coeff_input = [] input_data, labels = split(well_data) input_columns = list(input_data.columns) for data_idx in input_columns: signal = well_data[data_idx].values thresh = parameters["thresh"] * np.nanmax(signal) coeff = pywt.wavedec(signal, wavelet=parameters["wavelet"], mode=parameters["mode1"], level=parameters["level"]) coeff[1:] = (pywt.threshold(i, value=thresh, mode=str(parameters["mode2"])) for i in coeff[1:]) DWT_Aprox_coeff_input.append(coeff[0]) DWT_Aprox_coeff_input = pd.DataFrame( np.transpose(DWT_Aprox_coeff_input), columns=input_columns) data_set_input = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_input_DWT_coeffs_" + file) data_set_input.save(DWT_Aprox_coeff_input) all_wells_input.append(DWT_Aprox_coeff_input.values) data_set_labels = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_labels_" + file) data_set_labels.save(labels) all_wells_labels.append(labels.values) # Standardize dynamic data coeffs data_set_scaler_coeffs = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_coeffs.pickle") scaler_coeffs = data_set_scaler_coeffs.load() all_wells_standardized_input = [] all_wells_standardized_input_flattened = [] for well_coeffs in all_wells_input: std_coeffs = scaler_coeffs.transform(well_coeffs) all_wells_standardized_input.append(std_coeffs) transposed_std_coeffs = np.transpose(std_coeffs) flattened_std_coeffs = transposed_std_coeffs.flatten() all_wells_standardized_input_flattened.append(flattened_std_coeffs) all_wells_standardized_input = np.array(all_wells_standardized_input) all_wells_standardized_input_flattened = np.array( all_wells_standardized_input_flattened) input_columns = list(DWT_Aprox_coeff_input.columns) for std_coeffs, file in zip(all_wells_standardized_input, files): std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns) data_set = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_std_DWT_input_coeffs_" + file) data_set.save(std_coeffs) # Standardize static data data_set_scaler_static = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_static.pickle") scaler_static = data_set_scaler_static.load() all_wells_standardized_properties = scaler_static.fit_transform(properties) all_wells_coeffs_reservoir_data = [] for flattened_std_coeffs, standardized_properties in zip( all_wells_standardized_input_flattened, all_wells_standardized_properties): flattened_std_coeffs = list(flattened_std_coeffs) standardized_properties = list(standardized_properties) for reservoir_property in standardized_properties: flattened_std_coeffs.append( reservoir_property ) # append reservoir data to dynamic data coeffs all_wells_coeffs_reservoir_data.append(flattened_std_coeffs) all_wells_coeffs_reservoir_data = np.array(all_wells_coeffs_reservoir_data) well_count = np.arange(len(all_wells_coeffs_reservoir_data)) daily_timesteps = np.arange(len(all_wells_labels[0])) input_data = [] for coeff_inputs in all_wells_coeffs_reservoir_data: for time_lapse in daily_timesteps: well_inputs = [time_lapse] + list( coeff_inputs) # append time lapse to input data input_data.append(well_inputs) input_data = np.array(input_data) data_set_regressor_1 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_1.pickle") regressor_1 = data_set_regressor_1.load() number_of_wells = len(well_count) wells_steam_rate_predicted = regressor_1.predict(input_data) wells_steam_rate_predicted = wells_steam_rate_predicted.reshape( (number_of_wells, 1399)).T # prediction inputs to model 2 input_data_model_2 = [] for coeff_inputs, well in zip(all_wells_coeffs_reservoir_data, well_count): for time_lapse in daily_timesteps: well_inputs = [time_lapse] + list( coeff_inputs) # append time lapse to input data well_inputs_model_2 = [ wells_steam_rate_predicted[time_lapse, well] ] + well_inputs input_data_model_2.append(well_inputs_model_2) input_data_model_2 = np.array(input_data_model_2) data_set_regressor_2 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_2.pickle") regressor_2 = data_set_regressor_2.load() wells_emulsion_rate_predicted = regressor_2.predict(input_data_model_2) wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape( (number_of_wells, 1399)).T # actual targets all_wells_steam_data = [] all_wells_emulsion_data = [] for ID in well_count: well_steam_data = all_wells_labels[ID][:, 0] well_emulsion_data = all_wells_labels[ID][:, 1] all_wells_steam_data = all_wells_steam_data + list(well_steam_data) all_wells_emulsion_data = all_wells_emulsion_data + list( well_emulsion_data) all_wells_steam_data = np.array(all_wells_steam_data) all_wells_emulsion_data = np.array(all_wells_emulsion_data) wells_steam_rate_actual = all_wells_steam_data.reshape( (number_of_wells, 1399)).T wells_emulsion_rate_actual = all_wells_emulsion_data.reshape( (number_of_wells, 1399)).T print("Prediction Performance:\n") print("Steam Flow Rate:") for well, file in zip(well_count, files): steam_rate_predicted = wells_steam_rate_predicted[:, well] steam_rate_actual = wells_steam_rate_actual[:, well] steam_rate_actual_predicted = pd.DataFrame( np.vstack((steam_rate_actual, steam_rate_predicted)).T, columns=["steam rate actual", "steam rate predicted"]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output"] + "/steam_rate_" + file) data_set_steam_rate.save(steam_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(steam_rate_actual, steam_rate_predicted))) print("\n") print("Emulsion Flow Rate:") for well, file in zip(well_count, files): emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well] emulsion_rate_actual = wells_emulsion_rate_actual[:, well] emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["emulsion rate actual", "emulsion rate predicted"]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output"] + "/emulsion_rate_" + file) data_set_emulsion_rate.save(emulsion_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(emulsion_rate_actual, emulsion_rate_predicted))) dummy_validate = files return dummy_validate
def test_load_from_unregistered(self): """Check the error when attempting to load unregistered data set""" catalog = DataCatalog(data_sets={}) pattern = r"DataSet 'test' not found in the catalog" with pytest.raises(DataSetNotFoundError, match=pattern): catalog.load("test")
def preprocess_raw_data(parameters: Dict): import glob, os # os.chdir(parameters["path_raw"]) os.chdir(parameters["path_raw_matlab"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ # filename: CSVLocalDataSet(filepath=parameters["path_raw"]+"/"+file), filename: CSVLocalDataSet(filepath=parameters["path_raw_matlab"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', # 'Producer Bottom Hole Pressure', 'ESP Speed', # 'Steam Flow Rate - Outer', # 'Emulsion Flow Rate']] well = well[[ 'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP', 'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Prod_Date'] = pd.to_datetime(well['Date']) well = well.set_index('Prod_Date') # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data # well = well.rolling(30, min_periods=1).mean() data = well['Oil [bbl/d]'] / 6.28981 well.insert(4, 'Oil [m3/d]', data) time_data = np.arange(len(well)) well.insert(0, 'Timestep', time_data) wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) print((min_well_length, np.argmin(np.array(wells_life)))) timesteps = min_well_length # 1008 # timesteps = 371 for well, file, filename in zip(wells_data_, files, filenames): well = well.iloc[:timesteps] # daily, weekly, monthly data # well = well.fillna(0) # well = well.fillna(well.rolling(30, min_periods=1).median()) # well = well.fillna(well.median()) well["Well"] = filename # create a column for well name well = well.reset_index(drop=True) # remove date index data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well) Raw_Data_dated.append(well) well = well.drop(columns=['Date', 'Well']) Raw_Data_preprocessed.append(well) # stats_training = CSVLocalDataSet(filepath=parameters["path_raw_static"]+"/static_P50_data_training.csv") stats_training = CSVLocalDataSet( filepath=parameters["path_raw_static_matlab"] + "/static_P50_data_training_152_wells.csv") stats = stats_training.load() stats_ROIP = stats.loc[:, 'ROIP'] stats = stats.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation'] # # using only rich geoostats and no bottom water properties # stats = stats.loc[:, 'Effective_Length':'Rich_Oil_Saturation'] # # Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness # data = stats['Rich_Pay_Thickness'] - stats['Stand_Off'] # stats.insert(3, 'Effective_Rich_Pay_Thickness', data) # stats = stats.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off']) property_names = list(stats.columns) properties = list(stats.values) properties = np.array(properties) return [ timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames, properties, stats_ROIP, property_names ]
def preprocess_raw_data(parameters: Dict): import glob, os os.chdir(parameters["path_raw"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] for well, file, filename in zip(wells_data, files, filenames): well = well[[ 'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well = well.iloc[:1399] well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well_dated = well.copy() well_dated["Well"] = filename # create a column for well name data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well_dated) Raw_Data_dated.append(well_dated) well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') Raw_Data_preprocessed.append(well) os.chdir(parameters["path_raw_static"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values properties.append(property_) properties = np.array(properties) return [ Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities, asset_names, properties ]
from kedro.io import DataCatalog from kedro.extras.datasets.pandas import CSVDataSet io = DataCatalog({"titanic_training_data": CSVDataSet(filepath="train.csv")}) # Load your file and print the output df = io.load("titanic_training_data") print(df.head())
def load_well_validation_data(dummy2, timesteps, parameters: Dict): import glob, os os.chdir(parameters["path_model_input"]) files_val = [] for file in glob.glob("*.csv"): files_val.append(file) filenames_val = [] wells_data = [] for file in files_val: filename, extension = file.split('.') filenames_val.append(filename) for file, filename in zip(files_val, filenames_val): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed_val = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', # 'Producer Bottom Hole Pressure', 'ESP Speed', # 'Steam Flow Rate - Outer', # 'Emulsion Flow Rate']] well = well[[ 'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP', 'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Prod_Date'] = pd.to_datetime(well['Date']) well = well.set_index('Prod_Date') # well = well.dropna(axis=0) # may change # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data # well = well.rolling(30, min_periods=1).mean() # well = well.rolling(30, min_periods=1).mean() data = well['Oil [bbl/d]'] / 6.28981 well.insert(4, 'Oil [m3/d]', data) time_data = np.arange(len(well)) well.insert(0, 'Timestep', time_data) wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) if min_well_length < timesteps: timesteps_validation = min_well_length else: timesteps_validation = timesteps for well, file, filename in zip(wells_data_, files_val, filenames_val): well = well.iloc[:timesteps_validation] # daily data # well = well.fillna(0) # well = well.fillna(well.rolling(30, min_periods=1).median()) # well = well.fillna(well.median()) Raw_Data_preprocessed_val.append(well) stats_validation = CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/static_P50_data_validation.csv") stats_val = stats_validation.load() stats_val_ROIP = stats_val.loc[:, 'ROIP'] stats_val = stats_val.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation'] # # using only rich geoostats and no bottom water properties # stats_val = stats_val.loc[:, 'Effective_Length':'Rich_Oil_Saturation'] # # Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness # data = stats_val['Rich_Pay_Thickness'] - stats_val['Stand_Off'] # stats_val.insert(3, 'Effective_Rich_Pay_Thickness', data) # stats_val = stats_val.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off']) property_names_val = list(stats_val.columns) properties_val = list(stats_val.values) # properties_val = stats.loc[:, ['Effective_Length', 'Spacing', 'Effective_Rich_Pay_Thickness', 'Non_Rich_Pay_Thickness', # 'Rich_Vertical_Permeability','Non_Rich_Vertical_Permeability', 'Rich_Porosity', # 'Non_Rich_Porosity', 'Rich_Oil_Saturation', 'Non_Rich_Oil_Saturation']].values properties_val = np.array(properties_val) dummy11 = files_val return [ dummy11, timesteps_validation, Raw_Data_preprocessed_val, files_val, filenames_val, properties_val, stats_val_ROIP, property_names_val ]
def preprocess_raw_data(parameters: Dict): import glob, os os.chdir(parameters["path_raw"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', # 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', # 'ESP Current', 'Emulsion Flow Rate']] # well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Emulsion Flow Rate']] well = well[[ 'Date', 'IBHP', 'PBHP', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) timesteps = 983 for well, file, filename in zip(wells_data_, files, filenames): # well = well.iloc[:min_well_length] # use minimum well life well = well.iloc[:timesteps] # daily, weekly, monthly data # well = well.fillna(0) well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well_dated = well.copy() well_dated["Well"] = filename # create a column for well name data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well_dated) Raw_Data_dated.append(well_dated) Raw_Data_preprocessed.append(well) os.chdir(parameters["path_raw_static"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'Effective_Length':'BottomWater_Oil_Saturation'].values properties.append(property_) properties = np.array(properties) return [ timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities, asset_names, properties ]