def create_master_table(Raw_Data_dated: pd.DataFrame, parameters: Dict) -> pd.DataFrame: master_table = pd.concat(Raw_Data_dated, axis=1, sort=False) data_set = CSVLocalDataSet(filepath=parameters["path_primary"] + "/master_table.csv") data_set.save(master_table) return master_table
def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker): class DummyContext(KedroContext): project_name = "bob" project_version = __version__ def _get_pipelines(self) -> Dict[str, Pipeline]: return {"__default__": Pipeline([node(identity, "cars", "boats")])} mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) filepath = str(dummy_context.project_path / "cars.csv") old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) old_csv_data_set = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, old_save_version), ) old_csv_data_set.save(old_df) new_save_version = generate_timestamp() new_csv_data_set = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, new_save_version), ) new_csv_data_set.save(dummy_dataframe) load_versions = {"cars": old_save_version} dummy_context.run(load_versions=load_versions) assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) assert dummy_context.catalog.load("boats").equals(old_df)
def test_sequential_save_and_load(self, dummy_dataframe, filepath): """Tests if the correct load version is logged when two datasets are saved sequentially.""" dataset1 = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, "2000-01-01"), ) dataset2 = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, "2001-01-01"), ) dataset1.save(dummy_dataframe) last_save_version1 = dataset1.get_last_save_version() dataset2.save(dummy_dataframe) last_save_version2 = dataset2.get_last_save_version() dataset2.load() last_load_version = dataset2.get_last_load_version() assert last_save_version2 == last_load_version assert last_save_version1 != last_save_version2
def discrete_wavelet_transform(Raw_Data_preprocessed: pd.DataFrame, parameters: Dict, files: List): for well_data, file in zip(Raw_Data_preprocessed, files): list_input_DWT_Aprox_coeff = [] input_data, labels = split(well_data) input_columns = list(input_data.columns) for data_idx in input_columns: signal = well_data[data_idx].values thresh = parameters["thresh"] * np.nanmax(signal) coeff = pywt.wavedec(signal, wavelet=parameters["wavelet"], mode=parameters["mode1"], level=parameters["level"]) coeff[1:] = (pywt.threshold(i, value=thresh, mode=str(parameters["mode2"])) for i in coeff[1:]) list_input_DWT_Aprox_coeff.append(coeff[0]) list_input_DWT_Aprox_coeff = pd.DataFrame( np.transpose(list_input_DWT_Aprox_coeff), columns=input_columns) data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] + "/input_DWT_coeffs_" + file) data_set_input.save(list_input_DWT_Aprox_coeff) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) data_set_labels.save(labels) dummy = labels return dummy
def test_load_options_csv(self, tmp_path, sample_pandas_df): filepath = str(tmp_path / "data") local_csv_data_set = CSVLocalDataSet(filepath=filepath) local_csv_data_set.save(sample_pandas_df) spark_data_set = SparkDataSet(filepath=filepath, file_format="csv", load_args={"header": True}) spark_df = spark_data_set.load() assert spark_df.filter(col("Name") == "Alex").count() == 1
def test_load_options_csv(tmpdir): temp_path = str(tmpdir.join("data")) pandas_df = _get_sample_pandas_data_frame() local_csv_data_set = CSVLocalDataSet(filepath=temp_path) local_csv_data_set.save(pandas_df) spark_data_set = SparkDataSet(filepath=temp_path, file_format="csv", load_args={"header": True}) spark_df = spark_data_set.load() assert spark_df.filter(col("Name") == "Alex").count() == 1
def standardisation(dummy, properties: np.ndarray, files: List, parameters: Dict): from sklearn.preprocessing import StandardScaler all_wells_input = [] all_wells_labels = [] for file in files: data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] + "/input_DWT_coeffs_" + file) DWT_Aprox_coeff_input = data_set_input.load() all_wells_input.append(DWT_Aprox_coeff_input.values) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) labels = data_set_labels.load() all_wells_labels.append(labels.values) all_wells_input = np.array(all_wells_input) # Standardize dynamic data coeffs scaler_coeffs = StandardScaler() scaler_coeffs.fit(all_wells_input[0]) # fit based on first well record all_wells_standardized_input = [] all_wells_standardized_input_flattened = [] for well_coeffs in all_wells_input: std_coeffs = scaler_coeffs.transform(well_coeffs) all_wells_standardized_input.append(std_coeffs) transposed_std_coeffs = np.transpose(std_coeffs) flattened_std_coeffs = transposed_std_coeffs.flatten() all_wells_standardized_input_flattened.append(flattened_std_coeffs) all_wells_standardized_input = np.array(all_wells_standardized_input) all_wells_standardized_input_flattened = np.array( all_wells_standardized_input_flattened) data_set_scaler_coeffs = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_coeffs.pickle") data_set_scaler_coeffs.save(scaler_coeffs) input_columns = list(DWT_Aprox_coeff_input.columns) for std_coeffs, file in zip(all_wells_standardized_input, files): std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns) data_set = CSVLocalDataSet(filepath=parameters["path_features"] + "/std_DWT_input_coeffs_" + file) data_set.save(std_coeffs) # Standardize static data scaler_static = StandardScaler() all_wells_standardized_properties = scaler_static.fit_transform(properties) data_set_scaler_static = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_static.pickle") data_set_scaler_static.save(scaler_static) return [ all_wells_standardized_input_flattened, all_wells_standardized_properties, all_wells_labels ]
def save_well_data(Raw_Data_preprocessed, parameters, files): for well_data, file in zip(Raw_Data_preprocessed, files): steam_input_data, emulsion_input_data, labels = split(well_data) data_set_steam_input = CSVLocalDataSet( filepath=parameters["path_primary"] + "/steam_inputs_" + file) data_set_steam_input.save(steam_input_data) data_set_emulsion_input = CSVLocalDataSet( filepath=parameters["path_primary"] + "/emulsion_inputs_" + file) data_set_emulsion_input.save(emulsion_input_data) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) data_set_labels.save(labels) dummy = labels return dummy
def save_well_validation_data(dummy11, Raw_Data_preprocessed_val, parameters, files_val): # def save_well_validation_data(dummy12, Raw_Data_preprocessed_val_, parameters, files_val): all_wells_dates_input = [] all_wells_steam_input_val = [] all_wells_emulsion_input_val = [] all_wells_labels_val = [] for well_data, file in zip(Raw_Data_preprocessed_val, files_val): # for well_data, file in zip(Raw_Data_preprocessed_val_, files_val): steam_input_data, emulsion_input_data, labels = split(well_data) data_set_steam_input = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/vali_steam_inputs_" + file) data_set_steam_input.save(steam_input_data) all_wells_dates_input.append(steam_input_data['Date'].values) steam_input_data = steam_input_data.drop(columns='Date') all_wells_steam_input_val.append(steam_input_data.values) data_set_emulsion_input = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/vali_emulsion_inputs_" + file) data_set_emulsion_input.save(emulsion_input_data) all_wells_emulsion_input_val.append(emulsion_input_data.values) data_set_labels = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_labels_" + file) data_set_labels.save(labels) all_wells_labels_val.append(labels.values) steam_input_column = steam_input_data.columns emulsion_input_column = emulsion_input_data.columns labels_column = list(labels.columns) all_wells_dates_input = np.array(all_wells_dates_input) all_wells_steam_input_val = np.array(all_wells_steam_input_val) all_wells_emulsion_input_val = np.array(all_wells_emulsion_input_val) dummy13 = files_val return [ dummy13, steam_input_column, emulsion_input_column, labels_column, all_wells_dates_input, all_wells_steam_input_val, all_wells_emulsion_input_val, all_wells_labels_val ]
def validate(parameters: Dict): # def validate(dummy2, parameters: Dict): import glob, os os.chdir(parameters["path_model_input"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] for well, file, filename in zip(wells_data, files, filenames): well = well[[ 'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well = well.iloc[:1399] well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') Raw_Data_preprocessed.append(well) os.chdir(parameters["path_val_stats"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values properties.append(property_) properties = np.array(properties) all_wells_input = [] all_wells_labels = [] for well_data, file in zip(Raw_Data_preprocessed, files): DWT_Aprox_coeff_input = [] input_data, labels = split(well_data) input_columns = list(input_data.columns) for data_idx in input_columns: signal = well_data[data_idx].values thresh = parameters["thresh"] * np.nanmax(signal) coeff = pywt.wavedec(signal, wavelet=parameters["wavelet"], mode=parameters["mode1"], level=parameters["level"]) coeff[1:] = (pywt.threshold(i, value=thresh, mode=str(parameters["mode2"])) for i in coeff[1:]) DWT_Aprox_coeff_input.append(coeff[0]) DWT_Aprox_coeff_input = pd.DataFrame( np.transpose(DWT_Aprox_coeff_input), columns=input_columns) data_set_input = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_input_DWT_coeffs_" + file) data_set_input.save(DWT_Aprox_coeff_input) all_wells_input.append(DWT_Aprox_coeff_input.values) data_set_labels = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_labels_" + file) data_set_labels.save(labels) all_wells_labels.append(labels.values) # Standardize dynamic data coeffs data_set_scaler_coeffs = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_coeffs.pickle") scaler_coeffs = data_set_scaler_coeffs.load() all_wells_standardized_input = [] all_wells_standardized_input_flattened = [] for well_coeffs in all_wells_input: std_coeffs = scaler_coeffs.transform(well_coeffs) all_wells_standardized_input.append(std_coeffs) transposed_std_coeffs = np.transpose(std_coeffs) flattened_std_coeffs = transposed_std_coeffs.flatten() all_wells_standardized_input_flattened.append(flattened_std_coeffs) all_wells_standardized_input = np.array(all_wells_standardized_input) all_wells_standardized_input_flattened = np.array( all_wells_standardized_input_flattened) input_columns = list(DWT_Aprox_coeff_input.columns) for std_coeffs, file in zip(all_wells_standardized_input, files): std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns) data_set = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_std_DWT_input_coeffs_" + file) data_set.save(std_coeffs) # Standardize static data data_set_scaler_static = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_static.pickle") scaler_static = data_set_scaler_static.load() all_wells_standardized_properties = scaler_static.fit_transform(properties) all_wells_coeffs_reservoir_data = [] for flattened_std_coeffs, standardized_properties in zip( all_wells_standardized_input_flattened, all_wells_standardized_properties): flattened_std_coeffs = list(flattened_std_coeffs) standardized_properties = list(standardized_properties) for reservoir_property in standardized_properties: flattened_std_coeffs.append( reservoir_property ) # append reservoir data to dynamic data coeffs all_wells_coeffs_reservoir_data.append(flattened_std_coeffs) all_wells_coeffs_reservoir_data = np.array(all_wells_coeffs_reservoir_data) well_count = np.arange(len(all_wells_coeffs_reservoir_data)) daily_timesteps = np.arange(len(all_wells_labels[0])) input_data = [] for coeff_inputs in all_wells_coeffs_reservoir_data: for time_lapse in daily_timesteps: well_inputs = [time_lapse] + list( coeff_inputs) # append time lapse to input data input_data.append(well_inputs) input_data = np.array(input_data) data_set_regressor_1 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_1.pickle") regressor_1 = data_set_regressor_1.load() number_of_wells = len(well_count) wells_steam_rate_predicted = regressor_1.predict(input_data) wells_steam_rate_predicted = wells_steam_rate_predicted.reshape( (number_of_wells, 1399)).T # prediction inputs to model 2 input_data_model_2 = [] for coeff_inputs, well in zip(all_wells_coeffs_reservoir_data, well_count): for time_lapse in daily_timesteps: well_inputs = [time_lapse] + list( coeff_inputs) # append time lapse to input data well_inputs_model_2 = [ wells_steam_rate_predicted[time_lapse, well] ] + well_inputs input_data_model_2.append(well_inputs_model_2) input_data_model_2 = np.array(input_data_model_2) data_set_regressor_2 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_2.pickle") regressor_2 = data_set_regressor_2.load() wells_emulsion_rate_predicted = regressor_2.predict(input_data_model_2) wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape( (number_of_wells, 1399)).T # actual targets all_wells_steam_data = [] all_wells_emulsion_data = [] for ID in well_count: well_steam_data = all_wells_labels[ID][:, 0] well_emulsion_data = all_wells_labels[ID][:, 1] all_wells_steam_data = all_wells_steam_data + list(well_steam_data) all_wells_emulsion_data = all_wells_emulsion_data + list( well_emulsion_data) all_wells_steam_data = np.array(all_wells_steam_data) all_wells_emulsion_data = np.array(all_wells_emulsion_data) wells_steam_rate_actual = all_wells_steam_data.reshape( (number_of_wells, 1399)).T wells_emulsion_rate_actual = all_wells_emulsion_data.reshape( (number_of_wells, 1399)).T print("Prediction Performance:\n") print("Steam Flow Rate:") for well, file in zip(well_count, files): steam_rate_predicted = wells_steam_rate_predicted[:, well] steam_rate_actual = wells_steam_rate_actual[:, well] steam_rate_actual_predicted = pd.DataFrame( np.vstack((steam_rate_actual, steam_rate_predicted)).T, columns=["steam rate actual", "steam rate predicted"]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output"] + "/steam_rate_" + file) data_set_steam_rate.save(steam_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(steam_rate_actual, steam_rate_predicted))) print("\n") print("Emulsion Flow Rate:") for well, file in zip(well_count, files): emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well] emulsion_rate_actual = wells_emulsion_rate_actual[:, well] emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["emulsion rate actual", "emulsion rate predicted"]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output"] + "/emulsion_rate_" + file) data_set_emulsion_rate.save(emulsion_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(emulsion_rate_actual, emulsion_rate_predicted))) dummy_validate = files return dummy_validate
def preprocess_raw_data(parameters: Dict): import glob, os # os.chdir(parameters["path_raw"]) os.chdir(parameters["path_raw_matlab"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ # filename: CSVLocalDataSet(filepath=parameters["path_raw"]+"/"+file), filename: CSVLocalDataSet(filepath=parameters["path_raw_matlab"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', # 'Producer Bottom Hole Pressure', 'ESP Speed', # 'Steam Flow Rate - Outer', # 'Emulsion Flow Rate']] well = well[[ 'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP', 'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Prod_Date'] = pd.to_datetime(well['Date']) well = well.set_index('Prod_Date') # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data # well = well.rolling(30, min_periods=1).mean() data = well['Oil [bbl/d]'] / 6.28981 well.insert(4, 'Oil [m3/d]', data) time_data = np.arange(len(well)) well.insert(0, 'Timestep', time_data) wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) print((min_well_length, np.argmin(np.array(wells_life)))) timesteps = min_well_length # 1008 # timesteps = 371 for well, file, filename in zip(wells_data_, files, filenames): well = well.iloc[:timesteps] # daily, weekly, monthly data # well = well.fillna(0) # well = well.fillna(well.rolling(30, min_periods=1).median()) # well = well.fillna(well.median()) well["Well"] = filename # create a column for well name well = well.reset_index(drop=True) # remove date index data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well) Raw_Data_dated.append(well) well = well.drop(columns=['Date', 'Well']) Raw_Data_preprocessed.append(well) # stats_training = CSVLocalDataSet(filepath=parameters["path_raw_static"]+"/static_P50_data_training.csv") stats_training = CSVLocalDataSet( filepath=parameters["path_raw_static_matlab"] + "/static_P50_data_training_152_wells.csv") stats = stats_training.load() stats_ROIP = stats.loc[:, 'ROIP'] stats = stats.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation'] # # using only rich geoostats and no bottom water properties # stats = stats.loc[:, 'Effective_Length':'Rich_Oil_Saturation'] # # Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness # data = stats['Rich_Pay_Thickness'] - stats['Stand_Off'] # stats.insert(3, 'Effective_Rich_Pay_Thickness', data) # stats = stats.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off']) property_names = list(stats.columns) properties = list(stats.values) properties = np.array(properties) return [ timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames, properties, stats_ROIP, property_names ]
def preprocess_raw_data(parameters: Dict): import glob, os os.chdir(parameters["path_raw"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] for well, file, filename in zip(wells_data, files, filenames): well = well[[ 'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well = well.iloc[:1399] well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well_dated = well.copy() well_dated["Well"] = filename # create a column for well name data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well_dated) Raw_Data_dated.append(well_dated) well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') Raw_Data_preprocessed.append(well) os.chdir(parameters["path_raw_static"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values properties.append(property_) properties = np.array(properties) return [ Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities, asset_names, properties ]
def save_predicted_data(dummy15, well_count, all_wells_dates_input, wells_steam_rate_actual, wells_steam_rate_predicted, wells_emulsion_rate_actual, wells_emulsion_rate_predicted, steam_input_column, all_wells_steam_input_val, emulsion_input_column, all_wells_emulsion_input_val, labels_column, parameters, files_val, scheme): # to input wells_RF_array for RF case print("Prediction Performance:\n") print("Steam Flow Rate:") for well, file in zip(well_count, files_val): dates = all_wells_dates_input[well].T steam_input = all_wells_steam_input_val[well].T steam_rate_actual = wells_steam_rate_actual[:, well] steam_rate_predicted = wells_steam_rate_predicted[:, well] emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well] if scheme == 1: steam_rate_actual_predicted = pd.DataFrame( np.vstack((dates, steam_input, steam_rate_actual, steam_rate_predicted)).T, columns=["Date"] + list(steam_input_column) + [ labels_column[0] + " actual", labels_column[0] + " predicted" ]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_scheme1"] + "/steam_rate_" + file) elif scheme == 2: steam_rate_actual_predicted = pd.DataFrame( np.vstack((dates, steam_input, emulsion_rate_predicted, steam_rate_actual, steam_rate_predicted)).T, columns=["Date"] + list(steam_input_column) + [ labels_column[1] + " predicted", labels_column[0] + " actual", labels_column[0] + " predicted" ]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_scheme2"] + "/steam_rate_" + file) elif scheme == 3: RF_input = wells_RF_array[well].T steam_rate_actual_predicted = pd.DataFrame( np.vstack((dates, RF_input, steam_input, steam_rate_actual, steam_rate_predicted)).T, columns=["Date", "RF"] + list(steam_input_column) + [ labels_column[0] + " actual", labels_column[0] + " predicted" ]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_RF_scheme1"] + "/steam_rate_" + file) else: RF_input = wells_RF_array[well].T steam_rate_actual_predicted = pd.DataFrame( np.vstack( (dates, RF_input, steam_input, emulsion_rate_predicted, steam_rate_actual, steam_rate_predicted)).T, columns=["Date", "RF"] + list(steam_input_column) + [ labels_column[1] + " predicted", labels_column[0] + " actual", labels_column[0] + " predicted" ]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_RF_scheme2"] + "/steam_rate_" + file) data_set_steam_rate.save(steam_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(steam_rate_actual, steam_rate_predicted))) print("\n") print("Oil Rate:") for well, file in zip(well_count, files_val): dates = all_wells_dates_input[well].T emulsion_input = all_wells_emulsion_input_val[well].T emulsion_rate_actual = wells_emulsion_rate_actual[:, well] steam_rate_predicted = wells_steam_rate_predicted[:, well] emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well] if scheme == 1: emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((dates, emulsion_input, steam_rate_predicted, emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["Date"] + list(emulsion_input_column) + [ labels_column[0] + " predicted", labels_column[1] + " actual", labels_column[1] + " predicted" ]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_scheme1"] + "/emulsion_rate_" + file) elif scheme == 2: emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((dates, emulsion_input, emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["Date"] + list(emulsion_input_column) + [ labels_column[1] + " actual", labels_column[1] + " predicted" ]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_scheme2"] + "/emulsion_rate_" + file) elif scheme == 3: # cum_input = wells_cum_oil_array[well].T RF_input = wells_RF_array[well].T emulsion_rate_actual_predicted = pd.DataFrame( np.vstack( (dates, RF_input, emulsion_input, steam_rate_predicted, emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["Date", "RF"] + list(emulsion_input_column) + [ labels_column[0] + " predicted", labels_column[1] + " actual", labels_column[1] + " predicted" ]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_RF_scheme1"] + "/emulsion_rate_" + file) else: RF_input = wells_RF_array[well].T emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((dates, RF_input, emulsion_input, emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["Date", "RF"] + list(emulsion_input_column) + [ labels_column[1] + " actual", labels_column[1] + " predicted" ]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_RF_scheme2"] + "/emulsion_rate_" + file) data_set_emulsion_rate.save(emulsion_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(emulsion_rate_actual, emulsion_rate_predicted))) print("\n") dummy_validate = files_val return dummy_validate
def preprocess_raw_data(parameters: Dict): import glob, os os.chdir(parameters["path_raw"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', # 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', # 'ESP Current', 'Emulsion Flow Rate']] # well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Emulsion Flow Rate']] well = well[[ 'Date', 'IBHP', 'PBHP', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) timesteps = 983 for well, file, filename in zip(wells_data_, files, filenames): # well = well.iloc[:min_well_length] # use minimum well life well = well.iloc[:timesteps] # daily, weekly, monthly data # well = well.fillna(0) well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well_dated = well.copy() well_dated["Well"] = filename # create a column for well name data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well_dated) Raw_Data_dated.append(well_dated) Raw_Data_preprocessed.append(well) os.chdir(parameters["path_raw_static"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'Effective_Length':'BottomWater_Oil_Saturation'].values properties.append(property_) properties = np.array(properties) return [ timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities, asset_names, properties ]
def validate(dummy6, regressor_1, regressor_2, input_data, parameters: Dict): well_1_inputs = input_data[:1399] well_2_inputs = input_data[1399:2798] well_13_inputs = input_data[16788:18187] well_51_inputs = input_data[69950:71349] wells_inputs = list(well_1_inputs) + list(well_2_inputs) + list(well_13_inputs) + list(well_51_inputs) wells_steam_rate = regressor_1.predict(wells_inputs) well_1_steam_rate = wells_steam_rate[:1399] well_2_steam_rate = wells_steam_rate[1399:2798] well_13_steam_rate = wells_steam_rate[16788:18187] well_51_steam_rate = wells_steam_rate[69950:71349] well_1_steam_rate_dataframe = pd.DataFrame(np.transpose(np.array(well_1_steam_rate)), columns = ["well 1 steam rate"]) well_2_steam_rate_dataframe = pd.DataFrame(np.transpose(np.array(well_2_steam_rate)), columns = ["well 2 steam rate"]) well_13_steam_rate_dataframe = pd.DataFrame(np.transpose(np.array(well_13_steam_rate)), columns = ["well 13 steam rate"]) well_51_steam_rate_dataframe = pd.DataFrame(np.transpose(np.array(well_51_steam_rate)), columns = ["well 51 steam rate"]) data_set = CSVLocalDataSet(filepath=parameters["path_model_output"]+"/well_1_steam_rate_RF.csv") data_set.save(well_1_steam_rate_dataframe) data_set = CSVLocalDataSet(filepath=parameters["path_model_output"]+"/well_2_steam_rate_RF.csv") data_set.save(well_2_steam_rate_dataframe) data_set = CSVLocalDataSet(filepath=parameters["path_model_output"]+"/well_13_steam_rate_RF.csv") data_set.save(well_13_steam_rate_dataframe) data_set = CSVLocalDataSet(filepath=parameters["path_model_output"]+"/well_51_steam_rate_RF.csv") data_set.save(well_51_steam_rate_dataframe) dummy7 = well_1_steam_rate return dummy7 # Cross_validation # change array to image # def array_to_image(dummy2, X_train: np.ndarray, X_test: np.ndarray, parameters: Dict, filenames: List): # for training_image, test_image, filename in zip(X_train, X_test, filenames): # plt.figure(figsize=(40,10)) # plt.imshow(training_image) # plt.imsave(parameters["path_features"]+"/training_image_"+filename+".png", training_image) # plt.imshow(test_image) # plt.imsave(parameters["path_features"]+"/test_image_"+filename+".png", test_image) # dummy3 = X_train # return dummy3 # #CNN Node # def train_model_CNN(dummy2, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, parameters: Dict): # tensorflow.set_random_seed(0) # np.random.seed(0) # y_training = [] # y_testing = [] # for y in y_train: # y_training.append(y[:,1]) # for y in y_test: # y_testing.append(y[:,1]) # y_training = np.array(y_training) # y_testing = np.array(y_testing) # X_training = X_train.reshape(len(X_train), X_train.shape[1], X_train.shape[2], 1) # X_testing = X_test.reshape(len(X_test), X_test.shape[1], X_test.shape[2], 1) # inputs = keras.Input(shape=(X_train.shape[1], X_train.shape[2], 1), name='input_image') #input is greyscale "image" # filters = (16, 32) # for (i, filter_) in enumerate(filters): # if i == 0: # x = inputs # x = keras.layers.Conv2D(filter_, (3, 3), padding='same')(x) #confirm what padding means # x = keras.layers.Activation('relu')(x) # x = keras.layers.BatchNormalization(axis=-1)(x) #confirm what this means # x = keras.layers.MaxPooling2D(pool_size=(2, 2))(x) # x = keras.layers.Flatten()(x) # x = layers.Dense(parameters["fully_connected_layer_1"], activation='relu', name='fully_connected_layer_1')(x) # x = keras.layers.BatchNormalization(axis=-1)(x) # x = keras.layers.Dropout(0.5)(x) #confirm what this means # x = layers.Dense(parameters["fully_connected_layer_2"], activation='relu', name='fully_connected_layer_2')(x) # x = layers.Dense(X_train.shape[1], activation='linear', name='outputs')(x) # model = keras.Model(inputs=inputs, outputs=x, name='CNN_regression_model') # model.summary() # model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['accuracy']) # model.fit(X_training, y_training, epochs=parameters["epochs_CNN"], batch_size=parameters["batch_size"]) # test_loss = model.evaluate(X_testing, y_testing) # dummy3 = X_train # dummy variable set to enable sequential run # model.save(parameters["path_models"]+"/CNN_network_model.h5") # return dummy3 # def k_means_clustering(labels: pd.DataFrame, parameters: Dict) -> List: # from sklearn.cluster import KMeans # kmeans = KMeans(n_clusters=parameters["n_clusters"]) # data = labels.values # kmeans.fit(data) # y_kmeans = pd.DataFrame(kmeans.predict(data), columns = ["y_kMeans"]) # return [kmeans, y_kmeans]