Python CSVLocalDataSet.save示例，kedro.io.CSVLocalDataSet.save Python示例

示例#1

0

显示文件

def create_master_table(Raw_Data_dated: pd.DataFrame,
                        parameters: Dict) -> pd.DataFrame:
    master_table = pd.concat(Raw_Data_dated, axis=1, sort=False)
    data_set = CSVLocalDataSet(filepath=parameters["path_primary"] +
                               "/master_table.csv")
    data_set.save(master_table)
    return master_table

示例#2

0

显示文件

    def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker):
        class DummyContext(KedroContext):
            project_name = "bob"
            project_version = __version__

            def _get_pipelines(self) -> Dict[str, Pipeline]:
                return {"__default__": Pipeline([node(identity, "cars", "boats")])}

        mocker.patch("logging.config.dictConfig")
        dummy_context = DummyContext(str(tmp_path))
        filepath = str(dummy_context.project_path / "cars.csv")

        old_save_version = generate_timestamp()
        old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]})
        old_csv_data_set = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, old_save_version),
        )
        old_csv_data_set.save(old_df)

        new_save_version = generate_timestamp()
        new_csv_data_set = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, new_save_version),
        )
        new_csv_data_set.save(dummy_dataframe)

        load_versions = {"cars": old_save_version}
        dummy_context.run(load_versions=load_versions)
        assert not dummy_context.catalog.load("boats").equals(dummy_dataframe)
        assert dummy_context.catalog.load("boats").equals(old_df)

示例#3

0

显示文件

    def test_sequential_save_and_load(self, dummy_dataframe, filepath):
        """Tests if the correct load version is logged when two datasets are saved
        sequentially."""

        dataset1 = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, "2000-01-01"),
        )

        dataset2 = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, "2001-01-01"),
        )

        dataset1.save(dummy_dataframe)
        last_save_version1 = dataset1.get_last_save_version()

        dataset2.save(dummy_dataframe)
        last_save_version2 = dataset2.get_last_save_version()

        dataset2.load()
        last_load_version = dataset2.get_last_load_version()
        assert last_save_version2 == last_load_version
        assert last_save_version1 != last_save_version2

示例#4

0

显示文件

def discrete_wavelet_transform(Raw_Data_preprocessed: pd.DataFrame,
                               parameters: Dict, files: List):
    for well_data, file in zip(Raw_Data_preprocessed, files):
        list_input_DWT_Aprox_coeff = []
        input_data, labels = split(well_data)

        input_columns = list(input_data.columns)
        for data_idx in input_columns:
            signal = well_data[data_idx].values
            thresh = parameters["thresh"] * np.nanmax(signal)
            coeff = pywt.wavedec(signal,
                                 wavelet=parameters["wavelet"],
                                 mode=parameters["mode1"],
                                 level=parameters["level"])
            coeff[1:] = (pywt.threshold(i,
                                        value=thresh,
                                        mode=str(parameters["mode2"]))
                         for i in coeff[1:])
            list_input_DWT_Aprox_coeff.append(coeff[0])
        list_input_DWT_Aprox_coeff = pd.DataFrame(
            np.transpose(list_input_DWT_Aprox_coeff), columns=input_columns)
        data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                         "/input_DWT_coeffs_" + file)
        data_set_input.save(list_input_DWT_Aprox_coeff)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        data_set_labels.save(labels)
    dummy = labels
    return dummy

示例#5

0

显示文件

文件： test_spark_data_set.py 项目： sylinuxhy/kedro

 def test_load_options_csv(self, tmp_path, sample_pandas_df):
     filepath = str(tmp_path / "data")
     local_csv_data_set = CSVLocalDataSet(filepath=filepath)
     local_csv_data_set.save(sample_pandas_df)
     spark_data_set = SparkDataSet(filepath=filepath,
                                   file_format="csv",
                                   load_args={"header": True})
     spark_df = spark_data_set.load()
     assert spark_df.filter(col("Name") == "Alex").count() == 1

示例#6

0

显示文件

文件： test_spark_data_set.py 项目： zulyang/kedro

def test_load_options_csv(tmpdir):
    temp_path = str(tmpdir.join("data"))
    pandas_df = _get_sample_pandas_data_frame()
    local_csv_data_set = CSVLocalDataSet(filepath=temp_path)
    local_csv_data_set.save(pandas_df)
    spark_data_set = SparkDataSet(filepath=temp_path,
                                  file_format="csv",
                                  load_args={"header": True})
    spark_df = spark_data_set.load()
    assert spark_df.filter(col("Name") == "Alex").count() == 1

示例#7

0

显示文件

def standardisation(dummy, properties: np.ndarray, files: List,
                    parameters: Dict):
    from sklearn.preprocessing import StandardScaler
    all_wells_input = []
    all_wells_labels = []

    for file in files:
        data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                         "/input_DWT_coeffs_" + file)
        DWT_Aprox_coeff_input = data_set_input.load()
        all_wells_input.append(DWT_Aprox_coeff_input.values)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        labels = data_set_labels.load()
        all_wells_labels.append(labels.values)
    all_wells_input = np.array(all_wells_input)

    #     Standardize dynamic data coeffs
    scaler_coeffs = StandardScaler()
    scaler_coeffs.fit(all_wells_input[0])  # fit based on first well record
    all_wells_standardized_input = []
    all_wells_standardized_input_flattened = []
    for well_coeffs in all_wells_input:
        std_coeffs = scaler_coeffs.transform(well_coeffs)
        all_wells_standardized_input.append(std_coeffs)
        transposed_std_coeffs = np.transpose(std_coeffs)
        flattened_std_coeffs = transposed_std_coeffs.flatten()
        all_wells_standardized_input_flattened.append(flattened_std_coeffs)

    all_wells_standardized_input = np.array(all_wells_standardized_input)
    all_wells_standardized_input_flattened = np.array(
        all_wells_standardized_input_flattened)
    data_set_scaler_coeffs = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_coeffs.pickle")
    data_set_scaler_coeffs.save(scaler_coeffs)

    input_columns = list(DWT_Aprox_coeff_input.columns)
    for std_coeffs, file in zip(all_wells_standardized_input, files):
        std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns)
        data_set = CSVLocalDataSet(filepath=parameters["path_features"] +
                                   "/std_DWT_input_coeffs_" + file)
        data_set.save(std_coeffs)


#     Standardize static data
    scaler_static = StandardScaler()
    all_wells_standardized_properties = scaler_static.fit_transform(properties)
    data_set_scaler_static = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_static.pickle")
    data_set_scaler_static.save(scaler_static)
    return [
        all_wells_standardized_input_flattened,
        all_wells_standardized_properties, all_wells_labels
    ]

示例#8

0

显示文件

def save_well_data(Raw_Data_preprocessed, parameters, files):
    for well_data, file in zip(Raw_Data_preprocessed, files):
        steam_input_data, emulsion_input_data, labels = split(well_data)
        data_set_steam_input = CSVLocalDataSet(
            filepath=parameters["path_primary"] + "/steam_inputs_" + file)
        data_set_steam_input.save(steam_input_data)
        data_set_emulsion_input = CSVLocalDataSet(
            filepath=parameters["path_primary"] + "/emulsion_inputs_" + file)
        data_set_emulsion_input.save(emulsion_input_data)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        data_set_labels.save(labels)
    dummy = labels
    return dummy

示例#9

0

显示文件

def save_well_validation_data(dummy11, Raw_Data_preprocessed_val, parameters,
                              files_val):
    # def save_well_validation_data(dummy12, Raw_Data_preprocessed_val_, parameters, files_val):
    all_wells_dates_input = []
    all_wells_steam_input_val = []
    all_wells_emulsion_input_val = []
    all_wells_labels_val = []
    for well_data, file in zip(Raw_Data_preprocessed_val, files_val):
        #     for well_data, file in zip(Raw_Data_preprocessed_val_, files_val):
        steam_input_data, emulsion_input_data, labels = split(well_data)
        data_set_steam_input = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/vali_steam_inputs_" + file)
        data_set_steam_input.save(steam_input_data)
        all_wells_dates_input.append(steam_input_data['Date'].values)
        steam_input_data = steam_input_data.drop(columns='Date')
        all_wells_steam_input_val.append(steam_input_data.values)
        data_set_emulsion_input = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/vali_emulsion_inputs_" + file)
        data_set_emulsion_input.save(emulsion_input_data)
        all_wells_emulsion_input_val.append(emulsion_input_data.values)
        data_set_labels = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_labels_" + file)
        data_set_labels.save(labels)
        all_wells_labels_val.append(labels.values)

    steam_input_column = steam_input_data.columns
    emulsion_input_column = emulsion_input_data.columns
    labels_column = list(labels.columns)
    all_wells_dates_input = np.array(all_wells_dates_input)
    all_wells_steam_input_val = np.array(all_wells_steam_input_val)
    all_wells_emulsion_input_val = np.array(all_wells_emulsion_input_val)

    dummy13 = files_val
    return [
        dummy13, steam_input_column, emulsion_input_column, labels_column,
        all_wells_dates_input, all_wells_steam_input_val,
        all_wells_emulsion_input_val, all_wells_labels_val
    ]

示例#10

0

显示文件

def validate(parameters: Dict):
    # def validate(dummy2, parameters: Dict):
    import glob, os
    os.chdir(parameters["path_model_input"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" +
                            file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    for well, file, filename in zip(wells_data, files, filenames):
        well = well[[
            'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
            'Bottom Hole Heel Temperature', 'Emulsion Pressure',
            'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate'
        ]]
        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')

        well = well.iloc[:1399]
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_val_stats"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values
            properties.append(property_)
    properties = np.array(properties)

    all_wells_input = []
    all_wells_labels = []
    for well_data, file in zip(Raw_Data_preprocessed, files):
        DWT_Aprox_coeff_input = []
        input_data, labels = split(well_data)

        input_columns = list(input_data.columns)
        for data_idx in input_columns:
            signal = well_data[data_idx].values
            thresh = parameters["thresh"] * np.nanmax(signal)
            coeff = pywt.wavedec(signal,
                                 wavelet=parameters["wavelet"],
                                 mode=parameters["mode1"],
                                 level=parameters["level"])
            coeff[1:] = (pywt.threshold(i,
                                        value=thresh,
                                        mode=str(parameters["mode2"]))
                         for i in coeff[1:])
            DWT_Aprox_coeff_input.append(coeff[0])
        DWT_Aprox_coeff_input = pd.DataFrame(
            np.transpose(DWT_Aprox_coeff_input), columns=input_columns)
        data_set_input = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_input_DWT_coeffs_" + file)
        data_set_input.save(DWT_Aprox_coeff_input)
        all_wells_input.append(DWT_Aprox_coeff_input.values)
        data_set_labels = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_labels_" + file)
        data_set_labels.save(labels)
        all_wells_labels.append(labels.values)

    #     Standardize dynamic data coeffs
    data_set_scaler_coeffs = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_coeffs.pickle")
    scaler_coeffs = data_set_scaler_coeffs.load()
    all_wells_standardized_input = []
    all_wells_standardized_input_flattened = []
    for well_coeffs in all_wells_input:
        std_coeffs = scaler_coeffs.transform(well_coeffs)
        all_wells_standardized_input.append(std_coeffs)
        transposed_std_coeffs = np.transpose(std_coeffs)
        flattened_std_coeffs = transposed_std_coeffs.flatten()
        all_wells_standardized_input_flattened.append(flattened_std_coeffs)
    all_wells_standardized_input = np.array(all_wells_standardized_input)
    all_wells_standardized_input_flattened = np.array(
        all_wells_standardized_input_flattened)
    input_columns = list(DWT_Aprox_coeff_input.columns)
    for std_coeffs, file in zip(all_wells_standardized_input, files):
        std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns)
        data_set = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_std_DWT_input_coeffs_" + file)
        data_set.save(std_coeffs)
#     Standardize static data

    data_set_scaler_static = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_static.pickle")
    scaler_static = data_set_scaler_static.load()
    all_wells_standardized_properties = scaler_static.fit_transform(properties)
    all_wells_coeffs_reservoir_data = []
    for flattened_std_coeffs, standardized_properties in zip(
            all_wells_standardized_input_flattened,
            all_wells_standardized_properties):
        flattened_std_coeffs = list(flattened_std_coeffs)
        standardized_properties = list(standardized_properties)
        for reservoir_property in standardized_properties:
            flattened_std_coeffs.append(
                reservoir_property
            )  # append reservoir data to dynamic data coeffs
        all_wells_coeffs_reservoir_data.append(flattened_std_coeffs)
    all_wells_coeffs_reservoir_data = np.array(all_wells_coeffs_reservoir_data)

    well_count = np.arange(len(all_wells_coeffs_reservoir_data))
    daily_timesteps = np.arange(len(all_wells_labels[0]))
    input_data = []
    for coeff_inputs in all_wells_coeffs_reservoir_data:
        for time_lapse in daily_timesteps:
            well_inputs = [time_lapse] + list(
                coeff_inputs)  # append time lapse to input data
            input_data.append(well_inputs)
    input_data = np.array(input_data)
    data_set_regressor_1 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_1.pickle")
    regressor_1 = data_set_regressor_1.load()
    number_of_wells = len(well_count)
    wells_steam_rate_predicted = regressor_1.predict(input_data)
    wells_steam_rate_predicted = wells_steam_rate_predicted.reshape(
        (number_of_wells, 1399)).T

    # prediction inputs to model 2
    input_data_model_2 = []
    for coeff_inputs, well in zip(all_wells_coeffs_reservoir_data, well_count):
        for time_lapse in daily_timesteps:
            well_inputs = [time_lapse] + list(
                coeff_inputs)  # append time lapse to input data
            well_inputs_model_2 = [
                wells_steam_rate_predicted[time_lapse, well]
            ] + well_inputs
            input_data_model_2.append(well_inputs_model_2)
    input_data_model_2 = np.array(input_data_model_2)

    data_set_regressor_2 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_2.pickle")
    regressor_2 = data_set_regressor_2.load()
    wells_emulsion_rate_predicted = regressor_2.predict(input_data_model_2)
    wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape(
        (number_of_wells, 1399)).T

    # actual targets
    all_wells_steam_data = []
    all_wells_emulsion_data = []
    for ID in well_count:
        well_steam_data = all_wells_labels[ID][:, 0]
        well_emulsion_data = all_wells_labels[ID][:, 1]
        all_wells_steam_data = all_wells_steam_data + list(well_steam_data)
        all_wells_emulsion_data = all_wells_emulsion_data + list(
            well_emulsion_data)
    all_wells_steam_data = np.array(all_wells_steam_data)
    all_wells_emulsion_data = np.array(all_wells_emulsion_data)
    wells_steam_rate_actual = all_wells_steam_data.reshape(
        (number_of_wells, 1399)).T
    wells_emulsion_rate_actual = all_wells_emulsion_data.reshape(
        (number_of_wells, 1399)).T

    print("Prediction Performance:\n")
    print("Steam Flow Rate:")
    for well, file in zip(well_count, files):
        steam_rate_predicted = wells_steam_rate_predicted[:, well]
        steam_rate_actual = wells_steam_rate_actual[:, well]
        steam_rate_actual_predicted = pd.DataFrame(
            np.vstack((steam_rate_actual, steam_rate_predicted)).T,
            columns=["steam rate actual", "steam rate predicted"])
        data_set_steam_rate = CSVLocalDataSet(
            filepath=parameters["path_model_output"] + "/steam_rate_" + file)
        data_set_steam_rate.save(steam_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(steam_rate_actual, steam_rate_predicted)))

    print("\n")
    print("Emulsion Flow Rate:")
    for well, file in zip(well_count, files):
        emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well]
        emulsion_rate_actual = wells_emulsion_rate_actual[:, well]
        emulsion_rate_actual_predicted = pd.DataFrame(
            np.vstack((emulsion_rate_actual, emulsion_rate_predicted)).T,
            columns=["emulsion rate actual", "emulsion rate predicted"])
        data_set_emulsion_rate = CSVLocalDataSet(
            filepath=parameters["path_model_output"] + "/emulsion_rate_" +
            file)
        data_set_emulsion_rate.save(emulsion_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(emulsion_rate_actual, emulsion_rate_predicted)))

    dummy_validate = files
    return dummy_validate

示例#11

0

显示文件

def preprocess_raw_data(parameters: Dict):
    import glob, os
    #     os.chdir(parameters["path_raw"])
    os.chdir(parameters["path_raw_matlab"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            #                 filename: CSVLocalDataSet(filepath=parameters["path_raw"]+"/"+file),
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw_matlab"] + "/" +
                            file),
        })

        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure',
        #                    'Producer Bottom Hole Pressure', 'ESP Speed',
        #                    'Steam Flow Rate - Outer',
        #                    'Emulsion Flow Rate']]

        well = well[[
            'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP',
            'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]',
            'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Prod_Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Prod_Date')
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        #         well = well.rolling(30, min_periods=1).mean()

        data = well['Oil [bbl/d]'] / 6.28981
        well.insert(4, 'Oil [m3/d]', data)
        time_data = np.arange(len(well))
        well.insert(0, 'Timestep', time_data)

        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    print((min_well_length, np.argmin(np.array(wells_life))))
    timesteps = min_well_length  # 1008
    #     timesteps = 371

    for well, file, filename in zip(wells_data_, files, filenames):
        well = well.iloc[:timesteps]  # daily, weekly, monthly data
        #         well = well.fillna(0)
        #         well = well.fillna(well.rolling(30, min_periods=1).median())
        #         well = well.fillna(well.median())

        well["Well"] = filename  # create a column for well name
        well = well.reset_index(drop=True)  # remove date index
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well)
        Raw_Data_dated.append(well)
        well = well.drop(columns=['Date', 'Well'])
        Raw_Data_preprocessed.append(well)


#     stats_training = CSVLocalDataSet(filepath=parameters["path_raw_static"]+"/static_P50_data_training.csv")
    stats_training = CSVLocalDataSet(
        filepath=parameters["path_raw_static_matlab"] +
        "/static_P50_data_training_152_wells.csv")

    stats = stats_training.load()
    stats_ROIP = stats.loc[:, 'ROIP']
    stats = stats.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation']

    # #     using only rich geoostats and no bottom water properties
    #     stats = stats.loc[:, 'Effective_Length':'Rich_Oil_Saturation']

    # #     Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness
    #     data = stats['Rich_Pay_Thickness'] - stats['Stand_Off']
    #     stats.insert(3, 'Effective_Rich_Pay_Thickness', data)
    #     stats = stats.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off'])

    property_names = list(stats.columns)
    properties = list(stats.values)

    properties = np.array(properties)
    return [
        timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames,
        properties, stats_ROIP, property_names
    ]

示例#12

0

显示文件

def preprocess_raw_data(parameters: Dict):
    import glob, os
    os.chdir(parameters["path_raw"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    for well, file, filename in zip(wells_data, files, filenames):
        well = well[[
            'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
            'Bottom Hole Heel Temperature', 'Emulsion Pressure',
            'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate'
        ]]
        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')

        well = well.iloc[:1399]
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well_dated = well.copy()
        well_dated["Well"] = filename  # create a column for well name
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well_dated)
        Raw_Data_dated.append(well_dated)

        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_raw_static"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values
            properties.append(property_)
    properties = np.array(properties)
    return [
        Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities,
        asset_names, properties
    ]

示例#13

0

显示文件

def save_predicted_data(dummy15, well_count, all_wells_dates_input,
                        wells_steam_rate_actual, wells_steam_rate_predicted,
                        wells_emulsion_rate_actual,
                        wells_emulsion_rate_predicted, steam_input_column,
                        all_wells_steam_input_val, emulsion_input_column,
                        all_wells_emulsion_input_val, labels_column,
                        parameters, files_val,
                        scheme):  # to input wells_RF_array for RF case

    print("Prediction Performance:\n")
    print("Steam Flow Rate:")
    for well, file in zip(well_count, files_val):
        dates = all_wells_dates_input[well].T
        steam_input = all_wells_steam_input_val[well].T
        steam_rate_actual = wells_steam_rate_actual[:, well]

        steam_rate_predicted = wells_steam_rate_predicted[:, well]
        emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well]

        if scheme == 1:
            steam_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, steam_input, steam_rate_actual,
                           steam_rate_predicted)).T,
                columns=["Date"] + list(steam_input_column) + [
                    labels_column[0] + " actual",
                    labels_column[0] + " predicted"
                ])
            data_set_steam_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_scheme1"] +
                "/steam_rate_" + file)
        elif scheme == 2:
            steam_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, steam_input, emulsion_rate_predicted,
                           steam_rate_actual, steam_rate_predicted)).T,
                columns=["Date"] + list(steam_input_column) + [
                    labels_column[1] + " predicted", labels_column[0] +
                    " actual", labels_column[0] + " predicted"
                ])
            data_set_steam_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_scheme2"] +
                "/steam_rate_" + file)
        elif scheme == 3:
            RF_input = wells_RF_array[well].T
            steam_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, RF_input, steam_input, steam_rate_actual,
                           steam_rate_predicted)).T,
                columns=["Date", "RF"] + list(steam_input_column) + [
                    labels_column[0] + " actual",
                    labels_column[0] + " predicted"
                ])
            data_set_steam_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_RF_scheme1"] +
                "/steam_rate_" + file)
        else:
            RF_input = wells_RF_array[well].T
            steam_rate_actual_predicted = pd.DataFrame(
                np.vstack(
                    (dates, RF_input, steam_input, emulsion_rate_predicted,
                     steam_rate_actual, steam_rate_predicted)).T,
                columns=["Date", "RF"] + list(steam_input_column) + [
                    labels_column[1] + " predicted", labels_column[0] +
                    " actual", labels_column[0] + " predicted"
                ])
            data_set_steam_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_RF_scheme2"] +
                "/steam_rate_" + file)
        data_set_steam_rate.save(steam_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(steam_rate_actual, steam_rate_predicted)))

    print("\n")
    print("Oil Rate:")
    for well, file in zip(well_count, files_val):
        dates = all_wells_dates_input[well].T
        emulsion_input = all_wells_emulsion_input_val[well].T
        emulsion_rate_actual = wells_emulsion_rate_actual[:, well]

        steam_rate_predicted = wells_steam_rate_predicted[:, well]
        emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well]

        if scheme == 1:
            emulsion_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, emulsion_input, steam_rate_predicted,
                           emulsion_rate_actual, emulsion_rate_predicted)).T,
                columns=["Date"] + list(emulsion_input_column) + [
                    labels_column[0] + " predicted", labels_column[1] +
                    " actual", labels_column[1] + " predicted"
                ])
            data_set_emulsion_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_scheme1"] +
                "/emulsion_rate_" + file)
        elif scheme == 2:
            emulsion_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, emulsion_input, emulsion_rate_actual,
                           emulsion_rate_predicted)).T,
                columns=["Date"] + list(emulsion_input_column) + [
                    labels_column[1] + " actual",
                    labels_column[1] + " predicted"
                ])
            data_set_emulsion_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_scheme2"] +
                "/emulsion_rate_" + file)
        elif scheme == 3:
            #             cum_input = wells_cum_oil_array[well].T
            RF_input = wells_RF_array[well].T
            emulsion_rate_actual_predicted = pd.DataFrame(
                np.vstack(
                    (dates, RF_input, emulsion_input, steam_rate_predicted,
                     emulsion_rate_actual, emulsion_rate_predicted)).T,
                columns=["Date", "RF"] + list(emulsion_input_column) + [
                    labels_column[0] + " predicted", labels_column[1] +
                    " actual", labels_column[1] + " predicted"
                ])
            data_set_emulsion_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_RF_scheme1"] +
                "/emulsion_rate_" + file)
        else:
            RF_input = wells_RF_array[well].T
            emulsion_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, RF_input, emulsion_input,
                           emulsion_rate_actual, emulsion_rate_predicted)).T,
                columns=["Date", "RF"] + list(emulsion_input_column) + [
                    labels_column[1] + " actual",
                    labels_column[1] + " predicted"
                ])
            data_set_emulsion_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_RF_scheme2"] +
                "/emulsion_rate_" + file)
        data_set_emulsion_rate.save(emulsion_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(emulsion_rate_actual, emulsion_rate_predicted)))
    print("\n")
    dummy_validate = files_val
    return dummy_validate

示例#14

0

显示文件

def preprocess_raw_data(parameters: Dict):
    import glob, os
    os.chdir(parameters["path_raw"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
        #                                     'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure',
        #                                     'ESP Current', 'Emulsion Flow Rate']]

        #         well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Emulsion Flow Rate']]

        well = well[[
            'Date', 'IBHP', 'PBHP', 'Steam [m3/d]', 'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    timesteps = 983

    for well, file, filename in zip(wells_data_, files, filenames):
        #         well = well.iloc[:min_well_length]     # use minimum well life
        well = well.iloc[:timesteps]  # daily, weekly, monthly data
        #         well = well.fillna(0)
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well_dated = well.copy()
        well_dated["Well"] = filename  # create a column for well name
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well_dated)
        Raw_Data_dated.append(well_dated)
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_raw_static"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'Effective_Length':'BottomWater_Oil_Saturation'].values
            properties.append(property_)
    properties = np.array(properties)
    return [
        timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames,
        probabilities, asset_names, properties
    ]

示例#15

0

显示文件

def validate(dummy6, regressor_1, regressor_2, input_data, parameters: Dict):  
    well_1_inputs = input_data[:1399]
    well_2_inputs = input_data[1399:2798]
    well_13_inputs = input_data[16788:18187]
    well_51_inputs = input_data[69950:71349]  
    
    wells_inputs = list(well_1_inputs) + list(well_2_inputs) + list(well_13_inputs) + list(well_51_inputs)
    wells_steam_rate = regressor_1.predict(wells_inputs)
    
    well_1_steam_rate = wells_steam_rate[:1399]
    well_2_steam_rate = wells_steam_rate[1399:2798]
    well_13_steam_rate = wells_steam_rate[16788:18187]
    well_51_steam_rate = wells_steam_rate[69950:71349] 
    
    well_1_steam_rate_dataframe = pd.DataFrame(np.transpose(np.array(well_1_steam_rate)), columns = ["well 1 steam rate"])
    well_2_steam_rate_dataframe = pd.DataFrame(np.transpose(np.array(well_2_steam_rate)), columns = ["well 2 steam rate"])
    well_13_steam_rate_dataframe = pd.DataFrame(np.transpose(np.array(well_13_steam_rate)), columns = ["well 13 steam rate"])
    well_51_steam_rate_dataframe = pd.DataFrame(np.transpose(np.array(well_51_steam_rate)), columns = ["well 51 steam rate"])
    
    data_set = CSVLocalDataSet(filepath=parameters["path_model_output"]+"/well_1_steam_rate_RF.csv")
    data_set.save(well_1_steam_rate_dataframe)   
    
    data_set = CSVLocalDataSet(filepath=parameters["path_model_output"]+"/well_2_steam_rate_RF.csv")
    data_set.save(well_2_steam_rate_dataframe) 
    
    data_set = CSVLocalDataSet(filepath=parameters["path_model_output"]+"/well_13_steam_rate_RF.csv")
    data_set.save(well_13_steam_rate_dataframe) 
    
    data_set = CSVLocalDataSet(filepath=parameters["path_model_output"]+"/well_51_steam_rate_RF.csv")
    data_set.save(well_51_steam_rate_dataframe) 

    dummy7 = well_1_steam_rate 
    return dummy7
    
    
    
# Cross_validation

    
# change array to image  
# def array_to_image(dummy2, X_train: np.ndarray, X_test: np.ndarray, parameters: Dict, filenames: List):
#     for training_image, test_image, filename in  zip(X_train, X_test, filenames):      
#         plt.figure(figsize=(40,10))
#         plt.imshow(training_image)
#         plt.imsave(parameters["path_features"]+"/training_image_"+filename+".png", training_image)
#         plt.imshow(test_image)
#         plt.imsave(parameters["path_features"]+"/test_image_"+filename+".png", test_image)
#         dummy3 = X_train
#     return dummy3

    
# #CNN Node
# def train_model_CNN(dummy2, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, parameters: Dict):
#     tensorflow.set_random_seed(0)
#     np.random.seed(0)
    
#     y_training = []
#     y_testing = []
#     for y in  y_train:
#         y_training.append(y[:,1])
#     for y in  y_test:
#         y_testing.append(y[:,1])
#     y_training = np.array(y_training)
#     y_testing = np.array(y_testing)
    
#     X_training = X_train.reshape(len(X_train), X_train.shape[1], X_train.shape[2], 1)
#     X_testing = X_test.reshape(len(X_test), X_test.shape[1], X_test.shape[2], 1)
       
#     inputs = keras.Input(shape=(X_train.shape[1], X_train.shape[2], 1), name='input_image') #input is greyscale "image"
#     filters = (16, 32)
#     for (i, filter_) in enumerate(filters):
#         if i == 0:
#             x = inputs
#         x = keras.layers.Conv2D(filter_, (3, 3), padding='same')(x)  #confirm what padding means
#         x = keras.layers.Activation('relu')(x)
#         x = keras.layers.BatchNormalization(axis=-1)(x)  #confirm what this means
#         x = keras.layers.MaxPooling2D(pool_size=(2, 2))(x)

#     x = keras.layers.Flatten()(x)
#     x = layers.Dense(parameters["fully_connected_layer_1"], activation='relu', name='fully_connected_layer_1')(x)
#     x = keras.layers.BatchNormalization(axis=-1)(x)
#     x = keras.layers.Dropout(0.5)(x)                          #confirm what this means
#     x = layers.Dense(parameters["fully_connected_layer_2"], activation='relu', name='fully_connected_layer_2')(x)
#     x = layers.Dense(X_train.shape[1], activation='linear', name='outputs')(x)
    
#     model = keras.Model(inputs=inputs, outputs=x, name='CNN_regression_model')
#     model.summary() 
#     model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['accuracy'])
#     model.fit(X_training, y_training, epochs=parameters["epochs_CNN"], batch_size=parameters["batch_size"])
#     test_loss = model.evaluate(X_testing, y_testing)
  
#     dummy3 = X_train                                               # dummy variable set to enable sequential run
#     model.save(parameters["path_models"]+"/CNN_network_model.h5")
#     return dummy3


# def k_means_clustering(labels: pd.DataFrame, parameters: Dict) -> List:
#     from sklearn.cluster import KMeans
    
#     kmeans = KMeans(n_clusters=parameters["n_clusters"])
#     data = labels.values
#     kmeans.fit(data)
#     y_kmeans = pd.DataFrame(kmeans.predict(data), columns = ["y_kMeans"])
#     return [kmeans, y_kmeans]