Пример #1
0
    def test_sequential_save_and_load(self, dummy_dataframe, filepath):
        """Tests if the correct load version is logged when two datasets are saved
        sequentially."""

        dataset1 = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, "2000-01-01"),
        )

        dataset2 = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, "2001-01-01"),
        )

        dataset1.save(dummy_dataframe)
        last_save_version1 = dataset1.get_last_save_version()

        dataset2.save(dummy_dataframe)
        last_save_version2 = dataset2.get_last_save_version()

        dataset2.load()
        last_load_version = dataset2.get_last_load_version()
        assert last_save_version2 == last_load_version
        assert last_save_version1 != last_save_version2
Пример #2
0
def load_data(dummy, files: List, parameters: Dict):
    all_wells_steam_input = []
    all_wells_emulsion_input = []
    all_wells_labels = []

    for file in files:
        data_set_steam_input = CSVLocalDataSet(
            filepath=parameters["path_primary"] + "/steam_inputs_" + file)
        steam_input_data = data_set_steam_input.load()
        all_wells_steam_input.append(steam_input_data.values)
        data_set_emulsion_input = CSVLocalDataSet(
            filepath=parameters["path_primary"] + "/emulsion_inputs_" + file)
        emulsion_input_data = data_set_emulsion_input.load()
        all_wells_emulsion_input.append(emulsion_input_data.values)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        labels = data_set_labels.load()
        all_wells_labels.append(labels.values)

    steam_input_names = steam_input_data.columns
    emulsion_input_names = emulsion_input_data.columns
    all_wells_steam_input = np.array(all_wells_steam_input)
    all_wells_emulsion_input = np.array(all_wells_emulsion_input)
    return [
        all_wells_steam_input, all_wells_emulsion_input, all_wells_labels,
        steam_input_names, emulsion_input_names
    ]
Пример #3
0
def standardisation(dummy, properties: np.ndarray, files: List,
                    parameters: Dict):
    from sklearn.preprocessing import StandardScaler
    all_wells_input = []
    all_wells_labels = []

    for file in files:
        data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                         "/input_DWT_coeffs_" + file)
        DWT_Aprox_coeff_input = data_set_input.load()
        all_wells_input.append(DWT_Aprox_coeff_input.values)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        labels = data_set_labels.load()
        all_wells_labels.append(labels.values)
    all_wells_input = np.array(all_wells_input)

    #     Standardize dynamic data coeffs
    scaler_coeffs = StandardScaler()
    scaler_coeffs.fit(all_wells_input[0])  # fit based on first well record
    all_wells_standardized_input = []
    all_wells_standardized_input_flattened = []
    for well_coeffs in all_wells_input:
        std_coeffs = scaler_coeffs.transform(well_coeffs)
        all_wells_standardized_input.append(std_coeffs)
        transposed_std_coeffs = np.transpose(std_coeffs)
        flattened_std_coeffs = transposed_std_coeffs.flatten()
        all_wells_standardized_input_flattened.append(flattened_std_coeffs)

    all_wells_standardized_input = np.array(all_wells_standardized_input)
    all_wells_standardized_input_flattened = np.array(
        all_wells_standardized_input_flattened)
    data_set_scaler_coeffs = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_coeffs.pickle")
    data_set_scaler_coeffs.save(scaler_coeffs)

    input_columns = list(DWT_Aprox_coeff_input.columns)
    for std_coeffs, file in zip(all_wells_standardized_input, files):
        std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns)
        data_set = CSVLocalDataSet(filepath=parameters["path_features"] +
                                   "/std_DWT_input_coeffs_" + file)
        data_set.save(std_coeffs)


#     Standardize static data
    scaler_static = StandardScaler()
    all_wells_standardized_properties = scaler_static.fit_transform(properties)
    data_set_scaler_static = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_static.pickle")
    data_set_scaler_static.save(scaler_static)
    return [
        all_wells_standardized_input_flattened,
        all_wells_standardized_properties, all_wells_labels
    ]
Пример #4
0
def dynamic_time_warping(Raw_Data_preprocessed, parameters):
    reference_well = CSVLocalDataSet(filepath=parameters["path_raw"] +
                                     "/B03-1P.csv")
    well_ref = reference_well.load()
    data = well_ref['Oil [bbl/d]'] / 6.28981
    well_ref.insert(4, 'Oil [m3/d]', data)
    well_ref_oil_data = well_ref['Oil [m3/d]'].values

    Raw_Data_preprocessed_ = []
    distance_array = []
    for well_data in Raw_Data_preprocessed:
        well_oil_data = well_data['Oil [m3/d]'].values

        distance, path = fastdtw(well_ref_oil_data,
                                 well_oil_data,
                                 dist=euclidean)
        distance_array.append(distance)
        path = np.array(path)
        index_well = path[..., 1]
        index_ref_well = path[..., 0]
        well = well_data.iloc[index_well]
        well.insert(0, 'index_ref', index_ref_well)
        well = well.groupby('index_ref').mean()
        #         well = well.reset_index(drop=True)
        Raw_Data_preprocessed_.append(well)

    distance_array = np.array(distance_array)
    return [distance_array, Raw_Data_preprocessed_]
Пример #5
0
    def test_save_options_csv(self, tmp_path, sample_spark_df):
        # To cross check the correct Spark save operation we save to
        # a single spark partition with csv format and retrieve it with Kedro
        # CSVLocalDataSet
        temp_dir = Path(str(tmp_path / "test_data"))
        spark_data_set = SparkDataSet(
            filepath=str(temp_dir),
            file_format="csv",
            save_args={
                "sep": "|",
                "header": True
            },
        )
        spark_df = sample_spark_df.coalesce(1)
        spark_data_set.save(spark_df)

        single_csv_file = [
            f for f in temp_dir.iterdir() if f.is_file() and f.suffix == ".csv"
        ][0]

        csv_local_data_set = CSVLocalDataSet(filepath=str(single_csv_file),
                                             load_args={"sep": "|"})
        pandas_df = csv_local_data_set.load()

        assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
Пример #6
0
def test_save_options_csv():
    # To cross check the correct Spark save operation we save to
    # a single spark partition with csv format and retrieve it with Kedro
    # CSVLocalDataSet
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = join(temp_dir, "test_data")
        spark_data_set = SparkDataSet(
            filepath=temp_path,
            file_format="csv",
            save_args={
                "sep": "|",
                "header": True
            },
        )
        spark_df = _get_sample_spark_data_frame().coalesce(1)
        spark_data_set.save(spark_df)

        single_csv_file = [
            join(temp_path, f) for f in listdir(temp_path) if f.endswith("csv")
        ][0]

        csv_local_data_set = CSVLocalDataSet(filepath=single_csv_file,
                                             load_args={"sep": "|"})
        pandas_df = csv_local_data_set.load()

        assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
Пример #7
0
def preprocess_raw_data(parameters: Dict):
    import glob, os
    #     os.chdir(parameters["path_raw"])
    os.chdir(parameters["path_raw_matlab"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            #                 filename: CSVLocalDataSet(filepath=parameters["path_raw"]+"/"+file),
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw_matlab"] + "/" +
                            file),
        })

        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure',
        #                    'Producer Bottom Hole Pressure', 'ESP Speed',
        #                    'Steam Flow Rate - Outer',
        #                    'Emulsion Flow Rate']]

        well = well[[
            'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP',
            'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]',
            'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Prod_Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Prod_Date')
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        #         well = well.rolling(30, min_periods=1).mean()

        data = well['Oil [bbl/d]'] / 6.28981
        well.insert(4, 'Oil [m3/d]', data)
        time_data = np.arange(len(well))
        well.insert(0, 'Timestep', time_data)

        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    print((min_well_length, np.argmin(np.array(wells_life))))
    timesteps = min_well_length  # 1008
    #     timesteps = 371

    for well, file, filename in zip(wells_data_, files, filenames):
        well = well.iloc[:timesteps]  # daily, weekly, monthly data
        #         well = well.fillna(0)
        #         well = well.fillna(well.rolling(30, min_periods=1).median())
        #         well = well.fillna(well.median())

        well["Well"] = filename  # create a column for well name
        well = well.reset_index(drop=True)  # remove date index
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well)
        Raw_Data_dated.append(well)
        well = well.drop(columns=['Date', 'Well'])
        Raw_Data_preprocessed.append(well)


#     stats_training = CSVLocalDataSet(filepath=parameters["path_raw_static"]+"/static_P50_data_training.csv")
    stats_training = CSVLocalDataSet(
        filepath=parameters["path_raw_static_matlab"] +
        "/static_P50_data_training_152_wells.csv")

    stats = stats_training.load()
    stats_ROIP = stats.loc[:, 'ROIP']
    stats = stats.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation']

    # #     using only rich geoostats and no bottom water properties
    #     stats = stats.loc[:, 'Effective_Length':'Rich_Oil_Saturation']

    # #     Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness
    #     data = stats['Rich_Pay_Thickness'] - stats['Stand_Off']
    #     stats.insert(3, 'Effective_Rich_Pay_Thickness', data)
    #     stats = stats.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off'])

    property_names = list(stats.columns)
    properties = list(stats.values)

    properties = np.array(properties)
    return [
        timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames,
        properties, stats_ROIP, property_names
    ]
Пример #8
0
def load_well_validation_data(dummy2, timesteps, parameters: Dict):
    import glob, os
    os.chdir(parameters["path_model_input"])
    files_val = []
    for file in glob.glob("*.csv"):
        files_val.append(file)
    filenames_val = []
    wells_data = []
    for file in files_val:
        filename, extension = file.split('.')
        filenames_val.append(filename)
    for file, filename in zip(files_val, filenames_val):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" +
                            file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed_val = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure',
        #                    'Producer Bottom Hole Pressure', 'ESP Speed',
        #                    'Steam Flow Rate - Outer',
        #                    'Emulsion Flow Rate']]

        well = well[[
            'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP',
            'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]',
            'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Prod_Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Prod_Date')
        #         well = well.dropna(axis=0)   # may change
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        #         well = well.rolling(30, min_periods=1).mean()
        #         well = well.rolling(30, min_periods=1).mean()

        data = well['Oil [bbl/d]'] / 6.28981
        well.insert(4, 'Oil [m3/d]', data)
        time_data = np.arange(len(well))
        well.insert(0, 'Timestep', time_data)

        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    if min_well_length < timesteps:
        timesteps_validation = min_well_length
    else:
        timesteps_validation = timesteps

    for well, file, filename in zip(wells_data_, files_val, filenames_val):
        well = well.iloc[:timesteps_validation]  # daily data
        #         well = well.fillna(0)
        #         well = well.fillna(well.rolling(30, min_periods=1).median())
        #         well = well.fillna(well.median())
        Raw_Data_preprocessed_val.append(well)

    stats_validation = CSVLocalDataSet(filepath=parameters["path_val_stats"] +
                                       "/static_P50_data_validation.csv")
    stats_val = stats_validation.load()
    stats_val_ROIP = stats_val.loc[:, 'ROIP']
    stats_val = stats_val.loc[:,
                              'Effective_Length':'BottomWater_Oil_Saturation']

    # #     using only rich geoostats and no bottom water properties
    #     stats_val = stats_val.loc[:, 'Effective_Length':'Rich_Oil_Saturation']

    # #     Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness
    #     data = stats_val['Rich_Pay_Thickness'] - stats_val['Stand_Off']
    #     stats_val.insert(3, 'Effective_Rich_Pay_Thickness', data)
    #     stats_val = stats_val.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off'])

    property_names_val = list(stats_val.columns)
    properties_val = list(stats_val.values)

    #     properties_val = stats.loc[:, ['Effective_Length', 'Spacing', 'Effective_Rich_Pay_Thickness', 'Non_Rich_Pay_Thickness',
    #                               'Rich_Vertical_Permeability','Non_Rich_Vertical_Permeability', 'Rich_Porosity',
    #                                       'Non_Rich_Porosity', 'Rich_Oil_Saturation', 'Non_Rich_Oil_Saturation']].values
    properties_val = np.array(properties_val)
    dummy11 = files_val
    return [
        dummy11, timesteps_validation, Raw_Data_preprocessed_val, files_val,
        filenames_val, properties_val, stats_val_ROIP, property_names_val
    ]