Exemplo n.º 1
0
    def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]:
        """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``
        and save results back to the same objects.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        Raises:
            ValueError: Raised when ``Pipeline`` inputs cannot be satisfied.

        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """

        catalog = catalog.shallow_copy()

        unsatisfied = pipeline.inputs() - set(catalog.list())
        if unsatisfied:
            raise ValueError("Pipeline input(s) {} not found in the "
                             "DataCatalog".format(unsatisfied))

        free_outputs = pipeline.outputs() - set(catalog.list())
        unregistered_ds = pipeline.data_sets() - set(catalog.list())
        for ds_name in unregistered_ds:
            catalog.add(ds_name, self.create_default_data_set(ds_name))

        self._run(pipeline, catalog)

        self._logger.info("Pipeline execution completed successfully.")

        return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}
Exemplo n.º 2
0
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node:
    inputs = {}
    hook_manager = get_hook_manager()

    for name in node.inputs:
        hook_manager.hook.before_dataset_loaded(  # pylint: disable=no-member
            dataset_name=name
        )
        inputs[name] = catalog.load(name)
        hook_manager.hook.after_dataset_loaded(  # pylint: disable=no-member
            dataset_name=name, data=inputs[name]
        )

    is_async = False

    additional_inputs = _collect_inputs_from_hook(
        node, catalog, inputs, is_async, run_id=run_id
    )
    inputs.update(additional_inputs)

    outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id)

    for name, data in outputs.items():
        hook_manager.hook.before_dataset_saved(  # pylint: disable=no-member
            dataset_name=name, data=data
        )
        catalog.save(name, data)
        hook_manager.hook.after_dataset_saved(  # pylint: disable=no-member
            dataset_name=name, data=data
        )
    return node
Exemplo n.º 3
0
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node:
    inputs = {name: catalog.load(name) for name in node.inputs}
    hook_manager = get_hook_manager()
    is_async = False
    hook_manager.hook.before_node_run(  # pylint: disable=no-member
        node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id
    )
    try:
        outputs = node.run(inputs)
    except Exception as exc:
        hook_manager.hook.on_node_error(  # pylint: disable=no-member
            error=exc,
            node=node,
            catalog=catalog,
            inputs=inputs,
            is_async=is_async,
            run_id=run_id,
        )
        raise exc
    hook_manager.hook.after_node_run(  # pylint: disable=no-member
        node=node,
        catalog=catalog,
        inputs=inputs,
        outputs=outputs,
        is_async=is_async,
        run_id=run_id,
    )

    for name, data in outputs.items():
        catalog.save(name, data)
    return node
Exemplo n.º 4
0
    def after_pipeline_run(
        self,
        run_params: Dict[str, Any],
        pipeline: Pipeline,
        catalog: DataCatalog,
    ) -> None:
        """Hook to be invoked after a pipeline runs.
        Args:
            run_params: The params needed for the given run.
                Should be identical to the data logged by Journal.
                # @fixme: this needs to be modelled explicitly as code, instead of comment
                Schema: {
                    "project_path": str,
                    "env": str,
                    "kedro_version": str,
                    "tags": Optional[List[str]],
                    "from_nodes": Optional[List[str]],
                    "to_nodes": Optional[List[str]],
                    "node_names": Optional[List[str]],
                    "from_inputs": Optional[List[str]],
                    "load_versions": Optional[List[str]],
                    "pipeline_name": str,
                    "extra_params": Optional[Dict[str, Any]],
                }
            pipeline: The ``Pipeline`` that was run.
            catalog: The ``DataCatalog`` used during the run.
        """
        if self._is_mlflow_enabled:
            if isinstance(pipeline, PipelineML):
                with TemporaryDirectory() as tmp_dir:
                    # This will be removed at the end of the context manager,
                    # but we need to log in mlflow before moving the folder
                    kedro_pipeline_model = KedroPipelineModel(
                        pipeline=pipeline.inference,
                        catalog=catalog,
                        input_name=pipeline.input_name,
                        **pipeline.kpm_kwargs,
                    )
                    artifacts = kedro_pipeline_model.extract_pipeline_artifacts(
                        parameters_saving_folder=Path(tmp_dir))

                    log_model_kwargs = pipeline.log_model_kwargs.copy()
                    model_signature = log_model_kwargs.pop("signature", None)
                    if isinstance(model_signature, str):
                        if model_signature == "auto":
                            input_data = catalog.load(pipeline.input_name)
                            model_signature = infer_signature(
                                model_input=input_data)

                    mlflow.pyfunc.log_model(
                        python_model=kedro_pipeline_model,
                        artifacts=artifacts,
                        signature=model_signature,
                        **log_model_kwargs,
                    )
            # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs
            mlflow.end_run()
        else:
            switch_catalog_logging(catalog, True)
Exemplo n.º 5
0
    def test_add_save_and_load(self, data_set, dummy_dataframe):
        """Test adding and then saving and reloading the data set"""
        catalog = DataCatalog(data_sets={})
        catalog.add("test", data_set)
        catalog.save("test", dummy_dataframe)
        reloaded_df = catalog.load("test")

        assert_frame_equal(reloaded_df, dummy_dataframe)
Exemplo n.º 6
0
    def test_all_before_adding(self, fake_data_set, fake_transformer):
        catalog = DataCatalog()
        catalog.add_transformer(fake_transformer)
        catalog.add("test", fake_data_set)

        catalog.save("test", 42)
        assert catalog.load("test") == 44
        assert fake_data_set.log == [("save", 43), ("load", 43)]
        assert fake_transformer.log == [("save", 42), ("load", 43)]
Exemplo n.º 7
0
    def after_catalog_created(self, catalog: DataCatalog) -> None:
        """gets params, initializes logger, and logs params."""
        logger.info("Initializing mlflow logger")
        logger.info(__package__)

        params = catalog.load("parameters")
        self.config = MLFlowLoggerConfig(**params)
        if self.config.enabled:
            self._log_params()
            self._set_tags()
Exemplo n.º 8
0
def run_node(node: Node, catalog: DataCatalog) -> Node:
    """Run a single `Node` with inputs from and outputs to the `catalog`.

    Args:
        node: The ``Node`` to run.
        catalog: A ``DataCatalog`` containing the node's inputs and outputs.

    Returns:
        The node argument.

    """
    inputs = {name: catalog.load(name) for name in node.inputs}
    outputs = node.run(inputs)
    for name, data in outputs.items():
        catalog.save(name, data)
    return node
Exemplo n.º 9
0
def validate(parameters: Dict):
    # def validate(dummy2, parameters: Dict):
    import glob, os
    os.chdir(parameters["path_model_input"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" +
                            file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    for well, file, filename in zip(wells_data, files, filenames):
        well = well[[
            'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
            'Bottom Hole Heel Temperature', 'Emulsion Pressure',
            'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate'
        ]]
        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')

        well = well.iloc[:1399]
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_val_stats"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values
            properties.append(property_)
    properties = np.array(properties)

    all_wells_input = []
    all_wells_labels = []
    for well_data, file in zip(Raw_Data_preprocessed, files):
        DWT_Aprox_coeff_input = []
        input_data, labels = split(well_data)

        input_columns = list(input_data.columns)
        for data_idx in input_columns:
            signal = well_data[data_idx].values
            thresh = parameters["thresh"] * np.nanmax(signal)
            coeff = pywt.wavedec(signal,
                                 wavelet=parameters["wavelet"],
                                 mode=parameters["mode1"],
                                 level=parameters["level"])
            coeff[1:] = (pywt.threshold(i,
                                        value=thresh,
                                        mode=str(parameters["mode2"]))
                         for i in coeff[1:])
            DWT_Aprox_coeff_input.append(coeff[0])
        DWT_Aprox_coeff_input = pd.DataFrame(
            np.transpose(DWT_Aprox_coeff_input), columns=input_columns)
        data_set_input = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_input_DWT_coeffs_" + file)
        data_set_input.save(DWT_Aprox_coeff_input)
        all_wells_input.append(DWT_Aprox_coeff_input.values)
        data_set_labels = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_labels_" + file)
        data_set_labels.save(labels)
        all_wells_labels.append(labels.values)

    #     Standardize dynamic data coeffs
    data_set_scaler_coeffs = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_coeffs.pickle")
    scaler_coeffs = data_set_scaler_coeffs.load()
    all_wells_standardized_input = []
    all_wells_standardized_input_flattened = []
    for well_coeffs in all_wells_input:
        std_coeffs = scaler_coeffs.transform(well_coeffs)
        all_wells_standardized_input.append(std_coeffs)
        transposed_std_coeffs = np.transpose(std_coeffs)
        flattened_std_coeffs = transposed_std_coeffs.flatten()
        all_wells_standardized_input_flattened.append(flattened_std_coeffs)
    all_wells_standardized_input = np.array(all_wells_standardized_input)
    all_wells_standardized_input_flattened = np.array(
        all_wells_standardized_input_flattened)
    input_columns = list(DWT_Aprox_coeff_input.columns)
    for std_coeffs, file in zip(all_wells_standardized_input, files):
        std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns)
        data_set = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_std_DWT_input_coeffs_" + file)
        data_set.save(std_coeffs)
#     Standardize static data

    data_set_scaler_static = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_static.pickle")
    scaler_static = data_set_scaler_static.load()
    all_wells_standardized_properties = scaler_static.fit_transform(properties)
    all_wells_coeffs_reservoir_data = []
    for flattened_std_coeffs, standardized_properties in zip(
            all_wells_standardized_input_flattened,
            all_wells_standardized_properties):
        flattened_std_coeffs = list(flattened_std_coeffs)
        standardized_properties = list(standardized_properties)
        for reservoir_property in standardized_properties:
            flattened_std_coeffs.append(
                reservoir_property
            )  # append reservoir data to dynamic data coeffs
        all_wells_coeffs_reservoir_data.append(flattened_std_coeffs)
    all_wells_coeffs_reservoir_data = np.array(all_wells_coeffs_reservoir_data)

    well_count = np.arange(len(all_wells_coeffs_reservoir_data))
    daily_timesteps = np.arange(len(all_wells_labels[0]))
    input_data = []
    for coeff_inputs in all_wells_coeffs_reservoir_data:
        for time_lapse in daily_timesteps:
            well_inputs = [time_lapse] + list(
                coeff_inputs)  # append time lapse to input data
            input_data.append(well_inputs)
    input_data = np.array(input_data)
    data_set_regressor_1 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_1.pickle")
    regressor_1 = data_set_regressor_1.load()
    number_of_wells = len(well_count)
    wells_steam_rate_predicted = regressor_1.predict(input_data)
    wells_steam_rate_predicted = wells_steam_rate_predicted.reshape(
        (number_of_wells, 1399)).T

    # prediction inputs to model 2
    input_data_model_2 = []
    for coeff_inputs, well in zip(all_wells_coeffs_reservoir_data, well_count):
        for time_lapse in daily_timesteps:
            well_inputs = [time_lapse] + list(
                coeff_inputs)  # append time lapse to input data
            well_inputs_model_2 = [
                wells_steam_rate_predicted[time_lapse, well]
            ] + well_inputs
            input_data_model_2.append(well_inputs_model_2)
    input_data_model_2 = np.array(input_data_model_2)

    data_set_regressor_2 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_2.pickle")
    regressor_2 = data_set_regressor_2.load()
    wells_emulsion_rate_predicted = regressor_2.predict(input_data_model_2)
    wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape(
        (number_of_wells, 1399)).T

    # actual targets
    all_wells_steam_data = []
    all_wells_emulsion_data = []
    for ID in well_count:
        well_steam_data = all_wells_labels[ID][:, 0]
        well_emulsion_data = all_wells_labels[ID][:, 1]
        all_wells_steam_data = all_wells_steam_data + list(well_steam_data)
        all_wells_emulsion_data = all_wells_emulsion_data + list(
            well_emulsion_data)
    all_wells_steam_data = np.array(all_wells_steam_data)
    all_wells_emulsion_data = np.array(all_wells_emulsion_data)
    wells_steam_rate_actual = all_wells_steam_data.reshape(
        (number_of_wells, 1399)).T
    wells_emulsion_rate_actual = all_wells_emulsion_data.reshape(
        (number_of_wells, 1399)).T

    print("Prediction Performance:\n")
    print("Steam Flow Rate:")
    for well, file in zip(well_count, files):
        steam_rate_predicted = wells_steam_rate_predicted[:, well]
        steam_rate_actual = wells_steam_rate_actual[:, well]
        steam_rate_actual_predicted = pd.DataFrame(
            np.vstack((steam_rate_actual, steam_rate_predicted)).T,
            columns=["steam rate actual", "steam rate predicted"])
        data_set_steam_rate = CSVLocalDataSet(
            filepath=parameters["path_model_output"] + "/steam_rate_" + file)
        data_set_steam_rate.save(steam_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(steam_rate_actual, steam_rate_predicted)))

    print("\n")
    print("Emulsion Flow Rate:")
    for well, file in zip(well_count, files):
        emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well]
        emulsion_rate_actual = wells_emulsion_rate_actual[:, well]
        emulsion_rate_actual_predicted = pd.DataFrame(
            np.vstack((emulsion_rate_actual, emulsion_rate_predicted)).T,
            columns=["emulsion rate actual", "emulsion rate predicted"])
        data_set_emulsion_rate = CSVLocalDataSet(
            filepath=parameters["path_model_output"] + "/emulsion_rate_" +
            file)
        data_set_emulsion_rate.save(emulsion_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(emulsion_rate_actual, emulsion_rate_predicted)))

    dummy_validate = files
    return dummy_validate
Exemplo n.º 10
0
 def test_load_from_unregistered(self):
     """Check the error when attempting to load unregistered data set"""
     catalog = DataCatalog(data_sets={})
     pattern = r"DataSet 'test' not found in the catalog"
     with pytest.raises(DataSetNotFoundError, match=pattern):
         catalog.load("test")
Exemplo n.º 11
0
def preprocess_raw_data(parameters: Dict):
    import glob, os
    #     os.chdir(parameters["path_raw"])
    os.chdir(parameters["path_raw_matlab"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            #                 filename: CSVLocalDataSet(filepath=parameters["path_raw"]+"/"+file),
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw_matlab"] + "/" +
                            file),
        })

        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure',
        #                    'Producer Bottom Hole Pressure', 'ESP Speed',
        #                    'Steam Flow Rate - Outer',
        #                    'Emulsion Flow Rate']]

        well = well[[
            'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP',
            'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]',
            'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Prod_Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Prod_Date')
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        #         well = well.rolling(30, min_periods=1).mean()

        data = well['Oil [bbl/d]'] / 6.28981
        well.insert(4, 'Oil [m3/d]', data)
        time_data = np.arange(len(well))
        well.insert(0, 'Timestep', time_data)

        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    print((min_well_length, np.argmin(np.array(wells_life))))
    timesteps = min_well_length  # 1008
    #     timesteps = 371

    for well, file, filename in zip(wells_data_, files, filenames):
        well = well.iloc[:timesteps]  # daily, weekly, monthly data
        #         well = well.fillna(0)
        #         well = well.fillna(well.rolling(30, min_periods=1).median())
        #         well = well.fillna(well.median())

        well["Well"] = filename  # create a column for well name
        well = well.reset_index(drop=True)  # remove date index
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well)
        Raw_Data_dated.append(well)
        well = well.drop(columns=['Date', 'Well'])
        Raw_Data_preprocessed.append(well)


#     stats_training = CSVLocalDataSet(filepath=parameters["path_raw_static"]+"/static_P50_data_training.csv")
    stats_training = CSVLocalDataSet(
        filepath=parameters["path_raw_static_matlab"] +
        "/static_P50_data_training_152_wells.csv")

    stats = stats_training.load()
    stats_ROIP = stats.loc[:, 'ROIP']
    stats = stats.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation']

    # #     using only rich geoostats and no bottom water properties
    #     stats = stats.loc[:, 'Effective_Length':'Rich_Oil_Saturation']

    # #     Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness
    #     data = stats['Rich_Pay_Thickness'] - stats['Stand_Off']
    #     stats.insert(3, 'Effective_Rich_Pay_Thickness', data)
    #     stats = stats.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off'])

    property_names = list(stats.columns)
    properties = list(stats.values)

    properties = np.array(properties)
    return [
        timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames,
        properties, stats_ROIP, property_names
    ]
Exemplo n.º 12
0
def preprocess_raw_data(parameters: Dict):
    import glob, os
    os.chdir(parameters["path_raw"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    for well, file, filename in zip(wells_data, files, filenames):
        well = well[[
            'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
            'Bottom Hole Heel Temperature', 'Emulsion Pressure',
            'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate'
        ]]
        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')

        well = well.iloc[:1399]
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well_dated = well.copy()
        well_dated["Well"] = filename  # create a column for well name
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well_dated)
        Raw_Data_dated.append(well_dated)

        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_raw_static"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values
            properties.append(property_)
    properties = np.array(properties)
    return [
        Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities,
        asset_names, properties
    ]
from kedro.io import DataCatalog
from kedro.extras.datasets.pandas import CSVDataSet

io = DataCatalog({"titanic_training_data": CSVDataSet(filepath="train.csv")})

# Load your file and print the output
df = io.load("titanic_training_data")
print(df.head())
Exemplo n.º 14
0
def load_well_validation_data(dummy2, timesteps, parameters: Dict):
    import glob, os
    os.chdir(parameters["path_model_input"])
    files_val = []
    for file in glob.glob("*.csv"):
        files_val.append(file)
    filenames_val = []
    wells_data = []
    for file in files_val:
        filename, extension = file.split('.')
        filenames_val.append(filename)
    for file, filename in zip(files_val, filenames_val):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" +
                            file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed_val = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure',
        #                    'Producer Bottom Hole Pressure', 'ESP Speed',
        #                    'Steam Flow Rate - Outer',
        #                    'Emulsion Flow Rate']]

        well = well[[
            'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP',
            'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]',
            'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Prod_Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Prod_Date')
        #         well = well.dropna(axis=0)   # may change
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        #         well = well.rolling(30, min_periods=1).mean()
        #         well = well.rolling(30, min_periods=1).mean()

        data = well['Oil [bbl/d]'] / 6.28981
        well.insert(4, 'Oil [m3/d]', data)
        time_data = np.arange(len(well))
        well.insert(0, 'Timestep', time_data)

        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    if min_well_length < timesteps:
        timesteps_validation = min_well_length
    else:
        timesteps_validation = timesteps

    for well, file, filename in zip(wells_data_, files_val, filenames_val):
        well = well.iloc[:timesteps_validation]  # daily data
        #         well = well.fillna(0)
        #         well = well.fillna(well.rolling(30, min_periods=1).median())
        #         well = well.fillna(well.median())
        Raw_Data_preprocessed_val.append(well)

    stats_validation = CSVLocalDataSet(filepath=parameters["path_val_stats"] +
                                       "/static_P50_data_validation.csv")
    stats_val = stats_validation.load()
    stats_val_ROIP = stats_val.loc[:, 'ROIP']
    stats_val = stats_val.loc[:,
                              'Effective_Length':'BottomWater_Oil_Saturation']

    # #     using only rich geoostats and no bottom water properties
    #     stats_val = stats_val.loc[:, 'Effective_Length':'Rich_Oil_Saturation']

    # #     Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness
    #     data = stats_val['Rich_Pay_Thickness'] - stats_val['Stand_Off']
    #     stats_val.insert(3, 'Effective_Rich_Pay_Thickness', data)
    #     stats_val = stats_val.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off'])

    property_names_val = list(stats_val.columns)
    properties_val = list(stats_val.values)

    #     properties_val = stats.loc[:, ['Effective_Length', 'Spacing', 'Effective_Rich_Pay_Thickness', 'Non_Rich_Pay_Thickness',
    #                               'Rich_Vertical_Permeability','Non_Rich_Vertical_Permeability', 'Rich_Porosity',
    #                                       'Non_Rich_Porosity', 'Rich_Oil_Saturation', 'Non_Rich_Oil_Saturation']].values
    properties_val = np.array(properties_val)
    dummy11 = files_val
    return [
        dummy11, timesteps_validation, Raw_Data_preprocessed_val, files_val,
        filenames_val, properties_val, stats_val_ROIP, property_names_val
    ]
Exemplo n.º 15
0
def preprocess_raw_data(parameters: Dict):
    import glob, os
    os.chdir(parameters["path_raw"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
        #                                     'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure',
        #                                     'ESP Current', 'Emulsion Flow Rate']]

        #         well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Emulsion Flow Rate']]

        well = well[[
            'Date', 'IBHP', 'PBHP', 'Steam [m3/d]', 'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    timesteps = 983

    for well, file, filename in zip(wells_data_, files, filenames):
        #         well = well.iloc[:min_well_length]     # use minimum well life
        well = well.iloc[:timesteps]  # daily, weekly, monthly data
        #         well = well.fillna(0)
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well_dated = well.copy()
        well_dated["Well"] = filename  # create a column for well name
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well_dated)
        Raw_Data_dated.append(well_dated)
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_raw_static"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'Effective_Length':'BottomWater_Oil_Saturation'].values
            properties.append(property_)
    properties = np.array(properties)
    return [
        timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames,
        probabilities, asset_names, properties
    ]