Пример #1
0
    def fit(self, df, method: str = "min_max", columns: list = [], **kwargs):
        r"""
        Documentation here
        """
        if not method.lower() in self.methods:

            raise TypeError(
                "{} method not available, availables methods: {}".format(
                    method, methods.keys()))

        self.__scaler = self.methods[method.lower()](**kwargs)
        self.__columns = columns

        if not columns:

            column_name = Utils.get_column_names(df)
            if len(column_name) == 1:
                df = df.values.reshape(-1, 1)
            return self.__scaler.fit(df)

        if len(columns) == 1:
            df = df.values.reshape(-1, 1)
            return self.__scaler.fit(df)

        return self.__scaler.fit(df[columns])
Пример #2
0
    def detect(self,
               df: pd.DataFrame,
               win_size: int = 30,
               step: int = 1,
               conf: float = 0.95,
               cols: list = None) -> pd.DataFrame:
        """
        Detects any outliers values if exists in dataframe. If exists these outliers values 
        will be imputed.

        **Parameters**

        * **:param df:** (pandas.DataFrame)
        * **:param win_size:** (int)
        * **:param step:** (int)
        * **:param conf:** (float)
        * **:param cols:** (list)

        **returns**

        * **df:** (pandas.DataFrame)

        ______
        ### **Snippet code**

        ```python
        >>> import matplotlib.pyplot as plt
        >>> from rackio_AI import Outliers
        >>> df = pd.DataFrame(np.random.randn(1000,2), columns=["a", "b"])
        >>> out = Outliers()
        >>> df = out.add(df, percent=1)
        >>> df_imputed = out.detect(df, win_size=30)
        >>> ax = plt.plot(df["a"], '-r', df["b"], '-b', out.outliers["a"]["locs"], out.outliers["a"]["values"], 'rD', out.outliers["b"]["locs"], out.outliers["b"]["values"], 'bo', out.detected["a"]["locs"], out.detected["a"]["values"], 'kD', out.detected["b"]["locs"], out.detected["b"]["values"], 'ko')
        >>> ax = plt.legend(["a", "b", "a outliers", "b outliers", "a dectected", "b detected"])
        >>> plt.show()

        ```
        ![Detect Outlier](../img/impute_outliers.png)
    
        """
        self._df_ = df.copy()

        self.detected = dict()

        if not cols:

            cols = Utils.get_column_names(self._df_)

        options = {"win_size": win_size, "step": step}

        self._serie_list_ = Utils().get_windows(self._df_, win_size, step=step)

        self.__first_step_detect(cols, **options)

        df = self._df_

        return df
Пример #3
0
    def add(self,
            df: pd.DataFrame,
            percent: float = 5,
            method: str = "tf",
            cols: list = None):
        """
        Creates outliers values in a dataframe based on a given method

        **Parameters**

        * **:param df:** (pandas.DataFrame) Data to add outlier
        * **:param percent:** (float) outliers percent
        * **:param method:** (str) custom function name to calculate outlier
            * "tf": tukey-fence method
        * **:param cols:** (list) column names to add outliers, default None
            * If "None" outliers will be added to all columns

        **returns**

        * **df:** (pandas.DataFrame) Data with outliers added

        ______
        ### **Snippet code**

        ```python
        >>> import matplotlib.pyplot as plt
        >>> from rackio_AI import Outliers
        >>> df = pd.DataFrame(np.random.randn(100,2), columns=["a", "b"])
        >>> out = Outliers()
        >>> df = out.add(df)
        >>> ax = plt.plot(df["a"], '-r', df["b"], '-b', out.outliers["a"]["locs"], out.outliers["a"]["values"], 'rD', out.outliers["b"]["locs"], out.outliers["b"]["values"], 'bD')
        >>> ax = plt.legend(["a", "b", "a outliers", "b outliers"])
        >>> plt.show()

        ```
        ![Add Outlier](../img/add_outliers.png)

        """
        options = {
            "percent": percent,
            "method": method,
        }

        self.outliers = dict()

        self._df_ = df.copy()

        if not cols:

            cols = Utils.get_column_names(df)

        self.__first_step_add(cols, **options)

        df = self._df_

        return df
Пример #4
0
    def inverse(self, df):
        r"""
        Documentation here
        """
        if isinstance(df, pd.DataFrame):

            column_name = Utils.get_column_names(df)

            return pd.DataFrame(self.__scaler.inverse_transform(df),
                                columns=column_name)

        return self.__scaler.inverse_transform(df)
Пример #5
0
    def add(self,
            df: pd.DataFrame,
            win_size: int = 30,
            method: str = "rhinehardt",
            cols: list = None,
            std_factor: float = 0.001) -> pd.DataFrame:
        """
        Add gaussian noise over subsequence windows based on some method

        **Parameters**

        * **:param df:** (pandas.DataFrame)
        * **:param win_size:** (int) window size to apply gaussian noise
        * **:param method:** (str) method to base gaussian noise
            * *rhinehardt* or *rh*
        * **:param cols:** (list) column names to add gaussian noise.

        **returns**

        * **df** (pandas.DataFrame) noise added

        ______
        ### **Snippet code

        ```python
        >>> import matplotlib.pyplot as plt
        >>> from rackio_AI import Noise 
        >>> df = pd.DataFrame(np.random.randn(100,2), columns=["a", "b"])
        >>> noise = Noise()
        >>> df_noisy = noise.add(df, win_size=10)
        >>> ax = plt.plot(df.index, df["a"], '-r', df.index, df["b"], '-b', df_noisy.index, df_noisy["a"], '--r', df_noisy.index, df_noisy["b"], '--b')
        >>> ax = plt.legend(["a", "b", "noisy a", "noisy b"])
        >>> plt.show()

        ```
        ![Add rhinehardt noise](../img/rhinehardt_noise.png)
        """
        options = {
            'win_size': win_size,
            'method': method,
            'std_factor': std_factor
        }
        self._df_ = df.copy()
        if not cols:

            cols = Utils.get_column_names(self._df_)

        self.__first_step_add(cols, **options)

        df = self._df_

        return df
Пример #6
0
    def data(self):
        """
        Variable where is storaged the loaded data.

        **Parameters**

        None

        **:return:**

        * **data:** (pandas.DataFrame)

        """
        self.columns_name = Utils.get_column_names(self._data)

        return self._data
Пример #7
0
    def __change_columns(self, column_name):
        """
        Decorated function to visualize the progress bar during the execution of *change_colums*
        method in the pipeline

        **Parameters**

        * **:param column_name:** (list) list of data column to be deleted in DataFrame

        **returns**

        None
        """
        if column_name in Utils.get_column_names(self.data):

            self.data.loc[:, column_name] = self._data_.loc[:, column_name]

        return
Пример #8
0
    def data(self, value):
        """
        **Parameters**

        * **:param value:** (pd.DataFrame or np.ndarray)

        **:return:**

        None
        """
        if isinstance(value, pd.DataFrame) or isinstance(value, np.ndarray):

            if hasattr(self, '_data'):

                if isinstance(value, np.ndarray):

                    self._data = pd.DataFrame(value, columns=self.columns_name)

                else:

                    if isinstance(self._data.columns, pd.MultiIndex):

                        self.columns_name = pd.MultiIndex.from_tuples(
                            self.columns_name,
                            names=['tag', 'variable', 'unit'])

                    self._data = value

            else:

                self.columns_name = Utils.get_column_names(value)

                self._data = value

        else:

            raise TypeError('value must be a pd.DataFrame or np.ndarray')
Пример #9
0
    def __best(self, _iterator, **kwargs):
        """
        Decorated function to visualize the progress bar during the execution of 
        best_win_size_step method

        **Parameters**

        * **:param column_name:** (list) list of grid

        **returns**

        None
        """
        df = kwargs['df']
        win_size, step = _iterator
        self.detect(df, win_size=win_size, step=step)

        result = dict()

        for col in Utils.get_column_names(df):

            y_pred = pd.Series(0, index=df[col].index)
            y_pred.loc[y_pred.index.isin(self.detected[col]['locs'])] = 1
            y = pd.Series(0, index=df[col].index)
            y.loc[y.index.isin(self.outliers[col]['locs'])] = 1

            precision, recall, _ = precision_recall_curve(
                y.values, y_pred.values)

            _auc = auc(recall, precision)

            result[col] = {"win_size": win_size, "step": step, "auc": _auc}

        self.optimizer_result.append(result)

        return
Пример #10
0
    def fixnan(self,
               df: pd.DataFrame,
               key: str = "median",
               neighbors: int = 3,
               _round: bool = False,
               down: bool = False,
               decimals: int = 5) -> pd.DataFrame:
        """
        Fixes nan in dataframe columns by a key function

        **Parameters**
        
        * **:param df:** (pandas.DataFrame)
        * **:param key:** (str) Function's name to fix nan
            * *median*
            * *mean*
            * *std*
            * *var*
        * **:param neighbors:** (int) neighbors values to apply key function
        * **:param _round:** (bool) 
            * If True the value fixed is rounded
        * **:param down:** (bool) round down if *_round* is True otherwise round up
        * **:param decimals:** (int): If *_round* is True, the value is rounded with
        these decimals

        **returns**

        * **df** (pandas.DataFrame) dataframe with nan values fixed
        ___

        ### **Snippet code

        ```python
        >>> import pandas as pd
        >>> import numpy as np
        >>> from rackio_AI import RackioAI
        >>> EDA = RackioAI.get(name="EDA core", _type='EDA')
        >>> df = pd.DataFrame(np.random.randn(10, 3), index=['a', 'b', 'c', 'd', 'f', 'g', 'h', 'i', 'j', 'k'], columns=['one', 'two', 'three'])
        >>> df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'])
        >>> df_fixed = EDA.fixnan(df2, _round=True)

        ```
        """
        if key.lower() in ["median", "mean", "std", "var"]:

            self._dict_nonan_ = dict()
            self._df_ = df
            columns = Utils.get_column_names(df)
            options = {
                "key": key,
                "neighbors": neighbors,
                "_round": _round,
                "down": down,
                "decimals": decimals
            }
            self.__first_step_fixnan(columns, **options)

            names = list(self._dict_nonan_.keys())
            self.__last_step_fixnan(names)

            return self._df_

        else:

            raise TypeError(
                "{} is key not valid, use: ['median', 'mean', 'std', 'var']".
                format(key))
Пример #11
0
    def load(self,
             pathname: str,
             ext: str = ".tpl",
             reset_index=False,
             **kwargs):
        """
        Load data into DataFrame format:

        * **.tpl:** Is an [OLGA](https://www.petromehras.com/petroleum-software-directory/production-engineering-software/olga-dynamic-multiphase-flow-simulator)
        extension file.
        * **.pkl:** Numpy arrays or Pandas.DataFrame saved in pickle format.

        ___
        **Parameters**

        * **:param pathname:** (str) Filename or directory. 
            * If the *pathname* is a directory, it will load all the files with extension *ext*.
            * If the *pathname* is a filename, it will load the file with a supported extension.
        * **:param ext:** (str) filename extension, it's necessary if pathname is a directory.
        Extensions supported are:
            * *.tpl*  [OLGA](https://www.petromehras.com/petroleum-software-directory/production-engineering-software/olga-dynamic-multiphase-flow-simulator)
        extension file.
            * *.xls*
            * *.xlsx*
            * *.xlsm*
            * *.xlsb*
            * *.odf*
            * *.ods*
            * *.odt*
            * *.csv*
            * *.pkl* (Only if the pkl saved is a DataFrame)

        **:return:**

        * **data:** (pandas.DataFrame)

        ___
        ## Snippet code

        ```python
        >>> import os
        >>> from rackio_AI import RackioAI, get_directory
        >>> filename = os.path.join(get_directory('Leak'), 'Leak01.tpl')
        >>> df = RackioAI.load(filename)
        >>> print(df.head())
        tag      TIME_SERIES PT_SECTION_BRANCH_TUBERIA_PIPE_Pipe60_NR_1  ... CONTR_CONTROLLER_CONTROL_FUGA     file
        variable                                               Pressure  ...             Controller_output filename
        unit               S                                         PA  ...                                   .tpl
        0           0.000000                                   568097.3  ...                           0.0   Leak01
        1           0.502732                                   568098.2  ...                           0.0   Leak01
        2           1.232772                                   568783.2  ...                           0.0   Leak01
        3           1.653696                                   569367.3  ...                           0.0   Leak01
        4           2.200430                                   569933.5  ...                           0.0   Leak01
        <BLANKLINE>
        [5 rows x 12 columns]

        **Example loading a directory with .tpl files**

        >>> directory = os.path.join(get_directory('Leak'))
        >>> df = RackioAI.load(directory)
        >>> print(df.head())
        tag      TIME_SERIES PT_SECTION_BRANCH_TUBERIA_PIPE_Pipe60_NR_1  ... CONTR_CONTROLLER_CONTROL_FUGA     file
        variable                                               Pressure  ...             Controller_output filename
        unit               S                                         PA  ...                                   .tpl
        0           0.000000                                   568097.3  ...                           0.0   Leak01
        1           0.502732                                   568098.2  ...                           0.0   Leak01
        2           1.232772                                   568783.2  ...                           0.0   Leak01
        3           1.653696                                   569367.3  ...                           0.0   Leak01
        4           2.200430                                   569933.5  ...                           0.0   Leak01
        <BLANKLINE>
        [5 rows x 12 columns]

        **Example loading a directory with .csv files**

        >>> directory = os.path.join(get_directory('csv'), "Hysys")
        >>> df = RackioAI.load(directory, ext=".csv", _format="hysys")
        >>> print(df.head())
          (Time, [seconds]) (PIC-118 - PV, [kPa]) (PIC-118 - OP, [%]) (SPRDSHT-1 - Cell Matrix (G-16), []) (UIC-101 - OP, [%])
        1                 0               294.769                  42                              37.6105                  10
        2               0.3               294.769                  42                              37.6105                  10
        3               0.6               294.769                  42                              37.6105                  10
        4               0.9               294.769                  42                              37.6105                  10
        5               1.1               294.769                  42                              37.6105                  10

        >>> directory = os.path.join(get_directory('csv'), "VMGSim")
        >>> df = RackioAI.load(directory, ext=".csv", _format="vmgsim")
        >>> print(df.head())
          (time, s) (/Bed-1.In.MoleFlow, kmol/h) (/Bed-1.In.P, kPa)  ... (/Sep2.In.P, kPa) (/Sep3.In.P, kPa) (/Tail_Gas.In.T, C)
        1         1                  2072.582713        285.9299038  ...       315.8859771       291.4325134                 159
        2         2                  2081.622826        286.9027793  ...       315.8953772       292.3627861                 159
        3         3                   2085.98973        287.5966429  ...       316.0995398       293.0376745                 159
        4         4                  2089.323383        288.1380485  ...       316.3974799       293.5708836                 159
        5         5                  2092.214077         288.591646  ...       316.7350299       294.0200778                 159
        <BLANKLINE>
        [5 rows x 16 columns]

        **Example loading a .pkl with pandas.dataFrame**

        >>> filename = os.path.join(get_directory('pkl_files'), 'test_data.pkl')
        >>> df = RackioAI.load(filename)
        >>> print(df.head())
           Pipe-60 Totalmassflow_(KG/S)  Pipe-151 Totalmassflow_(KG/S)  Pipe-60 Pressure_(PA)  Pipe-151 Pressure_(PA)
        0                      37.83052                       37.83052               568097.3                352683.3
        1                      37.83918                       37.70243               568098.2                353449.8
        2                      37.83237                       37.67011               568783.2                353587.3
        3                      37.80707                       37.67344               569367.3                353654.8
        4                      37.76957                       37.69019               569933.5                353706.8

        ```
        """
        filename, ext = Utils.check_path(pathname, ext=ext)

        data = self.reader.read(filename, ext=ext, **kwargs)

        self.columns_name = Utils.get_column_names(data)

        if data.index.has_duplicates:

            data = data.reset_index(drop=True)

        if reset_index:

            data = data.reset_index(drop=True)

        self.columns_name = Utils.get_column_names(data)

        self._data = data

        return data
Пример #12
0
    def split_sequences(self,
                        df: pd.DataFrame,
                        timesteps,
                        stepsize: int = 1,
                        input_cols: list = None,
                        output_cols: list = None,
                        maxlen=None,
                        dtype: str = 'int32',
                        padding: str = 'pre',
                        truncating: str = 'pre',
                        value: float = 0.):
        """
        Splits dataframe in a 3D numpy array format supported by LSTM architectures using sliding windows concept.

        **Parameters**

        * **:param df:** (pandas.DataFrame) Contains inputs and outputs data
        * **:param timesteps:** (list or int) Timestep for each input variable.
            * If timestep is an int value, all input columns will be the same timestep
            * If timestep is a list, must be same lenght that input_cols argument
        * **:param stepsize:** (int, default = 1) step size for the sliding window
        * **:param input_cols:** (list, default = None) Column names that represents the input variables to LSTM
            * If input_cols is None the method assumes that inputs are all column except the last one.
        * **:param output_cols:** (list, default = None) Column names that represents the output variables to LSTM
            * If output_cols is None the method assumes that output is the last column.

        The rest of parameters represent the parameters for *pad_sequences* method, see its description.

        **returns**

        **sequences** (3D numpy array) dimensions (df.shape[0] - max(timesteps), max(timesteps), features)

        ```python
        >>> import numpy as np
        >>> from rackio_AI import RackioAI
        >>> a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]).reshape(-1,1)
        >>> b = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95]).reshape(-1,1)
        >>> c = np.array([a[i]+b[i] for i in range(len(a))]).reshape(-1,1)
        >>> data = np.hstack((a,b,c))
        >>> data
        array([[ 10,  15,  25],
               [ 20,  25,  45],
               [ 30,  35,  65],
               [ 40,  45,  85],
               [ 50,  55, 105],
               [ 60,  65, 125],
               [ 70,  75, 145],
               [ 80,  85, 165],
               [ 90,  95, 185]])
        >>> df = pd.DataFrame(data, columns=['a', 'b', 'c'])
        >>> preprocess = RackioAI.get("Preprocessing", _type="Preprocessing")
        >>> x, y = preprocess.lstm_data_preparation.split_sequences(df, 2)
        >>> x.shape
        (8, 2, 2)
        >>> x
        array([[[10., 15.],
                [20., 25.]],
        <BLANKLINE>
               [[20., 25.],
                [30., 35.]],
        <BLANKLINE>
               [[30., 35.],
                [40., 45.]],
        <BLANKLINE>
               [[40., 45.],
                [50., 55.]],
        <BLANKLINE>
               [[50., 55.],
                [60., 65.]],
        <BLANKLINE>
               [[60., 65.],
                [70., 75.]],
        <BLANKLINE>
               [[70., 75.],
                [80., 85.]],
        <BLANKLINE>
               [[80., 85.],
                [90., 95.]]])
        >>> y.shape
        (8, 1, 1)
        >>> y
        array([[[ 45.]],
        <BLANKLINE>
               [[ 65.]],
        <BLANKLINE>
               [[ 85.]],
        <BLANKLINE>
               [[105.]],
        <BLANKLINE>
               [[125.]],
        <BLANKLINE>
               [[145.]],
        <BLANKLINE>
               [[165.]],
        <BLANKLINE>
               [[185.]]])

        ```
        """

        if not input_cols:

            input_cols = Utils.get_column_names(df)
            input_cols = input_cols[:-1]

        if not output_cols:

            output_cols = Utils.get_column_names(df)
            output_cols = [output_cols[-1]]

        if isinstance(timesteps, list):

            if not len(timesteps) == len(input_cols):

                raise ValueError(
                    'timesteps and input_cols arguments must be same length')

        else:

            timesteps = [timesteps] * len(input_cols)

        input_data = df.loc[:, input_cols].values
        output_data = df.loc[:, output_cols].values
        iteration = list(
            range(0, input_data.shape[0] - max(timesteps) + stepsize,
                  stepsize))

        self.x_sequences = np.zeros(
            (len(iteration), max(timesteps), len(input_cols)))
        self.y_sequences = np.zeros((len(iteration), 1, len(output_cols)))

        self.start = 0

        options = {
            'output_data': output_data,
            'input_data': input_data,
            'timesteps': timesteps,
            'maxlen': maxlen,
            'dtype': dtype,
            'padding': padding,
            'truncating': truncating,
            'value': value
        }

        self.__split_sequences(iteration, **options)

        return self.x_sequences, self.y_sequences