def rms(self, s, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._NoValue): r""" Root Mean Square One of the most important basic features that can be extracted directly from the time-domain signal is the RMS which describe the energy of the signal. It is defined as the square root of the average squared value of the signal and can also be called the normalized energy of the signal. $RMS = \sqrt{\frac{1}{n}\sum_{i=0}^{n-1}s_{i}^{2}}$ Especially in vibration analysis the RMS is used to perform fault detection, i.e. triggering an alarm, whenever the RMS surpasses a level that depends on the size of the machine, the nature of the signal (for instance velocity or acceleration), the position of the accelerometer, and so on. After the detection of the existence of a failure, fault diagnosis is performed relying on more sophisticated features. For instance the ISO 2372 (VDI 2056) norms define three different velocity RMS alarm levels for four different machine classes divided by power and foundations of the rotating machines. RMS of array elements over a given axis. **Parameters** * **s:** (2d array_like) Elements to get RMS. * **axis:** (None or int or tuple of ints, optional) Axis or axes along which a RMS is performed. The default, axis=None, will get RMS of all the elements of the input array. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a RMS is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. * **dtype:** (dtype, optional) The type of the returned array and of the accumulator in which the elements are summed. The dtype of `s` is used by default unless `s` has an integer dtype of less precision than the default platform integer. In that case, if `s` is signed then the platform integer is used while if `s` is unsigned then an unsigned integer of the same precision as the platform integer is used. * **out:** (ndarray, optional) Alternative output array in which to place the result. It must have the same shape as the expected output, but the type of the output values will be cast if necessary. * **keepdims:** (bool, optional) If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `sum` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. * **initial:** (scalar, optional) Starting value for the sum. **Returns** * **RMS_along_axis:** (darray) An array with the same shape as `s`, with the specified axis removed. If `s` is a 0-d array, or if `axis` is None, a scalar is returned. If an output array is specified, a reference to `out` is returned. ## Snippet code ```python >>> from rackio_AI import RackioAIFE >>> feature_extraction = RackioAIFE() >>> feature_extraction.stats.rms(np.array([0.5, 1.5])) 1.118033988749895 >>> feature_extraction.stats.rms(np.array([0.5, 0.7, 0.2, 1.5]), dtype=np.int32) 0.7071067811865476 >>> feature_extraction.stats.rms(np.array([[0, 1], [0, 5]])) 3.605551275463989 >>> feature_extraction.stats.rms(np.array([[0, 1], [0, 5]]), axis=0) array([0. , 3.60555128]) >>> feature_extraction.stats.rms(np.array([[0, 1], [0, 5]]), axis=1) array([0.70710678, 3.53553391]) ``` You can also start the sum with a value other than zero: ```python >>> feature_extraction.stats.rms(np.array([2, 7, 10]), initial=5) 7.2571803523590805 ``` """ s = Utils.check_dataset_shape(s) return (np.sum(s**2, axis=axis, dtype=dtype, out=out, keepdims=keepdims, initial=initial) / s.shape[0])**0.5
def peak(self, s, ref=None, axis=0, rate=None, **kwargs): r""" I we consider only the maximum amplitude relative to zero $s_{ref}=0$ or a general reference level $s_{ref}$, we get the peak value $peak = \max\left(s_{i}-ref\right)$ Often the peak is used in conjunction with other statistical parameters, for instance the peak-to-average rate. $peak = \frac{\max\left(s_{i}-ref\right)}{\frac{1}{N}\sum_{i=0}^{N-1}s_{i}}$ or peak-to-median rate **Parameters** * **s:** * **ref:** * **axis:** * **rate:** **Returns** * **peak:** ## Snippet code ```python >>> from scipy.stats import norm >>> from rackio_AI import RackioAIFE >>> feature_extraction = RackioAIFE() >>> s = norm.rvs(size=1000, random_state=3) >>> feature_extraction.stats.peak(s) array([1.91382976]) >>> s = norm.rvs(size=(1000,2), random_state=3) >>> feature_extraction.stats.peak(s) array([1.0232499 , 3.26594839]) ``` """ s = Utils.check_dataset_shape(s) if not ref == None: _peak = np.max(s - ref, axis=axis) else: _peak = np.max(s - s[0, :], axis=axis) if not rate == None: if rate.lower() == 'average': return _peak / self.mean(s, **kwargs) elif rate.lower() == 'median': return _peak / self.median(s, **kwargs) else: return _peak
def skew(self, s, axis=0, bias=True, nan_policy='propagate'): r""" Compute the sample skewness of a data set. For normally distributed data, the skewness should be about zero. For unimodal continuous distributions, a skewness value greater than zero means that there is more weight in the right tail of the distribution. The function `skewtest` can be used to determine if the skewness value is close enough to zero, statistically speaking. **Parameters** * **s:** (ndarray) Input array. * **axis:** (int or None, optional) Axis along which skewness is calculated. Default is 0. If None, compute over the whole array `s`. * **bias:** (bool, optional) If False, then the calculations are corrected for statistical bias. * **nan_policy:** ({'propagate', 'raise', 'omit'}, optional) Defines how to handle when input contains nan. The following options are available (default is 'propagate'): * 'propagate': returns nan * 'raise': throws an error * 'omit': performs the calculations ignoring nan values **Returns** * **skewness:** (ndarray) The skewness of values along an axis, returning 0 where all values are equal. ## Notes The sample skewness is computed as the Fisher-Pearson coefficient of skewness, i.e. $g_1=\frac{m_3}{m_2^{3/2}}$ where $m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i$ is the biased sample $i\texttt{th}$ central moment, and $\bar{x}$ is the sample mean. If $bias$ is False, the calculations are corrected for bias and the value computed is the adjusted Fisher-Pearson standardized moment coefficient, i.e. $G_1=\frac{k_3}{k_2^{3/2}}=\frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}.$ ## References .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard Probability and Statistics Tables and Formulae. Chapman & Hall: New York. 2000. Section 2.2.24.1 ## Snippet code ```python >>> import numpy as np >>> from rackio_AI import RackioAIFE >>> feature_extraction = RackioAIFE() >>> s = np.array([1, 2, 3, 4, 5]) >>> feature_extraction.stats.skew(s) array([0.]) >>> s = np.array([2, 8, 0, 4, 1, 9, 9, 0]) >>> feature_extraction.stats.skew(s) array([0.26505541]) ``` """ s = Utils.check_dataset_shape(s) return skew(s, axis=axis, bias=bias, nan_policy=nan_policy)
def std(self, s, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue): r""" Compute the standard deviation along the specified axis. Returns the standard deviation, a measure of the spread of a distribution, of the array elements. The standard deviation is computed for the flattened array by default, otherwise over the specified axis. **Parameters** * **s:** (2d array_like) Calculate the standard deviation of these values. * **axis:** (None or int or tuple of ints, optional) Axis or axes along which the standard deviation is computed. The default is to compute the standard deviation of the flattened array. If this is a tuple of ints, a standard deviation is performed over multiple axes, instead of a single axis or all the axes as before. * **dtype:** (dtype, optional) Type to use in computing the standard deviation. For arrays of integer type the default is float64, for arrays of float types it is the same as the array type. * **out:** (ndarray, optional) Alternative output array in which to place the result. It must have the same shape as the expected output but the type (of the calculated values) will be cast if necessary. * **ddof:** (int, optional) Means Delta Degrees of Freedom. The divisor used in calculations is $N - ddof$, where $N$ represents the number of elements. By default `ddof` is zero. * **keepdims:** (bool, optional) If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `std` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. **Returns** * **standard_deviation:** (ndarray) If `out` is None, return a new array containing the standard deviation, otherwise return a reference to the output array. ## Notes The standard deviation is the square root of the average of the squared deviations from the mean, i.e. $\mu = \frac{1}{N}\sum_{i=1}^{n}s_{i}$ $std = \sqrt{\frac{1}{N}\sum_{i=1}^{n}|s_{i}-\mu|^2}$ ## Snippet code ```python >>> import numpy as np >>> from rackio_AI import RackioAIFE >>> feature_extraction = RackioAIFE() >>> s = np.array([[1, 2], [3, 4]]) >>> feature_extraction.stats.std(s, axis=0) array([1., 1.]) >>> feature_extraction.stats.std(s, axis=1) array([0.5, 0.5]) ``` ### In single precision, std() can be inaccurate ```python >>> s = np.zeros((2, 512*512), dtype=np.float32) >>> s[0, :] = 1.0 >>> s[1, :] = 0.1 >>> feature_extraction.stats.std(s) 0.45000005 >>> s = np.array([[14, 8, 11, 10], [7, 9, 10, 11], [10, 15, 5, 10]]) >>> feature_extraction.stats.std(s) 2.614064523559687 ``` """ s = Utils.check_dataset_shape(s) return np.std(s, axis=axis, dtype=dtype, out=dtype, ddof=ddof, keepdims=keepdims)
def load(self, pathname: str, ext: str = ".tpl", reset_index=False, **kwargs): """ Load data into DataFrame format: * **.tpl:** Is an [OLGA](https://www.petromehras.com/petroleum-software-directory/production-engineering-software/olga-dynamic-multiphase-flow-simulator) extension file. * **.pkl:** Numpy arrays or Pandas.DataFrame saved in pickle format. ___ **Parameters** * **:param pathname:** (str) Filename or directory. * If the *pathname* is a directory, it will load all the files with extension *ext*. * If the *pathname* is a filename, it will load the file with a supported extension. * **:param ext:** (str) filename extension, it's necessary if pathname is a directory. Extensions supported are: * *.tpl* [OLGA](https://www.petromehras.com/petroleum-software-directory/production-engineering-software/olga-dynamic-multiphase-flow-simulator) extension file. * *.xls* * *.xlsx* * *.xlsm* * *.xlsb* * *.odf* * *.ods* * *.odt* * *.csv* * *.pkl* (Only if the pkl saved is a DataFrame) **:return:** * **data:** (pandas.DataFrame) ___ ## Snippet code ```python >>> import os >>> from rackio_AI import RackioAI, get_directory >>> filename = os.path.join(get_directory('Leak'), 'Leak01.tpl') >>> df = RackioAI.load(filename) >>> print(df.head()) tag TIME_SERIES PT_SECTION_BRANCH_TUBERIA_PIPE_Pipe60_NR_1 ... CONTR_CONTROLLER_CONTROL_FUGA file variable Pressure ... Controller_output filename unit S PA ... .tpl 0 0.000000 568097.3 ... 0.0 Leak01 1 0.502732 568098.2 ... 0.0 Leak01 2 1.232772 568783.2 ... 0.0 Leak01 3 1.653696 569367.3 ... 0.0 Leak01 4 2.200430 569933.5 ... 0.0 Leak01 <BLANKLINE> [5 rows x 12 columns] **Example loading a directory with .tpl files** >>> directory = os.path.join(get_directory('Leak')) >>> df = RackioAI.load(directory) >>> print(df.head()) tag TIME_SERIES PT_SECTION_BRANCH_TUBERIA_PIPE_Pipe60_NR_1 ... CONTR_CONTROLLER_CONTROL_FUGA file variable Pressure ... Controller_output filename unit S PA ... .tpl 0 0.000000 568097.3 ... 0.0 Leak01 1 0.502732 568098.2 ... 0.0 Leak01 2 1.232772 568783.2 ... 0.0 Leak01 3 1.653696 569367.3 ... 0.0 Leak01 4 2.200430 569933.5 ... 0.0 Leak01 <BLANKLINE> [5 rows x 12 columns] **Example loading a directory with .csv files** >>> directory = os.path.join(get_directory('csv'), "Hysys") >>> df = RackioAI.load(directory, ext=".csv", _format="hysys") >>> print(df.head()) (Time, [seconds]) (PIC-118 - PV, [kPa]) (PIC-118 - OP, [%]) (SPRDSHT-1 - Cell Matrix (G-16), []) (UIC-101 - OP, [%]) 1 0 294.769 42 37.6105 10 2 0.3 294.769 42 37.6105 10 3 0.6 294.769 42 37.6105 10 4 0.9 294.769 42 37.6105 10 5 1.1 294.769 42 37.6105 10 >>> directory = os.path.join(get_directory('csv'), "VMGSim") >>> df = RackioAI.load(directory, ext=".csv", _format="vmgsim") >>> print(df.head()) (time, s) (/Bed-1.In.MoleFlow, kmol/h) (/Bed-1.In.P, kPa) ... (/Sep2.In.P, kPa) (/Sep3.In.P, kPa) (/Tail_Gas.In.T, C) 1 1 2072.582713 285.9299038 ... 315.8859771 291.4325134 159 2 2 2081.622826 286.9027793 ... 315.8953772 292.3627861 159 3 3 2085.98973 287.5966429 ... 316.0995398 293.0376745 159 4 4 2089.323383 288.1380485 ... 316.3974799 293.5708836 159 5 5 2092.214077 288.591646 ... 316.7350299 294.0200778 159 <BLANKLINE> [5 rows x 16 columns] **Example loading a .pkl with pandas.dataFrame** >>> filename = os.path.join(get_directory('pkl_files'), 'test_data.pkl') >>> df = RackioAI.load(filename) >>> print(df.head()) Pipe-60 Totalmassflow_(KG/S) Pipe-151 Totalmassflow_(KG/S) Pipe-60 Pressure_(PA) Pipe-151 Pressure_(PA) 0 37.83052 37.83052 568097.3 352683.3 1 37.83918 37.70243 568098.2 353449.8 2 37.83237 37.67011 568783.2 353587.3 3 37.80707 37.67344 569367.3 353654.8 4 37.76957 37.69019 569933.5 353706.8 ``` """ filename, ext = Utils.check_path(pathname, ext=ext) data = self.reader.read(filename, ext=ext, **kwargs) self.columns_name = Utils.get_column_names(data) if data.index.has_duplicates: data = data.reset_index(drop=True) if reset_index: data = data.reset_index(drop=True) self.columns_name = Utils.get_column_names(data) self._data = data return data
def split(self, *arrays, **options): """ Split arrays or matrices into random train and test subsets **Parameters** * **:*arrays:** (sequence of indexables with the same length / shape[0]) Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas DataFrame * **:train_size:** (float or int, default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. * **:test_size:** (float or int, default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If *train_size* is also None, it will be set to 0.30. * **:validation_size:** (float or int, default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the validation split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size and test size. If *train_size* is also None, it will be set to 0.0. * **:random_state:** (int or RandomState instance, default=None) Controls the suffling applied to the data before applying split. Pass an int for reproducible output across multiple function calls. See [Glosary](https://scikit-learn.org/stable/glossary.html#term-random-state) * **:shuffle:** (bool, default=True) Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None. * **:stratify:** (array-like, default=None) If not None, data is split in a stratified fashion, using this as the class labels. **:return:** * **splitting:** (list, length=3 * len(arrays)) list containing train-test split of inputs. ___ ## Snippet code ```python >>> from rackio_AI import RackioAI >>> import numpy as np >>> preprocess = RackioAI.get("Preprocessing", _type="Preprocessing") >>> X, y = np.arange(20).reshape((10, 2)), range(10) >>> X array([[ 0, 1], [ 2, 3], [ 4, 5], [ 6, 7], [ 8, 9], [10, 11], [12, 13], [14, 15], [16, 17], [18, 19]]) >>> list(y) [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] ``` ## Snippet code 2 ```python >>> X_train, X_test, X_validation, y_train, y_test, y_validation = preprocess.splitter.split(X, y, train_size=0.6, test_size=0.2, validation_size=0.2, random_state=0) >>> X_train array([[ 0, 1], [ 2, 3], [ 4, 5], [ 6, 7], [ 8, 9], [10, 11]]) >>> X_test array([[12, 13], [14, 15]]) >>> X_validation array([[16, 17], [18, 19]]) >>> y_train [0, 1, 2, 3, 4, 5] >>> y_test [6, 7] >>> y_validation [8, 9] ``` ## Snippet code 3 ```python >>> X_train, X_test, y_train, y_test = preprocess.splitter.split(X, y, train_size=0.6, test_size=0.4, random_state=0) >>> X_train array([[ 0, 1], [ 2, 3], [ 4, 5], [ 6, 7], [ 8, 9], [10, 11]]) >>> X_test array([[12, 13], [14, 15], [16, 17], [18, 19]]) >>> y_train [0, 1, 2, 3, 4, 5] >>> y_test [6, 7, 8, 9] ``` """ default_options = { 'train_size': None, 'test_size': None, 'validation_size': None, 'random_state': None, 'shuffle': False, 'stratify': None } data = [ array.values if isinstance(array, pd.DataFrame) else array for array in arrays ] options = Utils.check_default_kwargs(default_options, options) train_size = options['train_size'] test_size = options['test_size'] self.validation_size = options.pop('validation_size') lst = [ options['train_size'], options['test_size'], self.validation_size ] if lst.count(None) >= 1 or (options['train_size'] + options['test_size'] == 1): return self.__split(TRAIN_TEST_SPLIT, *data, **options) return self.__split(TRAIN_TEST_VALIDATION_SPLIT, *data, **options)
def split_sequences(self, df: pd.DataFrame, timesteps, stepsize: int = 1, input_cols: list = None, output_cols: list = None, maxlen=None, dtype: str = 'int32', padding: str = 'pre', truncating: str = 'pre', value: float = 0.): """ Splits dataframe in a 3D numpy array format supported by LSTM architectures using sliding windows concept. **Parameters** * **:param df:** (pandas.DataFrame) Contains inputs and outputs data * **:param timesteps:** (list or int) Timestep for each input variable. * If timestep is an int value, all input columns will be the same timestep * If timestep is a list, must be same lenght that input_cols argument * **:param stepsize:** (int, default = 1) step size for the sliding window * **:param input_cols:** (list, default = None) Column names that represents the input variables to LSTM * If input_cols is None the method assumes that inputs are all column except the last one. * **:param output_cols:** (list, default = None) Column names that represents the output variables to LSTM * If output_cols is None the method assumes that output is the last column. The rest of parameters represent the parameters for *pad_sequences* method, see its description. **returns** **sequences** (3D numpy array) dimensions (df.shape[0] - max(timesteps), max(timesteps), features) ```python >>> import numpy as np >>> from rackio_AI import RackioAI >>> a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]).reshape(-1,1) >>> b = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95]).reshape(-1,1) >>> c = np.array([a[i]+b[i] for i in range(len(a))]).reshape(-1,1) >>> data = np.hstack((a,b,c)) >>> data array([[ 10, 15, 25], [ 20, 25, 45], [ 30, 35, 65], [ 40, 45, 85], [ 50, 55, 105], [ 60, 65, 125], [ 70, 75, 145], [ 80, 85, 165], [ 90, 95, 185]]) >>> df = pd.DataFrame(data, columns=['a', 'b', 'c']) >>> preprocess = RackioAI.get("Preprocessing", _type="Preprocessing") >>> x, y = preprocess.lstm_data_preparation.split_sequences(df, 2) >>> x.shape (8, 2, 2) >>> x array([[[10., 15.], [20., 25.]], <BLANKLINE> [[20., 25.], [30., 35.]], <BLANKLINE> [[30., 35.], [40., 45.]], <BLANKLINE> [[40., 45.], [50., 55.]], <BLANKLINE> [[50., 55.], [60., 65.]], <BLANKLINE> [[60., 65.], [70., 75.]], <BLANKLINE> [[70., 75.], [80., 85.]], <BLANKLINE> [[80., 85.], [90., 95.]]]) >>> y.shape (8, 1, 1) >>> y array([[[ 45.]], <BLANKLINE> [[ 65.]], <BLANKLINE> [[ 85.]], <BLANKLINE> [[105.]], <BLANKLINE> [[125.]], <BLANKLINE> [[145.]], <BLANKLINE> [[165.]], <BLANKLINE> [[185.]]]) ``` """ if not input_cols: input_cols = Utils.get_column_names(df) input_cols = input_cols[:-1] if not output_cols: output_cols = Utils.get_column_names(df) output_cols = [output_cols[-1]] if isinstance(timesteps, list): if not len(timesteps) == len(input_cols): raise ValueError( 'timesteps and input_cols arguments must be same length') else: timesteps = [timesteps] * len(input_cols) input_data = df.loc[:, input_cols].values output_data = df.loc[:, output_cols].values iteration = list( range(0, input_data.shape[0] - max(timesteps) + stepsize, stepsize)) self.x_sequences = np.zeros( (len(iteration), max(timesteps), len(input_cols))) self.y_sequences = np.zeros((len(iteration), 1, len(output_cols))) self.start = 0 options = { 'output_data': output_data, 'input_data': input_data, 'timesteps': timesteps, 'maxlen': maxlen, 'dtype': dtype, 'padding': padding, 'truncating': truncating, 'value': value } self.__split_sequences(iteration, **options) return self.x_sequences, self.y_sequences