def predict_er(X, E, window=0.21, step=10, q=5, use_box_cox=True): qt = QuantileTransformer(n_quantiles=q, random_state=0) lr = HuberRegressor() lr.fit(X, E) E_pred = lr.predict(X) idx_sorted = np.argsort(E) X = X[idx_sorted] E_pred = E_pred[idx_sorted] E = E[idx_sorted] # use box-cox + quantile transformation so that the data lies uniformly in the interval [0, 1] if use_box_cox: E_quantile = qt.fit_transform(np.log(E).reshape(-1, 1)).reshape(-1) else: E_quantile = qt.fit_transform(E.reshape(-1, 1)).reshape(-1) E_pred = lr.predict(X) x, y = rolling_window_er(E_quantile, (E - E_pred) / E, window=window, step=step) if use_box_cox: x = np.exp(qt.inverse_transform(x.reshape(-1, 1)).reshape(-1)) else: x = qt.inverse_transform(x.reshape(-1, 1)).reshape(-1) return x, y
def fit(self): folds = KFold(n_splits=10, random_state=11, shuffle=True) y = self.train_df[self.target] oof_preds = np.zeros((len(self.train_df))) qt = QuantileTransformer(output_distribution='normal', random_state=11) qt.fit(y.values.reshape((-1, 1))) test_preds = [] y = qt.transform(y.values.reshape((-1, 1))) X = np.asarray(list(range(0, len(self.train_df)))) X_test = np.asarray(list(range(len(self.train_df), len(self.train_df) + len(self.test_df)))) X_test = self.convert_x(X_test) for tr_idx, val_idx in folds.split(X): X_tr, X_val = X[tr_idx], X[val_idx] y_tr, y_val = y[tr_idx], y[val_idx] X_tr = self.convert_x(X_tr) X_val = self.convert_x(X_val) mod = sm.OLS(y_tr, X_tr) res = mod.fit() oof_preds[val_idx] = qt.inverse_transform(res.predict(X_val).reshape((-1, 1))).ravel() test_preds.append(qt.inverse_transform(res.predict(X_test).reshape((-1, 1))).ravel()) test_preds = np.mean(test_preds, axis = 0) return test_preds, oof_preds, res
def test_transform_default_params(): N = 1000 rng = np.random.RandomState(22922) data = np.stack([ rng.lognormal(10, 5, N), rng.uniform(-10, 0, N), rng.normal(10, 10, N), rng.normal(-1, 1, N) ], axis=1) transformer = QuantileTransformer(output_distribution="normal", random_state=3434) data_transformed_sk = transformer.fit_transform(data) data_double_transformed_sk = transformer.inverse_transform( data_transformed_sk) np.testing.assert_allclose(data, data_double_transformed_sk) transformer_tf = QuantileTransformerTF(transformer) data_transformed_tf = transformer_tf.transform(data.astype(np.float64), False) data_double_transformed_tf = transformer_tf.transform( data_transformed_tf, True) with tf.Session() as session: data_transformed_tf_val, data_double_transformed_tf_val = session.run( [data_transformed_tf, data_double_transformed_tf]) np.testing.assert_allclose(data_transformed_sk, data_transformed_tf_val) np.testing.assert_allclose(data, data_double_transformed_tf_val)
class GaussRankScaler(object): """ So-called "Gauss Rank" scaling. Forces a transformation, uses bins to perform inverse mapping. Uses sklearn QuantileTransformer to work. """ def __init__(self): self.transformer = QuantileTransformer(output_distribution='normal') def fit(self, x): x = x.reshape(-1, 1) self.transformer.fit(x) def transform(self, x): x = x.reshape(-1, 1) result = self.transformer.transform(x) return result.reshape(-1) def inverse_transform(self, x): x = x.reshape(-1, 1) result = self.transformer.inverse_transform(x) return result.reshape(-1) def fit_transform(self, x): self.fit(x) return self.transform(x)
def test_transform(): N = 10000 rng = np.random.RandomState(223532) data_2 = rng.normal(0, 1, N // 4) data = np.stack([ rng.uniform(-10, 10, N), rng.lognormal(10, 5, N), np.concatenate([data_2] * 4), rng.normal(-1, 1, N) ], axis=1) transformer = QuantileTransformer(output_distribution="normal", random_state=34214) data_transformed_sk = transformer.fit_transform(data) data_double_transformed_sk = transformer.inverse_transform( data_transformed_sk) np.testing.assert_allclose(data, data_double_transformed_sk) # To test that QuantileTransformerTF picks up the right columns # we ask it only for [1, 2, 3] columns and when testing use data[:, 1:] transformer_tf = QuantileTransformerTF(transformer, [1, 2, 3], dtype=np.float64) data_transformed_tf = transformer_tf.transform( data[:, 1:].astype(np.float64), False) data_double_transformed_tf = transformer_tf.inverse_transform( data_transformed_tf) with tf.Session() as session: data_transformed_tf_val, data_double_transformed_tf_val = session.run( [data_transformed_tf, data_double_transformed_tf]) np.testing.assert_allclose(data_transformed_sk[:, 1:], data_transformed_tf_val) np.testing.assert_allclose(data[:, 1:], data_double_transformed_tf_val)
class Distributed: def __init__(self, my_name): self.my_name = my_name self.store = None def say_my_name(self): return self.my_name def fit(self, array): if self.my_name == 'Nothing': pass elif self.my_name == 'Simple': self.store = {'mean': array.mean(), 'std': array.std(ddof=1)} elif self.my_name == 'Normal': # from sklearn.preprocessing import PowerTransformer # self.store = PowerTransformer() from sklearn.preprocessing import QuantileTransformer self.store = QuantileTransformer(output_distribution='normal') arr = array.copy().astype(dtype=numpy.float64) self.store.fit(arr) elif self.my_name == 'Uniform': # from sklearn.preprocessing import PowerTransformer # self.store = PowerTransformer() from sklearn.preprocessing import QuantileTransformer self.store = QuantileTransformer(output_distribution='uniform') arr = array.copy().astype(dtype=numpy.float64) self.store.fit(arr) else: raise Exception("Not Yet!") def forward(self, array): if self.my_name == 'Nothing': arr = array.copy() return arr elif self.my_name == 'Simple': arr = array.copy().astype(dtype=numpy.float64) arr = (arr - self.store['mean']) / self.store['std'] arr = stop_out(arr) return arr elif self.my_name == 'Normal': arr = array.copy().astype(dtype=numpy.float64) arr = self.store.transform(arr) arr = stop_out(arr) return arr elif self.my_name == 'Uniform': arr = array.copy().astype(dtype=numpy.float64) arr = self.store.transform(arr) arr = stop_out(arr) return arr else: raise Exception("Not Yet!") def backward(self, array): if self.my_name == 'Nothing': arr = array.copy() return arr elif self.my_name == 'Simple': arr = array.copy().astype(dtype=numpy.float64) arr = arr * self.store['std'] + self.store['mean'] arr = stop_out(arr) return arr elif self.my_name == 'Normal': arr = array.copy().astype(dtype=numpy.float64) arr = self.store.inverse_transform(arr) arr = stop_out(arr) return arr elif self.my_name == 'Uniform': arr = array.copy().astype(dtype=numpy.float64) arr = self.store.inverse_transform(arr) arr = stop_out(arr) return arr else: raise Exception("Not Yet!")
class CustomQuantileTransformer(TransformerMixin, BaseEstimator): def __init__( self, cols=None, n_quantiles=1000, output_distribution="normal", random_state=42, **kwargs, ): """ cols: pass column names n_quantiles: """ if isinstance(cols, str): self.cols = [cols] else: self.cols = cols self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.random_state = random_state def fit(self, X, y=None): """ fit """ self.quant_trans = QuantileTransformer( n_quantiles=self.n_quantiles, output_distribution=self.output_distribution, random_state=self.random_state, ) if isinstance(X, pd.DataFrame): self.quant_trans.fit(X[self.cols]) elif isinstance(X, np.ndarray): self.quant_trans.fit(X) else: raise ValueError("input should be DataFrame or array") return self def transform(self, X): """ transform """ if isinstance(X, pd.DataFrame): Xo = self.quant_trans.transform(X[self.cols]) Xo = pd.DataFrame(Xo, columns=self.cols) Xo = pd.concat([X.drop(self.cols, axis=1), Xo], axis=1) elif isinstance(X, np.ndarray): Xo = self.quant_trans.transform(X) else: raise ValueError("input should be DataFrame or array") return Xo def inverse_transform(self, X): """ inverse_transform """ if isinstance(X, pd.DataFrame): Xo = self.quant_trans.inverse_transform(X[self.cols]) Xo = pd.DataFrame(Xo, columns=self.cols) Xo = pd.concat([X.drop(self.cols, axis=1), Xo], axis=1) elif isinstance(X, np.ndarray): Xo = self.quant_trans.inverse_transform(X) else: raise ValueError("input should be DataFrame or array") return Xo
for i, n, a in dataset: print(counter) dim = int(math.sqrt(int(a.shape[0]))) i = np.reshape(i, (1, 3, 512, 512)) n = np.reshape(n, (dim, 2)) a = np.reshape(a, (dim, dim)) i = torch.Tensor(i).cuda() n = torch.Tensor(n).cuda() a = torch.Tensor(a).cuda() inputi = torch.cat((i, arr), 1) metric, nodes_out, edges_out = train_step(inputi, n, a, dim, optimizer) node_features = nodes_out.cpu().detach().numpy() new_adj = edges_out.cpu().detach().numpy() i = i.cpu().detach().numpy() node_features = (node_features - node_b) / node_a node_features = qt.inverse_transform(node_features) #new_adj = np.where(new_adj>.5, 1.0 , 0) np.savetxt('./data/output/adj' + str(counter) + '.txt', new_adj) np.savetxt('./data/output/node' + str(counter) + '.txt', node_features) image = i * 255.0 image = np.reshape(image, (512, 512, 3)) image = np.asarray(image, dtype=np.uint8) cv2.imwrite('./data/output/img' + str(counter) + '.png', image) counter += 1 if counter == 6: break
class KDEQuantileTransformer(TransformerMixin, BaseEstimator): """ Quantile tranformer class using for each variable the CDF obtained with kernel density estimation """ def __init__(self, n_quantiles=1000, output_distribution='uniform', smooth_peaks=True, mirror_left=None, mirror_right=None, rho=0.5, n_adaptive=1, x_min=None, x_max=None, n_integral_bins=1000, use_KDE=True, use_inverse_qt=False, random_state=0, copy=True): """ Parameters with the class KDEQuantileTransformer KDEQuantileTransformer is a quantile tranformer class using for each variable the CDF obtained with kernel density estimation. Besides normal transformation functions, the class also provides the jacobian and inverse jacobian of the transformation and inverse transformation respectively. The KDE quantile transformation happens in four steps, two of which are transformations: 1. First KDE PDFs and CDFs are formed for all marginalized input variables. 2. Using the (smooth) CDFs, all input variables are transformed to uniform distributions. 3. Using the existing quantile transformer of sklearn, these uniform distributions are then transformed to normal distributions. 4. The KDE PDFs are used to calculate the (inverse) jacobian of the transformation. Concerning KDE evaluation of the PDF and CDF, the adaptive bandwidths are evaluated with the eqns described in: Cranmer KS, Kernel Estimation in High-Energy Physics. Computer Physics Communications 136:198-207, 2001 e-Print Archive: hep ex/0011057 In theory both transformations could be combined into one, but there are practical advantages of using two. Essentially the second transformation is a backup against the first one, to smooth out residual bumps. For certain edge case distributions, for example those with strange discrete peaks in them at the edge of a distribution, it may happen that a single transformation fails, in which case doing two quantile transformations catches any potential imperfections in the first. In the inverse transformation, by default the two transformations are combined into one however, b/c else the impact of KDE smoothing is cancelled. :param int n_quantiles: number of quantiles/bins used in output histogram. If greater than number of samples, this is reset to number of samples. Default is 1000. :param str output_distribution: 'uniform' or 'normal' distribution. :param bool smooth_peaks: if False, do not smear peaks of non-unique values. :param mirror_left: array. Mirror the data on a value on the left to counter signal leakage. Default is None, which is no mirroring. :param mirror_right: array. Mirror the data on a value on the right to counter signal leakage. Default is None, which is no mirroring. :param float rho: KDE bandwidth scale parameter. default is 0.5. :param int n_adaptive: KDE number of adaptive iterations to be applied to improve the band width. default is 1. :param x_min: array. minimum value of pdf's x range. default is None (= - inf) :param x_max: array. maximum value of pdf's x range. default is None (= + inf) :param int n_integral_bins: for internal evaluation, number of integration bins beyond x-range. default is 1000. :param bool use_KDE: Default is True. If false, KDE smoothing is off, using default quantile transformation. :param bool use_inverse_qt: Default is False. If true, KDE is not used in inverse transformation. :param int random_state: when an integer, the seed given random generator. :param copy: Copy the data before transforming. Default is True. """ self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.smooth_peaks = smooth_peaks self.n_adaptive = n_adaptive self.copy = copy self.use_inverse_qt = use_inverse_qt self.use_KDE = use_KDE self.n_integral_bins = max(n_integral_bins, 1000) self.random_state = random_state # integration range self.x_min = np.array(x_min) if isinstance(x_min, (list, tuple, np.ndarray)) else None self.x_max = np.array(x_max) if isinstance(x_max, (list, tuple, np.ndarray)) else None # left and right-hand mirror points self.mirror_left = np.array(mirror_left) if isinstance(mirror_left, (list, tuple, np.ndarray)) else None self.mirror_right = np.array(mirror_right) if isinstance(mirror_right, (list, tuple, np.ndarray)) else None # copy x ranges if mirror points not set self.mirror_left = self.x_min if self.mirror_left is None else self.mirror_left self.mirror_right = self.x_max if self.mirror_right is None else self.mirror_right # bandwidth rescaling factor self.rho = np.array(rho) if isinstance(rho, (list, tuple, np.ndarray)) else rho # basic checks on attributes if self.n_quantiles <= 0: raise ValueError("Invalid value for 'n_quantiles': %d. The number of quantiles must be at least one." % self.n_quantiles) if self.output_distribution not in ('normal', 'uniform'): raise ValueError("'output_distribution' has to be either 'normal' or 'uniform'. Got '{}' instead." % self.output_distribution) if (isinstance(self.rho, np.ndarray) and any([r <= 0 for r in self.rho])) or \ (isinstance(self.rho, (float, np.number)) and self.rho <= 0): raise ValueError("Invalid value(s) for 'rho': %f. The number(s) must be greater than zero." % self.rho) if self.n_adaptive < 0: raise ValueError("Invalid value for 'n_adaptive': %d. Must be positive." % self.n_adaptive) def fit(self, X, y=None): """Compute the kde-based quantiles used for transforming. :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to scale along the features axis. :param y: Ignored :return: self : object """ X = check_array(X, copy=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") # sample profiles n_samples, n_features = X.shape # continuation of basic checks, now that we know X if isinstance(self.rho, np.ndarray): if self.rho.shape[0] != n_features: raise ValueError("Invalid size of 'rho': %d. The number should match the data: %d." % (self.rho.shape[0], n_features)) else: self.rho = np.array([self.rho] * n_features) if isinstance(self.mirror_left, np.ndarray): if self.mirror_left.shape[0] != n_features: raise ValueError("Invalid size of 'mirror_left': %d. The number should match the data: %d." % (self.mirror_left.shape[0], n_features)) else: self.mirror_left = np.array([None] * n_features) if isinstance(self.mirror_right, np.ndarray): if self.mirror_right.shape[0] != n_features: raise ValueError("Invalid size of 'mirror_right': %d. The number should match the data: %d." % (self.mirror_right.shape[0], n_features)) else: self.mirror_right = np.array([None] * n_features) if isinstance(self.x_min, np.ndarray): if self.x_min.shape[0] != n_features: raise ValueError("Invalid size of 'x_min': %d. The number should match the data: %d." % (self.x_min.shape[0], n_features)) else: self.x_min = np.array([None] * n_features) if isinstance(self.x_max, np.ndarray): if self.x_max.shape[0] != n_features: raise ValueError("Invalid size of 'x_max': %d. The number should match the data: %d." % (self.x_max.shape[0], n_features)) else: self.x_max = np.array([None] * n_features) # number of quantiles cannot be higher than number of data points. If so, reset. if self.n_quantiles > n_samples: warnings.warn("n_quantiles (%s) is greater than the total number " "of samples (%s). n_quantiles is set to " "n_samples." % (self.n_quantiles, n_samples)) self.n_quantiles = max(1, min(self.n_quantiles, n_samples)) # set the (x_min, x_max) transformation range # if not set, by default widen the range beyond min/max to account for signal leakage if any([x is None for x in self.x_min]) or any([x is None for x in self.x_max]): gstd = np.std(X, axis=0) bw = np.power(4 / 3, 0.2) * gstd * np.power(n_samples, -0.2) min_orig = np.min(X, axis=0) - 10 * bw max_orig = np.max(X, axis=0) + 10 * bw for i in range(n_features): self.x_min[i] = min_orig[i] if (self.x_min[i] is None and gstd[i] > 0) else self.x_min[i] self.x_max[i] = max_orig[i] if (self.x_max[i] is None and gstd[i] > 0) else self.x_max[i] if self.use_KDE: # Do the actual KDE fit (to uniform distributions) self._kde_fit(X) # prepare X to do quantile transformer fit. # add extreme points so QT knows the true edges for inverse transformation after sampling X = self._kde_transform(X) low = np.array([[0] * X.shape[1]]) high = np.array([[1] * X.shape[1]]) X = np.concatenate([X, low, high], axis=0) elif self.smooth_peaks: X = self._smooth_peaks(X) # create pdf for quantile transformation self._qt_pdf(X) # perform quantile transformation to smooth out any residual imperfections after kde # standard quantile transformer helps to smooth out any residual imperfections after kde transformation, # and does conversion to normal. self.qt_ = QuantileTransformer( n_quantiles=self.n_quantiles, output_distribution=self.output_distribution, copy=self.copy, random_state=self.random_state, ) self.qt_.fit(X) return self def _qt_pdf(self, X, min_pdf_value=1e-20): """Internal function to make quantile transformer pdf Is only run when use_KDE=False :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to scale along the features axis. """ self.pdf_ = [] n_samples, n_features = X.shape ps = np.linspace(0, 1, self.n_quantiles + 1) # calculate quantiles and pdf for i in range(n_features): x = X[:, i] bin_edges = np.quantile(x, ps) bin_entries = [1./self.n_quantiles] * self.n_quantiles bin_diffs = np.diff(bin_edges) pdf_norm = np.divide(bin_entries, bin_diffs, out=np.zeros_like(bin_entries), where=bin_diffs != 0) # ensure interpolate works up to last bin edge, somehow ignored otherwise pdf_norm = np.concatenate([pdf_norm, [pdf_norm[-1]]]) fast_pdf = interpolate.interp1d(bin_edges, pdf_norm, kind='previous', bounds_error=False, fill_value=(min_pdf_value, min_pdf_value)) self.pdf_.append({'fast': fast_pdf, 'bin_edges': bin_edges, 'bin_entries': bin_entries}) def _kde_fit(self, X): """Internal function to compute the kde-based quantiles used for transforming. :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to scale along the features axis. :return: self : object """ # reset self.pdf_ = [] self.cdf_ = [] n_features = X.shape[1] for i in range(n_features): # do kde fit, store each pdf bin_entries, bin_mean = kde_process_data(X[:, i], self.n_quantiles, self.smooth_peaks, self.mirror_left[i], self.mirror_right[i], random_state=self.random_state) band_width = kde_bw(bin_mean, bin_entries, self.rho[i], self.n_adaptive) # transformers to uniform distribution and back fast_pdf, F, Finv, kde_norm = kde_make_transformers(bin_mean, bin_entries, band_width, x_min=self.x_min[i], x_max=self.x_max[i], n_bins=self.n_integral_bins) # store cdf, inverse-cdf, and pdf. self.cdf_.append((F, Finv)) pdf = {'bin_entries': bin_entries, 'bin_mean': bin_mean, 'band_width': band_width, 'norm': kde_norm, 'fast': fast_pdf} self.pdf_.append(pdf) return self def _smooth_peaks(self, X): """Internal function to smooth non-unique peaks :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to scale along the features axis. :return: ndarray or sparse matrix, shape (n_samples, n_features) The transformed data """ X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") n_features = X.shape[1] for feature_idx in range(n_features): x = X[:, feature_idx] # smooth peaks - note: this adds a random component to the data # applying smoothing to data that's already been smoothed has no impact, b/c all peaks are already gone. x = kde_smooth_peaks_1dim(x, self.mirror_left[feature_idx], self.mirror_right[feature_idx], copy=False, random_state=self.random_state, smoothing_width=1e-5) X[:, feature_idx] = x return X def _kde_transform(self, X): """Internal function to transform the data :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to scale along the features axis. :return: ndarray or sparse matrix, shape (n_samples, n_features) The transformed data """ X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") n_features = X.shape[1] for feature_idx in range(n_features): x = X[:, feature_idx] # smooth peaks - note: this adds a random component to the data # applying smoothing to data that's already been smoothed has no impact, b/c all peaks are already gone. if self.smooth_peaks: x = kde_smooth_peaks_1dim(x, self.mirror_left[feature_idx], self.mirror_right[feature_idx], copy=False, random_state=self.random_state) # transform distribution to uniform y = self.cdf_[feature_idx][0](x) # transform uniform [0,1] distribution to normal # X[:, feature_idx] = np.sqrt(2.) * erfinv(2. * y - 1.) if self.output_distribution == 'normal' else y X[:, feature_idx] = y return X def transform(self, X): """Transform the data :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to scale along the features axis. :return: ndarray or sparse matrix, shape (n_samples, n_features) The transformed data """ # 1. kde transformation to uniform. if self.use_KDE: X = self._kde_transform(X) elif self.smooth_peaks: X = self._smooth_peaks(X) # 2. quantile transformation to smooth out residual bumps and do conversion to normal distribution return self.qt_.transform(X) def _kde_inverse_transform(self, X): """Internal function to inverse transform the data :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to inverse scale along the features axis. :return: ndarray or sparse matrix, shape (n_samples, n_features) The inverse-transformed data """ n_features = X.shape[1] for feature_idx in range(n_features): x = X[:, feature_idx] # transform normal back to uniform [0,1] if not self.use_inverse_qt: x = (0.5 + 0.5 * erf(x/np.sqrt(2.))) if self.output_distribution == 'normal' else x # transform uniform back to original distribution X[:, feature_idx] = self.cdf_[feature_idx][1](x) return X def inverse_transform(self, X): """Inverse transform the data :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to inverse scale along the features axis. :return: ndarray or sparse matrix, shape (n_samples, n_features) The inverse-transformed data """ # 1. quantile transformation back to kde if self.use_inverse_qt or not self.use_KDE: X = self.qt_.inverse_transform(X) # 2. inverse kde transformation return self._kde_inverse_transform(X) if self.use_KDE else X def jacobian(self, X): """Provide the Jacobian of the transformation :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to scale along the features axis. :return: ndarray or sparse matrix, shape (n_samples, ) An array with the jacobian of each data point """ X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") # smoothing of peaks if self.smooth_peaks: X = self._smooth_peaks(X) jac = 1.0 for idx in range(X.shape[1]): kdfi = self.pdf_[idx]['fast'] jac /= kdfi(X[:, idx]) if self.output_distribution == 'normal': X = self.transform(X) for idx in range(X.shape[1]): jac *= norm.pdf(X[:, idx]) return jac def inverse_jacobian(self, X): """Provide the Jacobian of the inverse transformation :param X: ndarray or sparse matrix, shape (n_samples, n_features) The data used to inverse scale along the features axis. :return: ndarray or sparse matrix, shape (n_samples, ) An array with the jacobian of the inverse transformation of each input data point """ X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") inv_jac = 1.0 if self.output_distribution == 'normal': for idx in range(X.shape[1]): inv_jac /= norm.pdf(X[:, idx]) X = self.inverse_transform(X) for idx in range(X.shape[1]): kdfi = self.pdf_[idx]['fast'] inv_jac *= kdfi(X[:, idx]) return inv_jac
from sklearn.model_selection import cross_val_predict from sklearn.model_selection import KFold kfold = KFold(n_splits=10, random_state=seed) results = cross_val_score(pipeline, X_train, y_train_norm, cv=kfold) print("Standardized: %.15f (%.15f) MSE" % (results.mean(), results.std())) predictions_train_trans = cross_val_predict(pipeline, X_train, y_train_norm, cv=kfold) #To be able to save the model and evaluate it #we have to fit again because cross val does not store the fit parameters pipeline.fit(X_train, y_train_norm) predictions_train_new = quantile_transformer.inverse_transform( pipeline.predict(X_train)) predictions_test = quantile_transformer.inverse_transform( pipeline.predict(X_test)) #Score of the regression from sklearn.metrics import mean_squared_error print("MSE train : %.15f " % mean_squared_error(y_train, predictions_train_new)) print("MSE test : %.15f " % mean_squared_error(y_test, predictions_test)) with open('Results.txt', 'a+', newline='\n') as f: f.write('Results for 1 layer with 13 neurons \n') f.write('Standardized: %.15f (%.15f) MSEi \n' % (results.mean(), results.std())) f.write('MSE train : %.15f \n' % mean_squared_error(y_train, predictions_train_new))
#Get the Test data finalPreds = [] for i in range(14): x_test = [] x_test.append(scaled_data[-60:]) #format test data x_test = np.array(x_test) x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1)) #make prediction predictions = model.predict(x_test) test = random.randint(0, 1) scaled_data = np.append(scaled_data, predictions[0] + random.uniform(-0.35, 0.35)) scaled_data = scaled_data.transpose() predictions = scaler.inverse_transform(predictions) finalPreds.append(predictions) #Plot the results train = data[training_data_len - 60:training_data_len] valid = data[len(scaled_data) - len(finalPreds):] valid['Predictions'] = finalPreds plt.figure(figsize=(16, 8)) plt.title('COVID-19 Predicted Cases(Training)') plt.xticks(np.arange(0, 1000, 5)) plt.xlabel('Date', fontsize=8) plt.ylabel('Number of Cases', fontsize=18) plt.plot(train['Cases']) plt.plot(valid['Predictions']) plt.legend(['Train', 'Val', 'Predictions'], loc='lower right') plt.show()
class Dataset: def __init__(self, type, parameters): # Constructs dataset to train and test the following: # 1) Pressure to Acoustic converter: converts the output pressures of tubetalker to acoustic parameters. # 2) Feedforward controller: Controller that accepts acoustic parameters and estimates muscle parameters, i.e. inputs to tubetalker. # 3) Feedback controllers: # a) Somatosensory feedback controller: controller that accepts somatosensory error (targets - predictions of talker) and estimates muscle parameters (inputs to tubetalker). # b) Acoustic feedback controller: controller that accepts acoutic error (targets - predictions of PtoA converter) and estimates muscle parameters (inputs to tubetalker). # The datasets consist of targets for 1, 2 & 3 mentioned above, i.e., targets for muscle, acoustic and somatosensory parameters. super(Dataset, self).__init__() self.training_data, self.testing_data = self.readFile(type, parameters) print("training data shape: {}, testing data shape: {}".format( self.training_data.shape, self.testing_data.shape)) pass def readFile(self, type, parameters): # Reads the PtoA converter acoustics file and controllers parameters(muscle, acoustic and somatosensory parameters) file ptoa_acoustics_data = np.asarray(pd.read_csv( parameters["acoustics-file"]).to_numpy(), dtype=np.float32) controllers_data = np.asarray(pd.read_csv( parameters["controllers-file"]).to_numpy(), dtype=np.float32) controllers_acoustics_data = controllers_data[:, 8:12] ptoa_data_length = ptoa_acoustics_data.shape[0] controllers_data_length = ptoa_data_length + controllers_acoustics_data.shape[ 0] # Gather all acoustic data and apply normal transformation to the data. # This will normalize the data distribution and better trains the neural network controllers. acoustics_data = np.concatenate( (ptoa_acoustics_data, controllers_acoustics_data), axis=0) acoustics_data_normal = self.transform_to_normal_dist(acoustics_data) # Split the normally distributed acoustic data into PtoA acoustic data and controller acoustic data. ptoa_acoustics_data_normal = acoustics_data_normal[ 0:ptoa_data_length, :] controllers_acoustics_data_normal = acoustics_data_normal[ ptoa_data_length:controllers_data_length, :] # Testing data and training datasets and constructed according to the type given by the user. if type == "PtoA": self.pressure_maximum = 35681.0 # Max value for max normalization. training_length = 1500000 # Total length of training data. if (parameters["PtoA"]["perform"] == "training"): pressure_training_data = np.asarray( pd.read_csv( parameters["pressure-training-file"]).to_numpy(), dtype=np.float32) # Import data from training file training_data = np.concatenate( (pressure_training_data / self.pressure_maximum, ptoa_acoustics_data_normal[0:training_length, :]), axis=-1) # Select training data as per training_length testing_data = np.zeros((10, 1106), dtype=np.float32) elif (parameters["PtoA"]["perform"] == "testing"): pressure_testing_data = np.asarray(pd.read_csv( parameters["pressure-testing-file"]).to_numpy(), dtype=np.float32) training_data = np.zeros((10, 1106), dtype=np.float32) testing_data = np.concatenate( (pressure_testing_data / self.pressure_maximum, ptoa_acoustics_data_normal[training_length:, :]), axis=-1) elif type == "FBControl": # Feedback controller training. self.pressure_maximum = 35681.0 training_length = 40000 som_training_data = controllers_data[0:training_length, 4:8] som_testing_data = controllers_data[training_length:, 4:8] som_training_data[:, 1:] = np.log(som_training_data[:, 1:]) som_testing_data[:, 1:] = np.log(som_testing_data[:, 1:]) self.somatosensory_maximums = np.amax(np.concatenate( (som_training_data, som_testing_data), axis=0), axis=0) self.somatosensory_minimums = np.amin(np.concatenate( (som_training_data, som_testing_data), axis=0), axis=0) training_data = np.concatenate( (controllers_data[0:training_length, 0:4], (som_training_data - self.somatosensory_minimums) / (self.somatosensory_maximums - self.somatosensory_minimums), controllers_acoustics_data_normal[0:training_length, :]), axis=-1) testing_data = np.concatenate( (controllers_data[training_length:, 0:4], (som_testing_data - self.somatosensory_minimums) / (self.somatosensory_maximums - self.somatosensory_minimums), controllers_acoustics_data_normal[training_length:, :]), axis=-1) print("pressure max: {}, somatosensory maxs: {}, mins: {}".format( self.pressure_maximum, self.somatosensory_maximums, self.somatosensory_minimums)) training_data = np.reshape( training_data, (training_data.shape[0], 1, training_data.shape[-1])) testing_data = np.reshape( testing_data, (testing_data.shape[0], 1, testing_data.shape[-1])) print("training data shape: {}, testing data shape: {}".format( training_data.shape, testing_data.shape)) return training_data, testing_data def transform_to_normal_dist(self, acoustics): # Transforms acoustics data to normally distributed acoustic data. # Each acoustic parameters fo, SPL, SC, SNR are normalized independently. self.qt1 = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=0) self.qt2 = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=0) self.qt3 = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=0) self.qt4 = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=0) fo, SPL, SC, SNR = np.split(acoustics, 4, axis=-1) fob = self.qt1.fit_transform(fo) SPLb = self.qt2.fit_transform(SPL) SCb = self.qt3.fit_transform(SC) SNRb = self.qt4.fit_transform(SNR) # Computing maxima/minima of each acoustic parameter for normalization and denormalization. self.maxima = {} self.maxima["fo"] = np.amax(fob, axis=0) self.maxima["SPL"] = np.amax(SPLb, axis=0) self.maxima["SC"] = np.amax(SCb, axis=0) self.maxima["SNR"] = np.amax(SNRb, axis=0) self.minima = {} self.minima["fo"] = np.amin(fob, axis=0) self.minima["SPL"] = np.amin(SPLb, axis=0) self.minima["SC"] = np.amin(SCb, axis=0) self.minima["SNR"] = np.amin(SNRb, axis=0) # minmax normalization of parameters. fob = (fob - np.amin(fob, axis=0)) / (np.amax(fob, axis=0) - np.amin(fob, axis=0)) SPLb = (SPLb - np.amin(SPLb, axis=0)) / (np.amax(SPLb, axis=0) - np.amin(SPLb, axis=0)) SCb = (SCb - np.amin(SCb, axis=0)) / (np.amax(SCb, axis=0) - np.amin(SCb, axis=0)) SNRb = (SNRb - np.amin(SNRb, axis=0)) / (np.amax(SNRb, axis=0) - np.amin(SNRb, axis=0)) return np.concatenate((fob, SPLb, SCb, SNRb), axis=-1) def inverse_transform_to_original_dist(self, acoustics): # Acoustic predictions need to be converted back to their original distribution. # The same transformers as those used for inverse-transformation as those used in tranformation. fo, SPL, SC, SNR = np.split(acoustics, 4, axis=-1) fo = np.reshape(fo, (fo.shape[0], fo.shape[-1])) * ( self.maxima["fo"] - self.minima["fo"]) + self.minima["fo"] SPL = np.reshape(SPL, (SPL.shape[0], SPL.shape[-1])) * ( self.maxima["SPL"] - self.minima["SPL"]) + self.minima["SPL"] SC = np.reshape(SC, (SC.shape[0], SC.shape[-1])) * ( self.maxima["SC"] - self.minima["SC"]) + self.minima["SC"] SNR = np.reshape(SNR, (SNR.shape[0], SNR.shape[-1])) * ( self.maxima["SNR"] - self.minima["SNR"]) + self.minima["SNR"] foi = self.qt1.inverse_transform(fo) SPLi = self.qt2.inverse_transform(SPL) SCi = self.qt3.inverse_transform(SC) SNRi = self.qt4.inverse_transform(SNR) foi = np.reshape(foi, (fo.shape[0], 1, fo.shape[-1])) SPLi = np.reshape(SPLi, (SPL.shape[0], 1, SPL.shape[-1])) SCi = np.reshape(SCi, (SC.shape[0], 1, SC.shape[-1])) SNRi = np.reshape(SNRi, (SNR.shape[0], 1, SNR.shape[-1])) return np.concatenate((foi, SPLi, SCi, SNRi), axis=-1) def getFullShuffledDataset(self, data): # shuffles data on the first axis. np.random.shuffle(data) return data
x = (x * 256 - .5).int() quintiles = np.percentile(x.numpy(), [0, 50]) q = quintiles.searchsorted(x.numpy()) # print(quintiles) # print(q, q.min(), q.max()) # 1/0 qt = QuantileTransformer() xt = torch.tensor(qt.fit_transform(x.view(x.size(0), -1).numpy())).float().view(x.size()) # print(x) print(xt.min(), xt.max(), xt.size(), x.size()) print(quintiles) for buckets in [2, 4, 8, 16]: xd = (xt * buckets).int().float() / buckets xr = torch.tensor(qt.inverse_transform(xd.numpy() * 0 + 1)).float() quintiles = np.percentile(x.numpy(), 100 * np.linspace(0, 1, buckets + 1)[:-1]) print(quintiles) out = torch.zeros_like(xr) print(list(set(xt[0].numpy()))) print(list(set(xd[0].numpy()))) #print(list(set(x[0].numpy()))) print(sorted(list(set(xr[0].numpy())))) 1/0 for deq in [1, 2, 2, 4, 8, 16, 32, 64, 128]: xd = ((x // deq) * deq).float()
plt.xlabel('Acceleration (Total)') plt.ylabel('Frequency') plt.subplot(1, 2, 2) plt.hist(X_normal[:, 1]) plt.xlabel('Gyration (Total)') plt.ylabel('Frequency') plt.tight_layout(pad=3) plt.show() # In[150]: # inverse transform (to verify that it works) X = normalizer.inverse_transform(X_normal) plt.figure(figsize=(8, 4)) plt.subplot(1, 2, 1) plt.hist(X[:, 0]) plt.xlabel('Acceleration (Total)') plt.ylabel('Frequency') plt.subplot(1, 2, 2) plt.hist(X[:, 1]) plt.xlabel('Gyration (Total)') plt.ylabel('Frequency') plt.tight_layout(pad=3) plt.show()
def test_transform_test(): N = 10000 def gen_data(seed): rng = np.random.RandomState(seed) data_1 = rng.uniform(-10, 0, N // 4) return np.stack([ rng.lognormal(10, 5, N), np.concatenate([data_1] * 4), rng.normal(10, 10, N), rng.normal(-1, 1, N) ], axis=1) data = gen_data(23342) transformer = QuantileTransformer(output_distribution="normal", n_quantiles=100, random_state=3434) data_transformed_sk = transformer.fit_transform(data) data_double_transformed_sk = transformer.inverse_transform( data_transformed_sk) np.testing.assert_allclose(data, data_double_transformed_sk) transformer_tf = QuantileTransformerTF(transformer) data_transformed_tf = transformer_tf.transform(data.astype(np.float64), False) data_double_transformed_tf = transformer_tf.transform( data_transformed_tf, True) data_test = np.vstack( [gen_data(1321), [[100, 100, 100, 111], [-100, -124, -241, -1]]]) test_transformed_sk = transformer.transform(data_test) test_double_transformed_sk = transformer.inverse_transform( test_transformed_sk) test_transformed_tf = transformer_tf.transform( data_test.astype(np.float64), False) test_double_transformed_tf = transformer_tf.inverse_transform( data_transformed_tf) rng = np.random.RandomState(223532) data_inverse = rng.normal(size=[N, 4]) inverse_sk = transformer.inverse_transform(data_inverse) inverse_tf = transformer_tf.inverse_transform(data_inverse) with tf.Session() as session: test_transformed_tf_val, xtest_double_transformed_tf_val, \ data_transformed_tf_val, data_double_transformed_tf_val, \ inverse_tf_val = session.run([ test_transformed_tf, test_double_transformed_tf, data_transformed_tf, data_double_transformed_tf, inverse_tf]) np.testing.assert_allclose(data_transformed_sk, data_transformed_tf_val) np.testing.assert_allclose(data, data_double_transformed_sk) np.testing.assert_allclose(data, data_double_transformed_tf_val) np.testing.assert_allclose(test_transformed_sk, test_transformed_tf_val) np.testing.assert_allclose(inverse_sk, inverse_tf_val)
class DFQuantileTransformer(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = QuantileTransformer(**kwargs) self.transform_cols = None self.stat_df = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) # Reference: https://help.gooddata.com/doc/en/reporting-and-dashboards/maql-analytical-query-language/maql-expression-reference/aggregation-functions/statistical-functions/predictive-statistical-use-cases/normality-testing-skewness-and-kurtosis # Highly skewed: -1 > Skewness > 1 # Moderate skewed: -0.5 < Skewness < -1 # 0.5 < Skewness < 1 # Approximately symmetric: -0.5 < Skewness < 0.5 skew_df = X[self.transform_cols].skew().to_frame(name='Skewness') # Normal distributed kurtosis: 3 kurt_df = X[self.transform_cols].kurt().to_frame(name='Kurtosis') self.stat_df = skew_df.merge(kurt_df, left_index=True, right_index=True, how='left') return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.copy() new_X[self.transform_cols] = self.model.transform( X[self.transform_cols]) # Transformed skewness & kurtosis skew_df = new_X[self.transform_cols].skew().to_frame( name='Skewness (Transformed)') kurt_df = new_X[self.transform_cols].kurt().to_frame( name='Kurtosis (Transformed)') stat_df = skew_df.merge(kurt_df, left_index=True, right_index=True, how='left') self.stat_df = self.stat_df.merge(stat_df, left_index=True, right_index=True, how='left') return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X) def inverse_transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.copy() new_X[self.transform_cols] = self.model.inverse_transform( X[self.transform_cols]) return new_X
class Transform(object): # Yeo-Johnson transform, which is an extension of Box-Cox transformation but can handle both positive and negative values. # References: # Weisberg, Yeo-Johnson Power Transformations. https://www.stat.umn.edu/arc/yjpower.pdf # Yeo + Johnson, A new family of power transformations to improve normality or symmetry, http://www.stat.wisc.edu/sites/default/files/tr1002.pdf # For Anscombe transform (https://en.wikipedia.org/wiki/Anscombe_transform), # # Bias removal # for bias removal, ciw and alpha must be not None, otherwise no bias removal is done and ew return the median rather than the mean # for bias removal, see https://robjhyndman.com/hyndsight/backtransforming/ # see http://davegiles.blogspot.co.uk/2013/08/forecasting-from-log-linear-regressions.html # see http://data.princeton.edu/wws509/notes/c2s10.html # and https://robjhyndman.com/hyndsight/backtransforming/ def __init__(self, method, nqs, ceiling=None, floor=None, unbias=False): self.method = method self.ceiling = ceiling self.floor = floor self.lmbda = None self.name = method self.xf_done = False self.unbias = unbias # not implemented self.lbl = None if method == 'yeo-johnson' or method == 'box-cox': self.xobj = PowerTransformer( method=method, standardize=False, copy=False) # MUST have standardize = False elif method == 'quantile': self.xobj = QuantileTransformer(n_quantiles=int(nqs), output_distribution='normal', copy=False) elif method == 'logistic': self.xobj = Linearizer(ceiling, floor, self.unbias) elif method == 'log': self.xobj = LogTransform(self.unbias) elif method == 'anscombe': self.xobj = Anscombe() elif method is None: self.method = None self.xobj = NoTransform() else: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: set_xform: invalid method: ' + str(method)) self.method = None self.xobj = NoTransform() def check_input(self, y): if isinstance(y, (float, int, np.float, np.int)): y = np.array([y]) if isinstance(y, np.ndarray) is False: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: invalid type: ' + str(type(y))) return None yc = np.copy(y) if len(np.shape(yc)) > 1: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: invalid input shape: ' + str(np.shape(yc))) return None yx = np.reshape(yc, (1, -1))[0] if np.max(yx) == np.min(yx): su.my_print('pid: ' + str(os.getpid()) + ' WARNING: constant series: ' + str(np.min(yc))) return None if np.min( yx ) < 0.0 and self.method == 'box-cox' or self.method == 'log' or self.method == 'anscombe': su.my_print('pid: ' + str(os.getpid()) + ' WARNING: invalid range for method: ' + self.method + ' min: ' + str(np.min(yx)) + ' lbda: ' + str(self.lmbda)) return None return np.reshape(yx, (-1, 1)) def transform(self, y): if self.xf_done is False: su.my_print('pid: ' + str(os.getpid()) + ' ' + self.method + ' : must fit before transform') return None if self.method == 'logistic': return self.xobj.transform(y) elif self.method == 'log': return self.xobj.transform(y) else: return self._transform(y) def _transform(self, y): if self.method is None: return y else: ys = self.check_input(y) if ys is None: return None yt = self.xobj.transform(ys) ya = np.reshape(yt, (1, -1))[0] return self.check_output(ya, y) def fit(self, y): if self.xf_done is True: su.my_print('pid: ' + str(os.getpid()) + ' ' + self.method + ' : already fit. Create new object') return None else: self.xf_done = True if self.method == 'logistic': self.xobj.fit(y) return self elif self.method == 'log': self.xobj.fit(y) return self else: return self._fit(y) def _fit(self, y): ys = self.check_input(y) if ys is None: return None else: try: self.xobj.fit(ys) except ValueError: return None try: self.lmbda = self.xobj.lambdas_[0] except AttributeError: self.lmbda = None return self def fit_transform(self, y): if self.xf_done is True: su.my_print('pid: ' + str(os.getpid()) + ' ' + self.method + ' : already fit. Create new Transform instance') return None else: r = self.fit(y) return None if r is None else self.transform(y) def check_output(self, ya, y): if len(np.unique(ya)) == 1: # xform failed: over/underflow? su.my_print(' WARNING: transform ' + self.method + ' failed with lambda ' + str(self.lmbda) + ' and label: ' + str(self.lbl) + ' Trying Quantile') return self.reset_xform(y) else: return ya def reset_xform(self, y): # in case of failure, try quantile if self.method == 'yeo-johnson' or self.method == 'box-cox': self.method = 'quantile' del self.xobj # drops lmbda also self.xobj = QuantileTransformer(output_distribution='normal', copy=False) self.xf_done = False return self.fit_transform(y) else: return y def fcast_var(self, rdf, w): # used to unbias inverse transforms # see https://robjhyndman.com/hyndsight/backtransforming/ # rdf contains yhat_upr and yhat_lwr # w is is the quantile used for the yhat bounds # y_upr/lwr = yhat +/- q * sig q = 1.0 - (1.0 - w) / 2.0 qval = norm.ppf(q) / 2.0 try: diff = np.abs(rdf.diff(axis=1).dropna(axis=1)) / 2.0 except TypeError: return 0.0 if len(diff) > 0 and len(diff.columns) == 1: diff.columns = ['var'] return (diff / qval)**2 # y_var else: return 0.0 def _u_inverse_transform(self, y_mean, y_var): # return f(y_mean) + (y_var /2) * f''(y_mean) where f in the inverse transform function if self.unbias is False: return y_mean else: if y_mean is None: return None else: y_mean_ = np.reshape(y_mean, (1, -1))[0] y_var_ = 0.0 if y_var is None else np.reshape(y_var, (1, -1))[0] if self.method == 'box-cox': fy, d2fy = self._box_cox_unbias(y_mean_) elif self.method == 'yeo-johnson': fy, d2fy = self._yeo_johnson_unbias(y_mean_) elif self.method == 'quantile': fy, d2fy = self._quantile_unbias(y_mean_) else: fy, d2fy = y_mean_, 0.0 return None if fy is None else (fy if d2fy is None else fy + (y_var_ / 2.0) * d2fy) def _box_cox_unbias(self, y_mean): if np.abs(self.lmbda) > 1.0e-02: z = 1 + self.lmbda * y_mean fy = np.power(z, 1.0 / self.lmbda) fy = self.interpolate_(fy, y_mean, nan_pct=0.2) d2fy = (1 - self.lmbda) * np.power(z, (1.0 / self.lmbda) - 2.0) d2fy = self.interpolate_(d2fy, y_mean, nan_pct=0.2) else: fy = np.exp(y_mean) d2fy = fy return fy, d2fy def _yeo_johnson_unbias(self, y_mean): fy = np.zeros_like(y_mean) d2fy = np.zeros_like(y_mean) pos = y_mean >= 0 # binary mask if np.abs(self.lmbda) < 1.0e-02: fy[pos] = np.exp(y_mean[pos]) - 1 d2fy[pos] = np.exp(y_mean[pos]) else: z = 1 + self.lmbda * y_mean[pos] fy[pos] = np.power(z, (1.0 / self.lmbda)) - 1.0 d2fy[pos] = (self.lmbda - 1) * np.power(z, (1.0 / self.lmbda) - 2.0) if np.abs(2 - self.lmbda) < 10.e-02: fy[~pos] = 1.0 - np.exp(-y_mean[~pos]) d2fy[~pos] = -np.exp(-y_mean[~pos]) else: theta = 2 - self.lmbda z = 1 - theta * y_mean[~pos] fy[~pos] = 1 - np.power(z, 1.0 / theta) d2fy[~pos] = (theta - 1) * np.power(z, (1.0 / theta) - 2.0) return fy, d2fy def _quantile_unbias(self, y_mean): return y_mean, 0.0 def inverse_transform(self, y, y_var, lbl=None): self.lbl = lbl if y is not None: if isinstance(y, (pd.core.series.Series, pd.core.frame.DataFrame)): y = y.values else: return None if y_var is not None: if isinstance(y, (pd.core.series.Series, pd.core.frame.DataFrame)): y_var = y_var.values if isinstance(y, np.ndarray) is False: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: invalid type: ' + str(type(y))) return None if self.xf_done is False: su.my_print( 'pid: ' + str(os.getpid()) + ' WARNING: cannot inverse_transform before fit is done') return None yc = copy.deepcopy(y) if self.method == 'logistic': yt = self.xobj.inverse_transform(y, y_var, lbl=lbl) yt = self.interpolate_(yt, yc, nan_pct=0.2) if yt is None: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: inverse transform failed for label: ' + str(self.lbl) + ' (method: ' + str(self.method)) return None else: return yt elif self.method == 'log': yt = self.xobj.inverse_transform(y, y_var, lbl=lbl) yt = self.interpolate_(yt, yc, nan_pct=0.2) if yt is None: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: inverse transform failed for label: ' + str(self.lbl) + ' (method: ' + str(self.method)) return None else: return yt elif self.method is None: return y else: # box-cox, yj yt = self._inverse_transform(y, yc, y_var) if yt is None: su.my_print('pid: ' + str(os.getpid()) + ' WARNING: inverse transform failed for label: ' + str(self.lbl) + ' (method: ' + str(self.method) + ' and lambda: ' + str(self.lmbda) + ')') return None else: yout = np.reshape(yt, (1, -1))[0] if self.method is not None else y return yout def _inverse_transform(self, y, yc, y_var): if self.unbias is False: ys = np.reshape(y, (-1, 1)) yt = self.xobj.inverse_transform( ys) # box-cox returns NaN on failure else: yt = self._u_inverse_transform(copy.deepcopy(y), y_var) # unbiased inverse transform yt = self.interpolate_(yt, yc, nan_pct=0.2) return yt def interpolate_(self, y, yt, nan_pct=0.2): # y: inverse-transformed values (values in natural scale) # yt: pre-inverse transform (values in transformed scale) if y is None: return None else: yx = np.reshape(y, (1, -1))[0] if self.method is not None else y nulls = pd.Series(yx).isnull().sum() pct = 100.0 * np.round(nulls / len(yx), 2) if nulls > nan_pct * np.ceil(len(yx)): su.my_print('WARNING: Too many NaN to interpolate for label ' + str(self.lbl) + ': ' + str(nulls) + ' out of ' + str(len(yx)) + ' (' + str(pct) + '%) data points and lambda ' + str(self.lmbda)) f = pd.DataFrame({'yt': list(yt), 'yx': list(yx)}) f['lmbda'] = self.lmbda p_ut.save_df(f, '~/my_tmp/interpolDF') return None elif 0 < nulls <= nan_pct * np.ceil( len(yx)): # interpolate yhat if some NaNs su.my_print('WARNING: interpolating for label ' + str(self.lbl) + ': ' + str(nulls) + ' NaNs out of ' + str(len(yx)) + ' data points (' + str(pct) + '%)') st = pd.Series(yx) sint = st.interpolate(limit_direction='both') yhat = sint.values ys = np.reshape(yhat, (1, -1)) return ys[0] else: # all OK return y
def train_model(params, seed, model_num): if model_num == 0: num_cores = 1 GPU = False CPU = True if GPU: num_GPU = 1 num_CPU = 1 if CPU: num_CPU = 1 num_GPU = 0 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \ inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \ device_count={'CPU': num_CPU, 'GPU': num_GPU}) session = tf.Session(config=config) K.set_session(session) batchsize = 2000 #3000 epochs = 3 np.random.seed(seed) tf.set_random_seed(seed) model = keras_mercari_model(seed, params) train_idx, val_idx = cvlist[seed] X_tr = [x[train_idx] for x in X] X_val = [x[val_idx] for x in X] lr1, lr2, lr3 = params[-3:] lrs = [lr1, lr2, lr3] def schedule(epoch): return lrs[epoch] lr_schedule = LearningRateScheduler(schedule) # val_store = TestCallback(X_val, X_test) gc.collect() if valid: model.fit(X_tr, y[train_idx], batch_size=batchsize, epochs=epochs, verbose=0, validation_data=(X_val, y[val_idx]), shuffle=True, callbacks=[lr_schedule]) y_val = y[val_idx, 0] y_pred = model.predict(X_val)[:, 0] print(np.sqrt(metrics.mean_squared_error(y_val, y_pred))) else: model.fit(X, y, batch_size=batchsize, epochs=epochs, verbose=0, shuffle=True, callbacks=[lr_schedule]) y_test_pred = model.predict(X_test)[:, 0] K.clear_session() return y_test_pred if model_num == 1: num_cores = 1 GPU = False CPU = True if GPU: num_GPU = 1 num_CPU = 1 if CPU: num_CPU = 1 num_GPU = 0 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \ inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \ device_count={'CPU': num_CPU, 'GPU': num_GPU}) session = tf.Session(config=config) K.set_session(session) batchsize = 2000 epochs = 3 np.random.seed(seed) tf.set_random_seed(seed) model = keras_mercari_model(seed, params) train_idx, val_idx = cvlist[seed] X_tr = [x[train_idx] for x in X] X_val = [x[val_idx] for x in X] lr1, lr2, lr3 = params[-3:] lrs = [lr1, lr2, lr3] def schedule(epoch): return lrs[epoch] lr_schedule = LearningRateScheduler(schedule) # val_store = TestCallback(X_val, X_test) gc.collect() if valid: model.fit(X_tr, ynorm[train_idx], batch_size=batchsize, epochs=epochs, verbose=0, validation_data=(X_val, ynorm[val_idx]), shuffle=True, callbacks=[lr_schedule]) y_val = y[val_idx, 0] y_pred = model.predict(X_val)[:, 0] * std + mean print(np.sqrt(metrics.mean_squared_error(y_val, y_pred))) else: model.fit(X, ynorm, batch_size=batchsize, epochs=epochs, verbose=1, shuffle=True, callbacks=[lr_schedule]) y_test_pred = model.predict(X_test)[:, 0] * std + mean K.clear_session() return y_test_pred if model_num == 2: normll = QuantileTransformer(output_distribution='normal') ynorm2 = normll.fit_transform(yrel) num_cores = 1 GPU = False CPU = True if GPU: num_GPU = 1 num_CPU = 1 if CPU: num_CPU = 1 num_GPU = 0 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \ inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \ device_count={'CPU': num_CPU, 'GPU': num_GPU}) session = tf.Session(config=config) K.set_session(session) batchsize = 2000 epochs = 3 np.random.seed(seed) tf.set_random_seed(seed) model = keras_mercari_model(seed, params) train_idx, val_idx = cvlist[seed] X_tr = [x[train_idx] for x in X] X_val = [x[val_idx] for x in X] lr1, lr2, lr3 = params[-3:] lrs = [lr1, lr2, lr3] def schedule(epoch): return lrs[epoch] lr_schedule = LearningRateScheduler(schedule) # val_store = TestCallback(X_val, X_test) gc.collect() if valid: model.fit(X_tr, ynorm2[train_idx], batch_size=batchsize, epochs=epochs, verbose=0, validation_data=(X_val, ynorm2[val_idx]), shuffle=True, callbacks=[lr_schedule]) y_val = y[val_idx, 0] y_pred = (normll.inverse_transform(model.predict(X_val))[:, 0] + 1) * train_data['cat_price'].values[val_idx] print(np.sqrt(metrics.mean_squared_error(y_val, y_pred))) else: model.fit(X, ynorm2, batch_size=batchsize, epochs=epochs, verbose=0, shuffle=True, callbacks=[lr_schedule]) y_test_pred = ( normll.inverse_transform(model.predict(X_test))[:, 0] + 1) * test_data['cat_price'].values K.clear_session() return y_test_pred if model_num == 3: num_cores = 1 GPU = False CPU = True if GPU: num_GPU = 1 num_CPU = 1 if CPU: num_CPU = 1 num_GPU = 0 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \ inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \ device_count={'CPU': num_CPU, 'GPU': num_GPU}) session = tf.Session(config=config) K.set_session(session) batchsize = 2000 epochs = 3 np.random.seed(seed) tf.set_random_seed(seed) model = keras_mercari_model(seed, params) train_idx, val_idx = cvlist[seed] X_tr = [x[train_idx] for x in X] X_val = [x[val_idx] for x in X] lr1, lr2, lr3 = params[-3:] lrs = [lr1, lr2, lr3] def schedule(epoch): return lrs[epoch] lr_schedule = LearningRateScheduler(schedule) # val_store = TestCallback(X_val, X_test) gc.collect() if valid: model.fit(X_tr, y[train_idx], batch_size=batchsize, epochs=epochs, verbose=0, validation_data=(X_val, y[val_idx]), shuffle=True, callbacks=[lr_schedule]) y_val = y[val_idx, 0] y_pred = model.predict(X_val)[:, 0] print(np.sqrt(metrics.mean_squared_error(y_val, y_pred))) else: model.fit(X, y, batch_size=batchsize, epochs=epochs, verbose=0, shuffle=True, callbacks=[lr_schedule]) y_test_pred = model.predict(X_test)[:, 0] K.clear_session() return y_test_pred
class QuantileMapper(BaseEstimator, TransformerMixin): """ Transform features using quantile mapping. Parameters ---------- detrend : boolean, optional If True, detrend the data before quantile mapping and add the trend back after transforming. Default is False. lt_kwargs : dict, optional Dictionary of keyword arguments to pass to the LinearTrendTransformer qm_kwargs : dict, optional Dictionary of keyword arguments to pass to the QuantileMapper Attributes ---------- x_cdf_fit_ : QuantileTransformer QuantileTranform for fit(X) """ def __init__(self, detrend=False, lt_kwargs={}, qt_kwargs={}): self.lt_kwargs = lt_kwargs self.qt_kwargs = qt_kwargs self.detrend = detrend def fit(self, X): """ Fit the quantile mapping model. Parameters ---------- X : array-like, shape [n_samples, n_features] Training data. """ X = ensure_samples_features(X) qt_kws = self.qt_kwargs.copy() if "n_quantiles" not in qt_kws: qt_kws["n_quantiles"] = len(X) # maybe detrend the input datasets if self.detrend: x_to_cdf = LinearTrendTransformer( **self.lt_kwargs).fit_transform(X) else: x_to_cdf = X # calculate the cdfs for X # TODO: replace this transformer with something that uses robust # empirical cdf plotting positions self.x_cdf_fit_ = QuantileTransformer(**qt_kws).fit(x_to_cdf) return self def transform(self, X): """ Perform the quantile mapping. Parameters ---------- X : array_like, shape [n_samples, n_features] Samples. """ X = ensure_samples_features(X) # maybe detrend the datasets if self.detrend: x_trend = LinearTrendTransformer(**self.lt_kwargs).fit(X) x_to_cdf = x_trend.transform(X) else: x_to_cdf = X # do the final mapping qt_kws = self.qt_kwargs.copy() if "n_quantiles" not in qt_kws: qt_kws["n_quantiles"] = len(X) x_quantiles = quantile_transform(x_to_cdf, copy=True, **qt_kws) x_qmapped = self.x_cdf_fit_.inverse_transform(x_quantiles) # add the trend back if self.detrend: x_qmapped = x_trend.inverse_transform(x_qmapped) return x_qmapped
class PowerForecaster: """ Check out the class spec at https://docs.google.com/document/d/1-ceuHfJ2bNbgmKddLTUCS0HJ1juE5t0042Mts_yEUD8v sample data is in https://drive.google.com/uc?export=download&id=1z2MBYJ8k4M5J3udlFVc2d8opE_f-S4BK """ def __init__(self, df, model=Models.PROPHET, upsample_freq=None, train_test_split_ratio=Constants.TRAIN_TEST_SPLIT_RATIO.value, epochs=Constants.EPOCHS.value, initial_epoch=Constants.INITIAL_EPOCH.value, batch_size=Constants.BATCH_SIZE.value, sliding_window_size_or_time_steps=Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value, do_shuffle=True): logging.info("resample: {}. future_prediction: {}, epochs: {}, batch_size: {}," " window_size: {}, eurons: {}" .format(Constants.RESAMPLING_FREQ.value , Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value , epochs , batch_size , sliding_window_size_or_time_steps , Constants.NEURONS.value )) if logging.getLogger().isEnabledFor(logging.INFO): explore_data(df) # first step is to create a timestamp column as index to turn it to a TimeSeries data df.index = pd.to_datetime(df[ColumnNames.DATE.value] + df[ColumnNames.TIME.value], format='%Y-%m-%d%H:%M:%S', errors='raise') if 'Unnamed: 0' in df.columns: df.drop('Unnamed: 0', axis=1, inplace=True) # keep a copy of original dataset for future comparison self.df_original = df.copy() # we interpolate temperature using prophet to use it in a multivariate forecast temperature = ColumnNames.TEMPERATURE.value interpolated_df = facebook_prophet_filter(df, temperature, Constants.FORECASTED_TEMPERATURE_FILE.value) interpolated_df.index = df.index df[[temperature]] = interpolated_df[[ColumnNames.FORECAST.value]] # lets also interpolate missing kwh using facebook prophet (or we could simply drop them) # now turn to kwh and make the format compatible with prophet power = ColumnNames.POWER.value interpolated_df = facebook_prophet_filter(df, power, Constants.FORECASTED_POWER_FILE.value) interpolated_df.index = df.index df[[power]] = interpolated_df[[ColumnNames.FORECAST.value]] df = df.rename(columns={power: ColumnNames.LABEL.value}) df.drop(columns=[ColumnNames.DATE.value, ColumnNames.TIME.value, ColumnNames.DAY_OF_WEEK.value, ColumnNames.MONTH.value], inplace=True ) if upsample_freq is not None: df = df.resample(upsample_freq).mean() # for any regression or forecasting it is better to work with normalized data self.transformer = QuantileTransformer() # handle outliers better than MinMaxScalar features = ColumnNames.FEATURES.value normalized = normalize(df, features, transformer=self.transformer) # we use the last part (after 12/1/2013) that doesnt have temperature for testing cutoff_date = Constants.CUTOFF_DATE.value self.df = normalized[normalized.index < cutoff_date] self.testing = normalized[normalized.index >= cutoff_date] self.df[ColumnNames.DATE_STAMP.value] = self.df.index self.df_blocked = None self.train_test_split_ratio = train_test_split_ratio self.model_type = model self.train_X, self.test_X, self.train_test_split_index = self.train_test_split(self.df[features]) self.train_y, self.test_y, _ = self.train_test_split(self.df[ColumnNames.LABELS.value]) self.model_fit = None self.epochs = epochs self.initial_epoch = initial_epoch self.batch_size = batch_size self.history = None # following is defines in sliding_window self.do_shuffle = do_shuffle self.val_idx = None self.shuffled_X = None self.shuffled_y = None self.train = None self.label = None self.train_size = None self.val_size = None if logging.getLogger().isEnabledFor(logging.INFO): explore_data(self.df) def train_test_split(self, df): split_index = int(self.train_test_split_ratio * df.shape[0]) train = df.iloc[:split_index, :] test = df.iloc[split_index:, :] return train, test, split_index def stationary_test(self): dataset = self.test_y.dropna() seasonal_dataset = sm.tsa.seasonal_decompose(dataset, freq=365) fig = seasonal_dataset.plot() fig.set_figheight(8) fig.set_figwidth(15) fig.show() def p_value(dataset): # ADF-test(Original-time-series) dataset.dropna() p_value = sm.tsa.adfuller(dataset, regression='ct') logging.debug('p-value:{}'.format(p_value)) p_value = sm.tsa.adfuller(dataset, regression='c') logging.debug('p-value:{}'.format(p_value)) p_value(self.train_y) p_value(self.test_y) # Test works for only 12 variables, check the eigenvalues johnsen_test = coint_johansen(self.df[ColumnNames.FEATURES.value].dropna(), -1, 1).eig return johnsen_test def seasonal_prediction(self): from statsmodels.tsa.api import SimpleExpSmoothing y_hat_avg = self.test_y.copy() fit2 = SimpleExpSmoothing(np.asarray(self.train_y['Count'])).fit(smoothing_level=0.6, optimized=False) y_hat_avg['SES'] = fit2.forecast(len(self.test_y)) plt.figure(figsize=(16, 8)) plt.plot(self.train_y['Count'], label='Train') plt.plot(self.test_y['Count'], label='Test') plt.plot(y_hat_avg['SES'], label='SES') plt.legend(loc='best') plt.show() def fit(self): if self.model_type == Models.PROPHET: self.prophet_fit() elif self.model_type == Models.ARIMA: self.arima_fit() elif self.model_type == Models.VAR: self.var_fit() elif self.model_type == Models.LSTM: self.lstm_fit() else: raise ValueError("{} is not defined".format(self.model_type)) def evaluate(self): self.loss_metrics = self.model_type.value.evaluate( self.val_X, self.val_y, batch_size=self.batch_size, verbose=0 ) logging.info("Metric names:{}".format(self.model_type.value.metrics_names)) logging.info("Loss Metrics:{}".format(self.loss_metrics)) def resultToDataFrame(self, data, start_index, end_index, do_scale_back=False): label_column = ColumnNames.LABEL.value df = self.df.iloc[start_index:end_index] df[label_column] = data if do_scale_back: features = ColumnNames.FEATURES.value df[features] = self.transformer.inverse_transform(df[features]) return df[[label_column]] def block_after_date(self, start_block_date_st): index, _ = find_index(self.df, start_block_date_st) logging.debug("Index of block is {} with length of {}".format(index, len(self.df) - index)) self.df_blocked = self.df.iloc[index:] self.df_blocked.reindex() logging.info("Blocked from {} to {} fromo training and validation" .format(self.df_blocked.index[0], self.df_blocked.index[-1])) def adjust_index_and_training_shift(self, start_date_in_labeling_st , training_duration_in_frequency = None , start_date_training_st = None ): logging.debug("Original range data of data: [{}-{}]".format(self.df.index[0], self.df.index[-1])) index_start_labeling, _ = find_index(self.df, start_date_in_labeling_st) if start_date_training_st is not None: index_start_training, _ = find_index(self.df, start_date_training_st) if index_start_labeling < index_start_training: raise ValueError("Labeling should be after training") self.shift = index_start_labeling - index_start_training else: index_start_training = 0 self.shift = index_start_labeling if training_duration_in_frequency is None: logging.info("Shift is set to be {}".format(self.shift)) else: final_index = index_start_training + training_duration_in_frequency + self.shift logging.debug("start index: {}, final_index: {}".format(index_start_training, final_index)) self.df = self.df.iloc[index_start_training:index_start_training + training_duration_in_frequency + self.shift] logging.info("Shift is set to be {}, we picked the slice of [{} : {}] for trainig".format( self.shift, self.df.index[0] , self.df.index[-1] )) def lstm_predict(self, model , start_date_to_predict_st=None , duration_in_freq = None , do_scale_back = False ): X, true_y = self.get_whole() if start_date_to_predict_st is not None: y_index_i, _ = find_index(self.df, start_date_to_predict_st) x_index_i = 0 if y_index_i <= self.shift else y_index_i - self.shift x_index_f = x_index_i + duration_in_freq y_index_f = y_index_i + duration_in_freq logging.info("Predicting time slice [{} : {}] from [{} : {}]".format( self.df.index[y_index_i],self.df.index[y_index_f] , self.df.index[x_index_i], self.df.index[x_index_f] )) X = X[x_index_i:x_index_f] true_y = true_y[y_index_i:y_index_f] predicted = model.predict(X) logging.debug("Predicted Labels shape: {}".format(predicted.shape)) plt.plot(predicted, 'r') plt.plot(true_y, 'b') plt.show() df_predicted = self.resultToDataFrame(predicted, x_index_i + self.shift , x_index_f + self.shift, do_scale_back) return df_predicted def scale_back(self, df_predicted, start_index, end_index): label_column = ColumnNames.LABEL.value features = ColumnNames.FEATURES.value df = self.df[features].iloc[start_index:end_index] df[label_column] = df_predicted[label_column] scaled_predicted = self.transformer.inverse_transform(df[features]) df[features] = scaled_predicted return df def prophet_fit(self): past = self.train_y.copy() past[ColumnNames.DATE_STAMP.value] = self.train_y.index self.model_type.value.fit(past) def arima_fit(self): model = sm.tsa.statespace.SARIMAX(self.train_y, order=Constants.SARIMAX_ORDER.value, seasonal_order=Constants.SARIMAX_SEASONAL_ORDER.value) # ,enforce_stationarity=False, enforce_invertibility=False, freq='15T') logging.debug("SARIMAX fitting ....") self.model_fit = self.model_type.value.fit() self.model_fit.summary() logging.debug("SARIMAX forecast", self.model_fit.forecast()) def var_fit(self): logging.debug("making VAR model") model = VAR(endog=self.train_X[ColumnNames.FEATURES.value].dropna()) logging.debug("VAR fitting ....") self.model_fit = model.fit() print(self.model_fit.summary()) def lstm_fit(self): if logging.getLogger().isEnabledFor(logging.INFO): print(self.model_type.value.summary()) callbacks = Callbacks(Constants.MODEL_NAME.value, self.batch_size, self.epochs) X, y = self.get_shuff_train_label() self.history = self.model_type.value.fit( X, y, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.35, verbose=0, callbacks=callbacks.getDefaultCallbacks(), initial_epoch=self.initial_epoch, ) logging.debug("history of performance:{}".format(self.history.history)) def predict(self, feature_set=None): future = feature_set if feature_set is not None \ else Constants.DEFAULT_FUTURE_PERIODS.value if self.model_type == Models.PROPHET: self.future = self.model_type.value.make_future_dataframe(periods=future, freq=Constants.DEFAULT_FUTURE_FREQ.value, include_history=False) if self.model_type == Models.PROPHET: predicted = self.model_type.value.predict(self.future) predicted[ColumnNames.LABEL.value] = predicted[ColumnNames.FORECAST.value] elif self.model_type == Models.ARIMA: predicted = self.arima_predict(future) elif self.model_type == Models.VAR: predicted = self.var_predict(future) elif self.model_type == Models.LSTM: return self.lstm_predict(self.model.value, start_date_to_predict_st="2013-6-01", duration_in_freq=3 * 30) else: raise ValueError("{} is not defined".format(self.model_type)) df_predicted = self.resultToDataFrame(predicted, self.train_test_split_index , self.train_test_split_index + len(predicted)) return df_predicted def arima_predict(self, future): end = str(self.train_y.index[-1]) start = str(self.train_y.index[-future]) print(start, end) predicted = self.model_fit.predict(start=start[:10], end=end[:10], dynamic=True) return predicted def var_predict(self, future): predicted_array = self.model_fit.forecast(self.model_fit.y, future) predicted = pd.DataFrame(predicted_array) predicted.columns = ColumnNames.FEATURES.value predicted.index = self.test_y.index[:len(predicted)] return predicted def sliding_window(self): # Generate the data matrix length0 = self.df.shape[0] window_size = Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value future_time_steps = Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value features_column = ColumnNames.FEATURES.value label_column = ColumnNames.LABEL.value sliding_window_feature = np.zeros((length0 - window_size - future_time_steps, window_size, len(features_column))) sliding_window_label = np.zeros((length0 - window_size - future_time_steps, 1)) for counter in range(length0 - window_size - future_time_steps): sliding_window_label[counter, :] = self.df[label_column][counter + window_size + future_time_steps] for counter in range(length0 - window_size - future_time_steps): sliding_window_feature[counter, :] = self.df[features_column][ counter: counter + window_size] if self.do_shuffle: logging.debug('Random shuffeling') length = sliding_window_feature.shape[0] if self.df_blocked is not None: length -= len(self.df_blocked) logging.info("length of data reduced by {} due to blocking. The last date is {}" .format(len(self.df_blocked), self.df.index[length])) logging.debug("sliding window length: {}".format(length)) split_ratio = Constants.TRAIN_TEST_SPLIT_RATIO.value idx = np.random.choice(length, length, replace=False) if self.do_shuffle else np.arange(length) self.val_idx = idx[int(split_ratio * length):] feature_window_shuffled = sliding_window_feature[idx, :] label_window_shuffled = sliding_window_label[idx, :] self.shuffled_X = feature_window_shuffled self.shuffled_y = label_window_shuffled self.train = sliding_window_feature self.label = sliding_window_label self.train_X = self.shuffled_X[:int(split_ratio * length), :] self.train_y = self.shuffled_y[:int(split_ratio * length), :] self.train_size = int(split_ratio * length) self.val_X = self.shuffled_X[int(split_ratio * length):, :] self.val_y = self.shuffled_y[int(split_ratio * length):, :] self.val_size = length - self.train_size def get_shuff_train_label(self): X = self.shuffled_X # np.expand_dims(self.shuffled_X, axis=-1) Y = self.shuffled_y return X, Y def evaluate_performance(self): # make a prediction X = self.test_X # np.expand_dims(self.test_X, axis=-1) yhat = self.model_type.value.predict(X) test_X = self.test_X.reshape((self.test_X.shape[0], self.test_X.shape[2])) # invert scaling for forecast inv_yhat = pd.concatenate((yhat, test_X[:, 1:]), axis=1) inv_yhat = self.transformer.inverse_transform(inv_yhat) inv_yhat = inv_yhat[:, 0] # invert scaling for actual test_y = self.test_y.reshape((len(self.test_y), 1)) inv_y = pd.concatenate((test_y, test_X[:, 1:]), axis=1) inv_y = self.transformer.inverse_transform(inv_y) inv_y = inv_y[:, 0] # calculate RMSE rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) logging.debug('Test RMSE: %.3f' % rmse) def plot_future(self, predicted): self.model_type.value.plot(predicted, xlabel='Date', ylabel='KWH') self.model_type.value.plot_components(predicted) # by_dow.plot(xticks=ticks, style=style, title='Averaged on Days of the Week') # plt.show() def visual_inspection(self): style = [':', '--', '-'] pd.plotting.register_matplotlib_converters() df = self.df self.df_original[ColumnNames.ORIGINAL_FEATURES.value].plot(style=style, title='Original Data') plt.show() self.df[ColumnNames.FEATURES.value].plot(style=style, title='Normalized Data') plt.show() sampled = df.resample('M').sum()[ColumnNames.FEATURES.value] sampled.plot(style=style, title='Aggregated Monthly') plt.show() sampled = df.resample('W').sum()[ColumnNames.FEATURES.value] sampled.plot(style=style, title='Aggregated Weekly') plt.show() sampled = df.resample('D').sum()[ColumnNames.FEATURES.value] sampled.rolling(30, center=True).sum().plot(style=style, title='Aggregated Daily') plt.show() by_time = df.groupby(by=df.index.time).mean()[ColumnNames.FEATURES.value] ticks = 4 * 60 * 60 * np.arange(6) by_time.plot(xticks=ticks, style=style, title='Averaged Hourly') plt.show() days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] def tick(x): if x % 24 == 12: return days[int(x) // 24] else: return "" # ax.xaxis.set_major_formatter(NullFormatter()) # ax.xaxis.set_minor_formatter(FuncFormatter(tick)) # ax.tick_params(which="major", axis="x", length=10, width=1.5) #by_dow = df.groupby(by=df.dow).mean()[ColumnNames.FEATURES.value] #ticks = 4 * 60 * 60 * np.arange(6) def plot_prediction(self, start_index, end_index): style = [':', '--', '-'] pd.plotting.register_matplotlib_converters() label_column = ColumnNames.LABELS.value # import pdb; pdb.set_trace() t = self.train.index.iloc[start_index:end_index] X = self.train.iloc[start_index: end_index] true_y = self.label.iloc[start_index, end_index] y = self.model_type.value.predict(X) plt.plot(t, y, true_y, style=style) plt.show() def plot_history(self): plt.plot(np.arange(self.epochs - self.initial_epoch), self.history.history['loss'], label='train') plt.plot(np.arange(self.epochs - self.initial_epoch), self.history.history['val_loss'], label='validation') plt.legend() plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() def get_next_train_batch(self): # getting the next train batch if self.pointer + self.batchsize >= self.train_size: end = self.train_size start = self.pointer self.pointer = 0 self.epoch += 1 else: end = self.pointer + self.batchsize start = self.pointer self.pointer += self.batchsize X = self.train_data[start:end, :] Y = self.train_label[start:end, :] return X, Y def get_val(self): X = np.expand_dims(self.val_data, axis=-1) return X, self.val_label[:] def get_whole(self): # get whole, for validation set X = self.train[:, :] # np.expand_dims(self.train[:, :], axis=-1) Y = self.label[:, :] return X, Y def reset(self): self.pointer = 0 self.epoch = 0