def trainandTest(X_train, y_train, X_test, testindex, output_path): rbY = RobustScaler() y_train = rbY.fit_transform(y_train.values.reshape(-1,1)) model = xgb.XGBRegressor(n_estimators=210, subsample=0.7, max_depth=3, min_child_weight=1, seed=0, colsample_bytree=0.8, #learning_rate=0.21, gamma=0.14, reg_alpha=0.015, reg_lambda=0.002, silent=1, objective='reg:linear') model.fit(X_train, y_train) ans = model.predict(X_test) ans = rbY.inverse_transform(ans.reshape(-1,1)) ans = np.exp(ans) result = testindex result['SalePrice'] = ans result.to_csv(output_path, index=None) # 显示重要特征 # plot_importance(model) # plt.show() bns = np.exp(rbY.inverse_transform(model.predict(X_train).reshape(-1,1))) error = [] for i ,j in zip(bns, y_train): error.append(abs(i-j)) print(bns.tolist()) print('xun训练集误差') print(sqrt(sum(error)/len(error)))
def trainandTestLa(X_train, y_train, X_test, testindex, output_path): print(type(X_train)) rbY = RobustScaler() y_train = rbY.fit_transform(y_train.values.reshape(-1,1)) model = Lasso(alpha=0.06, max_iter=2000, selection='random', tol=0.001, normalize=False) model.fit(X_train, y_train) print(type(X_train)) coef = pd.Series(model.coef_, index = X_train.columns)# .coef_ 可以返回经过学习后的所有 feature 的参数。 print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") coef_all = coef[coef!=0].sort_values() print(set(coef_all.index.tolist()) & set(added_features)) print(coef_all.tail(20)) print(coef_all.head(20)) ans = model.predict(X_test) ans = rbY.inverse_transform(ans.reshape(-1,1)) ans = np.exp(ans) result = testindex result['SalePrice'] = ans result.to_csv(output_path, index=None) bns = np.exp(rbY.inverse_transform(model.predict(X_train).reshape(-1,1))) error = [] for i ,j in zip(bns, y_train): error.append(abs(i-j)) print(bns.tolist()) print('xun训练集误差') print(sqrt(sum(error)/len(error)))
class Trainer(metaclass=ABCMeta): def __init__(self, df_daily, df_monthly): self.df_daily = df_daily self.df_monthly = df_monthly self.scalerX = RobustScaler(quantile_range=(10, 90)) self.scalerY = RobustScaler(quantile_range=(10, 90)) def load_data(self, val_size, test_size, target_column): df = FeatureSelection().add_prod_delay_correlation( dataframe=self.df_daily, df_month=self.df_monthly.copy(), target=target_column) bcb = BCB() bcb = bcb.get_dataframe(df.index[0], df.index[-1]) if not bcb.empty: bcb.set_index(df.index, inplace=True) df = pd.concat((df, bcb), axis=1, join='inner') columns = list(df) columns[-1], columns[columns.index(target_column)] = columns[ columns.index(target_column)], columns[-1] df = df.reindex(columns=columns) df.iloc[:, -1:] = remove_outliers(df.iloc[:, -1:]) df = sum_days(df, past_days=31, prevision_days=31) df.drop('NUM_VENDEDOR', axis=1, inplace=True) y_total = df.iloc[:, -1:].values x_total = df.iloc[:, :-1].values y_test = y_total[-test_size:, :] x_test = x_total[-test_size:, :] y_train = y_total[:-val_size - test_size, :] x_train = x_total[:-val_size - test_size, :] y_val = y_total[-val_size - test_size - 1:-test_size, :] x_val = x_total[-val_size - test_size - 1:-test_size, :] x_train = self.scalerX.fit_transform(x_train) y_train = self.scalerY.fit_transform(y_train) x_val = self.scalerX.transform(x_val) y_val = self.scalerY.transform(y_val) x_test = self.scalerX.transform(x_test) y_test = self.scalerY.transform(y_test) return x_train, y_train, x_val, y_val, x_test, y_test # @abstractmethod # def train(self): # pass # # @abstractmethod # def predict(self): # pass def inverse_transformX(self, df): return self.scalerX.inverse_transform(df) def inverse_transformY(self, df): return self.scalerY.inverse_transform(df)
class FloatCode(IntCode): def __init__( self, col_name: str, code_len: int, start_id: int, fillall: bool = True, base: int = 100, hasnan: bool = True, transform: str = 'quantile', ): super().__init__(col_name, code_len, start_id, fillall, base, hasnan) if transform == 'yeo-johnson': self.scaler = PowerTransformer(standardize=True) elif transform == 'quantile': self.scaler = QuantileTransformer(output_distribution='uniform') elif transform == 'robust': self.scaler = RobustScaler() else: raise ValueError( 'Supported data transformations are "yeo-johnson", "quantile", and "robust"' ) def convert_to_int(self, val: float) -> int: val = np.expand_dims(np.array(val), axis=0) values = self.scaler.transform(val[:, None])[:, 0] - self.mval values = (values * self.base**self.extra_digits).astype(int) output = values[0] return output def array_convert_to_int(self, val: ndarray): values = self.scaler.fit_transform(val[:, None])[:, 0] self.mval = values.min() values = values - self.mval digits = int(math.log(values.max(), self.base)) + 1 # extra digits used for 'float' part of the number extra_digits = self.code_len - digits if extra_digits < 0: raise ValueError("need large length to code the nummber") self.extra_digits = extra_digits values = (values * self.base**self.extra_digits).astype(int) return values def reverse_convert_to_int(self, val: int) -> float: val = val / self.base**self.extra_digits val = np.expand_dims(np.array(val), axis=0) v = self.scaler.inverse_transform(val[:, None] + self.mval)[0, 0] return v def decode(self, ids: List[int]) -> str: if self.hasnan and ids[0] == self.NA_token_id[0]: return self.NA_token v = 0 for i in reversed(range(self.code_len)): digit = int(self.digits_id_to_item[i][ids[self.code_len - i - 1]]) v += digit * self.base**i v = self.reverse_convert_to_int(v) accuracy = max(int(abs(np.log10(0.1 / self.base**self.extra_digits))), 1) return f"{v:.{accuracy}f}"
def modeling(input): dataset = np.loadtxt("models/First_trial_regression.csv", delimiter=",") # separate the data from the target attributes X = dataset[:, 0:12] Y = dataset[:, 12] rbX = RobustScaler() X = rbX.fit_transform(X) rbY = RobustScaler() Y = np.expand_dims(Y, 0) Y = Y.T Y = rbY.fit_transform(Y) clf = SVR(C=1.000, epsilon=0.2, kernel="rbf") clf.fit(X, Y) joblib.dump(clf, 'models/SMOreg.pkl') clf2 = joblib.load('models/SMOreg.pkl') audience_num = clf2.predict(rbX.transform(input)) audience_num = np.expand_dims(audience_num, 0) audience_num = audience_num.T audience_num = rbY.inverse_transform(audience_num) audience_num = np.squeeze(audience_num) return {'audience_num': audience_num, 'rbX': rbX, 'rbY': rbY}
class DFRobustScaler(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = RobustScaler(**kwargs) self.transform_cols = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = X.copy() new_X[self.transform_cols] = self.model.transform(X[self.transform_cols]) return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X) def inverse_transform(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = X.copy() new_X[self.transform_cols] = self.model.inverse_transform(X[self.transform_cols]) return new_X
def denormalize(df, norm_data): # df = df['s3'].values.reshape(-1,1) # norm_data = norm_data.reshape(-1,1) scl = RobustScaler() a = scl.fit(df['s3'].as_matrix().reshape(-1, 1)) new = scl.inverse_transform(norm_data) return new
def trainandTestKR(X_train, y_train, X_test, testindex, output_path): rbY = RobustScaler() y_train = rbY.fit_transform(y_train.values.reshape(-1,1)) model = KernelRidge(alpha=0.19, kernel='laplacian', coef0=0) model.fit(X_train, y_train) ans = model.predict(X_test) ans = rbY.inverse_transform(ans.reshape(-1,1)) ans = np.exp(ans) result = testindex result['SalePrice'] = ans result.to_csv(output_path, index=None) bns = np.exp(rbY.inverse_transform(model.predict(X_train).reshape(-1,1))) error = [] for i ,j in zip(bns, y_train): error.append(abs(i-j)) print(bns.tolist()) print('xun训练集误差') print(sqrt(sum(error)/len(error)))
class Scaler(object): """If we have `hypers.scale=True`, we use this class to scale everything (price-actions, rewards, etc). Using this instead of TForce's built-in preprocessing (http://tensorforce.readthedocs.io/en/latest/preprocessing.html) since this gives more flexibility, but it's basically the same thing. Someone may want to check me on that statement by reading those docs and trying TForce's preprocessing instead of this. One important bit here is the use of RobustScaler with a quantile_range. This allows us to handle outliers, which abound in the data. Sometimes we have a timeseries hole, and suddenly we're up a billion percent. Sometimes whales pump-and-dump to screw with the market. RobustScaler lets us "ignore" those moments. TODO someone will want to double-check my work on this scaling approach in general. Best of my knowledges, but I'm a newb. """ # 400k should be enough data to safely say "I've seen it all, just scale (don't fit) going forward") STOP_AT = 3e5 SKIP = 15 def __init__(self): self.reward_scaler = RobustScaler(quantile_range=(5., 95.)) self.state_scaler = RobustScaler(quantile_range=(5., 95.)) self.rewards = [] self.states = [] self.done = False self.i = 0 def _should_skip(self): # After we've fitted enough (see STOP_AT), start returning direct-transforms for performance improvement # Skip every few fittings. Each individual doesn't contribute a whole lot anyway, and costs a lot return self.done or (self.i % self.SKIP != 0 and self.i > self.SKIP) def transform_state(self, state): self.i += 1 if self._should_skip(): return self.state_scaler.transform([state])[-1] # Fit, transform, return self.states.append(state) ret = self.state_scaler.fit_transform(self.states)[-1] if self.i >= self.STOP_AT: # Clear up memory, fitted scalers have all the info we need. stop=True only needed in one of these functions del self.rewards del self.states self.done = True return ret def transform_reward(self, reward): if self._should_skip(): return self.reward_scaler.transform([[reward]])[-1][0] self.rewards.append([reward]) return self.reward_scaler.fit_transform(self.rewards)[-1][0] def avg_reward(self): if self.i < self.SKIP: return 20 reward = self.reward_scaler.inverse_transform([[0]])[-1][0] return abs(reward)
def graph_forecast(): data, last_timestamp = ml.get_all_data() # Prediction # print(last_timestamp) # data.reverse() # train = data[:-24] # test = data[len(data) - 24:] # train = list(ml.chunk(train, 24)) # Test train = data[:-48] test = data[len(data) - 48:len(data) - 24] actual = data[len(data) - 24:] train = list(ml.chunk(train, 24)) real_temp = [] for a in range(0, len(actual)): real_temp.append(actual[a][0]) print(real_temp) # Train model and predict the new temperature train_X = [] train_Y = [] for k in range(0, len(train) - 2): for j in range(0, 24): train_X.append(train[k][j]) train_Y.append(train[k + 1][j][0]) rbX = RobustScaler() X = rbX.fit_transform(train_X) rbY = RobustScaler() Y = rbY.fit_transform(train_Y) svm = SVR(kernel='rbf', C=1e3, gamma=0.0001) svm.fit(X, Y) svm_pred = svm.predict(rbX.transform(test)) new_temp = list((rbY.inverse_transform(svm_pred)) / 1000) print(new_temp) # Get the timestamp for the next 24 hours timestamp = [] last_timestamp = last_timestamp[:-5] for i in range(1, 25): last_timestamp_dt = datetime.strptime(last_timestamp, '%Y-%m-%dT%H:%M:%S') last_timestamp_dt = last_timestamp_dt.replace(tzinfo=None) correct_time = last_timestamp_dt + timedelta(hours=int(i)) timestamp.append(str(correct_time)) # Plot the prediction (and verfication) line_chart = pygal.Line(x_label_rotation=30) line_chart.title = '24 hour forecast of indoor temperature in degrees Celsius' line_chart.x_labels = map(str, list(timestamp)) line_chart.add('Prediction', list(new_temp)) real_temp[:] = [x / 1000 for x in real_temp] line_chart.add('Actual', list(real_temp)) return Response(response=line_chart.render(), content_type='image/svg+xml')
def trainandTestLR(X_train, y_train, X_test, testindex, output_path): rbY = RobustScaler() y_train = rbY.fit_transform(y_train.values.reshape(-1,1)) model = LinearRegression() model.fit(X_train, y_train) ans = model.predict(X_test) ans = rbY.inverse_transform(ans.reshape(-1,1)) ans = np.exp(ans) result = testindex result['SalePrice'] = ans result.to_csv(output_path, index=None)
def predict(n = 0): h5f = h5py.File('D:/total_data_ro.h5','r') test_data = h5f['total_2019'][:] test_data_gen = h5f['gen_2019'][:] h5f.close() sc = RobustScaler() sc = joblib.load('D:/scaler_gen.pkl') test_data_gen = test_data_gen.reshape(-1, 1) img_input = Input(shape=(None, 75, 75, 10), name='images') convlstm2d = layers.ConvLSTM2D(filters=20, kernel_size=(3, 3), input_shape=(None, 75, 75, 10), data_format='channels_last', return_sequences=True, padding='same')(img_input) batch_norm = layers.BatchNormalization()(convlstm2d) convlstm2d_1x1 = layers.ConvLSTM2D(filters=1, kernel_size=(3, 3), data_format='channels_last',padding='same')(batch_norm) batch_norm = layers.BatchNormalization()(convlstm2d_1x1) flatten = layers.Flatten()(batch_norm) Dense_1_list = [layers.Dense(units=1)(flatten) for i in range(72)] model = Model(img_input, Dense_1_list) model.compile(optimizer=rmsprop(lr=0.001), loss=['mae' for i in range(72)]) model.load_weights('D:/gen_pred_72hours.h5') test_sample, gen_sample = generator(test_data, test_data_gen, 120, 72, 0, None, shuffle=True, batch_size=1) test_sample = np.rollaxis(test_sample, 2, 5) y_pred = model.predict(test_sample) y_pred = np.array(y_pred).reshape(-1,1) #y_test = np.array(gen_sample) y_pred = sc.inverse_transform(y_pred) sc = MinMaxScaler(feature_range=(0, 40)) plt.plot(sc.fit_transform(y_pred), label='pred') #plt.plot(gen_sample, label='real') plt.legend() plt.savefig("media/test" + str(n) + ".png") print('save img') plt.clf() plt.cla() plt.close() #plt.show()
class Scaler: def __init__(self, column): self.column = column.values.reshape(-1, 1) self.scalar = RobustScaler() self.scaled_column = None @staticmethod def get_new_scaler(self): return RobustScaler() def transform(self): self.scaled_column = self.scalar.fit_transform(self.column) return self.scaled_column.flatten() def inv_transform(self): return self.scalar.inverse_transform(self.scaled_column).flatten()
X_train = rfe.fit_transform(X_train, y_train) X_test = rfe.transform(X_test) from sklearn.preprocessing import RobustScaler rb_X = RobustScaler(with_centering=True, quantile_range=(23.0, 74.0)) rb_y = RobustScaler(with_centering=False, quantile_range=(23.0, 74.0)) X_train = rb_X.fit_transform(X_train) X_test = rb_X.transform(X_test) y_train = rb_y.fit_transform(y_train.reshape(-1, 1))[:, 0] # Applying Grid Search to find the best model and the best parameters from sklearn.kernel_ridge import KernelRidge from sklearn.metrics import r2_score, make_scorer r2_scorer = make_scorer(r2_score, greater_is_better=True) # Fitting the SVR to the dataset regressor = KernelRidge(alpha=1.7e-2, kernel='laplacian', gamma=1.44929e-4) #regressor = RandomForestRegressor(n_estimators = 300, max_depth = 3) # Final solution regressor.fit(X_train, y_train) y_test_pred = rb_y.inverse_transform(regressor.predict(X_test).reshape(-1, 1)) y_test_pred = np.rint(y_test_pred) sol = np.append(arr=ids_test.reshape(-1, 1), values=y_test_pred.reshape(-1, 1), axis=1) fsol = pd.DataFrame(sol) fsol.rename(columns={0: 'id', 1: 'y'}, inplace=True) fsol.to_csv('last_ditch_sol.csv', encoding='utf-8', index=False) print("DONE")
class ANN(): def __init__(self, df, output_column, train_raw, val_raw, test_raw, continuous_cols): self.output_column = output_column self.train = train_raw self.val = val_raw self.test = test_raw self.continous = continuous_cols self.df = df self.train_size = train_raw.size #keras.backend.set_epsilon(1) def scale(self, train, test, val): #scale features cs = RobustScaler() return (trainX, valX, testX) def prepare(self): self.cs = RobustScaler() self.trainX = self.cs.fit_transform(self.train[self.continous]) self.valX = self.cs.transform(self.val[self.continous]) self.testX = self.cs.transform(self.test[self.continous]) #self.trainX, self.valX, self.testX = self.scale(self.train, self.test, self.val) self.trainY = self.cs.fit_transform(self.train[[self.output_column]]) self.testY = self.cs.transform(self.test[[self.output_column]]) self.valY = self.cs.transform(self.val[[self.output_column]]) return self.trainX, self.trainY, self.valX, self.valY def create_model(self, neurons=256): model = Sequential() initializer = keras.initializers.HeNormal() model.add( Dense(neurons, input_dim=self.trainX.shape[1], name='InputLayer', activation='relu', kernel_initializer=initializer)) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Dropout(rate=0.36)) neurons = neurons / 2 model.add( Dense(neurons, kernel_initializer=initializer, activation='relu', name='H1')) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Dropout(rate=0.37)) neurons = neurons / 2 model.add( Dense(neurons, kernel_initializer=initializer, activation='relu', name='H2')) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Dropout(rate=0.16)) model.add( Dense(1, activation='linear', name="OutputLayer", kernel_initializer=initializer)) self.model = model return model def get_callbacks(self): return [ keras.callbacks.EarlyStopping(monitor='val_loss', patience=50), keras.callbacks.History() ] def train_trials(self, n_trials): study = optuna.create_study() study.optimize(self.objective, n_trials=n_trials) return study.best_params def objective(self, trial): K.clear_session() model = Sequential() neurons = 512 initializer = keras.initializers.HeNormal() initializer = keras.initializers.HeNormal() model.add( Dense(neurons, input_dim=self.trainX.shape[1], name='InputLayer', activation='relu', kernel_initializer=initializer)) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Dropout(rate=0.36)) neurons = neurons / 2 model.add( Dense(neurons, kernel_initializer=initializer, activation='relu', name='H1')) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Dropout(rate=0.37)) neurons = neurons / 2 model.add( Dense(neurons, kernel_initializer=initializer, activation='relu', name='H2')) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Dropout(rate=0.16)) model.add( Dense(neurons, kernel_initializer=initializer, activation='relu', name='H3')) model.add( Dense(1, activation='linear', name="OutputLayer", kernel_initializer=initializer)) trainY = self.trainY testY = self.testY trainX = self.trainX model = self.model opt = Adam(lr=trial.suggest_float('lr', 1e-5, 1e-3, log=True), decay=trial.suggest_float('decay', 1e-5, 0.1, log=True)) model.compile(loss='mae', optimizer=opt) history = model.fit(x=trainX, y=trainY, validation_data=(self.valX, self.valY), epochs=15, verbose=2, batch_size=trial.suggest_int('batchsize', 68, 512, step=12)) return history.history["val_loss"][-1] def train_model(self, hparams, filename, historyname): opt = Adam(lr=hparams['lr'], decay=hparams['decay_rate']) #opt = tfa.optimizers.MovingAverage(opt) #keras.backend.set_epsilon(1e-7) print("[INFO] processing data") trainY = self.trainY testY = self.testY trainX = self.trainX model = self.model model.compile(loss='mae', optimizer=opt, metrics=[root_mean_squared_error]) model.summary() print("[INFO] training model...") history = model.fit(x=trainX, y=trainY, validation_data=(self.valX, self.valY), verbose=2, epochs=hparams["epochs"], batch_size=hparams["batch"], callbacks=self.get_callbacks()) model.save(filename) hist_df = pd.DataFrame(history.history) hist_csv_file = f'{filename}/{historyname}.csv' with open(hist_csv_file, mode='w') as f: hist_df.to_csv(f) self.model = model plot_model(model, to_file=f'{filename}/model_archi.png', show_shapes=True, show_layer_names=True) return history, model def predict(self): model = self.model print("[INFO] predicting trade value...") preds = model.predict(self.testX) #print(preds) df = self.df # compute the difference between the *predicted* *actual* , then compute the percentage difference and # the absolute percentage difference diff = preds.flatten() - self.testY percentDiff = (diff / self.testY) * 100 absPercentDiff = np.abs(percentDiff) # compute the mean and standard deviation of the absolute percentage difference mean = np.mean(absPercentDiff) std = np.std(absPercentDiff) locale.setlocale(locale.LC_ALL, "en_US.UTF-8") print("[INFO] avg. trade value: {}, std trade value: {}".format( locale.currency(df[self.output_column].mean(), grouping=True), locale.currency(df[self.output_column].std(), grouping=True))) print("[INFO] mean: {:.2f}%, std: {:.2f}%".format(mean, std)) return self.cs.inverse_transform(preds)
def run(news_df, snp_df, split, stopwords, using_text): data = news_df.groupby('date').sum().join(snp_df.set_index('Date')).dropna() X_temp = data.values X_pre_price = generate_price_features(data) X_price = data['Open'].values y = generate_regression_label(data) y_cls = generate_classification_label(data) tscv = TimeSeriesSplit(n_splits=split) for train_index, test_index in tscv.split(X_temp): start_date = data.index[train_index[0]] split_date = data.index[test_index[0]] end_date = data.index[test_index[-1]] print(start_date, split_date, end_date) if using_text: bag_of_words, vectorizer = generate_bag_of_words(news_df, start_date, split_date, end_date, stopwords) X = data[['Open']].join(bag_of_words, how='inner').drop('Open', axis=1).values word_size = X.shape[1] X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] y_cls_train, y_cls_test = y_cls[train_index], y_cls[test_index] X_train_price = X_price[train_index] X_test_price = X_price[test_index] X_train_pre_price = X_pre_price[train_index] X_test_pre_price = X_pre_price[test_index] # Normalization and Scaling scaler = RobustScaler() scaler.fit(X_train_pre_price[:,0].reshape(-1, 1)) #x_train_price_t = scaler.transform(X_train_price.reshape(-1, 1)) #x_test_price_t = scaler.transform(X_test_price.reshape(-1, 1)) x_train_pre_price_t = scaler.transform(X_train_pre_price.reshape(-1, 1)).reshape(-1, X_train_pre_price.shape[1]) x_test_pre_price_t = scaler.transform(X_test_pre_price.reshape(-1, 1)).reshape(-1, X_test_pre_price.shape[1]) y_train_t = scaler.transform(y_train.reshape(-1, 1)).reshape(-1, ) if using_text: x_text_train_t = normalize(X_train) x_text_test_t = normalize(X_test) x_train_t = np.concatenate((x_text_train_t, x_train_pre_price_t), axis=1) x_test_t = np.concatenate((x_text_test_t, x_test_pre_price_t), axis=1) else: x_train_t = x_train_pre_price_t x_test_t = x_test_pre_price_t # Modeling cls_clf = LogisticRegression(penalty='l2', C=0.5, verbose=0, max_iter=100) cls_clf.fit(x_train_t, y_cls_train) y_train_cls_clf = cls_clf.predict(x_train_t) y_test_cls_clf = cls_clf.predict(x_test_t) #clf = SVR(kernel='linear', C=0.0005, verbose=0) #clf = LinearRegression() clf = linear_model.Ridge(alpha=1.0) clf.fit(x_train_t, y_train_t) y_train_clf = clf.predict(x_train_t) y_test_clf = clf.predict(x_test_t) y_train_hat = scaler.inverse_transform(y_train_clf.reshape(-1, 1)).reshape(-1, ) y_test_hat = scaler.inverse_transform(y_test_clf.reshape(-1, 1)).reshape(-1, ) #ipdb.set_trace() # Evaluation train_acc = accuracy_score(y_cls_train, y_train_cls_clf) test_acc = accuracy_score(y_cls_test, y_test_cls_clf) print("Accuracy ", train_acc, test_acc) train_mse = mean_squared_error(y_train, y_train_hat) test_mas = mean_squared_error(y_test, y_test_hat) print("MSE", train_mse, test_mas) train_return = evaluate_return(X_train_price, y_train_hat, y_train) test_return = evaluate_return(X_test_price, y_test_hat, y_test) print("Return", train_return, test_return) # Words analysis if using_text: print("LR analysis") positive_terms, negative_terms = analysis(x_train_t.shape[1], cls_clf.coef_[0]) print("\tPositive terms: ", vectorizer.inverse_transform(positive_terms[:word_size].reshape(1, -1))[0]) print("\tNegative terms: ", vectorizer.inverse_transform(negative_terms[:word_size].reshape(1, -1))[0]) print("Ridge analysis") positive_terms, negative_terms = analysis(x_train_t.shape[1], clf.coef_) print("\tPositive terms: ", vectorizer.inverse_transform(positive_terms[:word_size].reshape(1, -1))[0]) print("\tNegative terms: ", vectorizer.inverse_transform(negative_terms[:word_size].reshape(1, -1))[0]) print("\nBayes analysis") bayes_result = analysis_bay(X_train, y_cls_train) show_terms(['negative', 'positive'], vectorizer, bayes_result[0], bayes_result[1]) print("\n")
def least_square_reference( inst, empty_room=None, max_times_samples=2000, bad_channels=None, scaler=None, mrk=None, elp=None, hsp=None ): """ Fits and applies Least Square projection of the reference channels (potentially from an empty room) and removes the corresponding component from the recordings of a subject. Parameters ---------- inst : Raw | str Raw instance or path to raw data. empty_room : str | None Path to raw data acquired in empty room. max_times_samples : int Number of time sample to use for pinv. Defautls to 2000 bad_channels : list | array, shape (n_chans) of strings Lists bad channels scaler : function | None Scaler functions to normalize data. Defaults to sklearn.preprocessing.RobustScaler. Returns ------- inst : Raw adapted from Adeen Flinker 6/2013 (<*****@*****.**>) LSdenoise.m Main EHN - Automatically detects channel types. - Allows flexible scaler; Robust by default. - The data is projected back in Tesla. - Allows memory control. TODO: - Allow other kind of MNE-Python inst - Allow baseline selection (pre-stim instead of empty room) - Clean up memory - Allow fancy solver (l1, etc) """ from scipy.linalg import pinv from mne.io import read_raw_kit from mne.io import _BaseRaw # Least square can be fitted on empty room or on subject's data if empty_room is None: if not isinstance(inst, _BaseRaw): raw = read_raw_kit(inst, preload=True) else: raw = inst else: if not isinstance(empty_room, _BaseRaw): raw = read_raw_kit(empty_room, preload=True) else: raw = empty_room # Parameters n_chans, n_times = raw._data.shape chan_info = raw.info["chs"] # KIT: axial gradiometers (equiv to mag) ch_mag = np.where([ch["coil_type"] == 6001 for ch in chan_info])[0] # KIT: ref magnetometer ch_ref = np.where([ch["coil_type"] == 6002 for ch in chan_info])[0] # Other channels ch_misc = np.where([ch["coil_type"] not in [6001, 6002] for ch in chan_info])[0] # Bad channel ch_bad = np.empty(0) if (bad_channels is not None) and len(bad_channels): if np.all([isinstance(ch, int) for ch in bad_channels]): bad_channels = np.array(bad_channels) elif np.all([isinstance(ch, str) for ch in bad_channels]): bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in bad_channels] else: raise ValueError("bad_channels needs array of int or array of str") else: bad_channels = [] default_bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in raw.info["bads"]] bad_channels = np.array(default_bad_channels + bad_channels, int) print("bad channels:", [raw.ch_names[bad] for bad in bad_channels]) # To avoid memory error, let's subsample across time sel_times = slice(0, n_times, int(np.ceil(n_times // max_times_samples))) # Whiten data if scaler is None: from sklearn.preprocessing import RobustScaler scaler = RobustScaler() data_bsl = scaler.fit_transform(raw._data.T) # Fit Least Square coefficients on baseline data empty_sensors = data_bsl[:, ch_mag] if len(ch_bad): empty_sensors[:, ch_bad] = 0 # remove bad channels coefs = np.dot(pinv(data_bsl[sel_times, ch_ref]), empty_sensors[sel_times, :]) empty_sensors, data_bsl = None, None # clear memory # Apply correction on subject data if empty_room is not None: del raw raw = read_raw_kit(inst, preload=True) data_subject = scaler.transform(raw._data.T) subject_sensors = data_subject[:, ch_mag] - np.dot(data_subject[:, ch_ref], coefs) # Remove bad channels if len(ch_bad): subject_sensors[:, ch_bad] = 0 # Reproject baseline new_ref = np.dot(subject_sensors, pinv(coefs)) # Un-whiten data to get physical units back data = np.concatenate((subject_sensors, new_ref, raw._data[ch_misc, :].T), axis=1) data = scaler.inverse_transform(data) # Output raw._data = data.T return raw
class ActiveLearningClient: def keras_model(self): """ This function compiles and returns a Keras model. Should be passed to KerasClassifier in the Keras scikit-learn API. """ print("KERAS MODEL GENERATION") model = Sequential() model.add(Dense(units=20, input_dim=5, activation='relu')) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=20, activation='sigmoid')) # compile keras model # model.compile(loss=binary_crossentropy, optimizer='adam', metrics=['binary_accuracy', self.full_multi_label_metric]) model.compile(loss=binary_crossentropy, optimizer='adam', metrics=['binary_accuracy']) return model def load_AL_models(self): n_members = 2 # initializing number of Committee members learner_list = list() if not Blobby.objects.exists(): # participant number == 1 # below for loop would only be launched for the very first participant... print("FIRST PARTICIPANT!") self.csv_url = settings.STATICFILES_DIRS[0] + "/um_{}.csv".format(DATASIZE) # default, But MUST BE CHANGED for the actual learning. try: df = pd.read_csv(self.csv_url, header=None, sep=',') tmp_dataset = np.array(df) except Exception as e: print(e) return tmp_sc_x = RobustScaler() cpy_xpool = tmp_sc_x.fit_transform(tmp_dataset[:, :5]) Y = pd.DataFrame(tmp_dataset[:, 5:]) encoded_Y = self.to_ordinal(Y) cpy_ypool = encoded_Y.reshape([np.size(Y,0), 20]) n_initial = 300 # number of initial training data ~ this determines the ratio between user_model vs. human input to inquiry for member_idx in range(n_members): train_idx = np.random.choice(range(cpy_xpool.shape[0]), size=n_initial, replace=False) X_train, y_train = cpy_xpool[train_idx], cpy_ypool[train_idx] cpy_xpool, cpy_ypool = np.delete(cpy_xpool, train_idx, axis=0), np.delete(cpy_ypool, train_idx, axis=0) learner = ActiveLearner( estimator = self.keras_model(), X_training = X_train, y_training = y_train, query_strategy = avg_score ) learner_list.append(learner) else: # participant number > 1, we load models dir_flag = settings.BASE_DIR orig_urls, mod_urls = [dir_flag+"/originalfirst.h5", dir_flag+"/originalsecond.h5"], [dir_flag+"/modifiedfirst.h5", dir_flag+"/modifiedsecond.h5"] for member_idx in range(n_members): model_url = orig_urls[member_idx] if Path(orig_urls[member_idx]).is_file() else mod_urls[member_idx] print("\tLoaded Models from: {}\n".format(model_url)) model = keras.models.load_model(model_url) # load the classifier # when model was loaded, we don't train extra x and y learner_list.append(ActiveLearner(estimator=model, query_strategy=avg_score)) return Committee(learner_list=learner_list, given_classes=np.array([1,2,3,4,5])) def __init__(self): self.learner = self.load_AL_models() def get_data_for_hearing_group(self): dir_flag = settings.STATICFILES_DIRS[0] deafend, deaf, hoh = 'I am Deafened', 'I identify as Deaf', 'I am Hard of Hearing' q = Question.objects.filter(text="1. What statement best describes your relationship to the Deaf and/or Hard of Hearing Communities?")[0] deaf_or_hoh = AnswerRadio.objects.filter(question=q).last().body self.csv_url = dir_flag + "/um_{}.csv".format(DATASIZE) # DEFAULT LOAD if deaf_or_hoh in (deaf, deafend): self.csv_url = dir_flag + "/deaf_{}_used.csv".format(DATASIZE) elif hoh == deaf_or_hoh: self.csv_url = dir_flag + "/hoh_{}_used.csv".format(DATASIZE) else: self.csv_url = dir_flag + "/deaf_{}_used.csv".format(DATASIZE) print("HEARING GROUP:{}, loading:{}".format(deaf_or_hoh, self.csv_url)) try: df = pd.read_csv(self.csv_url, header=None, sep=',') dataset = np.array(df) # this sets the class variable... except Exception as e: print(e) return self.sc_x = RobustScaler() self.X_pool = self.sc_x.fit_transform(dataset[:, :5]) Y = pd.DataFrame(dataset[:, 5:]) encoded_Y = self.to_ordinal(Y) self.Y_pool = encoded_Y.reshape([np.size(Y,0), 20]) def to_ordinal(self, y, num_classes=None, dtype='float32'): y = np.array(y, dtype='int') input_shape = y.shape if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: input_shape = tuple(input_shape[:-1]) y = y.ravel() if not num_classes: num_classes = np.max(y) + 1 n = y.shape[0] ordinal = np.zeros((n, num_classes - 1), dtype=dtype) for i, yi in enumerate(y): ordinal[i, :yi] = 1 output_shape = input_shape + (num_classes - 1,) ordinal = np.reshape(ordinal, output_shape) return ordinal def make_preds(self): query_idx, q_instance = self.learner.query(self.X_pool) queried_vals = self.sc_x.inverse_transform(q_instance) # get machine prediction to be displayed machine_prediction = list(np.array(self.learner.predict(q_instance)) + 1) # add 1 to show in 1-5 scale print("machine prediction:", machine_prediction) # these values are +1 from what's predicted. self.test_printing(query_idx, queried_vals[0]) return (q_instance, machine_prediction, queried_vals[0]) def train_learner(self, q_instance, ratings): tmp_queried_val = self.sc_x.inverse_transform(q_instance).astype(int) np_ratings = np.zeros(shape=(1, 20)) # in the shape of multiple columns, padd with zeros for c in [0,1,2,3]: tmp_start = c*5 tmp_i = c*5 + ratings[c] for w in range(tmp_start, tmp_i): np_ratings[0, w] = 1 # User ratings [1,3,4,2]: # [[1. 0. 0. 0. 0. | 1. 1. 1. 0. 0. | 1. 1. 1. 1. 0. | 1. 1. 0. 0. 0.]] # we convert form to (1,20) not (,20) # therefore, the 0-4 range for index doesn't really matter because we convert from 1-5 range to 1,20 anyways. self.learner.teach(q_instance, np_ratings, epochs=100, verbose=0) print("Cappy learning the ratings:{} for q_instance:{}\nwhich is {}".format(ratings, q_instance, tmp_queried_val)) return self.learner, tmp_queried_val def test_printing(self, query_idx, queried_vals): pf_val = "Paraphrased" if queried_vals[3] == 1 else "Verbatim" hearing_group = "Deaf" if queried_vals[4] == 1 else "Hard of Hearing" tmp_str = "The machine selected index {} with raw values:\n\t" \ + "Delay of {} ms\n\tSpeed of {} WPM\n\tMissing {} words and is {}.\n" \ + "Predicting ratings by {}." print(tmp_str.format(query_idx, queried_vals[0], queried_vals[1], queried_vals[2], pf_val, hearing_group))
y = y.reshape(-1, 1) scaler = RobustScaler() y3 = scaler.fit_transform(y) y3 = y3.reshape(1, -1)[0] print("Training regressor ...") reg = ensemble.RandomForestRegressor(verbose=1, n_estimators=1000, n_jobs=3) # pickle.dump(clf, open("reg.pickle", "wb")) # reg = pickle.load(open("rfr_reg_iter1000_alldata.pickle", "r")) reg.fit(x3, y3) print("Predicting regression ...") reg_result = reg.predict(t2) reg_result = reg_result.reshape(-1, 1) # scaler = pickle.load(open("scaler.pickle", "r")) reg_result = scaler.inverse_transform(reg_result) reg_result = np.exp(reg_result) reg_result = reg_result.reshape(1, -1)[0] ############################# # # Submission Output # ############################# print("Output submission ...") res_merged = merge(clf_result, reg_result) res_merged = np.round(res_merged, 2) test_id = test_id.astype(int) test_loss = pd.DataFrame(res_merged, columns=['loss'])
class PredictorTrainer: DATA_PATH = 'data/training.csv' MODEL_PATH = 'data/models/' SCALER_PATH = 'data/models/scaler.pkl' TRAINED_MODEL_PATH = 'data/models/fee-predictor-model.h5' BATCH_SIZE = 256 TRAIN_STEPS = 10 TRAIN_DATA_PERCENT = 0.9 def __init__(self, batch_size=BATCH_SIZE, train_steps=TRAIN_STEPS): self.initialize_scaler() def initialize_scaler(self): path = Path(PredictorTrainer.SCALER_PATH) if not path.is_file(): print('Scaler model not found. Initializing.') #self.scaler = MinMaxScaler(feature_range=(0, 1)) self.scaler = RobustScaler() data = self.load_data() self.scaler.fit(data.values[:, 1:]) path.parent.mkdir(parents=True, exist_ok=True) joblib.dump(self.scaler, PredictorTrainer.SCALER_PATH) print('Scaler initialized and saved.') else: print('Found scaler model. Loading.') self.scaler = joblib.load(PredictorTrainer.SCALER_PATH) print('Scaler loaded.') def scale_data(self, data): return self.scaler.transform(data) # splits the data onto training and test set def split_data(self, data, n): train_start = 0 train_end = int(np.floor(0.8 * n)) test_start = train_end + 1 test_end = n return data[train_start:train_end], data[test_start:test_end] # loads the file with default data def load_file(self): return pd.read_csv(PredictorTrainer.DATA_PATH) # there are helper fields in data, this function left only ones which needed to train the model def get_learning_data(self, dataframe): return dataframe.drop(['block_median_fee_per_byte', 'block_id'], axis='columns') # sometimes fee_per_byte is enormous, so we take care of having the normal one here def filter_out_outliners(self, dataframe): return dataframe.query('fee_per_byte < block_median_fee_per_byte') # do all transformation needed to get info suitable for training def load_data(self): data = self.load_file() data = self.filter_out_outliners(data) return self.get_learning_data(data) def train(self): data = self.load_data() n = data.shape[0] data = data.values data_train, data_test = self.split_data(data, n) x_train = self.scale_data(data_train[:, 1:]) y_train = data_train[:, 0] x_test = self.scale_data(data_test[:, 1:]) y_test = data_test[:, 0] model = keras.Sequential([ keras.layers.Dense(3, kernel_initializer='normal', input_dim=3), keras.layers.Dense(1024, kernel_initializer='normal'), keras.layers.PReLU(), keras.layers.Dropout(0.1), keras.layers.Dense(512, kernel_initializer='normal'), keras.layers.PReLU(), keras.layers.Dropout(0.1), keras.layers.Dense(256, kernel_initializer='normal'), keras.layers.PReLU(), keras.layers.Dropout(0.1), keras.layers.Dense(128, kernel_initializer='normal'), keras.layers.PReLU(), keras.layers.Dropout(0.1), keras.layers.Dense( 64, kernel_initializer='normal', ), keras.layers.PReLU(), keras.layers.Dropout(0.1), keras.layers.Dense(32, kernel_initializer='normal'), keras.layers.PReLU(), keras.layers.Dropout(0.1), keras.layers.Dense(1, kernel_initializer='normal') ]) model.compile(optimizer='adam', loss=tf.losses.huber_loss) model.fit(x_train, y_train, epochs=10, batch_size=250) model.save(PredictorTrainer.TRAINED_MODEL_PATH) def load_model(self, model_name): return keras.models.load_model( model_name, custom_objects={'huber_loss': tf.losses.huber_loss}) def evaluate_block(self, model_name, test_file): model = self.load_model(model_name) data_raw = pd.read_csv(test_file) min_fee = data_raw[['fee_per_byte']].min().values[0] median_fee = data_raw[['block_median_fee_per_byte']].values[0][0] data = data_raw.query('confirmation_speed == 0') data = self.get_learning_data(data) data_y = data[:, 0] data_x = self.scale_data(data[:, 1:]) predicted = model.predict(data_x).flatten() hit = np.where(predicted > min_fee)[0].size out = np.where(predicted > median_fee)[0].size total_good = np.where((min_fee < predicted) & (predicted < median_fee))[0].size print('hit', hit) print('out', out) print('total_good', total_good) total_fee_loss = 0 sizes = data_raw.query('confirmation_speed == 0')[['vsize' ]].values.flatten() for i in range(0, data_y.size): total_fee_loss += sizes[i] * (data_y[i] - predicted[i]) print('total_fee_loss', total_fee_loss) return # evaluates the model predictions and write down values to file for further analisys def evaluate(self): # idea is to check how well we predict fee so that transaction were added to the first block after they appear in mempool model = self.load_model(PredictorTrainer.TRAINED_MODEL_PATH) data_raw = self.load_file() # looking for blocks which wasn't used during training so that get legitimate result # the first step is get training set the same way as we did this during training session data = self.filter_out_outliners(data_raw) data_train, data_test = self.split_data(data, data.shape[0]) data_train_blocks = set(data_train['block_id'].values.flatten() ) # block ids which were used during training all_blocks = set( data_raw['block_id'].values.flatten()) # all block ids in our data block_indexes_to_evaluate = list( all_blocks.difference(data_train_blocks) ) # this difference are block ids which wasn't used by training process data = data_raw[( data_raw['block_id'].isin(block_indexes_to_evaluate) )] # filter the data which wasn't used in training so we can use it to evaluate data = data.query( 'confirmation_speed == 0' ) # we looking only for results where transaction were added to the first next block after it added to mempool #collecting the statistics output = pd.DataFrame(columns=[ 'block_id', 'min_fee', 'median_fee', 'predicted_mean_fee', 'predicted_median_fee' ]) for name, group in data.groupby('block_id'): min_fee = group['fee_per_byte'].min() median_fee = group['fee_per_byte'].median() learning_data = self.get_learning_data(group) x_test = self.scale_data(learning_data.values[:, 1:]) y_predicted = model.predict(x_test).flatten() predicted_mean_fee = float(np.mean(y_predicted)) predicted_median_fee = float(np.median(y_predicted)) output = output.append( { 'block_id': name, 'min_fee': min_fee, 'median_fee': median_fee, 'predicted_mean_fee': predicted_mean_fee, 'predicted_median_fee': predicted_median_fee }, ignore_index=True) output.to_csv( os.path.join(PredictorTrainer.MODEL_PATH, 'evaluation_output.csv')) def predict(self, predict, expected, model_name): predict_scaled = self.scale_data(predict)[:, 1:] sess, x, y, out = self.load_model( os.path.join(PredictorTrainer.MODEL_PATH, model_name)) predictions = sess.run(out, feed_dict={x: predict_scaled}) template = 'Prediction is "{}", expected "{}"\n' output = [] i = 0 for pred, expec in zip(predictions[0, :], expected): inversed = self.scaler.inverse_transform( np.array([[pred, predict[i][1], predict[i][2], predict[i][3]]])) pred = inversed[0, 0] print(template.format(pred, expec)) output.append({ 'mempool_megabytes': predict[i][1], 'mempool_tx_count': predict[i][2], 'confirmation_speed': predict[i][3], 'prediction': pred }) i += 1 return output
y1 = y[:, 0:1] y2 = y[:, 1:2] rbY1 = RobustScaler() y1 = rbY1.fit_transform(y1) rbY2 = RobustScaler() y2 = rbY2.fit_transform(y2) C = 1e3 # SVM regularization parameter svc1 = svm.SVR(kernel='rbf', C=C, gamma=0.1).fit(X_scaled, [x[0] for x in y1]) svc2 = svm.SVR(kernel='rbf', C=C, gamma=0.1).fit(X_scaled, [x[0] for x in y2]) svm_pred = svc1.predict(rbX.transform(X_val)) svm_pred = np.reshape(svm_pred, (-1, 1)) y1_pred = rbY1.inverse_transform(svm_pred) svm_pred = svc2.predict(rbX.transform(X_val)) svm_pred = np.reshape(svm_pred, (-1, 1)) y2_pred = rbY2.inverse_transform(svm_pred) predicted = np.concatenate((y1_pred, y2_pred), axis=1) dist_err = np.array( list(map(lambda x: geo_dist(x[0], x[1]), zip(predicted, y_val)))) err_mean = np.mean(dist_err) ## Random Forest regr = RandomForestRegressor(random_state=0, n_estimators=1000, oob_score=True) regr.fit(X, y)
#saving the NN results as .json and .h5 #model_json = model.to_json() #with open("model.json", "w") as json_file: #json_file.write(model_json) #model.save_weights("model.h5") #saving the NN model #import joblib #filename = 'predictor101.sav' #joblib.dump(model, filename) #load_model = joblib.load(filename) yhat = model.predict( np.array(df.tail(n_per_in)).reshape(1, n_per_in, n_features)) # Transforming the predicted values back to their original format yhat = close_scaler.inverse_transform(yhat)[0] # Creating a DF of the predicted prices preds = pd.DataFrame(yhat, index=pd.date_range(start=df.index[-1] + timedelta(days=1), periods=len(yhat), freq="B"), columns=[df.columns[0]]) # Number of periods back to plot the actual values pers = n_per_in # Transforming the actual values to their original price actual = pd.DataFrame(close_scaler.inverse_transform(df[["Close"]].tail(pers)), index=df.Close.tail(pers).index,
columns=Xtest.columns) Ytrain = pd.DataFrame( rbsy.transform(Ytrain)) # , index=Ytrain.index, columns=Ytrain.columns) if not os.path.isfile('.\\model.dat'): model = LinearRegression() model.fit(Xtrain, Ytrain) pickle.dump(model, open("model.dat", "wb")) else: model = pickle.load(open("model.dat", "rb")) ypred = model.predict(Xtest) ypred = ypred.reshape((-1, 1)) ypred = rbsy.inverse_transform(ypred) rmse = sqrt(mean_squared_error(Ytest, ypred)) print(rmse) Ytest_np = Ytest.to_numpy() ypred = ypred.flatten() fig = go.Figure() fig.add_trace(go.Scatter(y=Ytest_np, mode='lines', name='Ytest')) fig.add_trace(go.Scatter(y=ypred, mode='lines', name='ypred')) fig.write_html(f'.\\Ytest.html') new = 2 chromepath = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s' webbrowser.get(chromepath).open(f'.\\Ytest.html', new=new)
verbose=True, learning_rate='adaptive', tol=0.0, warm_start=True, solver='adam') reg.fit(X_train, Y_train) pred_y = reg.predict(X_test) plt.plot(pred_y.flatten(), label='predict') plt.plot(Y_test.flatten(), label='real') plt.legend() plt.show() pred = rob_sca.inverse_transform(pred_y.reshape(-1, 1)) test = rob_sca.inverse_transform(Y_test.reshape(-1, 1)) err = abs(pred - test) / test plt.plot(pred.flatten(), label='predict') plt.plot(test.flatten(), label='real') plt.legend() plt.show() plt.plot(err, label='err') plt.legend() plt.show() # 误差方差 re_err = abs(pred - test)
valor_arrecadacao_serie_temporal_lstm_treino = LSTMUtil.cria_intervalos_temporais(valor_treino_rbt) valor_arrecadacao_serie_temporal_lstm_teste = LSTMUtil.cria_intervalos_temporais(valor_teste_rbt) model = LSTMUnivariada(df_treino) checkpoint = ModelCheckpoint('checkpoint_regressor_'+tributo+'_teste_robust_scaler.hdf5', monitor='loss', verbose=2, save_best_only=True, save_weights_only=False, mode='auto', period=1) model.compile(optimizer=ko.Adam(lr=0.1), loss='mse') model.fit([np_dia_mes_treino, valor_arrecadacao_serie_temporal_lstm_treino], saida_treino, validation_data=([np_dia_mes_teste, valor_arrecadacao_serie_temporal_lstm_teste], saida_teste), epochs=100, batch_size=50, callbacks=[checkpoint]) # Carrega o melhor modelo salvo pelo Checkpoint model.load_weights('checkpoint_regressor_'+tributo+'_teste_robust_scaler.hdf5') rbt_pred = model.predict([np_dia_mes_teste, valor_arrecadacao_serie_temporal_lstm_teste]) mae_rbt = mean_absolute_error(rbt_scaler.inverse_transform(saida_teste), rbt_scaler.inverse_transform(rbt_pred)) print('O MAE para o tributo '+tributo+' usando o "Robust Scaler" foi de '+str(mae_rbt)) comparativo.loc[tributo, 'RobustScaler'] = mae_rbt # Power Transformer (yeo-johnson) pwr_scaler = PowerTransformer() valor_treino_pwr = pwr_scaler.fit_transform(df_treino['Valor'].values.reshape(-1, 1)) valor_teste_pwr = pwr_scaler.transform(df_teste['Valor'].values.reshape(-1, 1)) # A saída (label) é a arrecadação do dia seguinte ao último dia da sequência saida_treino = valor_treino_pwr[5:] saida_teste = valor_teste_pwr[5:] valor_arrecadacao_serie_temporal_lstm_treino = LSTMUtil.cria_intervalos_temporais(valor_treino_pwr) valor_arrecadacao_serie_temporal_lstm_teste = LSTMUtil.cria_intervalos_temporais(valor_teste_pwr)
def anomallyDet(revDF, resName, valid=None, test=0.25, plot=True, figsize=(15, 5), n_steps=30, units=64, dropout=0.2, optimizer='adam', metrics='accuracy', batch_size=32, loss='mae', epochs=100): """ FORECAST FUTURE SALES WITH THE USE OF ARIMA MODELING Inputs: :param revDF: Generated and clustered restaurant revenue dataframe :param resName: Name of restuarnt in interest of analyzing :param valid: Valid dataframe size :param test: Test dataframe size, DEFAULT: 0.25 :param plot: If the function should plot, DEFAULT: True :param figsize: Plot figure size, DEFAULT: (15, 5) :param n_steps: Sequence for n_steps of days for historical data :param units: Dimensionality of the output space :param dropout: Fraction of the units to drop for the linear transformation of the inputs, DEFAULT = 0.2 :param optimizier: Updating modle in response to the output of the loss function, DEFAULT: adam :param loss: Compute the quantity that a model should seek to minimize, DEFAULT: mae :param metrics: Function used to judge the performance of the LSTM model ,DEFAULT: accuracy :param batch_size: Number of samples per gradient update, DEFAULT: 32 :param epochs: Number of epochs to train LSTM model, DEFAULT: 100 """ revCopy = revDF.copy() resName = resName.lower() print('**** SPLICING GENERATED DATAFRAME ****') revCopy = revCopy.reset_index() revCopy = revCopy[['Date', resName]] revCopy['Date'] = revCopy['Date'].astype('datetime64') first_idx = revCopy[resName].first_valid_index() revCopy = revCopy.loc[first_idx:] revCopy = revCopy.reset_index(drop=True) revCopy = revCopy.groupby('Date').sum() if valid is None: print('**** SPLITING INTO TRAIN AND TEST **** \n') trainDF, testDF = split_train_test(revCopy, valid=valid, test=test, plot=plot, figsize=figsize) print('**** ROBUST SCALING TRAIN AND TEST DATA **** \n') robust = RobustScaler(quantile_range=(25, 75)).fit(trainDF) trainDF_scaled = robust.transform(trainDF) testDF_scaled = robust.transform(testDF) ## HELPER FUNCTION def create_dataset(X, y, time_steps=1): a, b = [], [] for i in range(len(X) - time_steps): v = X[i:(i + time_steps)] a.append(v) b.append(y[i + time_steps]) return np.array(a), np.array(b) ## CREATE SEQUENCES WITH N_STEPS DAYS OF HISTORICAL DATA n_steps = n_steps print('**** RESHAPING DATA INTO 3D FOR LSTM MODEL **** \n') ## RESHAPE TO 3D [n_samples, n_steps, n_features] X_train, y_train = create_dataset(trainDF_scaled, trainDF_scaled, n_steps) X_test, y_test = create_dataset(testDF_scaled, testDF_scaled, n_steps) print('X_train shape:', X_train.shape) print('y_train:', y_train.shape) print('X_test shape:', X_test.shape) print('y_test:', y_test.shape) print('**** BUILDING LSTM MODEL ****') units = units dropout = dropout optimizer = optimizer loss = loss epochs = epochs model = Sequential() model.add( LSTM(units=units, input_shape=(X_train.shape[1], X_train.shape[2]))) model.add(Dropout(rate=dropout)) model.add(RepeatVector(n=X_train.shape[1])) model.add(LSTM(units=units, return_sequences=True)) model.add(Dropout(rate=dropout)) model.add(TimeDistributed(Dense(units=X_train.shape[2]))) print(model.summary()) print('\n **** COMPILING AND FITTING LSTM MODEL **** \n') model.compile(loss=loss, optimizer=optimizer, metrics=metrics) history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, shuffle=False) if plot is True: print('**** PLOT MODEL LOSS OVER EPOCHS ****') plt.figure(figsize=figsize) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model Loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='best') plt.grid(True) plt.show() else: pass print('\n **** PREDICTING ON TEST DATAFRAME ****') y_pred = model.predict(X_test) mae = np.mean(np.abs(y_pred - X_test), axis=1) ## RESHAPING PREDICTION pred = y_pred.reshape((y_pred.shape[0] * y_pred.shape[1]), y_pred.shape[2]) ## RESHAPING TEST DATA X_test = X_test.reshape((X_test.shape[0] * X_test.shape[1]), X_test.shape[2]) ## ERROR COMPUTATION errors = X_test - pred print('Error:', errors.shape) ## RMSE DATA RMSE = math.sqrt(mean_squared_error(X_test, pred)) print(f'Test RMSE: {RMSE} \n') ## DETECTING ANOMALIES print('**** DETECTING ANOMALIES IN SALES ****') dist = np.linalg.norm(X_test - pred, axis=1) scores = dist.copy() scores.sort() cut_off = int(0.8 * len(scores)) threshold = scores[cut_off] score = pd.DataFrame(index=testDF[n_steps:].index) score['Loss'] = mae score['Threshold'] = threshold score['Anomaly'] = score['Loss'] > score['Threshold'] score[resName] = testDF[n_steps:][resName] anomalies = score[score['Anomaly'] == True] x = pd.DataFrame(anomalies[resName]) x = pd.DataFrame(robust.inverse_transform(x)) x.index = anomalies.index x.rename(columns={0: 'Revenue'}, inplace=True) anomalies = anomalies.join(x, how='left') anomalies = anomalies.drop(columns=[resName], axis=1) test_inv = pd.DataFrame(robust.inverse_transform(testDF[n_steps:])) test_inv.index = testDF[n_steps:].index test_inv.rename(columns={0: resName}, inplace=True) if plot is True: print('**** PLOTTING ANOMALLY DETECTION ****') plt.figure(figsize=figsize) plt.plot(test_inv.index, test_inv[resName], color='gray', label=resName) sns.scatterplot(anomalies.index, anomalies['Revenue'], color='red', s=55, label='Anomaly') plt.xticks(rotation=90) plt.xlabel('Date') plt.ylabel('Sales') plt.legend(loc='best') plt.grid(True) plt.show() print('\n **** SAVING ANOMALY MODEL PREDICTIONS LOCALLY ****') resFileName = resName.replace(' ', '_') fileName = f'{resFileName.upper()}_ANOMALY_PREDICTIONS.csv' anomalies.to_csv(fileName) else: print('\n **** SAVING ANOMALY MODEL PREDICTIONS LOCALLY ****') resFileName = resName.replace(' ', '_') fileName = f'{resFileName.upper()}_ANOMALY_PREDICTIONS.csv' anomalies.to_csv(fileName) else: print('**** SPLITING INTO TRAIN, VALID, AND TEST **** \n') trainDF, validDF, testDF = split_train_test(revCopy, valid=valid, test=test, plot=plot, figsize=figsize) print('**** ROBUST SCALING TRAIN, VALID, TEST DATA **** \n') robust = RobustScaler(quantile_range=(25, 75)).fit(trainDF) trainDF_scaled = robust.transform(trainDF) validDF_scaled = robust.transform(validDF) testDF_scaled = robust.transform(testDF) ## HELPER FUNCTION def create_dataset(X, y, time_steps=1): a, b = [], [] for i in range(len(X) - time_steps): v = X[i:(i + time_steps)] a.append(v) b.append(y[i + time_steps]) return np.array(a), np.array(b) ## CREATE SEQUENCES WITH N_STEPS DAYS OF HISTORICAL DATA n_steps = n_steps print('**** RESHAPING DATA INTO 3D FOR LSTM MODEL **** \n') ## RESHAPE TO 3D [n_samples, n_steps, n_features] X_train, y_train = create_dataset(trainDF_scaled, trainDF_scaled, n_steps) X_valid, y_valid = create_dataset(validDF_scaled, validDF_scaled, n_steps) X_test, y_test = create_dataset(testDF_scaled, testDF_scaled, n_steps) print('X_train shape:', X_train.shape) print('y_train:', y_train.shape) print('X_test shape:', X_test.shape) print('y_test:', y_test.shape) print('**** BUILDING LSTM MODEL ****') units = units dropout = dropout optimizer = optimizer loss = loss epochs = epochs model = Sequential() model.add( LSTM(units=units, input_shape=(X_train.shape[1], X_train.shape[2]))) model.add(Dropout(rate=dropout)) model.add(RepeatVector(n=X_train.shape[1])) model.add(LSTM(units=units, return_sequences=True)) model.add(Dropout(rate=dropout)) model.add(TimeDistributed(Dense(units=X_train.shape[2]))) print(model.summary()) print('\n **** COMPILING AND FITTING LSTM MODEL **** \n') model.compile(loss=loss, optimizer=optimizer, metrics=metrics) history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, y_valid), shuffle=False) if plot is True: print('**** PLOT MODEL LOSS OVER EPOCHS ****') plt.figure(figsize=figsize) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model Loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='best') plt.grid(True) plt.show() else: pass print('\n **** PREDICTING ON TEST DATAFRAME ****') y_pred = model.predict(X_test) mae = np.mean(np.abs(y_pred - X_test), axis=1) ## RESHAPING PREDICTION pred = y_pred.reshape((y_pred.shape[0] * y_pred.shape[1]), y_pred.shape[2]) ## RESHAPING TEST DATA X_test = X_test.reshape((X_test.shape[0] * X_test.shape[1]), X_test.shape[2]) ## ERROR COMPUTATION errors = X_test - pred print('Error:', errors.shape) ## RMSE DATA RMSE = math.sqrt(mean_squared_error(X_test, pred)) print(F'Test RMSE: {RMSE}') ## DETECTING ANOMALIES print('\n **** DETECTING ANOMALIES IN SALES ****') dist = np.linalg.norm(X_test - pred, axis=1) scores = dist.copy() scores.sort() cut_off = int(0.8 * len(scores)) threshold = scores[cut_off] score = pd.DataFrame(index=testDF[n_steps:].index) score['Loss'] = mae score['Threshold'] = threshold score['Anomaly'] = score['Loss'] > score['Threshold'] score[resName] = testDF[n_steps:][resName] anomalies = score[score['Anomaly'] == True] x = pd.DataFrame(anomalies[resName]) x = pd.DataFrame(robust.inverse_transform(x)) x.index = anomalies.index x.rename(columns={0: 'Revenue'}, inplace=True) anomalies = anomalies.join(x, how='left') anomalies = anomalies.drop(columns=[resName], axis=1) test_inv = pd.DataFrame(robust.inverse_transform(testDF[n_steps:])) test_inv.index = testDF[n_steps:].index test_inv.rename(columns={0: resName}, inplace=True) if plot is True: print('**** PLOTTING ANOMALLY DETECTION ****') plt.figure(figsize=figsize) plt.plot(test_inv.index, test_inv[resName], color='gray', label=resName) sns.scatterplot(anomalies.index, anomalies['Revenue'], color='red', s=55, label='Anomaly') plt.xticks(rotation=90) plt.xlabel('Date') plt.ylabel('Sales') plt.legend(loc='best') plt.grid(True) plt.show() print('\n **** SAVING ANOMALY MODEL PREDICTIONS LOCALLY ****') resFileName = resName.replace(' ', '_') fileName = f'{resFileName.upper()}_ANOMALY_PREDICTIONS.csv' anomalies.to_csv(fileName) else: print('\n **** SAVING ANOMALY MODEL PREDICTIONS LOCALLY ****') resFileName = resName.replace(' ', '_') fileName = f'{resFileName.upper()}_ANOMALY_PREDICTIONS.csv' anomalies.to_csv(fileName)
epochs=50, batch_size=72, validation_split=0.1, shuffle=False ) scores = model.evaluate(X_test1, y_test1, verbose=0) print(scores) plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') plt.legend(); y_pred1 = model.predict(X_test1) y_train_inv1 = cnt_transformer.inverse_transform(y_train1.reshape(1, -1)) y_test_inv1 = cnt_transformer.inverse_transform(y_test1.reshape(1, -1)) y_pred_inv1 = cnt_transformer.inverse_transform(y_pred1) print(y_pred_inv1) from matplotlib import pyplot pyplot.plot(history.history['loss']) pyplot.plot(history.history['val_loss']) pyplot.title('model train vs validation loss') pyplot.ylabel('loss') pyplot.xlabel('epoch') pyplot.legend(['train', 'validation'], loc='upper right') pyplot.show() import pickle
epochs=epochs, batch_size=batch_size, validation_split=0.1, shuffle=False, callbacks=[earlystop_callback], verbose=2) #save model #my_model_path = os.path.dirname('saved_model/my_model') #model.save(my_model_path) # #plt.plot(history.history['loss'], label='train') #plt.plot(history.history['val_loss'], label = 'validation') #plt.legend() ## #evaluate model on testing data y_pred = model.predict(X_test) y_train_inv = label_column_max_transformer.inverse_transform( y_train.reshape(1, -1)) y_test_inv = label_column_max_transformer.inverse_transform( y_test.reshape(1, -1)) y_pred_inv = label_column_max_transformer.inverse_transform(y_pred) f2 = plt.figure() plot_prediction(f2, y_test_inv.flatten(), y_pred_inv.flatten()) #print r2 print(metrics.r2_score(y_test, y_pred)) print(metrics.mean_absolute_error(y_test, y_pred)) print(metrics.mean_squared_error(y_test, y_pred))
for i in range(len(x_scaled) - history_period_size - future_period_predict): sequential_data.append([ x_scaled[i:(i + history_period_size)], y_scaled[i + history_period_size + future_period_predict - 1] ]) x, y = [], [] for seq, target in sequential_data: x.append(seq) y.append(target) # Predict x_pred = np.array(x) y_pred = np.array(y) y_inverse = y_transformer.inverse_transform(y_pred) predicted = model.predict(x_pred) predicted_inverse = y_transformer.inverse_transform(predicted) # print(output_features) # print('Predicted:') # print(np.round(predicted_inverse[0], 1)) # print('Original:') # print(records.iloc[i_pred + history_period_size][output_features].to_numpy()) plt.figure(figsize=(10, 6)) plt.plot(y_inverse[:200, 2], 'b', label='Measured') plt.plot(predicted_inverse[:200, 2], 'r', label='Predicted') plt.ylabel('Min Temp (C)') plt.legend()
batch_size = 128 g = generator.flow(Xt, yt, batch_size=batch_size, shuffle=True) steps_per_epoch = 10000 / batch_size reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6, verbose=1) history = model.fit_generator(g, steps_per_epoch=len(Xt) // batch_size, epochs=150, validation_data=(X_test, y_test_s), callbacks=[reduce_lr]) name = "Chemception_like_demo" model.save("data/%s.h5" % name) hist = history.history import pickle pickle.dump(hist, file("data/%s_history.pickle" % name, "w")) y_pred_t = rbs.inverse_transform(model.predict(X_train)) y_pred = rbs.inverse_transform(model.predict(X_test)) corr2 = np.corrcoef(np.log(y_test).reshape(1, -1), y_pred.reshape(1, -1))[0][1]**2 rmse = np.mean((np.log(y_test) - y_pred)**2)**0.5 print("R2 : %0.2F" % corr2) print("RMSE : %0.2F" % rmse)
def least_square_reference(inst, empty_room=None, max_times_samples=2000, bad_channels=None, scaler=None, mrk=None, elp=None, hsp=None): """ # downloaded function least_square_reference from https://github.com/kingjr/jr-tools/blob/master/jr/meg/kit.py and added to base_funcs Fits and applies Least Square projection of the reference channels (potentially from an empty room) and removes the corresponding component from the recordings of a subject. Parameters ---------- inst : Raw | str Raw instance or path to raw data. empty_room : str | None Path to raw data acquired in empty room. max_times_samples : int Number of time sample to use for pinv. Defautls to 2000 bad_channels : list | array, shape (n_chans) of strings Lists bad channels scaler : function | None Scaler functions to normalize data. Defaults to sklearn.preprocessing.RobustScaler. Returns ------- inst : Raw adapted from Adeen Flinker 6/2013 (<*****@*****.**>) LSdenoise.m Main EHN - Automatically detects channel types. - Allows flexible scaler; Robust by default. - The data is projected back in Tesla. - Allows memory control. TODO: - Allow other kind of MNE-Python inst - Allow baseline selection (pre-stim instead of empty room) - Clean up memory - Allow fancy solver (l1, etc) """ from scipy.linalg import pinv from mne.io import read_raw_fif from mne.io import BaseRaw # Least square can be fitted on empty room or on subject's data if empty_room is None: if not isinstance(inst, BaseRaw): raw = read_raw_fif(inst, preload=True) else: raw = inst else: if not isinstance(empty_room, BaseRaw): raw = read_raw_fif(empty_room, preload=True) else: raw = empty_room # Parameters n_chans, n_times = raw._data.shape chan_info = raw.info['chs'] # KIT: axial gradiometers (equiv to mag) ch_mag = np.where([ch['coil_type'] == 6001 for ch in chan_info])[0] # KIT: ref magnetometer ch_ref = np.where([ch['coil_type'] == 6002 for ch in chan_info])[0] # Other channels ch_misc = np.where( [ch['coil_type'] not in [6001, 6002] for ch in chan_info])[0] # check if refs is included assert len( ch_ref ) != 0, "MEG refs are not among the channels! They are needed for denoise!" # Bad channel ch_bad = np.empty(0) if (bad_channels is not None) and len(bad_channels): if np.all([isinstance(ch, int) for ch in bad_channels]): bad_channels = np.array(bad_channels) elif np.all([isinstance(ch, str) for ch in bad_channels]): bad_channels = [ ii for ii, ch in enumerate(raw.ch_names) if ch in bad_channels ] else: raise ValueError('bad_channels needs array of int or array of str') else: bad_channels = [] default_bad_channels = [ ii for ii, ch in enumerate(raw.ch_names) if ch in raw.info['bads'] ] bad_channels = np.array(default_bad_channels + bad_channels, int) print('bad channels:', [raw.ch_names[bad] for bad in bad_channels]) # To avoid memory error, let's subsample across time sel_times = slice(0, n_times, int(np.ceil(n_times // max_times_samples))) # Whiten data if scaler is None: from sklearn.preprocessing import RobustScaler scaler = RobustScaler() data_bsl = scaler.fit_transform(raw._data.T) # Fit Least Square coefficients on baseline data empty_sensors = data_bsl[:, ch_mag] if len(ch_bad): empty_sensors[:, ch_bad] = 0 # remove bad channels coefs = np.dot(pinv(data_bsl[sel_times, ch_ref]), empty_sensors[sel_times, :]) empty_sensors, data_bsl = None, None # clear memory # Apply correction on subject data if empty_room is not None: del raw raw = read_raw_fif(inst, preload=True) data_subject = scaler.transform(raw._data.T) subject_sensors = (data_subject[:, ch_mag] - np.dot(data_subject[:, ch_ref], coefs)) # Remove bad channels if len(ch_bad): subject_sensors[:, ch_bad] = 0 # Reproject baseline new_ref = np.dot(subject_sensors, pinv(coefs)) # Un-whiten data to get physical units back data = np.concatenate((subject_sensors, new_ref, raw._data[ch_misc, :].T), axis=1) data = scaler.inverse_transform(data) # Output raw._data = data.T return raw