def trainandTest(X_train, y_train, X_test, testindex, output_path):
    rbY = RobustScaler()
    y_train = rbY.fit_transform(y_train.values.reshape(-1,1))
    model = xgb.XGBRegressor(n_estimators=210,
                             subsample=0.7, max_depth=3, min_child_weight=1, seed=0,
                             colsample_bytree=0.8,
                             #learning_rate=0.21, gamma=0.14, reg_alpha=0.015, reg_lambda=0.002,
                             silent=1, objective='reg:linear')
    model.fit(X_train, y_train)
    ans = model.predict(X_test)
    ans = rbY.inverse_transform(ans.reshape(-1,1))
    ans = np.exp(ans)
    result = testindex
    result['SalePrice'] = ans
    result.to_csv(output_path, index=None)

    # 显示重要特征
    # plot_importance(model)
    # plt.show()

    bns = np.exp(rbY.inverse_transform(model.predict(X_train).reshape(-1,1)))
    error = []
    for i ,j in zip(bns, y_train):
        error.append(abs(i-j))
    print(bns.tolist())
    print('xun训练集误差')
    print(sqrt(sum(error)/len(error)))
def trainandTestLa(X_train, y_train, X_test, testindex, output_path):
    print(type(X_train))
    rbY = RobustScaler()
    y_train = rbY.fit_transform(y_train.values.reshape(-1,1))
    model = Lasso(alpha=0.06, max_iter=2000, selection='random', tol=0.001, normalize=False)
    model.fit(X_train, y_train)
    print(type(X_train))
    coef = pd.Series(model.coef_, index = X_train.columns)# .coef_ 可以返回经过学习后的所有 feature 的参数。
    print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
    coef_all = coef[coef!=0].sort_values()
    print(set(coef_all.index.tolist()) & set(added_features))
    print(coef_all.tail(20))
    print(coef_all.head(20))
    ans = model.predict(X_test)
    ans = rbY.inverse_transform(ans.reshape(-1,1))
    ans = np.exp(ans)
    result = testindex
    result['SalePrice'] = ans
    result.to_csv(output_path, index=None)

    bns = np.exp(rbY.inverse_transform(model.predict(X_train).reshape(-1,1)))
    error = []
    for i ,j in zip(bns, y_train):
        error.append(abs(i-j))
    print(bns.tolist())
    print('xun训练集误差')
    print(sqrt(sum(error)/len(error)))
示例#3
0
class Trainer(metaclass=ABCMeta):
    def __init__(self, df_daily, df_monthly):
        self.df_daily = df_daily
        self.df_monthly = df_monthly
        self.scalerX = RobustScaler(quantile_range=(10, 90))
        self.scalerY = RobustScaler(quantile_range=(10, 90))

    def load_data(self, val_size, test_size, target_column):
        df = FeatureSelection().add_prod_delay_correlation(
            dataframe=self.df_daily,
            df_month=self.df_monthly.copy(),
            target=target_column)

        bcb = BCB()
        bcb = bcb.get_dataframe(df.index[0], df.index[-1])
        if not bcb.empty:
            bcb.set_index(df.index, inplace=True)
            df = pd.concat((df, bcb), axis=1, join='inner')

        columns = list(df)
        columns[-1], columns[columns.index(target_column)] = columns[
            columns.index(target_column)], columns[-1]
        df = df.reindex(columns=columns)
        df.iloc[:, -1:] = remove_outliers(df.iloc[:, -1:])
        df = sum_days(df, past_days=31, prevision_days=31)
        df.drop('NUM_VENDEDOR', axis=1, inplace=True)

        y_total = df.iloc[:, -1:].values
        x_total = df.iloc[:, :-1].values
        y_test = y_total[-test_size:, :]
        x_test = x_total[-test_size:, :]
        y_train = y_total[:-val_size - test_size, :]
        x_train = x_total[:-val_size - test_size, :]
        y_val = y_total[-val_size - test_size - 1:-test_size, :]
        x_val = x_total[-val_size - test_size - 1:-test_size, :]

        x_train = self.scalerX.fit_transform(x_train)
        y_train = self.scalerY.fit_transform(y_train)
        x_val = self.scalerX.transform(x_val)
        y_val = self.scalerY.transform(y_val)
        x_test = self.scalerX.transform(x_test)
        y_test = self.scalerY.transform(y_test)
        return x_train, y_train, x_val, y_val, x_test, y_test

    # @abstractmethod
    # def train(self):
    #     pass
    #
    # @abstractmethod
    # def predict(self):
    #     pass

    def inverse_transformX(self, df):
        return self.scalerX.inverse_transform(df)

    def inverse_transformY(self, df):
        return self.scalerY.inverse_transform(df)
示例#4
0
class FloatCode(IntCode):
    def __init__(
        self,
        col_name: str,
        code_len: int,
        start_id: int,
        fillall: bool = True,
        base: int = 100,
        hasnan: bool = True,
        transform: str = 'quantile',
    ):
        super().__init__(col_name, code_len, start_id, fillall, base, hasnan)
        if transform == 'yeo-johnson':
            self.scaler = PowerTransformer(standardize=True)
        elif transform == 'quantile':
            self.scaler = QuantileTransformer(output_distribution='uniform')
        elif transform == 'robust':
            self.scaler = RobustScaler()
        else:
            raise ValueError(
                'Supported data transformations are "yeo-johnson", "quantile", and "robust"'
            )

    def convert_to_int(self, val: float) -> int:
        val = np.expand_dims(np.array(val), axis=0)
        values = self.scaler.transform(val[:, None])[:, 0] - self.mval
        values = (values * self.base**self.extra_digits).astype(int)
        output = values[0]
        return output

    def array_convert_to_int(self, val: ndarray):
        values = self.scaler.fit_transform(val[:, None])[:, 0]
        self.mval = values.min()
        values = values - self.mval
        digits = int(math.log(values.max(), self.base)) + 1
        # extra digits used for 'float' part of the number
        extra_digits = self.code_len - digits
        if extra_digits < 0:
            raise ValueError("need large length to code the nummber")
        self.extra_digits = extra_digits
        values = (values * self.base**self.extra_digits).astype(int)
        return values

    def reverse_convert_to_int(self, val: int) -> float:
        val = val / self.base**self.extra_digits
        val = np.expand_dims(np.array(val), axis=0)
        v = self.scaler.inverse_transform(val[:, None] + self.mval)[0, 0]
        return v

    def decode(self, ids: List[int]) -> str:
        if self.hasnan and ids[0] == self.NA_token_id[0]:
            return self.NA_token
        v = 0
        for i in reversed(range(self.code_len)):
            digit = int(self.digits_id_to_item[i][ids[self.code_len - i - 1]])
            v += digit * self.base**i
        v = self.reverse_convert_to_int(v)
        accuracy = max(int(abs(np.log10(0.1 / self.base**self.extra_digits))),
                       1)
        return f"{v:.{accuracy}f}"
示例#5
0
def modeling(input):

    dataset = np.loadtxt("models/First_trial_regression.csv", delimiter=",")
    # separate the data from the target attributes
    X = dataset[:, 0:12]
    Y = dataset[:, 12]

    rbX = RobustScaler()
    X = rbX.fit_transform(X)

    rbY = RobustScaler()
    Y = np.expand_dims(Y, 0)
    Y = Y.T
    Y = rbY.fit_transform(Y)

    clf = SVR(C=1.000, epsilon=0.2, kernel="rbf")
    clf.fit(X, Y)

    joblib.dump(clf, 'models/SMOreg.pkl')
    clf2 = joblib.load('models/SMOreg.pkl')

    audience_num = clf2.predict(rbX.transform(input))
    audience_num = np.expand_dims(audience_num, 0)
    audience_num = audience_num.T
    audience_num = rbY.inverse_transform(audience_num)
    audience_num = np.squeeze(audience_num)

    return {'audience_num': audience_num, 'rbX': rbX, 'rbY': rbY}
示例#6
0
class DFRobustScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns        = columns
        self.model          = RobustScaler(**kwargs)
        self.transform_cols = None
        
    def fit(self, X, y=None):
        self.columns        = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        return self
    
    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.transform(X[self.transform_cols])

        return new_X
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    
    def inverse_transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.inverse_transform(X[self.transform_cols])

        return new_X
 def denormalize(df, norm_data):
     # df = df['s3'].values.reshape(-1,1)
     # norm_data = norm_data.reshape(-1,1)
     scl = RobustScaler()
     a = scl.fit(df['s3'].as_matrix().reshape(-1, 1))
     new = scl.inverse_transform(norm_data)
     return new
def trainandTestKR(X_train, y_train, X_test, testindex, output_path):
    rbY = RobustScaler()
    y_train = rbY.fit_transform(y_train.values.reshape(-1,1))
    model = KernelRidge(alpha=0.19, kernel='laplacian', coef0=0)
    model.fit(X_train, y_train)
    ans = model.predict(X_test)
    ans = rbY.inverse_transform(ans.reshape(-1,1))
    ans = np.exp(ans)
    result = testindex
    result['SalePrice'] = ans
    result.to_csv(output_path, index=None)

    bns = np.exp(rbY.inverse_transform(model.predict(X_train).reshape(-1,1)))
    error = []
    for i ,j in zip(bns, y_train):
        error.append(abs(i-j))
    print(bns.tolist())
    print('xun训练集误差')
    print(sqrt(sum(error)/len(error)))
示例#9
0
class Scaler(object):
    """If we have `hypers.scale=True`, we use this class to scale everything (price-actions, rewards, etc). Using this
    instead of TForce's built-in preprocessing (http://tensorforce.readthedocs.io/en/latest/preprocessing.html) since
    this gives more flexibility, but it's basically the same thing. Someone may want to check me on that statement by
    reading those docs and trying TForce's preprocessing instead of this.

    One important bit here is the use of RobustScaler with a quantile_range. This allows us to handle outliers, which
    abound in the data. Sometimes we have a timeseries hole, and suddenly we're up a billion percent. Sometimes whales
    pump-and-dump to screw with the market. RobustScaler lets us "ignore" those moments.

    TODO someone will want to double-check my work on this scaling approach in general. Best of my knowledges, but I'm
    a newb.
    """

    # 400k should be enough data to safely say "I've seen it all, just scale (don't fit) going forward")
    STOP_AT = 3e5
    SKIP = 15

    def __init__(self):
        self.reward_scaler = RobustScaler(quantile_range=(5., 95.))
        self.state_scaler = RobustScaler(quantile_range=(5., 95.))
        self.rewards = []
        self.states = []
        self.done = False
        self.i = 0

    def _should_skip(self):
        # After we've fitted enough (see STOP_AT), start returning direct-transforms for performance improvement
        # Skip every few fittings. Each individual doesn't contribute a whole lot anyway, and costs a lot
        return self.done or (self.i % self.SKIP != 0 and self.i > self.SKIP)

    def transform_state(self, state):
        self.i += 1
        if self._should_skip():
            return self.state_scaler.transform([state])[-1]
        # Fit, transform, return
        self.states.append(state)
        ret = self.state_scaler.fit_transform(self.states)[-1]
        if self.i >= self.STOP_AT:
            # Clear up memory, fitted scalers have all the info we need. stop=True only needed in one of these functions
            del self.rewards
            del self.states
            self.done = True
        return ret

    def transform_reward(self, reward):
        if self._should_skip():
            return self.reward_scaler.transform([[reward]])[-1][0]
        self.rewards.append([reward])
        return self.reward_scaler.fit_transform(self.rewards)[-1][0]

    def avg_reward(self):
        if self.i < self.SKIP: return 20
        reward = self.reward_scaler.inverse_transform([[0]])[-1][0]
        return abs(reward)
示例#10
0
def graph_forecast():
    data, last_timestamp = ml.get_all_data()

    # Prediction
    # print(last_timestamp)
    # data.reverse()
    # train = data[:-24]
    # test = data[len(data) - 24:]
    # train = list(ml.chunk(train, 24))

    # Test
    train = data[:-48]
    test = data[len(data) - 48:len(data) - 24]
    actual = data[len(data) - 24:]
    train = list(ml.chunk(train, 24))
    real_temp = []
    for a in range(0, len(actual)):
        real_temp.append(actual[a][0])
    print(real_temp)

    # Train model and predict the new temperature
    train_X = []
    train_Y = []
    for k in range(0, len(train) - 2):
        for j in range(0, 24):
            train_X.append(train[k][j])
            train_Y.append(train[k + 1][j][0])
    rbX = RobustScaler()
    X = rbX.fit_transform(train_X)
    rbY = RobustScaler()
    Y = rbY.fit_transform(train_Y)
    svm = SVR(kernel='rbf', C=1e3, gamma=0.0001)
    svm.fit(X, Y)
    svm_pred = svm.predict(rbX.transform(test))
    new_temp = list((rbY.inverse_transform(svm_pred)) / 1000)
    print(new_temp)

    # Get the timestamp for the next 24 hours
    timestamp = []
    last_timestamp = last_timestamp[:-5]
    for i in range(1, 25):
        last_timestamp_dt = datetime.strptime(last_timestamp,
                                              '%Y-%m-%dT%H:%M:%S')
        last_timestamp_dt = last_timestamp_dt.replace(tzinfo=None)
        correct_time = last_timestamp_dt + timedelta(hours=int(i))
        timestamp.append(str(correct_time))

    # Plot the prediction (and verfication)
    line_chart = pygal.Line(x_label_rotation=30)
    line_chart.title = '24 hour forecast of indoor temperature in degrees Celsius'
    line_chart.x_labels = map(str, list(timestamp))
    line_chart.add('Prediction', list(new_temp))
    real_temp[:] = [x / 1000 for x in real_temp]
    line_chart.add('Actual', list(real_temp))
    return Response(response=line_chart.render(), content_type='image/svg+xml')
def trainandTestLR(X_train, y_train, X_test, testindex, output_path):
    rbY = RobustScaler()
    y_train = rbY.fit_transform(y_train.values.reshape(-1,1))
    model = LinearRegression()
    model.fit(X_train, y_train)
    ans = model.predict(X_test)
    ans = rbY.inverse_transform(ans.reshape(-1,1))
    ans = np.exp(ans)
    result = testindex
    result['SalePrice'] = ans
    result.to_csv(output_path, index=None)
示例#12
0
def predict(n = 0):

    h5f = h5py.File('D:/total_data_ro.h5','r')
    test_data = h5f['total_2019'][:]
    test_data_gen = h5f['gen_2019'][:]
    h5f.close()

    sc = RobustScaler()
    sc = joblib.load('D:/scaler_gen.pkl')
    test_data_gen = test_data_gen.reshape(-1, 1)

    img_input = Input(shape=(None, 75, 75, 10), name='images')
    convlstm2d = layers.ConvLSTM2D(filters=20, kernel_size=(3, 3),
                    input_shape=(None, 75, 75, 10),
                        data_format='channels_last', return_sequences=True,
                    padding='same')(img_input)
    batch_norm = layers.BatchNormalization()(convlstm2d)

    convlstm2d_1x1 = layers.ConvLSTM2D(filters=1, kernel_size=(3, 3),
                        data_format='channels_last',padding='same')(batch_norm)
    batch_norm = layers.BatchNormalization()(convlstm2d_1x1)

    flatten = layers.Flatten()(batch_norm)

    Dense_1_list = [layers.Dense(units=1)(flatten) for i in range(72)]

    model = Model(img_input, Dense_1_list)

    model.compile(optimizer=rmsprop(lr=0.001), loss=['mae' for i in range(72)])
    model.load_weights('D:/gen_pred_72hours.h5')

    test_sample, gen_sample = generator(test_data, test_data_gen, 120, 72, 0, None, shuffle=True, batch_size=1)
    test_sample = np.rollaxis(test_sample, 2, 5)

    y_pred = model.predict(test_sample)
    y_pred = np.array(y_pred).reshape(-1,1)
    #y_test = np.array(gen_sample)

    y_pred = sc.inverse_transform(y_pred)

    sc = MinMaxScaler(feature_range=(0, 40))
    plt.plot(sc.fit_transform(y_pred), label='pred')
    #plt.plot(gen_sample, label='real')
    plt.legend()
    plt.savefig("media/test" + str(n) + ".png")
    print('save img')
    plt.clf()
    plt.cla()
    plt.close()
    #plt.show() 
示例#13
0
class Scaler:
    def __init__(self, column):
        self.column = column.values.reshape(-1, 1)
        self.scalar = RobustScaler()
        self.scaled_column = None

    @staticmethod
    def get_new_scaler(self):
        return RobustScaler()

    def transform(self):
        self.scaled_column = self.scalar.fit_transform(self.column)
        return self.scaled_column.flatten()

    def inv_transform(self):
        return self.scalar.inverse_transform(self.scaled_column).flatten()
示例#14
0
文件: last_ditch.py 项目: lod531/aml
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

from sklearn.preprocessing import RobustScaler
rb_X = RobustScaler(with_centering=True, quantile_range=(23.0, 74.0))
rb_y = RobustScaler(with_centering=False, quantile_range=(23.0, 74.0))
X_train = rb_X.fit_transform(X_train)
X_test = rb_X.transform(X_test)
y_train = rb_y.fit_transform(y_train.reshape(-1, 1))[:, 0]

# Applying Grid Search to find the best model and the best parameters
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import r2_score, make_scorer
r2_scorer = make_scorer(r2_score, greater_is_better=True)

# Fitting the SVR to the dataset
regressor = KernelRidge(alpha=1.7e-2, kernel='laplacian', gamma=1.44929e-4)
#regressor = RandomForestRegressor(n_estimators = 300, max_depth = 3)

# Final solution
regressor.fit(X_train, y_train)
y_test_pred = rb_y.inverse_transform(regressor.predict(X_test).reshape(-1, 1))
y_test_pred = np.rint(y_test_pred)
sol = np.append(arr=ids_test.reshape(-1, 1),
                values=y_test_pred.reshape(-1, 1),
                axis=1)
fsol = pd.DataFrame(sol)
fsol.rename(columns={0: 'id', 1: 'y'}, inplace=True)
fsol.to_csv('last_ditch_sol.csv', encoding='utf-8', index=False)
print("DONE")
示例#15
0
class ANN():
    def __init__(self, df, output_column, train_raw, val_raw, test_raw,
                 continuous_cols):
        self.output_column = output_column
        self.train = train_raw
        self.val = val_raw
        self.test = test_raw
        self.continous = continuous_cols
        self.df = df
        self.train_size = train_raw.size
        #keras.backend.set_epsilon(1)

    def scale(self, train, test, val):
        #scale features
        cs = RobustScaler()

        return (trainX, valX, testX)

    def prepare(self):
        self.cs = RobustScaler()
        self.trainX = self.cs.fit_transform(self.train[self.continous])
        self.valX = self.cs.transform(self.val[self.continous])
        self.testX = self.cs.transform(self.test[self.continous])
        #self.trainX, self.valX, self.testX = self.scale(self.train, self.test, self.val)
        self.trainY = self.cs.fit_transform(self.train[[self.output_column]])
        self.testY = self.cs.transform(self.test[[self.output_column]])
        self.valY = self.cs.transform(self.val[[self.output_column]])
        return self.trainX, self.trainY, self.valX, self.valY

    def create_model(self, neurons=256):
        model = Sequential()
        initializer = keras.initializers.HeNormal()
        model.add(
            Dense(neurons,
                  input_dim=self.trainX.shape[1],
                  name='InputLayer',
                  activation='relu',
                  kernel_initializer=initializer))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(rate=0.36))

        neurons = neurons / 2
        model.add(
            Dense(neurons,
                  kernel_initializer=initializer,
                  activation='relu',
                  name='H1'))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(rate=0.37))

        neurons = neurons / 2
        model.add(
            Dense(neurons,
                  kernel_initializer=initializer,
                  activation='relu',
                  name='H2'))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(rate=0.16))

        model.add(
            Dense(1,
                  activation='linear',
                  name="OutputLayer",
                  kernel_initializer=initializer))
        self.model = model
        return model

    def get_callbacks(self):
        return [
            keras.callbacks.EarlyStopping(monitor='val_loss', patience=50),
            keras.callbacks.History()
        ]

    def train_trials(self, n_trials):
        study = optuna.create_study()
        study.optimize(self.objective, n_trials=n_trials)
        return study.best_params

    def objective(self, trial):
        K.clear_session()
        model = Sequential()
        neurons = 512
        initializer = keras.initializers.HeNormal()
        initializer = keras.initializers.HeNormal()
        model.add(
            Dense(neurons,
                  input_dim=self.trainX.shape[1],
                  name='InputLayer',
                  activation='relu',
                  kernel_initializer=initializer))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(rate=0.36))

        neurons = neurons / 2
        model.add(
            Dense(neurons,
                  kernel_initializer=initializer,
                  activation='relu',
                  name='H1'))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(rate=0.37))

        neurons = neurons / 2
        model.add(
            Dense(neurons,
                  kernel_initializer=initializer,
                  activation='relu',
                  name='H2'))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(rate=0.16))

        model.add(
            Dense(neurons,
                  kernel_initializer=initializer,
                  activation='relu',
                  name='H3'))
        model.add(
            Dense(1,
                  activation='linear',
                  name="OutputLayer",
                  kernel_initializer=initializer))
        trainY = self.trainY
        testY = self.testY
        trainX = self.trainX
        model = self.model
        opt = Adam(lr=trial.suggest_float('lr', 1e-5, 1e-3, log=True),
                   decay=trial.suggest_float('decay', 1e-5, 0.1, log=True))
        model.compile(loss='mae', optimizer=opt)
        history = model.fit(x=trainX,
                            y=trainY,
                            validation_data=(self.valX, self.valY),
                            epochs=15,
                            verbose=2,
                            batch_size=trial.suggest_int('batchsize',
                                                         68,
                                                         512,
                                                         step=12))
        return history.history["val_loss"][-1]

    def train_model(self, hparams, filename, historyname):
        opt = Adam(lr=hparams['lr'], decay=hparams['decay_rate'])
        #opt = tfa.optimizers.MovingAverage(opt)
        #keras.backend.set_epsilon(1e-7)
        print("[INFO] processing data")
        trainY = self.trainY
        testY = self.testY
        trainX = self.trainX
        model = self.model
        model.compile(loss='mae',
                      optimizer=opt,
                      metrics=[root_mean_squared_error])
        model.summary()
        print("[INFO] training model...")
        history = model.fit(x=trainX,
                            y=trainY,
                            validation_data=(self.valX, self.valY),
                            verbose=2,
                            epochs=hparams["epochs"],
                            batch_size=hparams["batch"],
                            callbacks=self.get_callbacks())
        model.save(filename)
        hist_df = pd.DataFrame(history.history)
        hist_csv_file = f'{filename}/{historyname}.csv'
        with open(hist_csv_file, mode='w') as f:
            hist_df.to_csv(f)
        self.model = model
        plot_model(model,
                   to_file=f'{filename}/model_archi.png',
                   show_shapes=True,
                   show_layer_names=True)
        return history, model

    def predict(self):
        model = self.model
        print("[INFO] predicting trade value...")
        preds = model.predict(self.testX)
        #print(preds)
        df = self.df
        # compute the difference between the *predicted*  *actual* , then compute the percentage difference and
        # the absolute percentage difference
        diff = preds.flatten() - self.testY
        percentDiff = (diff / self.testY) * 100
        absPercentDiff = np.abs(percentDiff)
        # compute the mean and standard deviation of the absolute percentage difference
        mean = np.mean(absPercentDiff)
        std = np.std(absPercentDiff)
        locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
        print("[INFO] avg. trade value: {}, std trade value: {}".format(
            locale.currency(df[self.output_column].mean(), grouping=True),
            locale.currency(df[self.output_column].std(), grouping=True)))
        print("[INFO] mean: {:.2f}%, std: {:.2f}%".format(mean, std))
        return self.cs.inverse_transform(preds)
示例#16
0
def run(news_df, snp_df, split, stopwords, using_text):
    data = news_df.groupby('date').sum().join(snp_df.set_index('Date')).dropna()

    X_temp = data.values
    X_pre_price = generate_price_features(data)
    X_price = data['Open'].values

    y = generate_regression_label(data)
    y_cls = generate_classification_label(data)

    tscv = TimeSeriesSplit(n_splits=split)
    for train_index, test_index in tscv.split(X_temp):
        start_date = data.index[train_index[0]]
        split_date = data.index[test_index[0]]
        end_date = data.index[test_index[-1]]

        print(start_date, split_date, end_date)

        if using_text:
            bag_of_words, vectorizer = generate_bag_of_words(news_df, start_date, split_date, end_date, stopwords)
            X = data[['Open']].join(bag_of_words, how='inner').drop('Open', axis=1).values

            word_size = X.shape[1]

            X_train, X_test = X[train_index], X[test_index]

        y_train, y_test = y[train_index], y[test_index]
        y_cls_train, y_cls_test = y_cls[train_index], y_cls[test_index]

        X_train_price = X_price[train_index]
        X_test_price = X_price[test_index]
        X_train_pre_price = X_pre_price[train_index]
        X_test_pre_price = X_pre_price[test_index]

        # Normalization and Scaling
        scaler = RobustScaler()
        scaler.fit(X_train_pre_price[:,0].reshape(-1, 1))
        #x_train_price_t = scaler.transform(X_train_price.reshape(-1, 1))
        #x_test_price_t = scaler.transform(X_test_price.reshape(-1, 1))
        x_train_pre_price_t = scaler.transform(X_train_pre_price.reshape(-1, 1)).reshape(-1, X_train_pre_price.shape[1])
        x_test_pre_price_t = scaler.transform(X_test_pre_price.reshape(-1, 1)).reshape(-1, X_test_pre_price.shape[1])
        y_train_t = scaler.transform(y_train.reshape(-1, 1)).reshape(-1, )

        if using_text:
            x_text_train_t = normalize(X_train)
            x_text_test_t = normalize(X_test)
            x_train_t = np.concatenate((x_text_train_t, x_train_pre_price_t), axis=1)
            x_test_t = np.concatenate((x_text_test_t, x_test_pre_price_t), axis=1)
        else:
            x_train_t = x_train_pre_price_t
            x_test_t = x_test_pre_price_t

        # Modeling
        cls_clf = LogisticRegression(penalty='l2', C=0.5, verbose=0, max_iter=100)
        cls_clf.fit(x_train_t, y_cls_train)
        y_train_cls_clf = cls_clf.predict(x_train_t)
        y_test_cls_clf = cls_clf.predict(x_test_t)

        #clf = SVR(kernel='linear', C=0.0005, verbose=0)
        #clf = LinearRegression()
        clf = linear_model.Ridge(alpha=1.0)
        clf.fit(x_train_t, y_train_t)
        y_train_clf = clf.predict(x_train_t)
        y_test_clf = clf.predict(x_test_t)
        y_train_hat = scaler.inverse_transform(y_train_clf.reshape(-1, 1)).reshape(-1, )
        y_test_hat = scaler.inverse_transform(y_test_clf.reshape(-1, 1)).reshape(-1, )

        #ipdb.set_trace()

        # Evaluation
        train_acc = accuracy_score(y_cls_train, y_train_cls_clf)
        test_acc = accuracy_score(y_cls_test, y_test_cls_clf)
        print("Accuracy ", train_acc, test_acc)

        train_mse = mean_squared_error(y_train, y_train_hat)
        test_mas = mean_squared_error(y_test, y_test_hat)
        print("MSE", train_mse, test_mas)

        train_return = evaluate_return(X_train_price, y_train_hat, y_train)
        test_return = evaluate_return(X_test_price, y_test_hat, y_test)
        print("Return", train_return, test_return)


        # Words analysis
        if using_text:
            print("LR analysis")
            positive_terms, negative_terms = analysis(x_train_t.shape[1], cls_clf.coef_[0])
            print("\tPositive terms: ", vectorizer.inverse_transform(positive_terms[:word_size].reshape(1, -1))[0])
            print("\tNegative terms: ", vectorizer.inverse_transform(negative_terms[:word_size].reshape(1, -1))[0])

            print("Ridge analysis")
            positive_terms, negative_terms = analysis(x_train_t.shape[1], clf.coef_)
            print("\tPositive terms: ", vectorizer.inverse_transform(positive_terms[:word_size].reshape(1, -1))[0])
            print("\tNegative terms: ", vectorizer.inverse_transform(negative_terms[:word_size].reshape(1, -1))[0])

            print("\nBayes analysis")
            bayes_result = analysis_bay(X_train, y_cls_train)
            show_terms(['negative', 'positive'], vectorizer, bayes_result[0], bayes_result[1])
        print("\n")
示例#17
0
文件: kit.py 项目: kingjr/jr-tools
def least_square_reference(
    inst, empty_room=None, max_times_samples=2000, bad_channels=None, scaler=None, mrk=None, elp=None, hsp=None
):
    """
    Fits and applies Least Square projection of the reference channels
    (potentially from an empty room) and removes the corresponding component
    from the recordings of a subject.

    Parameters
    ----------
        inst : Raw | str
            Raw instance or path to raw data.
        empty_room : str | None
            Path to raw data acquired in empty room.
        max_times_samples : int
            Number of time sample to use for pinv. Defautls to 2000
        bad_channels : list | array, shape (n_chans) of strings
            Lists bad channels
        scaler : function | None
            Scaler functions to normalize data. Defaults to
            sklearn.preprocessing.RobustScaler.

    Returns
    -------
        inst : Raw

    adapted from Adeen Flinker 6/2013 (<*****@*****.**>) LSdenoise.m

    Main EHN
        - Automatically detects channel types.
        - Allows flexible scaler; Robust by default.
        - The data is projected back in Tesla.
        - Allows memory control.
    TODO:
        - Allow other kind of MNE-Python inst
        - Allow baseline selection (pre-stim instead of empty room)
        - Clean up memory
        - Allow fancy solver (l1, etc)
    """
    from scipy.linalg import pinv
    from mne.io import read_raw_kit
    from mne.io import _BaseRaw

    # Least square can be fitted on empty room or on subject's data
    if empty_room is None:
        if not isinstance(inst, _BaseRaw):
            raw = read_raw_kit(inst, preload=True)
        else:
            raw = inst
    else:
        if not isinstance(empty_room, _BaseRaw):
            raw = read_raw_kit(empty_room, preload=True)
        else:
            raw = empty_room

    # Parameters
    n_chans, n_times = raw._data.shape
    chan_info = raw.info["chs"]

    # KIT: axial gradiometers (equiv to mag)
    ch_mag = np.where([ch["coil_type"] == 6001 for ch in chan_info])[0]
    # KIT: ref magnetometer
    ch_ref = np.where([ch["coil_type"] == 6002 for ch in chan_info])[0]
    # Other channels
    ch_misc = np.where([ch["coil_type"] not in [6001, 6002] for ch in chan_info])[0]
    # Bad channel
    ch_bad = np.empty(0)
    if (bad_channels is not None) and len(bad_channels):
        if np.all([isinstance(ch, int) for ch in bad_channels]):
            bad_channels = np.array(bad_channels)
        elif np.all([isinstance(ch, str) for ch in bad_channels]):
            bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in bad_channels]
        else:
            raise ValueError("bad_channels needs array of int or array of str")
    else:
        bad_channels = []
    default_bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in raw.info["bads"]]
    bad_channels = np.array(default_bad_channels + bad_channels, int)

    print("bad channels:", [raw.ch_names[bad] for bad in bad_channels])
    # To avoid memory error, let's subsample across time
    sel_times = slice(0, n_times, int(np.ceil(n_times // max_times_samples)))

    # Whiten data
    if scaler is None:
        from sklearn.preprocessing import RobustScaler

        scaler = RobustScaler()
    data_bsl = scaler.fit_transform(raw._data.T)

    # Fit Least Square coefficients on baseline data
    empty_sensors = data_bsl[:, ch_mag]
    if len(ch_bad):
        empty_sensors[:, ch_bad] = 0  # remove bad channels
    coefs = np.dot(pinv(data_bsl[sel_times, ch_ref]), empty_sensors[sel_times, :])
    empty_sensors, data_bsl = None, None  # clear memory

    # Apply correction on subject data
    if empty_room is not None:
        del raw
        raw = read_raw_kit(inst, preload=True)

    data_subject = scaler.transform(raw._data.T)
    subject_sensors = data_subject[:, ch_mag] - np.dot(data_subject[:, ch_ref], coefs)

    # Remove bad channels
    if len(ch_bad):
        subject_sensors[:, ch_bad] = 0

    # Reproject baseline
    new_ref = np.dot(subject_sensors, pinv(coefs))

    # Un-whiten data to get physical units back
    data = np.concatenate((subject_sensors, new_ref, raw._data[ch_misc, :].T), axis=1)
    data = scaler.inverse_transform(data)

    # Output
    raw._data = data.T
    return raw
示例#18
0
class ActiveLearningClient:

    def keras_model(self):
        """
            This function compiles and returns a Keras model.
            Should be passed to KerasClassifier in the Keras scikit-learn API.
        """
        print("KERAS MODEL GENERATION")
        model = Sequential()
        model.add(Dense(units=20, input_dim=5, activation='relu'))
        model.add(Dense(units=20, activation='relu'))
        model.add(Dense(units=20, activation='sigmoid')) 
        # compile keras model
        # model.compile(loss=binary_crossentropy, optimizer='adam', metrics=['binary_accuracy', self.full_multi_label_metric])
        model.compile(loss=binary_crossentropy, optimizer='adam', metrics=['binary_accuracy'])
        return model

    def load_AL_models(self):
        n_members = 2 # initializing number of Committee members
        learner_list = list()

        if not Blobby.objects.exists(): # participant number == 1
            # below for loop would only be launched for the very first participant...
            print("FIRST PARTICIPANT!")
            self.csv_url = settings.STATICFILES_DIRS[0] + "/um_{}.csv".format(DATASIZE) # default, But MUST BE CHANGED for the actual learning.
            try:
                df = pd.read_csv(self.csv_url, header=None, sep=',')            
                tmp_dataset = np.array(df)
            except Exception as e:
                print(e)
                return
            tmp_sc_x = RobustScaler()
            cpy_xpool = tmp_sc_x.fit_transform(tmp_dataset[:, :5])
            Y = pd.DataFrame(tmp_dataset[:, 5:])
            encoded_Y = self.to_ordinal(Y)
            cpy_ypool = encoded_Y.reshape([np.size(Y,0), 20])
            
            n_initial = 300  # number of initial training data ~ this determines the ratio between user_model vs. human input to inquiry

            for member_idx in range(n_members):
                train_idx = np.random.choice(range(cpy_xpool.shape[0]), size=n_initial, replace=False)
                X_train, y_train = cpy_xpool[train_idx], cpy_ypool[train_idx]
                cpy_xpool, cpy_ypool = np.delete(cpy_xpool, train_idx, axis=0), np.delete(cpy_ypool, train_idx, axis=0)
                learner = ActiveLearner(
                    estimator = self.keras_model(),
                    X_training = X_train, y_training = y_train,
                    query_strategy = avg_score
                )
                learner_list.append(learner)

        else: # participant number > 1, we load models
            dir_flag = settings.BASE_DIR
            orig_urls, mod_urls = [dir_flag+"/originalfirst.h5", dir_flag+"/originalsecond.h5"], [dir_flag+"/modifiedfirst.h5", dir_flag+"/modifiedsecond.h5"]

            for member_idx in range(n_members):
                model_url = orig_urls[member_idx] if Path(orig_urls[member_idx]).is_file() else mod_urls[member_idx]
                print("\tLoaded Models from: {}\n".format(model_url))
                model = keras.models.load_model(model_url)  # load the classifier
                # when model was loaded, we don't train extra x and y
                learner_list.append(ActiveLearner(estimator=model, query_strategy=avg_score))
        return Committee(learner_list=learner_list, given_classes=np.array([1,2,3,4,5]))

    def __init__(self):
        self.learner = self.load_AL_models()
                    
    def get_data_for_hearing_group(self):
        dir_flag = settings.STATICFILES_DIRS[0]
        deafend, deaf, hoh = 'I am Deafened', 'I identify as Deaf', 'I am Hard of Hearing'

        q = Question.objects.filter(text="1. What statement best describes your relationship to the Deaf and/or Hard of Hearing Communities?")[0]
        deaf_or_hoh = AnswerRadio.objects.filter(question=q).last().body
        
        self.csv_url = dir_flag + "/um_{}.csv".format(DATASIZE) # DEFAULT LOAD

        if deaf_or_hoh in (deaf, deafend):
            self.csv_url = dir_flag + "/deaf_{}_used.csv".format(DATASIZE)            
        elif hoh == deaf_or_hoh:
            self.csv_url = dir_flag + "/hoh_{}_used.csv".format(DATASIZE)
        else:
            self.csv_url = dir_flag + "/deaf_{}_used.csv".format(DATASIZE)
        
        print("HEARING GROUP:{}, loading:{}".format(deaf_or_hoh, self.csv_url))
        try:
            df = pd.read_csv(self.csv_url, header=None, sep=',')
            dataset = np.array(df) # this sets the class variable...
        except Exception as e:
            print(e)
            return
        
        self.sc_x = RobustScaler()
        self.X_pool = self.sc_x.fit_transform(dataset[:, :5])
        Y = pd.DataFrame(dataset[:, 5:])
        encoded_Y = self.to_ordinal(Y)
        self.Y_pool = encoded_Y.reshape([np.size(Y,0), 20])


    def to_ordinal(self, y, num_classes=None, dtype='float32'):
        y = np.array(y, dtype='int')
        input_shape = y.shape
        if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
            input_shape = tuple(input_shape[:-1])
        y = y.ravel()
        if not num_classes:
            num_classes = np.max(y) + 1
        n = y.shape[0]
        ordinal = np.zeros((n, num_classes - 1), dtype=dtype)
        for i, yi in enumerate(y):
            ordinal[i, :yi] = 1
        output_shape = input_shape + (num_classes - 1,)
        ordinal = np.reshape(ordinal, output_shape)
        return ordinal
        
    def make_preds(self):
        query_idx, q_instance = self.learner.query(self.X_pool)
        queried_vals = self.sc_x.inverse_transform(q_instance)
        # get machine prediction to be displayed
        machine_prediction = list(np.array(self.learner.predict(q_instance)) + 1) # add 1 to show in 1-5 scale
        
        print("machine prediction:", machine_prediction) # these values are +1 from what's predicted.
        self.test_printing(query_idx, queried_vals[0])
        return (q_instance, machine_prediction, queried_vals[0])

    def train_learner(self, q_instance, ratings):
        tmp_queried_val = self.sc_x.inverse_transform(q_instance).astype(int)
        np_ratings = np.zeros(shape=(1, 20)) # in the shape of multiple columns, padd with zeros
        for c in [0,1,2,3]:
            tmp_start = c*5
            tmp_i = c*5 + ratings[c]
            for w in range(tmp_start, tmp_i):
                np_ratings[0, w] = 1

        # User ratings [1,3,4,2]: 
        # [[1. 0. 0. 0. 0. | 1. 1. 1. 0. 0. | 1. 1. 1. 1. 0. | 1. 1. 0. 0. 0.]] 
        # we convert form to (1,20) not (,20)
        # therefore, the 0-4 range for index doesn't really matter because we convert from 1-5 range to 1,20 anyways.
        self.learner.teach(q_instance, np_ratings, epochs=100, verbose=0)
        
        print("Cappy learning the ratings:{} for q_instance:{}\nwhich is {}".format(ratings, q_instance, tmp_queried_val))

        return self.learner, tmp_queried_val

    def test_printing(self, query_idx, queried_vals):
        pf_val = "Paraphrased" if queried_vals[3] == 1 else "Verbatim"
        hearing_group = "Deaf" if queried_vals[4] == 1 else "Hard of Hearing"
        tmp_str = "The machine selected index {} with raw values:\n\t" \
            + "Delay of {} ms\n\tSpeed of {} WPM\n\tMissing {} words and is {}.\n" \
            + "Predicting ratings by {}."
        print(tmp_str.format(query_idx, queried_vals[0], queried_vals[1], queried_vals[2], pf_val, hearing_group))
y = y.reshape(-1, 1)
scaler = RobustScaler()
y3 = scaler.fit_transform(y)
y3 = y3.reshape(1, -1)[0]

print("Training regressor ...")
reg = ensemble.RandomForestRegressor(verbose=1, n_estimators=1000, n_jobs=3)
# pickle.dump(clf, open("reg.pickle", "wb"))
# reg = pickle.load(open("rfr_reg_iter1000_alldata.pickle", "r"))
reg.fit(x3, y3)

print("Predicting regression ...")
reg_result = reg.predict(t2)
reg_result = reg_result.reshape(-1, 1)
# scaler = pickle.load(open("scaler.pickle", "r"))
reg_result = scaler.inverse_transform(reg_result)
reg_result = np.exp(reg_result)
reg_result = reg_result.reshape(1, -1)[0]

#############################
#
# Submission Output
#
#############################
print("Output submission ...")
res_merged = merge(clf_result, reg_result)
res_merged = np.round(res_merged, 2)

test_id = test_id.astype(int)
test_loss = pd.DataFrame(res_merged, columns=['loss'])
示例#20
0
class PredictorTrainer:
    DATA_PATH = 'data/training.csv'
    MODEL_PATH = 'data/models/'
    SCALER_PATH = 'data/models/scaler.pkl'
    TRAINED_MODEL_PATH = 'data/models/fee-predictor-model.h5'
    BATCH_SIZE = 256
    TRAIN_STEPS = 10
    TRAIN_DATA_PERCENT = 0.9

    def __init__(self, batch_size=BATCH_SIZE, train_steps=TRAIN_STEPS):
        self.initialize_scaler()

    def initialize_scaler(self):
        path = Path(PredictorTrainer.SCALER_PATH)

        if not path.is_file():
            print('Scaler model not found. Initializing.')
            #self.scaler = MinMaxScaler(feature_range=(0, 1))
            self.scaler = RobustScaler()
            data = self.load_data()
            self.scaler.fit(data.values[:, 1:])
            path.parent.mkdir(parents=True, exist_ok=True)
            joblib.dump(self.scaler, PredictorTrainer.SCALER_PATH)
            print('Scaler initialized and saved.')
        else:
            print('Found scaler model. Loading.')
            self.scaler = joblib.load(PredictorTrainer.SCALER_PATH)
            print('Scaler loaded.')

    def scale_data(self, data):
        return self.scaler.transform(data)

    # splits the data onto training and test set
    def split_data(self, data, n):
        train_start = 0
        train_end = int(np.floor(0.8 * n))
        test_start = train_end + 1
        test_end = n
        return data[train_start:train_end], data[test_start:test_end]

    # loads the file with default data
    def load_file(self):
        return pd.read_csv(PredictorTrainer.DATA_PATH)

    # there are helper fields in data, this function left only ones which needed to train the model
    def get_learning_data(self, dataframe):
        return dataframe.drop(['block_median_fee_per_byte', 'block_id'],
                              axis='columns')

    # sometimes fee_per_byte is enormous, so we take care of having the normal one here
    def filter_out_outliners(self, dataframe):
        return dataframe.query('fee_per_byte < block_median_fee_per_byte')

    # do all transformation needed to get info suitable for training
    def load_data(self):
        data = self.load_file()
        data = self.filter_out_outliners(data)
        return self.get_learning_data(data)

    def train(self):
        data = self.load_data()
        n = data.shape[0]
        data = data.values

        data_train, data_test = self.split_data(data, n)

        x_train = self.scale_data(data_train[:, 1:])
        y_train = data_train[:, 0]
        x_test = self.scale_data(data_test[:, 1:])
        y_test = data_test[:, 0]

        model = keras.Sequential([
            keras.layers.Dense(3, kernel_initializer='normal', input_dim=3),
            keras.layers.Dense(1024, kernel_initializer='normal'),
            keras.layers.PReLU(),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(512, kernel_initializer='normal'),
            keras.layers.PReLU(),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(256, kernel_initializer='normal'),
            keras.layers.PReLU(),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(128, kernel_initializer='normal'),
            keras.layers.PReLU(),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(
                64,
                kernel_initializer='normal',
            ),
            keras.layers.PReLU(),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(32, kernel_initializer='normal'),
            keras.layers.PReLU(),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(1, kernel_initializer='normal')
        ])

        model.compile(optimizer='adam', loss=tf.losses.huber_loss)
        model.fit(x_train, y_train, epochs=10, batch_size=250)

        model.save(PredictorTrainer.TRAINED_MODEL_PATH)

    def load_model(self, model_name):
        return keras.models.load_model(
            model_name, custom_objects={'huber_loss': tf.losses.huber_loss})

    def evaluate_block(self, model_name, test_file):
        model = self.load_model(model_name)
        data_raw = pd.read_csv(test_file)
        min_fee = data_raw[['fee_per_byte']].min().values[0]
        median_fee = data_raw[['block_median_fee_per_byte']].values[0][0]
        data = data_raw.query('confirmation_speed == 0')
        data = self.get_learning_data(data)
        data_y = data[:, 0]
        data_x = self.scale_data(data[:, 1:])
        predicted = model.predict(data_x).flatten()

        hit = np.where(predicted > min_fee)[0].size
        out = np.where(predicted > median_fee)[0].size
        total_good = np.where((min_fee < predicted)
                              & (predicted < median_fee))[0].size

        print('hit', hit)
        print('out', out)
        print('total_good', total_good)

        total_fee_loss = 0
        sizes = data_raw.query('confirmation_speed == 0')[['vsize'
                                                           ]].values.flatten()
        for i in range(0, data_y.size):
            total_fee_loss += sizes[i] * (data_y[i] - predicted[i])
        print('total_fee_loss', total_fee_loss)
        return

    # evaluates the model predictions and write down values to file for further analisys
    def evaluate(self):
        # idea is to check how well we predict fee so that transaction were added to the first block after they appear in mempool
        model = self.load_model(PredictorTrainer.TRAINED_MODEL_PATH)
        data_raw = self.load_file()
        # looking for blocks which wasn't used during training so that get legitimate result
        # the first step is get training set the same way as we did this during training session
        data = self.filter_out_outliners(data_raw)
        data_train, data_test = self.split_data(data, data.shape[0])

        data_train_blocks = set(data_train['block_id'].values.flatten()
                                )  # block ids which were used during training
        all_blocks = set(
            data_raw['block_id'].values.flatten())  # all block ids in our data
        block_indexes_to_evaluate = list(
            all_blocks.difference(data_train_blocks)
        )  # this difference are block ids which wasn't used by training process
        data = data_raw[(
            data_raw['block_id'].isin(block_indexes_to_evaluate)
        )]  # filter the data which wasn't used in training so we can use it to evaluate
        data = data.query(
            'confirmation_speed == 0'
        )  # we looking only for results where transaction were added to the first next block after it added to mempool

        #collecting the statistics
        output = pd.DataFrame(columns=[
            'block_id', 'min_fee', 'median_fee', 'predicted_mean_fee',
            'predicted_median_fee'
        ])
        for name, group in data.groupby('block_id'):
            min_fee = group['fee_per_byte'].min()
            median_fee = group['fee_per_byte'].median()
            learning_data = self.get_learning_data(group)
            x_test = self.scale_data(learning_data.values[:, 1:])
            y_predicted = model.predict(x_test).flatten()
            predicted_mean_fee = float(np.mean(y_predicted))
            predicted_median_fee = float(np.median(y_predicted))
            output = output.append(
                {
                    'block_id': name,
                    'min_fee': min_fee,
                    'median_fee': median_fee,
                    'predicted_mean_fee': predicted_mean_fee,
                    'predicted_median_fee': predicted_median_fee
                },
                ignore_index=True)

        output.to_csv(
            os.path.join(PredictorTrainer.MODEL_PATH, 'evaluation_output.csv'))

    def predict(self, predict, expected, model_name):
        predict_scaled = self.scale_data(predict)[:, 1:]
        sess, x, y, out = self.load_model(
            os.path.join(PredictorTrainer.MODEL_PATH, model_name))
        predictions = sess.run(out, feed_dict={x: predict_scaled})

        template = 'Prediction is "{}", expected "{}"\n'
        output = []
        i = 0

        for pred, expec in zip(predictions[0, :], expected):
            inversed = self.scaler.inverse_transform(
                np.array([[pred, predict[i][1], predict[i][2],
                           predict[i][3]]]))
            pred = inversed[0, 0]
            print(template.format(pred, expec))
            output.append({
                'mempool_megabytes': predict[i][1],
                'mempool_tx_count': predict[i][2],
                'confirmation_speed': predict[i][3],
                'prediction': pred
            })

            i += 1

        return output
示例#21
0
y1 = y[:, 0:1]
y2 = y[:, 1:2]

rbY1 = RobustScaler()
y1 = rbY1.fit_transform(y1)

rbY2 = RobustScaler()
y2 = rbY2.fit_transform(y2)

C = 1e3  # SVM regularization parameter
svc1 = svm.SVR(kernel='rbf', C=C, gamma=0.1).fit(X_scaled, [x[0] for x in y1])
svc2 = svm.SVR(kernel='rbf', C=C, gamma=0.1).fit(X_scaled, [x[0] for x in y2])

svm_pred = svc1.predict(rbX.transform(X_val))
svm_pred = np.reshape(svm_pred, (-1, 1))
y1_pred = rbY1.inverse_transform(svm_pred)

svm_pred = svc2.predict(rbX.transform(X_val))
svm_pred = np.reshape(svm_pred, (-1, 1))
y2_pred = rbY2.inverse_transform(svm_pred)

predicted = np.concatenate((y1_pred, y2_pred), axis=1)

dist_err = np.array(
    list(map(lambda x: geo_dist(x[0], x[1]), zip(predicted, y_val))))
err_mean = np.mean(dist_err)

## Random Forest

regr = RandomForestRegressor(random_state=0, n_estimators=1000, oob_score=True)
regr.fit(X, y)
#saving the NN results as .json and .h5
#model_json = model.to_json()
#with open("model.json", "w") as json_file:
#json_file.write(model_json)
#model.save_weights("model.h5")

#saving the NN model
#import joblib
#filename = 'predictor101.sav'
#joblib.dump(model, filename)
#load_model = joblib.load(filename)
yhat = model.predict(
    np.array(df.tail(n_per_in)).reshape(1, n_per_in, n_features))

# Transforming the predicted values back to their original format
yhat = close_scaler.inverse_transform(yhat)[0]

# Creating a DF of the predicted prices
preds = pd.DataFrame(yhat,
                     index=pd.date_range(start=df.index[-1] +
                                         timedelta(days=1),
                                         periods=len(yhat),
                                         freq="B"),
                     columns=[df.columns[0]])

# Number of periods back to plot the actual values
pers = n_per_in

# Transforming the actual values to their original price
actual = pd.DataFrame(close_scaler.inverse_transform(df[["Close"]].tail(pers)),
                      index=df.Close.tail(pers).index,
示例#23
0
                     columns=Xtest.columns)
Ytrain = pd.DataFrame(
    rbsy.transform(Ytrain))  # , index=Ytrain.index, columns=Ytrain.columns)

if not os.path.isfile('.\\model.dat'):
    model = LinearRegression()
    model.fit(Xtrain, Ytrain)
    pickle.dump(model, open("model.dat", "wb"))

else:
    model = pickle.load(open("model.dat", "rb"))

ypred = model.predict(Xtest)
ypred = ypred.reshape((-1, 1))

ypred = rbsy.inverse_transform(ypred)
rmse = sqrt(mean_squared_error(Ytest, ypred))
print(rmse)

Ytest_np = Ytest.to_numpy()
ypred = ypred.flatten()

fig = go.Figure()
fig.add_trace(go.Scatter(y=Ytest_np, mode='lines', name='Ytest'))
fig.add_trace(go.Scatter(y=ypred, mode='lines', name='ypred'))
fig.write_html(f'.\\Ytest.html')

new = 2
chromepath = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s'
webbrowser.get(chromepath).open(f'.\\Ytest.html', new=new)
示例#24
0
                   verbose=True,
                   learning_rate='adaptive',
                   tol=0.0,
                   warm_start=True,
                   solver='adam')

reg.fit(X_train, Y_train)

pred_y = reg.predict(X_test)

plt.plot(pred_y.flatten(), label='predict')
plt.plot(Y_test.flatten(), label='real')
plt.legend()
plt.show()

pred = rob_sca.inverse_transform(pred_y.reshape(-1, 1))
test = rob_sca.inverse_transform(Y_test.reshape(-1, 1))

err = abs(pred - test) / test

plt.plot(pred.flatten(), label='predict')
plt.plot(test.flatten(), label='real')
plt.legend()
plt.show()

plt.plot(err, label='err')
plt.legend()
plt.show()

# 误差方差
re_err = abs(pred - test)
示例#25
0
    valor_arrecadacao_serie_temporal_lstm_treino = LSTMUtil.cria_intervalos_temporais(valor_treino_rbt)
    valor_arrecadacao_serie_temporal_lstm_teste = LSTMUtil.cria_intervalos_temporais(valor_teste_rbt)

    model = LSTMUnivariada(df_treino)
    checkpoint = ModelCheckpoint('checkpoint_regressor_'+tributo+'_teste_robust_scaler.hdf5', monitor='loss', verbose=2,
                                save_best_only=True, save_weights_only=False,
                                mode='auto', period=1)
    model.compile(optimizer=ko.Adam(lr=0.1), loss='mse')
    model.fit([np_dia_mes_treino, valor_arrecadacao_serie_temporal_lstm_treino], saida_treino, validation_data=([np_dia_mes_teste, valor_arrecadacao_serie_temporal_lstm_teste], saida_teste), 
              epochs=100, batch_size=50, callbacks=[checkpoint])
    
    # Carrega o melhor modelo salvo pelo Checkpoint
    model.load_weights('checkpoint_regressor_'+tributo+'_teste_robust_scaler.hdf5')
    
    rbt_pred = model.predict([np_dia_mes_teste, valor_arrecadacao_serie_temporal_lstm_teste])    
    mae_rbt = mean_absolute_error(rbt_scaler.inverse_transform(saida_teste), rbt_scaler.inverse_transform(rbt_pred))
    print('O MAE para o tributo '+tributo+' usando o "Robust Scaler" foi de '+str(mae_rbt))
    
    comparativo.loc[tributo, 'RobustScaler'] = mae_rbt

    # Power Transformer (yeo-johnson)
    pwr_scaler = PowerTransformer()
    valor_treino_pwr = pwr_scaler.fit_transform(df_treino['Valor'].values.reshape(-1, 1))
    valor_teste_pwr = pwr_scaler.transform(df_teste['Valor'].values.reshape(-1, 1))
    
    # A saída (label) é a arrecadação do dia seguinte ao último dia da sequência
    saida_treino = valor_treino_pwr[5:]
    saida_teste = valor_teste_pwr[5:]

    valor_arrecadacao_serie_temporal_lstm_treino = LSTMUtil.cria_intervalos_temporais(valor_treino_pwr)
    valor_arrecadacao_serie_temporal_lstm_teste = LSTMUtil.cria_intervalos_temporais(valor_teste_pwr)
示例#26
0
def anomallyDet(revDF,
                resName,
                valid=None,
                test=0.25,
                plot=True,
                figsize=(15, 5),
                n_steps=30,
                units=64,
                dropout=0.2,
                optimizer='adam',
                metrics='accuracy',
                batch_size=32,
                loss='mae',
                epochs=100):
    """
    FORECAST FUTURE SALES WITH THE USE OF ARIMA MODELING

    Inputs:
        :param revDF: Generated and clustered restaurant revenue dataframe
        :param resName: Name of restuarnt in interest of analyzing
        :param valid: Valid dataframe size
        :param test: Test dataframe size, DEFAULT: 0.25
        :param plot: If the function should plot, DEFAULT: True
        :param figsize: Plot figure size, DEFAULT: (15, 5)
        :param n_steps: Sequence for n_steps of days for historical data
        :param units: Dimensionality of the output space
        :param dropout: Fraction of the units to drop for the linear transformation of the inputs, DEFAULT = 0.2
        :param optimizier: Updating modle in response to the output of the loss function, DEFAULT: adam
        :param loss: Compute the quantity that a model should seek to minimize, DEFAULT: mae
        :param metrics: Function used to judge the performance of the LSTM model ,DEFAULT: accuracy
        :param batch_size: Number of samples per gradient update, DEFAULT: 32
        :param epochs: Number of epochs to train LSTM model, DEFAULT: 100
    """

    revCopy = revDF.copy()
    resName = resName.lower()

    print('**** SPLICING GENERATED DATAFRAME ****')
    revCopy = revCopy.reset_index()
    revCopy = revCopy[['Date', resName]]
    revCopy['Date'] = revCopy['Date'].astype('datetime64')
    first_idx = revCopy[resName].first_valid_index()
    revCopy = revCopy.loc[first_idx:]
    revCopy = revCopy.reset_index(drop=True)
    revCopy = revCopy.groupby('Date').sum()

    if valid is None:
        print('**** SPLITING INTO TRAIN AND TEST **** \n')
        trainDF, testDF = split_train_test(revCopy,
                                           valid=valid,
                                           test=test,
                                           plot=plot,
                                           figsize=figsize)

        print('**** ROBUST SCALING TRAIN AND TEST DATA **** \n')
        robust = RobustScaler(quantile_range=(25, 75)).fit(trainDF)
        trainDF_scaled = robust.transform(trainDF)
        testDF_scaled = robust.transform(testDF)

        ## HELPER FUNCTION
        def create_dataset(X, y, time_steps=1):
            a, b = [], []
            for i in range(len(X) - time_steps):
                v = X[i:(i + time_steps)]
                a.append(v)
                b.append(y[i + time_steps])
            return np.array(a), np.array(b)

        ## CREATE SEQUENCES WITH N_STEPS DAYS OF HISTORICAL DATA
        n_steps = n_steps

        print('**** RESHAPING DATA INTO 3D FOR LSTM MODEL **** \n')
        ## RESHAPE TO 3D [n_samples, n_steps, n_features]
        X_train, y_train = create_dataset(trainDF_scaled, trainDF_scaled,
                                          n_steps)
        X_test, y_test = create_dataset(testDF_scaled, testDF_scaled, n_steps)
        print('X_train shape:', X_train.shape)
        print('y_train:', y_train.shape)
        print('X_test shape:', X_test.shape)
        print('y_test:', y_test.shape)

        print('**** BUILDING LSTM MODEL ****')
        units = units
        dropout = dropout
        optimizer = optimizer
        loss = loss
        epochs = epochs
        model = Sequential()
        model.add(
            LSTM(units=units,
                 input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(Dropout(rate=dropout))
        model.add(RepeatVector(n=X_train.shape[1]))
        model.add(LSTM(units=units, return_sequences=True))
        model.add(Dropout(rate=dropout))
        model.add(TimeDistributed(Dense(units=X_train.shape[2])))
        print(model.summary())

        print('\n **** COMPILING AND FITTING LSTM MODEL **** \n')
        model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
        history = model.fit(X_train,
                            y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_split=0.2,
                            shuffle=False)

        if plot is True:
            print('**** PLOT MODEL LOSS OVER EPOCHS ****')
            plt.figure(figsize=figsize)
            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title('Model Loss')
            plt.ylabel('Loss')
            plt.xlabel('Epoch')
            plt.legend(['Train', 'Test'], loc='best')
            plt.grid(True)
            plt.show()
        else:
            pass

        print('\n **** PREDICTING ON TEST DATAFRAME ****')
        y_pred = model.predict(X_test)
        mae = np.mean(np.abs(y_pred - X_test), axis=1)
        ## RESHAPING PREDICTION
        pred = y_pred.reshape((y_pred.shape[0] * y_pred.shape[1]),
                              y_pred.shape[2])
        ## RESHAPING TEST DATA
        X_test = X_test.reshape((X_test.shape[0] * X_test.shape[1]),
                                X_test.shape[2])
        ## ERROR COMPUTATION
        errors = X_test - pred
        print('Error:', errors.shape)
        ## RMSE DATA
        RMSE = math.sqrt(mean_squared_error(X_test, pred))
        print(f'Test RMSE: {RMSE} \n')

        ## DETECTING ANOMALIES
        print('**** DETECTING ANOMALIES IN SALES ****')
        dist = np.linalg.norm(X_test - pred, axis=1)
        scores = dist.copy()
        scores.sort()
        cut_off = int(0.8 * len(scores))
        threshold = scores[cut_off]
        score = pd.DataFrame(index=testDF[n_steps:].index)
        score['Loss'] = mae
        score['Threshold'] = threshold
        score['Anomaly'] = score['Loss'] > score['Threshold']
        score[resName] = testDF[n_steps:][resName]
        anomalies = score[score['Anomaly'] == True]
        x = pd.DataFrame(anomalies[resName])
        x = pd.DataFrame(robust.inverse_transform(x))
        x.index = anomalies.index
        x.rename(columns={0: 'Revenue'}, inplace=True)
        anomalies = anomalies.join(x, how='left')
        anomalies = anomalies.drop(columns=[resName], axis=1)

        test_inv = pd.DataFrame(robust.inverse_transform(testDF[n_steps:]))
        test_inv.index = testDF[n_steps:].index
        test_inv.rename(columns={0: resName}, inplace=True)

        if plot is True:
            print('**** PLOTTING ANOMALLY DETECTION ****')
            plt.figure(figsize=figsize)
            plt.plot(test_inv.index,
                     test_inv[resName],
                     color='gray',
                     label=resName)
            sns.scatterplot(anomalies.index,
                            anomalies['Revenue'],
                            color='red',
                            s=55,
                            label='Anomaly')
            plt.xticks(rotation=90)
            plt.xlabel('Date')
            plt.ylabel('Sales')
            plt.legend(loc='best')
            plt.grid(True)
            plt.show()

            print('\n **** SAVING ANOMALY MODEL PREDICTIONS LOCALLY ****')
            resFileName = resName.replace(' ', '_')
            fileName = f'{resFileName.upper()}_ANOMALY_PREDICTIONS.csv'
            anomalies.to_csv(fileName)
        else:
            print('\n **** SAVING ANOMALY MODEL PREDICTIONS LOCALLY ****')
            resFileName = resName.replace(' ', '_')
            fileName = f'{resFileName.upper()}_ANOMALY_PREDICTIONS.csv'
            anomalies.to_csv(fileName)

    else:
        print('**** SPLITING INTO TRAIN, VALID, AND TEST **** \n')
        trainDF, validDF, testDF = split_train_test(revCopy,
                                                    valid=valid,
                                                    test=test,
                                                    plot=plot,
                                                    figsize=figsize)

        print('**** ROBUST SCALING TRAIN, VALID, TEST DATA **** \n')
        robust = RobustScaler(quantile_range=(25, 75)).fit(trainDF)
        trainDF_scaled = robust.transform(trainDF)
        validDF_scaled = robust.transform(validDF)
        testDF_scaled = robust.transform(testDF)

        ## HELPER FUNCTION
        def create_dataset(X, y, time_steps=1):
            a, b = [], []
            for i in range(len(X) - time_steps):
                v = X[i:(i + time_steps)]
                a.append(v)
                b.append(y[i + time_steps])
            return np.array(a), np.array(b)

        ## CREATE SEQUENCES WITH N_STEPS DAYS OF HISTORICAL DATA
        n_steps = n_steps

        print('**** RESHAPING DATA INTO 3D FOR LSTM MODEL **** \n')
        ## RESHAPE TO 3D [n_samples, n_steps, n_features]
        X_train, y_train = create_dataset(trainDF_scaled, trainDF_scaled,
                                          n_steps)
        X_valid, y_valid = create_dataset(validDF_scaled, validDF_scaled,
                                          n_steps)
        X_test, y_test = create_dataset(testDF_scaled, testDF_scaled, n_steps)
        print('X_train shape:', X_train.shape)
        print('y_train:', y_train.shape)
        print('X_test shape:', X_test.shape)
        print('y_test:', y_test.shape)

        print('**** BUILDING LSTM MODEL ****')
        units = units
        dropout = dropout
        optimizer = optimizer
        loss = loss
        epochs = epochs
        model = Sequential()
        model.add(
            LSTM(units=units,
                 input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(Dropout(rate=dropout))
        model.add(RepeatVector(n=X_train.shape[1]))
        model.add(LSTM(units=units, return_sequences=True))
        model.add(Dropout(rate=dropout))
        model.add(TimeDistributed(Dense(units=X_train.shape[2])))
        print(model.summary())

        print('\n **** COMPILING AND FITTING LSTM MODEL **** \n')
        model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
        history = model.fit(X_train,
                            y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_data=(X_valid, y_valid),
                            shuffle=False)

        if plot is True:
            print('**** PLOT MODEL LOSS OVER EPOCHS ****')
            plt.figure(figsize=figsize)
            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title('Model Loss')
            plt.ylabel('Loss')
            plt.xlabel('Epoch')
            plt.legend(['Train', 'Test'], loc='best')
            plt.grid(True)
            plt.show()
        else:
            pass

        print('\n **** PREDICTING ON TEST DATAFRAME ****')
        y_pred = model.predict(X_test)
        mae = np.mean(np.abs(y_pred - X_test), axis=1)
        ## RESHAPING PREDICTION
        pred = y_pred.reshape((y_pred.shape[0] * y_pred.shape[1]),
                              y_pred.shape[2])
        ## RESHAPING TEST DATA
        X_test = X_test.reshape((X_test.shape[0] * X_test.shape[1]),
                                X_test.shape[2])
        ## ERROR COMPUTATION
        errors = X_test - pred
        print('Error:', errors.shape)
        ## RMSE DATA
        RMSE = math.sqrt(mean_squared_error(X_test, pred))
        print(F'Test RMSE: {RMSE}')

        ## DETECTING ANOMALIES
        print('\n **** DETECTING ANOMALIES IN SALES ****')
        dist = np.linalg.norm(X_test - pred, axis=1)
        scores = dist.copy()
        scores.sort()
        cut_off = int(0.8 * len(scores))
        threshold = scores[cut_off]
        score = pd.DataFrame(index=testDF[n_steps:].index)
        score['Loss'] = mae
        score['Threshold'] = threshold
        score['Anomaly'] = score['Loss'] > score['Threshold']
        score[resName] = testDF[n_steps:][resName]
        anomalies = score[score['Anomaly'] == True]
        x = pd.DataFrame(anomalies[resName])
        x = pd.DataFrame(robust.inverse_transform(x))
        x.index = anomalies.index
        x.rename(columns={0: 'Revenue'}, inplace=True)
        anomalies = anomalies.join(x, how='left')
        anomalies = anomalies.drop(columns=[resName], axis=1)

        test_inv = pd.DataFrame(robust.inverse_transform(testDF[n_steps:]))
        test_inv.index = testDF[n_steps:].index
        test_inv.rename(columns={0: resName}, inplace=True)

        if plot is True:
            print('**** PLOTTING ANOMALLY DETECTION ****')
            plt.figure(figsize=figsize)
            plt.plot(test_inv.index,
                     test_inv[resName],
                     color='gray',
                     label=resName)
            sns.scatterplot(anomalies.index,
                            anomalies['Revenue'],
                            color='red',
                            s=55,
                            label='Anomaly')
            plt.xticks(rotation=90)
            plt.xlabel('Date')
            plt.ylabel('Sales')
            plt.legend(loc='best')
            plt.grid(True)
            plt.show()

            print('\n **** SAVING ANOMALY MODEL PREDICTIONS LOCALLY ****')
            resFileName = resName.replace(' ', '_')
            fileName = f'{resFileName.upper()}_ANOMALY_PREDICTIONS.csv'
            anomalies.to_csv(fileName)
        else:
            print('\n **** SAVING ANOMALY MODEL PREDICTIONS LOCALLY ****')
            resFileName = resName.replace(' ', '_')
            fileName = f'{resFileName.upper()}_ANOMALY_PREDICTIONS.csv'
            anomalies.to_csv(fileName)
    epochs=50, 
    batch_size=72, 
    validation_split=0.1,
    shuffle=False
)

scores = model.evaluate(X_test1, y_test1, verbose=0)
print(scores)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend();

y_pred1 = model.predict(X_test1)

y_train_inv1 = cnt_transformer.inverse_transform(y_train1.reshape(1, -1))
y_test_inv1 = cnt_transformer.inverse_transform(y_test1.reshape(1, -1))
y_pred_inv1 = cnt_transformer.inverse_transform(y_pred1)

print(y_pred_inv1)

from matplotlib import pyplot
pyplot.plot(history.history['loss'])
pyplot.plot(history.history['val_loss'])
pyplot.title('model train vs validation loss')
pyplot.ylabel('loss')
pyplot.xlabel('epoch')
pyplot.legend(['train', 'validation'], loc='upper right')
pyplot.show()

import pickle
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=0.1,
                    shuffle=False,
                    callbacks=[earlystop_callback],
                    verbose=2)

#save model
#my_model_path = os.path.dirname('saved_model/my_model')
#model.save(my_model_path)
#
#plt.plot(history.history['loss'], label='train')
#plt.plot(history.history['val_loss'], label = 'validation')
#plt.legend()
##
#evaluate model on testing data
y_pred = model.predict(X_test)
y_train_inv = label_column_max_transformer.inverse_transform(
    y_train.reshape(1, -1))
y_test_inv = label_column_max_transformer.inverse_transform(
    y_test.reshape(1, -1))
y_pred_inv = label_column_max_transformer.inverse_transform(y_pred)

f2 = plt.figure()
plot_prediction(f2, y_test_inv.flatten(), y_pred_inv.flatten())

#print r2
print(metrics.r2_score(y_test, y_pred))
print(metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))
示例#29
0
for i in range(len(x_scaled) - history_period_size - future_period_predict):
    sequential_data.append([
        x_scaled[i:(i + history_period_size)],
        y_scaled[i + history_period_size + future_period_predict - 1]
    ])

x, y = [], []
for seq, target in sequential_data:
    x.append(seq)
    y.append(target)

# Predict
x_pred = np.array(x)
y_pred = np.array(y)
y_inverse = y_transformer.inverse_transform(y_pred)

predicted = model.predict(x_pred)
predicted_inverse = y_transformer.inverse_transform(predicted)

# print(output_features)
# print('Predicted:')
# print(np.round(predicted_inverse[0], 1))
# print('Original:')
# print(records.iloc[i_pred + history_period_size][output_features].to_numpy())

plt.figure(figsize=(10, 6))
plt.plot(y_inverse[:200, 2], 'b', label='Measured')
plt.plot(predicted_inverse[:200, 2], 'r', label='Predicted')
plt.ylabel('Min Temp (C)')
plt.legend()
batch_size = 128
g = generator.flow(Xt, yt, batch_size=batch_size, shuffle=True)
steps_per_epoch = 10000 / batch_size

reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.5,
                              patience=10,
                              min_lr=1e-6,
                              verbose=1)
history = model.fit_generator(g,
                              steps_per_epoch=len(Xt) // batch_size,
                              epochs=150,
                              validation_data=(X_test, y_test_s),
                              callbacks=[reduce_lr])

name = "Chemception_like_demo"
model.save("data/%s.h5" % name)

hist = history.history
import pickle
pickle.dump(hist, file("data/%s_history.pickle" % name, "w"))

y_pred_t = rbs.inverse_transform(model.predict(X_train))
y_pred = rbs.inverse_transform(model.predict(X_test))

corr2 = np.corrcoef(np.log(y_test).reshape(1, -1), y_pred.reshape(1,
                                                                  -1))[0][1]**2
rmse = np.mean((np.log(y_test) - y_pred)**2)**0.5
print("R2 : %0.2F" % corr2)
print("RMSE : %0.2F" % rmse)
def least_square_reference(inst,
                           empty_room=None,
                           max_times_samples=2000,
                           bad_channels=None,
                           scaler=None,
                           mrk=None,
                           elp=None,
                           hsp=None):
    """
    # downloaded function least_square_reference from https://github.com/kingjr/jr-tools/blob/master/jr/meg/kit.py and added to base_funcs
    Fits and applies Least Square projection of the reference channels
    (potentially from an empty room) and removes the corresponding component
    from the recordings of a subject.
    Parameters
    ----------
        inst : Raw | str
            Raw instance or path to raw data.
        empty_room : str | None
            Path to raw data acquired in empty room.
        max_times_samples : int
            Number of time sample to use for pinv. Defautls to 2000
        bad_channels : list | array, shape (n_chans) of strings
            Lists bad channels
        scaler : function | None
            Scaler functions to normalize data. Defaults to
            sklearn.preprocessing.RobustScaler.
    Returns
    -------
        inst : Raw
    adapted from Adeen Flinker 6/2013 (<*****@*****.**>) LSdenoise.m
    Main EHN
        - Automatically detects channel types.
        - Allows flexible scaler; Robust by default.
        - The data is projected back in Tesla.
        - Allows memory control.
    TODO:
        - Allow other kind of MNE-Python inst
        - Allow baseline selection (pre-stim instead of empty room)
        - Clean up memory
        - Allow fancy solver (l1, etc)
    """
    from scipy.linalg import pinv
    from mne.io import read_raw_fif
    from mne.io import BaseRaw

    # Least square can be fitted on empty room or on subject's data
    if empty_room is None:
        if not isinstance(inst, BaseRaw):
            raw = read_raw_fif(inst, preload=True)
        else:
            raw = inst
    else:
        if not isinstance(empty_room, BaseRaw):
            raw = read_raw_fif(empty_room, preload=True)
        else:
            raw = empty_room

    # Parameters
    n_chans, n_times = raw._data.shape
    chan_info = raw.info['chs']

    # KIT: axial gradiometers (equiv to mag)
    ch_mag = np.where([ch['coil_type'] == 6001 for ch in chan_info])[0]
    # KIT: ref magnetometer
    ch_ref = np.where([ch['coil_type'] == 6002 for ch in chan_info])[0]
    # Other channels
    ch_misc = np.where(
        [ch['coil_type'] not in [6001, 6002] for ch in chan_info])[0]

    # check if refs is included
    assert len(
        ch_ref
    ) != 0, "MEG refs are not among the channels! They are needed for denoise!"

    # Bad channel
    ch_bad = np.empty(0)
    if (bad_channels is not None) and len(bad_channels):
        if np.all([isinstance(ch, int) for ch in bad_channels]):
            bad_channels = np.array(bad_channels)
        elif np.all([isinstance(ch, str) for ch in bad_channels]):
            bad_channels = [
                ii for ii, ch in enumerate(raw.ch_names) if ch in bad_channels
            ]
        else:
            raise ValueError('bad_channels needs array of int or array of str')
    else:
        bad_channels = []
    default_bad_channels = [
        ii for ii, ch in enumerate(raw.ch_names) if ch in raw.info['bads']
    ]
    bad_channels = np.array(default_bad_channels + bad_channels, int)

    print('bad channels:', [raw.ch_names[bad] for bad in bad_channels])
    # To avoid memory error, let's subsample across time
    sel_times = slice(0, n_times, int(np.ceil(n_times // max_times_samples)))

    # Whiten data
    if scaler is None:
        from sklearn.preprocessing import RobustScaler
        scaler = RobustScaler()
    data_bsl = scaler.fit_transform(raw._data.T)

    # Fit Least Square coefficients on baseline data
    empty_sensors = data_bsl[:, ch_mag]
    if len(ch_bad):
        empty_sensors[:, ch_bad] = 0  # remove bad channels
    coefs = np.dot(pinv(data_bsl[sel_times, ch_ref]),
                   empty_sensors[sel_times, :])
    empty_sensors, data_bsl = None, None  # clear memory

    # Apply correction on subject data
    if empty_room is not None:
        del raw
        raw = read_raw_fif(inst, preload=True)

    data_subject = scaler.transform(raw._data.T)
    subject_sensors = (data_subject[:, ch_mag] -
                       np.dot(data_subject[:, ch_ref], coefs))

    # Remove bad channels
    if len(ch_bad):
        subject_sensors[:, ch_bad] = 0

    # Reproject baseline
    new_ref = np.dot(subject_sensors, pinv(coefs))

    # Un-whiten data to get physical units back
    data = np.concatenate((subject_sensors, new_ref, raw._data[ch_misc, :].T),
                          axis=1)
    data = scaler.inverse_transform(data)

    # Output
    raw._data = data.T
    return raw