예제 #1
0
class Data(object):
    def __init__(self, fname, random_state=42):
        self.fname = fname
        self.radom_state = random_state
        self._scalar = None

    def load(self, subset=None):
        df = pd.read_csv(self.fname, index_col=0,
                         skipinitialspace=True).iloc[:, 1:]
        if subset:
            df = df[subset + ["state_code"]]
        x = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values
        return x, y

    def normalize(self, x, algorithm="min-max"):

        if algorithm == "min-max":
            self._scalar = MinMaxScaler()
        elif algorithm == "standard":
            self._scalar = StandardScaler()
        elif algorithm == "robust":
            self._scalar = RobustScaler(quantile_range=(25, 78))
        self._scalar.fit(x)
        return self._scalar.transform(x)

    def split(self, x, y, test_ratio=0.2, random_state=None):
        if random_state is None:
            random_state = self.radom_state
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=test_ratio, stratify=y, random_state=random_state)
        return x_train, x_test, y_train, y_test
예제 #2
0
def gen_splits(X, scale=True, exclude_features=None, k=5, test_size=.1):
    X, y = separate_X_y(X, exclude_features)

    if test_size:
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size=test_size, random_state=RANDOM_STATE)

    kf = KFold(n_splits=k, random_state=RANDOM_STATE)
    folds = []
    k_idx = 0
    for train_index, val_index in kf.split(X_train):
        k_idx += 1
        X_train_cv, X_val = X_train[train_index].copy(
        ), X_train[val_index].copy()
        y_train_cv, y_val = y_train[train_index].copy(
        ), y_train[val_index].copy()
        if scale:
            scaler = RobustScaler()
            scaler.fit(X_train_cv)
            X_train_cv = scaler.transform(X_train_cv)
            # Fit on train, transforming the validation, avoid data leak
            X_val = scaler.transform(X_val)

        folds.append((X_train_cv, X_val, y_train_cv, y_val))

    # The Scaler must be executed for the full train only after the folds are computed
    # Avoiding data leaks to the Cross Validation
    if scale:
        scaler = RobustScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        # Fit on train, transforming the test, avoid data leak
        X_test = scaler.transform(X_test)

    return folds, (X_train, X_test, y_train, y_test)
예제 #3
0
class CustomRobustScaler:
    """RobustScaler that passes a labeled pandas dataframe"""
    def __init__(self, debug=False, strategy="median"):
        self._scaler = None
        self._column_list = []
        self.d = debug
        self.colnames = None

    def fit(self, X, y=None):
        self._scaler = RobustScaler()
        self._scaler.fit(X)
        return self

    def transform(self, X):
        self._column_list = []
        debug_print(X=X, debug=self.d)
        result_X = self._scaler.transform(X)
        for column in X.columns:
            self._column_list.append(column)
        X = pd.DataFrame(result_X, columns=self._column_list)
        self.colnames = X.columns.tolist()
        return X

    def get_feature_names(self):
        return self.colnames
예제 #4
0
class ScalingTransformer(BaseEstimator, TransformerMixin):
    """
    transform dataframe first by RobustScaler to lower down the influence of outliers,
    then transform it by MinMaxScaler to range (0,1)
    """
    def __init__(self, featureList, quantile_range=(25, 75)):
        # scaling continuous value features with RobustScaler
        self.quantile_range = quantile_range
        self.robust_scaler = RobustScaler(quantile_range=self.quantile_range)
        # further scaling data to range (0, 1)
        self.min_max_scaler = MinMaxScaler()
        self.featureList = featureList

    def fit(self, X, y=None):
        #print(X['starRating'].head())
        X_ = X.copy()
        self.robust_scaler.fit(X_[self.featureList])
        X_train_robust = self.robust_scaler.transform(X_[self.featureList])
        self.min_max_scaler.fit(X_train_robust)
        return self

    def transform(self, X):
        X_ = X.copy()
        X_train_robust = self.robust_scaler.transform(X_[self.featureList])
        X_[self.featureList] = self.min_max_scaler.transform(X_train_robust)
        return X_
예제 #5
0
def modeling(
    dataset: pd.DataFrame,
    hyperparams: Hyperparameters,
) -> float:
    y_target = dataset["Product_Supermarket_Sales"].tolist()
    dataset.drop(["Product_Supermarket_Sales"], axis=1, inplace=True)

    X_train, X_test, y_train, _ = train_test_split(dataset,
                                                   y_target,
                                                   test_size=0.3)

    scaler = RobustScaler()

    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    gb_model = GradientBoostingRegressor(
        n_estimators=hyperparams.n_estimators,
        max_depth=hyperparams.max_depth,
        max_features=hyperparams.max_features,
        min_samples_split=hyperparams.min_samples_split,
        random_state=hyperparams.random_state,
    )

    return cross_validate(gb_model, hyperparams.nfolds, X_train, y_train)
예제 #6
0
class df_scaler(TransformerMixin, BaseEstimator):
    '''
    Wrapper of StandardScaler or RobustScaler
    '''
    def __init__(self, method='standard'):
        self.scl = None
        self.scale_ = None
        self.method = method
        if self.method == 'sdandard':
            self.mean_ = None
        elif method == 'robust':
            self.center_ = None
        self.columns = None  # this is useful when it is the last step of a pipeline before the model

    def fit(self, X, y=None):
        if self.method == 'standard':
            self.scl = StandardScaler()
            self.scl.fit(X)
            self.mean_ = pd.Series(self.scl.mean_, index=X.columns)
        elif self.method == 'robust':
            self.scl = RobustScaler()
            self.scl.fit(X)
            self.center_ = pd.Series(self.scl.center_, index=X.columns)
        self.scale_ = pd.Series(self.scl.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xscl = self.scl.transform(X)
        Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns)
        self.columns = X.columns
        return Xscaled

    def get_feature_names(self):
        return list(self.columns)
예제 #7
0
class RobustScalerPrim(primitive):
    def __init__(self, random_state=0):
        super(RobustScalerPrim, self).__init__(name='RobustScaler')
        self.id = 9
        self.hyperparams = []
        self.type = 'feature preprocess'
        self.description = "Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile). Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Median and interquartile range are then stored to be used on later data using the transform method. Standardization of a dataset is a common requirement for many machine learning estimators. Typically this is done by removing the mean and scaling to unit variance. However, outliers can often influence the sample mean / variance in a negative way. In such cases, the median and the interquartile range often give better results."
        self.hyperparams_run = {'default': True}
        self.scaler = RobustScaler()
        self.accept_type = 'c_t'

    def can_accept(self, data):
        return self.can_accept_c(data)

    def is_needed(self, data):
        # data = handle_data(data)
        # Update
        return True

    def fit(self, data):
        data = handle_data(data)
        self.scaler.fit(data['X'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        cols = ["{}_rbstscale".format(x) for x in cols]
        output['X'] = pd.DataFrame(self.scaler.transform(output['X']),
                                   columns=cols)
        final_output = {0: output}
        return final_output
예제 #8
0
def FeatureProcessing(new_df):
    dev_id = new_df['dev_id']
    Feature = new_df.loc[:, new_df.columns != 'dev_id']
    ss = RobustScaler()
    ss.fit(Feature)
    Feature = ss.transform(Feature)
    return dev_id, Feature
예제 #9
0
def transform_robust_scale(train, test):
    features = train.loc[:, train.columns.str.contains('^g-|^c-')].columns
    scaler = RobustScaler()
    scaler.fit(pd.concat([train[features], test[features]], axis=0))
    train[features] = scaler.transform(train[features])
    test[features] = scaler.transform(test[features])
    return train, test
예제 #10
0
파일: data3.py 프로젝트: yyk/temp1
def process_field(a, column):
    p = a[column]

    price_scaler = MinMaxScaler(feature_range=(0, 1))
    a['%s_ln' % column] = np.log(p)
    price_scaler.fit(a['%s_ln' % column].values.reshape(-1, 1))
    a['%s_ln_scaled' % column] = price_scaler.transform(
        a['%s_ln' % column].values.reshape(-1, 1))

    intervals = [1, 5, 10, 20]
    roc_columns = []
    for interval in intervals:
        column_name = '%s_roc%d_ln' % (column, interval)
        a[column_name] = a['%s_ln' % column].diff(interval).fillna(0)
        roc_columns.append(column_name)
    roc_scaler = RobustScaler(quantile_range=(5.0, 95.0))
    roc_scaler.fit(a[roc_columns[1]].values.reshape(-1, 1))
    for interval in intervals:
        a[('%s_roc%d_ln_scaled' % (column, interval))] = roc_scaler.transform(
            a['%s_roc%d_ln' % (column, interval)].values.reshape(-1, 1))

    intervals = [5, 10, 20]
    for interval in intervals:
        # sma
        a[('%s_sma%d_ln' % (column, interval))] = np.log(
            p.rolling(interval).mean()).fillna(0)
        a[('%s_sma%d_ln_scaled' %
           (column, interval))] = price_scaler.transform(
               a['%s_sma%d_ln' % (column, interval)].values.reshape(-1, 1))
        # ema
        a[('%s_ema%d_ln' % (column, interval))] = np.log(
            p.ewm(interval).mean()).fillna(0)
        a[('%s_ema%d_ln_scaled' %
           (column, interval))] = price_scaler.transform(
               a['%s_ema%d_ln' % (column, interval)].values.reshape(-1, 1))
예제 #11
0
def fselect_v1(h5_path, scaler_type, use_gmean, out_path):
    ''' Run feature selection for preprocess HDF5 (v1)
    '''
    logger.info('Loading training data ...')
    Xtrain_df, ytrain_df, Ntrain = load_hdf5(h5_path)
    train_columns = Xtrain_df.columns.values
    Xtrain_mat = Xtrain_df.as_matrix()
    # scaling if need
    if scaler_type == 'robust':
        logger.info('Use robust scaler')
        scaler = RobustScaler()
        scaler.fit(Xtrain_mat)
        Xtrain_mat = scaler.transform(Xtrain_mat)
    elif scaler_type == 'standard':
        logger.info('Use standard scaler')
        scaler = StandardScaler()
        scaler.fit(Xtrain_mat)
        Xtrain_mat = scaler.transform(Xtrain_mat)
    ytrain2d = make2dy(ytrain_df, Ntrain, use_gmean)
    lassocv = run_lasso(Xtrain_mat, ytrain2d)
    rndlasso = run_rndlasso(Xtrain_mat, ytrain2d, alpha=lassocv.alpha_)
    fscores = rndlasso.scores_
    res = pd.DataFrame(fscores, index=train_columns, columns=['rndlasso'])
    res.index.name = 'fname'
    res.to_csv(out_path, sep='\t')
def linear_train(train_log, train_label, valid_log, valid_label, time_name):
    train_log = train_log.fillna(0)
    valid_log = valid_log.fillna(0)
    scaler = RobustScaler(with_centering=True,
                          with_scaling=True,
                          quantile_range=(1.0, 99.0),
                          copy=True)
    scaler.fit(train_log.values)
    Classifier.set_scaler(scaler)

    normal_train = scaler.transform(train_log.values)
    normal_valid = scaler.transform(valid_log.values)

    classifier = linear_model.LogisticRegression(class_weight='balanced',
                                                 solver="sag",
                                                 max_iter=5000,
                                                 verbose=1,
                                                 n_jobs=2)
    classifier.fit(normal_train, train_label)

    y_valid = classifier.predict(normal_valid)
    y_train = classifier.predict(normal_train)

    fpr, tpr, thresholds = metrics.roc_curve(train_label, y_train, pos_label=1)
    a = metrics.auc(fpr, tpr)
    print("train auc", a)

    fpr, tpr, thresholds = metrics.roc_curve(valid_label, y_valid, pos_label=1)
    a = metrics.auc(fpr, tpr)
    print(a)

    return classifier, a
예제 #13
0
def prepare_data_mean():
    data = pd.read_csv('titanic_train_500_age_passengerclass.csv', sep=',', header=0)
    yvalues = pd.DataFrame(dict(Survived=[]), dtype=int)
    yvalues["Survived"] = data["Survived"].copy()

    data.drop('Survived', axis=1, inplace=True)
    data.drop('PassengerId', axis=1, inplace=True)

    x_train = data.head(400)
    x_train = x_train.fillna(x_train.mean())

    x_test = data.tail(100)
    x_test = x_test.fillna(x_test.mean())

    y_train = yvalues.head(400)
    y_test = yvalues.tail(100)

    # Scaling our data
    scaler = RobustScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    scaler.fit(x_test)
    x_test = scaler.transform(x_test)

    return x_train, x_test, y_train, y_test
def getScaler_fromFile(trainFile):
    raw_train = [l.strip().split('\t')[2:] for l in open(trainFile)][1:]
    trainTable = [[float(i) for i in r] for r in raw_train]
    X = np.array(trainTable)
    scaler = RobustScaler()
    scaler.fit(X)
    return scaler
예제 #15
0
def split_xy(df, y_var='apow', step=1):
    # define X and Y
    y_col = [col for col in df.columns if col.split('.')[0] == y_var]
    X = df.iloc[:-1, :]
    Y = df[y_col].iloc[1:, 0]

    if step > 1:
        # for multi forecast
        mX = X[:-step + 1]
        mY = pd.DataFrame(Y)
        for i in range(step - 1):
            mY = pd.concat([Y.shift(i + 1), mY], axis=1)
        mY = pd.DataFrame(mY.values[step - 1:, :], index=mY.index[:-step + 1])
        x_train, x_test, y_train, y_test = train_test_split(mX,
                                                            mY,
                                                            test_size=0.1,
                                                            random_state=42)
    else:
        # single forecast
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=42)

    # Feature Scaling
    scaler = RobustScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    #    with open('../forecast_models/scaler_new.sav', 'wb') as sc:
    #        pickle.dump(scaler, sc)

    return x_train, x_test, y_train, y_test
class PandasRobustScaler(BaseEstimator, TransformerMixin):
    def __init__(self, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), prefix='', suffix='__rbstscale'):
        self.scaler = None
        self.with_centering = with_centering
        self.with_scaling = with_scaling
        self.quantile_range = quantile_range
        self.center_ = None
        self.scale_ = None
        self.prefix = prefix
        self.suffix = suffix

    def fit(self, X, y=None, **fitparams):
        X = validate_dataframe(X)
        self.scaler = RobustScaler(with_centering=self.with_centering,
                               with_scaling=self.with_scaling, quantile_range=self.quantile_range)
        self.scaler.fit(X)
        self.center_ = pd.Series(self.scaler.center_, index=X.columns)
        self.scale_ = pd.Series(self.scaler.scale_, index=X.columns)
        return self

    def transform(self, X, **transformparams):
        X = validate_dataframe(X)
        X = X.copy()
        Xrs = self.scaler.transform(X)
        Xscaled = pd.DataFrame(Xrs, index=X.index, columns=self.prefix + X.columns + self.suffix)
        return Xscaled
예제 #17
0
def scaler_dummy(dataset):

  scaler_mm = MinMaxScaler() 
  scaler_ma = MaxAbsScaler()
  scaler_sd = StandardScaler()
  scaler_rb = RobustScaler()

  numerical = list(dataset.columns)
  data_transform_mm = pd.DataFrame(data = dataset)
  data_transform_ma = pd.DataFrame(data = dataset)
  data_transform_sd = pd.DataFrame(data = dataset)
  data_transform_rb = pd.DataFrame(data = dataset)


  scaler_mm.fit(dataset[numerical])
  scaler_ma.fit(dataset[numerical])
  scaler_sd.fit(dataset[numerical])
  scaler_rb.fit(dataset[numerical])


  data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical])
  data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical])
  data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical])
  data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical])


  ## get dummies

  features_final_mm = pd.get_dummies(data_transform_mm)
  features_final_ma = pd.get_dummies(data_transform_ma)
  features_final_sd = pd.get_dummies(data_transform_sd)
  features_final_rb = pd.get_dummies(data_transform_rb)

  return features_final_mm, features_final_ma, features_final_sd, features_final_rb
예제 #18
0
def flatten_ts(train, test):
    new_train, new_test = [], []

    for _, row in train.iterrows():
        new_list = []
        for i in row.index:
            row[i] = row[i].dropna()
            for j in range(len(row[i])):
                new_list.append(row[i][j])
        new_train.append(new_list)

    for _, row in test.iterrows():
        new_list = []
        for i in row.index:
            row[i] = row[i].dropna()
            for j in range(len(row[i])):
                new_list.append(row[i][j])
        new_test.append(new_list)

    train_df = pd.DataFrame(new_train)
    test_df = pd.DataFrame(
        pad_sequences(new_test, maxlen=train_df.shape[1], dtype='float32'))

    scaler = RobustScaler()
    scaler.fit(train_df)

    return scaler.transform(train_df.dropna()), scaler.transform(
        test_df.dropna())
def fn_y_nhanes(y, reference_y=None):

    scaler = RobustScaler(with_centering=False, quantile_range=(0, 95))

    if reference_y is None:
        scaler.fit(y)
    else:
        scaler.fit(reference_y)

    # get mask before transforming
    m = np.zeros((y.shape))
    yc = np.ones((y.shape))
    for r in range(y.shape[0]):
        for c in range(y.shape[1]):
            if y[r, c] > 0:
                m[r, c] = 1
                yc[r, c] = y[r, c]

    yt = scaler.transform(yc)

    # convert to torch
    yp = torch.from_numpy(yt)
    mp = torch.from_numpy(m)

    return yp, mp
예제 #20
0
class DFRobustScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns        = columns
        self.model          = RobustScaler(**kwargs)
        self.transform_cols = None
        
    def fit(self, X, y=None):
        self.columns        = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        return self
    
    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.transform(X[self.transform_cols])

        return new_X
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    
    def inverse_transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.inverse_transform(X[self.transform_cols])

        return new_X
def get_data_split(data):

    rows = data.shape[0]  # rows
    cols = data.shape[1]  # columns

    data = data.values  # .values contains data as a Numpy Array

    # Training and test data
    train_start = 0
    train_end = int(np.floor(0.8 * rows))  # 80% of data is training data
    # test_start = train_end + 1            # remaining is for testing
    test_start = rows - 100
    test_end = rows
    data_train = data[np.arange(train_start, train_end), :]
    data_test = data[np.arange(test_start, test_end), :]

    # Scale data to handle outliers
    from sklearn.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaler.fit(data_train)
    data_train = scaler.transform(data_train)
    data_test = scaler.transform(data_test)

    # Build X and y
    X_train = data_train[:, 1:]
    y_train = data_train[:, 0]  # 0th column has labels
    X_test = data_test[:, 1:]
    y_test = data_test[:, 0]  # 0th column has labels

    return X_train, y_train, X_test, y_test
예제 #22
0
def bestRandomForest(X, y, split=0.7, ntrials=100):
    means = np.zeros(ntrials, )
    max_accuracy= 0
    for trial in range(ntrials):
        xTr, yTr, xTe, yTe, trIdx, teIdx = trteSplitEven(X, y, split, trial)
        # Train
        scaler = RobustScaler()
        scaler.fit(xTr)
        xTr = scaler.transform(xTr)
        xTe = scaler.transform(xTe)
        forest = RandomForestClassifier(class_weight="balanced")
        n_estimators = [100, 300, 600]
        max_depth = [3, xTr.shape[1] / 2 + 1, 25, 100, 300]
        min_samples_split = [2, 5, 10]
        min_samples_leaf = [1, 2, 10]
        hyperF = dict(n_estimators=n_estimators, max_depth=max_depth,
                      min_samples_split=min_samples_split,
                      min_samples_leaf=min_samples_leaf)

        gridF = GridSearchCV(forest, hyperF, cv=3, verbose=1, n_jobs=-1)
        gridF.fit(xTr, yTr)
        best_params = gridF.best_params_
        print(best_params)
        # Predict
        yPr = gridF.predict(xTe)
        # Compute classification error
        print("Trial:", trial, "Accuracy", "%.3g" % (100 * np.mean((yPr == yTe).astype(float))))
        means[trial] = 100 * np.mean((yPr == yTe).astype(float))
        if means[trial]>max_accuracy:
            max_accuracy = means[trial]
            best_classifier = gridF

    print("best accuracy is ", max_accuracy)
    print("best parameters are ", best_classifier.get_params())
    return best_classifier
예제 #23
0
    def scale(self, X_train, X_test, type):
        if type == "StandardScaler":
            standardScaler = StandardScaler()
            standardScaler.fit(X_train)
            X_train = standardScaler.transform(X_train)
            X_test = standardScaler.transform(X_test)
            return X_train, X_test

        elif type == "MinMaxScaler":
            minMaxScaler = MinMaxScaler()
            minMaxScaler.fit(X_train)
            X_train = minMaxScaler.transform(X_train)
            X_test = minMaxScaler.transform(X_test)
            return X_train, X_test
        elif type == "MaxScaler":

            maxScaler = MaxAbsScaler()
            maxScaler.fit(X_train)
            X_train = maxScaler.transform(X_train)
            X_test = maxScaler.transform(X_test)
            return X_train, X_test

        elif type == "RobustScaler":
            robustScaler = RobustScaler()
            robustScaler.fit(X_train)
            X_train = robustScaler.transform(X_train)
            X_test = robustScaler.transform(X_test)
            return X_train, X_test
예제 #24
0
class Preprocessor:
    def __init__(self, params: dict):
        self.params = params
        if params["scaler"] == "standert_scaler":
            self.scaler = StandardScaler()
        elif params["scaler"] == "robust_scaler":
            self.scaler = RobustScaler()
        else:
            print("wrong scaler parameters")
            raise KeyError

        self.encoder = {}
        if params["encoder"] == "label_encoder":
            self.base_encoder = LabelEncoder
        else:
            print("wrong encoder parameters")
            raise KeyError

    def fit_transform(self, X_old):
        X = X_old.copy()
        for var_name in X.select_dtypes(include=['object']):
            encoder = self.base_encoder
            encoder.transform(X[var_name].astype(str))
            X[var_name] = encoder.fit(X[var_name].astype(str))
            self.encoder[var_name] = encoder
        self.scaler.fit(X)
        return self.scaler.transform(X)

    def fit(self, X_old):
        X = X_old.copy()
        for var_name in X.select_dtypes(include=['object']):
            X[var_name] = self.encoder[var_name].fit(X[var_name].astype(str))
        return self.scaler.transform(X)
예제 #25
0
def fit_evaluate(regr, X_train, X_val, y_train, y_val, log_y=False, scale=False, exclude_features=None):
    print("Evaluating ...")
    if y_val is None:
        X_train, y_train = separate_X_y(X_train, exclude_features)
        X_val, y_val = separate_X_y(X_val, exclude_features)

    if scale:
        scaler = RobustScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        # Fit on train, transforming the test, avoid data leak
        X_val = scaler.transform(X_val)

    if regr:
        regr.verbose = False
        if log_y:
            regr.fit(X_train, np.log(y_train))
            y_pred = np.exp(
                np.array(regr.predict(X_val), dtype=np.float128))
        else:
            regr.fit(X_train, y_train)
            y_pred = regr.predict(X_val)

    else:
        if log_y:
            theta = normal_equation.normal_equation(
                X_train, np.log(y_train))
            y_pred = np.exp(customSGD.predict(theta, X_val))
        else:
            theta = normal_equation.normal_equation(X_train, y_train)
            y_pred = customSGD.predict(theta, X_val)

    evaluate(y_val, y_pred)
예제 #26
0
def prepare(processing):
    # Scale
    pd.options.mode.chained_assignment = None  # disable false warning for copying

    x_transformer = RobustScaler(
    )  # StandardScaler(), MinMaxScalar(feature_range=(-1,1))
    x_transformer = x_transformer.fit(processing[input_features].to_numpy())
    x_scaled = x_transformer.transform(processing[input_features].to_numpy())

    y_transformer = RobustScaler()
    y_transformer = y_transformer.fit(processing[output_features].to_numpy())
    y_scaled = y_transformer.transform(processing[output_features].to_numpy())

    # Shuffle
    sequential_data = []

    for i in range(
            len(x_scaled) - history_period_size - future_period_predict):
        sequential_data.append([
            x_scaled[i:(i + history_period_size)],
            y_scaled[i + history_period_size + future_period_predict - 1]
        ])

    random.shuffle(sequential_data)

    # Split
    x, y = [], []

    for seq, target in sequential_data:
        x.append(seq)
        y.append(target)

    return np.array(x), np.array(y)
예제 #27
0
def normalize_data(dataframe, mode):
    if mode == 'abs':
        from sklearn.preprocessing import MaxAbsScaler
        max_abs = MaxAbsScaler(copy=True)  #save for retransform later
        max_abs.fit(dataframe)
        data_norm = max_abs.transform(dataframe)

        return data_norm, max_abs

    if mode == 'robust':
        from sklearn.preprocessing import RobustScaler
        robust = RobustScaler(copy=True)  #save for retransform later
        robust.fit(dataframe)
        data_norm = robust.transform(dataframe)

        return data_norm, robust

    if mode == 'min_max':
        from sklearn.preprocessing import MinMaxScaler
        minmax = MinMaxScaler(feature_range=(0, 1),
                              copy=True)  #save for retransform later
        minmax.fit(dataframe)
        data_norm = minmax.transform(dataframe)

        return data_norm, minmax
    if mode == 'std':
        from sklearn.preprocessing import StandardScaler
        stdscaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        stdscaler.fit(dataframe)
        data_norm = stdscaler.transform(dataframe)

        return data_norm, stdscaler
예제 #28
0
파일: data.py 프로젝트: als95/nntest
 def normalize(self, data):
     new_axis_data = data[np.newaxis, :]
     new_raw_data = np.append(self.raw, new_axis_data, axis=0)
     scaler = RobustScaler()
     scaler.fit(new_raw_data)
     new_normalize = scaler.transform(new_raw_data)
     return new_normalize[-1]
예제 #29
0
def interquartile_scale(X_train, X_valid, X_test):
    scaler = RobustScaler(quantile_range=(25.0, 75.0))
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_valid = scaler.transform(X_valid) if X_valid is not None else None
    X_test = scaler.transform(X_test) if X_test is not None else None
    return X_train, X_valid, X_test
 def transform(self, X):
     data = X.copy()
     rscaler = RobustScaler()
     rscaler.fit(X=data[data.columns.intersection(self.columns)])
     data[data.columns.intersection(self.columns)] = rscaler.transform(
         data[data.columns.intersection(self.columns)])
     return data