class Data(object): def __init__(self, fname, random_state=42): self.fname = fname self.radom_state = random_state self._scalar = None def load(self, subset=None): df = pd.read_csv(self.fname, index_col=0, skipinitialspace=True).iloc[:, 1:] if subset: df = df[subset + ["state_code"]] x = df.iloc[:, :-1].values y = df.iloc[:, -1].values return x, y def normalize(self, x, algorithm="min-max"): if algorithm == "min-max": self._scalar = MinMaxScaler() elif algorithm == "standard": self._scalar = StandardScaler() elif algorithm == "robust": self._scalar = RobustScaler(quantile_range=(25, 78)) self._scalar.fit(x) return self._scalar.transform(x) def split(self, x, y, test_ratio=0.2, random_state=None): if random_state is None: random_state = self.radom_state x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_ratio, stratify=y, random_state=random_state) return x_train, x_test, y_train, y_test
def gen_splits(X, scale=True, exclude_features=None, k=5, test_size=.1): X, y = separate_X_y(X, exclude_features) if test_size: X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=test_size, random_state=RANDOM_STATE) kf = KFold(n_splits=k, random_state=RANDOM_STATE) folds = [] k_idx = 0 for train_index, val_index in kf.split(X_train): k_idx += 1 X_train_cv, X_val = X_train[train_index].copy( ), X_train[val_index].copy() y_train_cv, y_val = y_train[train_index].copy( ), y_train[val_index].copy() if scale: scaler = RobustScaler() scaler.fit(X_train_cv) X_train_cv = scaler.transform(X_train_cv) # Fit on train, transforming the validation, avoid data leak X_val = scaler.transform(X_val) folds.append((X_train_cv, X_val, y_train_cv, y_val)) # The Scaler must be executed for the full train only after the folds are computed # Avoiding data leaks to the Cross Validation if scale: scaler = RobustScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) # Fit on train, transforming the test, avoid data leak X_test = scaler.transform(X_test) return folds, (X_train, X_test, y_train, y_test)
class CustomRobustScaler: """RobustScaler that passes a labeled pandas dataframe""" def __init__(self, debug=False, strategy="median"): self._scaler = None self._column_list = [] self.d = debug self.colnames = None def fit(self, X, y=None): self._scaler = RobustScaler() self._scaler.fit(X) return self def transform(self, X): self._column_list = [] debug_print(X=X, debug=self.d) result_X = self._scaler.transform(X) for column in X.columns: self._column_list.append(column) X = pd.DataFrame(result_X, columns=self._column_list) self.colnames = X.columns.tolist() return X def get_feature_names(self): return self.colnames
class ScalingTransformer(BaseEstimator, TransformerMixin): """ transform dataframe first by RobustScaler to lower down the influence of outliers, then transform it by MinMaxScaler to range (0,1) """ def __init__(self, featureList, quantile_range=(25, 75)): # scaling continuous value features with RobustScaler self.quantile_range = quantile_range self.robust_scaler = RobustScaler(quantile_range=self.quantile_range) # further scaling data to range (0, 1) self.min_max_scaler = MinMaxScaler() self.featureList = featureList def fit(self, X, y=None): #print(X['starRating'].head()) X_ = X.copy() self.robust_scaler.fit(X_[self.featureList]) X_train_robust = self.robust_scaler.transform(X_[self.featureList]) self.min_max_scaler.fit(X_train_robust) return self def transform(self, X): X_ = X.copy() X_train_robust = self.robust_scaler.transform(X_[self.featureList]) X_[self.featureList] = self.min_max_scaler.transform(X_train_robust) return X_
def modeling( dataset: pd.DataFrame, hyperparams: Hyperparameters, ) -> float: y_target = dataset["Product_Supermarket_Sales"].tolist() dataset.drop(["Product_Supermarket_Sales"], axis=1, inplace=True) X_train, X_test, y_train, _ = train_test_split(dataset, y_target, test_size=0.3) scaler = RobustScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) gb_model = GradientBoostingRegressor( n_estimators=hyperparams.n_estimators, max_depth=hyperparams.max_depth, max_features=hyperparams.max_features, min_samples_split=hyperparams.min_samples_split, random_state=hyperparams.random_state, ) return cross_validate(gb_model, hyperparams.nfolds, X_train, y_train)
class df_scaler(TransformerMixin, BaseEstimator): ''' Wrapper of StandardScaler or RobustScaler ''' def __init__(self, method='standard'): self.scl = None self.scale_ = None self.method = method if self.method == 'sdandard': self.mean_ = None elif method == 'robust': self.center_ = None self.columns = None # this is useful when it is the last step of a pipeline before the model def fit(self, X, y=None): if self.method == 'standard': self.scl = StandardScaler() self.scl.fit(X) self.mean_ = pd.Series(self.scl.mean_, index=X.columns) elif self.method == 'robust': self.scl = RobustScaler() self.scl.fit(X) self.center_ = pd.Series(self.scl.center_, index=X.columns) self.scale_ = pd.Series(self.scl.scale_, index=X.columns) return self def transform(self, X): # assumes X is a DataFrame Xscl = self.scl.transform(X) Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns) self.columns = X.columns return Xscaled def get_feature_names(self): return list(self.columns)
class RobustScalerPrim(primitive): def __init__(self, random_state=0): super(RobustScalerPrim, self).__init__(name='RobustScaler') self.id = 9 self.hyperparams = [] self.type = 'feature preprocess' self.description = "Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile). Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Median and interquartile range are then stored to be used on later data using the transform method. Standardization of a dataset is a common requirement for many machine learning estimators. Typically this is done by removing the mean and scaling to unit variance. However, outliers can often influence the sample mean / variance in a negative way. In such cases, the median and the interquartile range often give better results." self.hyperparams_run = {'default': True} self.scaler = RobustScaler() self.accept_type = 'c_t' def can_accept(self, data): return self.can_accept_c(data) def is_needed(self, data): # data = handle_data(data) # Update return True def fit(self, data): data = handle_data(data) self.scaler.fit(data['X']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) cols = ["{}_rbstscale".format(x) for x in cols] output['X'] = pd.DataFrame(self.scaler.transform(output['X']), columns=cols) final_output = {0: output} return final_output
def FeatureProcessing(new_df): dev_id = new_df['dev_id'] Feature = new_df.loc[:, new_df.columns != 'dev_id'] ss = RobustScaler() ss.fit(Feature) Feature = ss.transform(Feature) return dev_id, Feature
def transform_robust_scale(train, test): features = train.loc[:, train.columns.str.contains('^g-|^c-')].columns scaler = RobustScaler() scaler.fit(pd.concat([train[features], test[features]], axis=0)) train[features] = scaler.transform(train[features]) test[features] = scaler.transform(test[features]) return train, test
def process_field(a, column): p = a[column] price_scaler = MinMaxScaler(feature_range=(0, 1)) a['%s_ln' % column] = np.log(p) price_scaler.fit(a['%s_ln' % column].values.reshape(-1, 1)) a['%s_ln_scaled' % column] = price_scaler.transform( a['%s_ln' % column].values.reshape(-1, 1)) intervals = [1, 5, 10, 20] roc_columns = [] for interval in intervals: column_name = '%s_roc%d_ln' % (column, interval) a[column_name] = a['%s_ln' % column].diff(interval).fillna(0) roc_columns.append(column_name) roc_scaler = RobustScaler(quantile_range=(5.0, 95.0)) roc_scaler.fit(a[roc_columns[1]].values.reshape(-1, 1)) for interval in intervals: a[('%s_roc%d_ln_scaled' % (column, interval))] = roc_scaler.transform( a['%s_roc%d_ln' % (column, interval)].values.reshape(-1, 1)) intervals = [5, 10, 20] for interval in intervals: # sma a[('%s_sma%d_ln' % (column, interval))] = np.log( p.rolling(interval).mean()).fillna(0) a[('%s_sma%d_ln_scaled' % (column, interval))] = price_scaler.transform( a['%s_sma%d_ln' % (column, interval)].values.reshape(-1, 1)) # ema a[('%s_ema%d_ln' % (column, interval))] = np.log( p.ewm(interval).mean()).fillna(0) a[('%s_ema%d_ln_scaled' % (column, interval))] = price_scaler.transform( a['%s_ema%d_ln' % (column, interval)].values.reshape(-1, 1))
def fselect_v1(h5_path, scaler_type, use_gmean, out_path): ''' Run feature selection for preprocess HDF5 (v1) ''' logger.info('Loading training data ...') Xtrain_df, ytrain_df, Ntrain = load_hdf5(h5_path) train_columns = Xtrain_df.columns.values Xtrain_mat = Xtrain_df.as_matrix() # scaling if need if scaler_type == 'robust': logger.info('Use robust scaler') scaler = RobustScaler() scaler.fit(Xtrain_mat) Xtrain_mat = scaler.transform(Xtrain_mat) elif scaler_type == 'standard': logger.info('Use standard scaler') scaler = StandardScaler() scaler.fit(Xtrain_mat) Xtrain_mat = scaler.transform(Xtrain_mat) ytrain2d = make2dy(ytrain_df, Ntrain, use_gmean) lassocv = run_lasso(Xtrain_mat, ytrain2d) rndlasso = run_rndlasso(Xtrain_mat, ytrain2d, alpha=lassocv.alpha_) fscores = rndlasso.scores_ res = pd.DataFrame(fscores, index=train_columns, columns=['rndlasso']) res.index.name = 'fname' res.to_csv(out_path, sep='\t')
def linear_train(train_log, train_label, valid_log, valid_label, time_name): train_log = train_log.fillna(0) valid_log = valid_log.fillna(0) scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(1.0, 99.0), copy=True) scaler.fit(train_log.values) Classifier.set_scaler(scaler) normal_train = scaler.transform(train_log.values) normal_valid = scaler.transform(valid_log.values) classifier = linear_model.LogisticRegression(class_weight='balanced', solver="sag", max_iter=5000, verbose=1, n_jobs=2) classifier.fit(normal_train, train_label) y_valid = classifier.predict(normal_valid) y_train = classifier.predict(normal_train) fpr, tpr, thresholds = metrics.roc_curve(train_label, y_train, pos_label=1) a = metrics.auc(fpr, tpr) print("train auc", a) fpr, tpr, thresholds = metrics.roc_curve(valid_label, y_valid, pos_label=1) a = metrics.auc(fpr, tpr) print(a) return classifier, a
def prepare_data_mean(): data = pd.read_csv('titanic_train_500_age_passengerclass.csv', sep=',', header=0) yvalues = pd.DataFrame(dict(Survived=[]), dtype=int) yvalues["Survived"] = data["Survived"].copy() data.drop('Survived', axis=1, inplace=True) data.drop('PassengerId', axis=1, inplace=True) x_train = data.head(400) x_train = x_train.fillna(x_train.mean()) x_test = data.tail(100) x_test = x_test.fillna(x_test.mean()) y_train = yvalues.head(400) y_test = yvalues.tail(100) # Scaling our data scaler = RobustScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) scaler.fit(x_test) x_test = scaler.transform(x_test) return x_train, x_test, y_train, y_test
def getScaler_fromFile(trainFile): raw_train = [l.strip().split('\t')[2:] for l in open(trainFile)][1:] trainTable = [[float(i) for i in r] for r in raw_train] X = np.array(trainTable) scaler = RobustScaler() scaler.fit(X) return scaler
def split_xy(df, y_var='apow', step=1): # define X and Y y_col = [col for col in df.columns if col.split('.')[0] == y_var] X = df.iloc[:-1, :] Y = df[y_col].iloc[1:, 0] if step > 1: # for multi forecast mX = X[:-step + 1] mY = pd.DataFrame(Y) for i in range(step - 1): mY = pd.concat([Y.shift(i + 1), mY], axis=1) mY = pd.DataFrame(mY.values[step - 1:, :], index=mY.index[:-step + 1]) x_train, x_test, y_train, y_test = train_test_split(mX, mY, test_size=0.1, random_state=42) else: # single forecast x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # Feature Scaling scaler = RobustScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # with open('../forecast_models/scaler_new.sav', 'wb') as sc: # pickle.dump(scaler, sc) return x_train, x_test, y_train, y_test
class PandasRobustScaler(BaseEstimator, TransformerMixin): def __init__(self, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), prefix='', suffix='__rbstscale'): self.scaler = None self.with_centering = with_centering self.with_scaling = with_scaling self.quantile_range = quantile_range self.center_ = None self.scale_ = None self.prefix = prefix self.suffix = suffix def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.scaler = RobustScaler(with_centering=self.with_centering, with_scaling=self.with_scaling, quantile_range=self.quantile_range) self.scaler.fit(X) self.center_ = pd.Series(self.scaler.center_, index=X.columns) self.scale_ = pd.Series(self.scaler.scale_, index=X.columns) return self def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() Xrs = self.scaler.transform(X) Xscaled = pd.DataFrame(Xrs, index=X.index, columns=self.prefix + X.columns + self.suffix) return Xscaled
def scaler_dummy(dataset): scaler_mm = MinMaxScaler() scaler_ma = MaxAbsScaler() scaler_sd = StandardScaler() scaler_rb = RobustScaler() numerical = list(dataset.columns) data_transform_mm = pd.DataFrame(data = dataset) data_transform_ma = pd.DataFrame(data = dataset) data_transform_sd = pd.DataFrame(data = dataset) data_transform_rb = pd.DataFrame(data = dataset) scaler_mm.fit(dataset[numerical]) scaler_ma.fit(dataset[numerical]) scaler_sd.fit(dataset[numerical]) scaler_rb.fit(dataset[numerical]) data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical]) data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical]) data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical]) data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical]) ## get dummies features_final_mm = pd.get_dummies(data_transform_mm) features_final_ma = pd.get_dummies(data_transform_ma) features_final_sd = pd.get_dummies(data_transform_sd) features_final_rb = pd.get_dummies(data_transform_rb) return features_final_mm, features_final_ma, features_final_sd, features_final_rb
def flatten_ts(train, test): new_train, new_test = [], [] for _, row in train.iterrows(): new_list = [] for i in row.index: row[i] = row[i].dropna() for j in range(len(row[i])): new_list.append(row[i][j]) new_train.append(new_list) for _, row in test.iterrows(): new_list = [] for i in row.index: row[i] = row[i].dropna() for j in range(len(row[i])): new_list.append(row[i][j]) new_test.append(new_list) train_df = pd.DataFrame(new_train) test_df = pd.DataFrame( pad_sequences(new_test, maxlen=train_df.shape[1], dtype='float32')) scaler = RobustScaler() scaler.fit(train_df) return scaler.transform(train_df.dropna()), scaler.transform( test_df.dropna())
def fn_y_nhanes(y, reference_y=None): scaler = RobustScaler(with_centering=False, quantile_range=(0, 95)) if reference_y is None: scaler.fit(y) else: scaler.fit(reference_y) # get mask before transforming m = np.zeros((y.shape)) yc = np.ones((y.shape)) for r in range(y.shape[0]): for c in range(y.shape[1]): if y[r, c] > 0: m[r, c] = 1 yc[r, c] = y[r, c] yt = scaler.transform(yc) # convert to torch yp = torch.from_numpy(yt) mp = torch.from_numpy(m) return yp, mp
class DFRobustScaler(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = RobustScaler(**kwargs) self.transform_cols = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = X.copy() new_X[self.transform_cols] = self.model.transform(X[self.transform_cols]) return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X) def inverse_transform(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = X.copy() new_X[self.transform_cols] = self.model.inverse_transform(X[self.transform_cols]) return new_X
def get_data_split(data): rows = data.shape[0] # rows cols = data.shape[1] # columns data = data.values # .values contains data as a Numpy Array # Training and test data train_start = 0 train_end = int(np.floor(0.8 * rows)) # 80% of data is training data # test_start = train_end + 1 # remaining is for testing test_start = rows - 100 test_end = rows data_train = data[np.arange(train_start, train_end), :] data_test = data[np.arange(test_start, test_end), :] # Scale data to handle outliers from sklearn.preprocessing import RobustScaler scaler = RobustScaler() scaler.fit(data_train) data_train = scaler.transform(data_train) data_test = scaler.transform(data_test) # Build X and y X_train = data_train[:, 1:] y_train = data_train[:, 0] # 0th column has labels X_test = data_test[:, 1:] y_test = data_test[:, 0] # 0th column has labels return X_train, y_train, X_test, y_test
def bestRandomForest(X, y, split=0.7, ntrials=100): means = np.zeros(ntrials, ) max_accuracy= 0 for trial in range(ntrials): xTr, yTr, xTe, yTe, trIdx, teIdx = trteSplitEven(X, y, split, trial) # Train scaler = RobustScaler() scaler.fit(xTr) xTr = scaler.transform(xTr) xTe = scaler.transform(xTe) forest = RandomForestClassifier(class_weight="balanced") n_estimators = [100, 300, 600] max_depth = [3, xTr.shape[1] / 2 + 1, 25, 100, 300] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 10] hyperF = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) gridF = GridSearchCV(forest, hyperF, cv=3, verbose=1, n_jobs=-1) gridF.fit(xTr, yTr) best_params = gridF.best_params_ print(best_params) # Predict yPr = gridF.predict(xTe) # Compute classification error print("Trial:", trial, "Accuracy", "%.3g" % (100 * np.mean((yPr == yTe).astype(float)))) means[trial] = 100 * np.mean((yPr == yTe).astype(float)) if means[trial]>max_accuracy: max_accuracy = means[trial] best_classifier = gridF print("best accuracy is ", max_accuracy) print("best parameters are ", best_classifier.get_params()) return best_classifier
def scale(self, X_train, X_test, type): if type == "StandardScaler": standardScaler = StandardScaler() standardScaler.fit(X_train) X_train = standardScaler.transform(X_train) X_test = standardScaler.transform(X_test) return X_train, X_test elif type == "MinMaxScaler": minMaxScaler = MinMaxScaler() minMaxScaler.fit(X_train) X_train = minMaxScaler.transform(X_train) X_test = minMaxScaler.transform(X_test) return X_train, X_test elif type == "MaxScaler": maxScaler = MaxAbsScaler() maxScaler.fit(X_train) X_train = maxScaler.transform(X_train) X_test = maxScaler.transform(X_test) return X_train, X_test elif type == "RobustScaler": robustScaler = RobustScaler() robustScaler.fit(X_train) X_train = robustScaler.transform(X_train) X_test = robustScaler.transform(X_test) return X_train, X_test
class Preprocessor: def __init__(self, params: dict): self.params = params if params["scaler"] == "standert_scaler": self.scaler = StandardScaler() elif params["scaler"] == "robust_scaler": self.scaler = RobustScaler() else: print("wrong scaler parameters") raise KeyError self.encoder = {} if params["encoder"] == "label_encoder": self.base_encoder = LabelEncoder else: print("wrong encoder parameters") raise KeyError def fit_transform(self, X_old): X = X_old.copy() for var_name in X.select_dtypes(include=['object']): encoder = self.base_encoder encoder.transform(X[var_name].astype(str)) X[var_name] = encoder.fit(X[var_name].astype(str)) self.encoder[var_name] = encoder self.scaler.fit(X) return self.scaler.transform(X) def fit(self, X_old): X = X_old.copy() for var_name in X.select_dtypes(include=['object']): X[var_name] = self.encoder[var_name].fit(X[var_name].astype(str)) return self.scaler.transform(X)
def fit_evaluate(regr, X_train, X_val, y_train, y_val, log_y=False, scale=False, exclude_features=None): print("Evaluating ...") if y_val is None: X_train, y_train = separate_X_y(X_train, exclude_features) X_val, y_val = separate_X_y(X_val, exclude_features) if scale: scaler = RobustScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) # Fit on train, transforming the test, avoid data leak X_val = scaler.transform(X_val) if regr: regr.verbose = False if log_y: regr.fit(X_train, np.log(y_train)) y_pred = np.exp( np.array(regr.predict(X_val), dtype=np.float128)) else: regr.fit(X_train, y_train) y_pred = regr.predict(X_val) else: if log_y: theta = normal_equation.normal_equation( X_train, np.log(y_train)) y_pred = np.exp(customSGD.predict(theta, X_val)) else: theta = normal_equation.normal_equation(X_train, y_train) y_pred = customSGD.predict(theta, X_val) evaluate(y_val, y_pred)
def prepare(processing): # Scale pd.options.mode.chained_assignment = None # disable false warning for copying x_transformer = RobustScaler( ) # StandardScaler(), MinMaxScalar(feature_range=(-1,1)) x_transformer = x_transformer.fit(processing[input_features].to_numpy()) x_scaled = x_transformer.transform(processing[input_features].to_numpy()) y_transformer = RobustScaler() y_transformer = y_transformer.fit(processing[output_features].to_numpy()) y_scaled = y_transformer.transform(processing[output_features].to_numpy()) # Shuffle sequential_data = [] for i in range( len(x_scaled) - history_period_size - future_period_predict): sequential_data.append([ x_scaled[i:(i + history_period_size)], y_scaled[i + history_period_size + future_period_predict - 1] ]) random.shuffle(sequential_data) # Split x, y = [], [] for seq, target in sequential_data: x.append(seq) y.append(target) return np.array(x), np.array(y)
def normalize_data(dataframe, mode): if mode == 'abs': from sklearn.preprocessing import MaxAbsScaler max_abs = MaxAbsScaler(copy=True) #save for retransform later max_abs.fit(dataframe) data_norm = max_abs.transform(dataframe) return data_norm, max_abs if mode == 'robust': from sklearn.preprocessing import RobustScaler robust = RobustScaler(copy=True) #save for retransform later robust.fit(dataframe) data_norm = robust.transform(dataframe) return data_norm, robust if mode == 'min_max': from sklearn.preprocessing import MinMaxScaler minmax = MinMaxScaler(feature_range=(0, 1), copy=True) #save for retransform later minmax.fit(dataframe) data_norm = minmax.transform(dataframe) return data_norm, minmax if mode == 'std': from sklearn.preprocessing import StandardScaler stdscaler = StandardScaler(copy=True, with_mean=True, with_std=True) stdscaler.fit(dataframe) data_norm = stdscaler.transform(dataframe) return data_norm, stdscaler
def normalize(self, data): new_axis_data = data[np.newaxis, :] new_raw_data = np.append(self.raw, new_axis_data, axis=0) scaler = RobustScaler() scaler.fit(new_raw_data) new_normalize = scaler.transform(new_raw_data) return new_normalize[-1]
def interquartile_scale(X_train, X_valid, X_test): scaler = RobustScaler(quantile_range=(25.0, 75.0)) scaler.fit(X_train) X_train = scaler.transform(X_train) X_valid = scaler.transform(X_valid) if X_valid is not None else None X_test = scaler.transform(X_test) if X_test is not None else None return X_train, X_valid, X_test
def transform(self, X): data = X.copy() rscaler = RobustScaler() rscaler.fit(X=data[data.columns.intersection(self.columns)]) data[data.columns.intersection(self.columns)] = rscaler.transform( data[data.columns.intersection(self.columns)]) return data