def get_scaler(scale_method='StandardScaler'): """ Get different kinds of scalers from scikit-learn :param scale_method: scale method :returns: scaler instance :raises: none """ scaler = None if scale_method == 'StandardScaler': scaler = preprocessing.StandardScaler() elif scale_method == 'MinMaxScaler': scaler = preprocessing.MinMaxScaler() elif scale_method == 'MaxAbsScaler': scaler = preprocessing.MaxAbsScaler() elif scale_method == 'RobustScaler': scaler = preprocessing.RobustScaler() elif scale_method == 'QuantileTransformer': scaler = preprocessing.QuantileTransformer() elif scale_method == 'Normalizer': scaler = preprocessing.Normalizer() elif scale_method == 'PowerTransformer': scaler = preprocessing.PowerTransformer() else: print(scale_method, ' not found') return scaler
def norm_and_zscore_ML_array(ML_array, robust=False, decomp=False, gauss=False): """ default preprocessing is simple MinMax L1 norm input ML_array : array shape : (Cut Trials, Features, Frames) where Cut Trials refers to either the number of Trials inside the testing data or training data (Don't call this function for just the total ML data, split beforehand..) robust: boolean flag, use sci-kit learn robust scaling to normalize our data decomp : boolean flag, post-processing step used to return first whitened 20 PCA components to remove linear dependence gauss : boolean flag, use sci-kit learn gaussian distribution scaling to normalize our data """ # ML_array if robust: pt = preprocessing.robust_scale() elif gauss: pt = preprocessing.PowerTransformer(method='box-cox', standardize=False) else: pt = preprocessing.MinMaxScaler() r_ML_array = pt.fit_transform( ML_array.reshape(ML_array.shape[0], ML_array.shape[1] * ML_array.shape[2])) # apply normalization to feature axis if decomp: # used to decomp linear correlations, if they exist. pca = decomposition.PCA(n=20, whiten=True) r_ML_array = pca.fit(r_ML_array) return r_ML_array
def NormaliseColumnValues(df, normFactor): ''' If there is no normalisation needed, just return the DataFrame unmodified ''' if normFactor == 'none': return df ''' Only consider numeric data ''' colsToTransform = list(df.columns) colsToTransform.remove('Contig') contigNames = df.Contig if normFactor == 'unit': ''' Straight out of the preprocessing.scale documentation. ''' df[colsToTransform] = preprocessing.scale(df[colsToTransform], axis=0) return df elif normFactor == 'yeojohnson': ''' Taken from https://scikit-learn.org/stable/modules/preprocessing.html Since df has already been sorted, can just do an iloc slice of the values. ''' pt = preprocessing.PowerTransformer(method='yeo-johnson', standardize=True) normArray = pt.fit_transform(df.iloc[:, 1:]) ''' Recast the data as a DataFrame and return''' normDf = pd.DataFrame(normArray, columns=colsToTransform) normDf.insert(loc=0, column='Contig', value=contigNames) return normDf else: ''' Otherwise, none must have been chosen, return the frame unchanged. ''' return df
def scale_features(X, Y, name='Standard'): """Scale features for training and testing. Parameters ---------- X : list Feature labels Y : list Target labels name : str Name of the scale to use Returns ------- list Scaled feature labels """ # Select feature scaler if name == 'MinMax': scaler = scalers.MinMaxScaler() elif name == 'Robust': scaler = scalers.RobustScaler() elif name == 'Quantile': scaler = scalers.QuantileTransformer() elif name == 'Power': scaler = scalers.PowerTransformer() elif name == 'Standard': scaler = scalers.StandardScaler() else: print('Invalid scale name, defaulting to Standard') scaler = scalers.StandardScaler() # Scale features return scaler.fit_transform(X, Y)
def statistic_transform(df, **params): """ 特征变换 :param df: :param params: :return: """ import scipy.stats as spstats # for column in list(df.columns): # # 计算最佳lamda值 # opt_lambda = spstats.boxcox(df[column]) # df[column] = spstats.boxcox(df[column], lmbda=opt_lambda) from sklearn import preprocessing import numpy as np import pandas as pd outlier_columns = DefaultConfig.outlier_columns for column in outlier_columns: df[column] = df[column].apply(lambda x: np.min(x) if x == np.nan else x) df[column] = df[column].apply(lambda x: 1e-5 if x == 0 else x) pt = preprocessing.PowerTransformer(method='box-cox', standardize=False) df[DefaultConfig.outlier_columns] = pd.DataFrame(columns=outlier_columns, data=pt.fit_transform(df[outlier_columns])) return df
def normal_distribution_transformer(df): ''' Non-linear Transformation ''' transformer = preprocessing.PowerTransformer(method='yeo-johnson') df_transformed = pd.DataFrame(transformer.fit_transform(df), columns=df.columns) return df_transformed
def test_inv_transform_ct_22(self): """ test inv_transform_ct with PowerTransformer Encoder Sklearn and passthrough option """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'num1': [0, 1], 'num2': [0, 2], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('power', skp.PowerTransformer(), ['num1', 'num2']) ], remainder='passthrough') enc.fit(train, y) test = pd.DataFrame({'num1': [0, 1, 1], 'num2': [0, 2, 3], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame({'power_num1': [0.0, 1.0, 1.0], 'power_num2': [0.0, 1.9999999997665876, 3.000000000169985], 'other': ['A', 'B', 'C']}) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1', 'col2', 'other'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def PowerTransformer(train_df, test_df, HP): method, standardize, copy = HP['PowerTransformer']['method'], HP['PowerTransformer']['standardize'], \ HP['PowerTransformer']['copy'] train_x = train_df.iloc[:, :-1] train_y = train_df.iloc[:, -1:] test_x = test_df.iloc[:, :-1] test_y = test_df.iloc[:, -1:] transformer = preprocessing.PowerTransformer(method=method, standardize=standardize, copy=copy) train_x_copy = train_x.copy() train_x_transformed = transformer.fit_transform(train_x_copy) test_x_copy = test_x.copy() test_x_transformed = transformer.transform(test_x_copy) # TODO check here train_column_name = list(train_x_copy.columns) test_column_name = list(test_x_copy.columns) train_x_transformed_df = pd.DataFrame(train_x_transformed) train_x_transformed_df.columns = train_column_name train_df_transformed = train_x_transformed_df.assign(label=train_y.values) test_x_transformed_df = pd.DataFrame(test_x_transformed) test_x_transformed_df.columns = test_column_name test_df_transformed = test_x_transformed_df.assign(label=test_y.values) return train_df_transformed, test_df_transformed
def standardizing(df, methods): ''' This function takes in a dataframe and a method for standardizing, it returns the standardized dataframe. The methods are: - z: for z-scores - mm: for min-max - robust: for robust - gauss: for gaussian ''' if methods == 'z': scaler = preprocessing.StandardScaler().fit(df) scaled_df = pd.DataFrame(scaler.transform(df)) elif methods == 'mm': scaler = preprocessing.MinMaxScaler().fit(df) scaled_df = pd.DataFrame(scaler.transform(df)) elif methods == 'robust': scaler = preprocessing.RobustScaler().fit(df) scaled_df = pd.DataFrame(scaler.transform(df)) else: scaler = preprocessing.PowerTransformer(method='yeo-johnson', standardize=True) scaled_df = pd.DataFrame(scaler.fit_transform(df)) return scaled_df
def calculateCrimeRates(crime_weather_agg, cr_type=False): ''' Calculate, standardize, and classify crime rates per 100,000 people. This function only standardizes and classifies crime rates for all crimes, not by crime type. ''' #Remove blank Census data. crime_weather_agg_na = crime_weather_agg.dropna( axis='index', how='any', subset=['TotalPop']).reset_index(drop=True) #Calculate crime rates per 100,000 people. crime_weather_agg_na['crime_rate'] = ( crime_weather_agg_na['crime_counts'] / crime_weather_agg_na['TotalPop']) * 100000 if cr_type == False: #Standardize crime rates. standardize = crime_weather_agg_na[['crime_rate']] power = preprocessing.PowerTransformer(method='box-cox', standardize=False) crime_weather_agg_na['crs'] = power.fit_transform(standardize) #Calculate statistics for classification. stats = crime_weather_agg_na[['crs']].describe().transpose() #Classify standardized crime rates. crime_weather_agg_na['crime_rate_cat'] = crime_weather_agg_na[[ 'crs' ]].apply(classifyCrimeRates, args=(stats[['mean', 'std']], ), axis=1) return crime_weather_agg_na
def _box_cox_transform(self, columns): """ Perform top Box-Cox transformation """ transformer = preprocessing.PowerTransformer('box-cox') self.scalers['box-cox'] = transformer self.output_df[columns] = transformer.fit_transform( self.output_df[columns])
def _yeo_johnson_transform(self, column): """ Perform top Robust Scaling """ transformer = preprocessing.PowerTransformer('yeo-johnson') self.scalers['robust'] = transformer self.output_df[column] = transformer.fit_transform( self.output_df[column])
def __init__(self, column, dataframe, settings): APreprocessor.__init__(self, column, dataframe, settings) self.scaler = preprocessing.PowerTransformer( method=self.settings.get('method', 'yeo-johnson'), standardize=self.settings.get('standardize', True), copy=self.settings.get('copy', True) ) self.pickle_process(dataframe)
def _power_transform(self): for c in self.num_feats: powt = preprocessing.PowerTransformer() powt.fit(self.df[c].values.reshape(-1, 1)) self.output_df.loc[:, c] = powt.transform(self.df[c].values.reshape( -1, 1)) self.power_transform_encoder[c] = powt return self.output_df, self.power_transform_encoder
def scaleData(houseData): #le originál data dataX = houseData #i think this makes it so that majority of the data is within plus or minus 1, but outliers are included scaledXnormal = preprocessing.scale(dataX) #reduces skewness by applying a logarithmic scale? pt = preprocessing.PowerTransformer() scaledXpowerTransformer = pt.fit_transform(dataX) return dataX, scaledXnormal, scaledXpowerTransformer
def scaler(self, method='yeo-johnson'): ''' Scale data to gaussian distribution N(0,1) Parameters ---------- column_name : string Name of the column to scale data. method : string, optional Method to use for scaling transformation. The default is 'yeo-johnson'. Returns ------- dataframe : DataFrame Return updated dataframe of the missing data from the column. scaler: object scaler created with the data. ''' if method == 'standard': scaler = preprocessing.StandardScaler() if method == 'minmax': scaler = preprocessing.MinMaxScaler() if method == 'maxabs': scaler = preprocessing.MaxAbsScaler() if method == 'robust': scaler = preprocessing.RobustScaler() if method == 'quantile': scaler = preprocessing.QuantileTransformer( output_distribution='normal') if method == 'l1': scaler = preprocessing.normalize(method) if method == 'l2': scaler = preprocessing.normalize(method) if method == 'max': scaler = preprocessing.normalize(method) feature_sign = self._check_sign_feature() if method == 'box-cox' or feature_sign == 'positive': scaler = preprocessing.PowerTransformer(method) if method == 'yeo-johnson' or feature_sign == 'negative': scaler = preprocessing.PowerTransformer(method) scaler.fit(self.dataframe) self.dataframe = scaler.transform(self.dataframe) return self.dataframe, scaler
def gaussian_scaler(train, test): scaler = skl.PowerTransformer(method='yeo-johnson') scaler.fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns, index=train.index) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns, index=test.index) return scaler, train_scaled, test_scaled
def scale_input_data(data): input_a = np.array(data) input_a = input_a.reshape(-1, 1) mean_all = preprocessing.PowerTransformer() mean_all = mean_all.fit(input_a) mean_all = mean_all.transform(input_a) mean_all = preprocessing.MinMaxScaler(feature_range=(-1, 1)) new_mean_all = mean_all.fit_transform(input_a) last_values = [] for i in range(len(new_mean_all)): last_values.append(float(new_mean_all[i])) return last_values
def allScalers(): return { # "None": None, # "QuantileTransformerUniform": preprocessing.QuantileTransformer(output_distribution='uniform'), # "QuantileTransformerNormal": preprocessing.QuantileTransformer(output_distribution='normal'), "PowerTransformer": preprocessing.PowerTransformer(), # "RobustScaler": preprocessing.RobustScaler(), # "MaxAbsScaler": preprocessing.MaxAbsScaler(), # "MinMaxScaler": preprocessing.MinMaxScaler(), # "Normalizer": preprocessing.Normalizer(), # "StandardScaler": preprocessing.StandardScaler(), }
def DataNormalize(data, method = "StandardScaler"): if method.lower() == "standardscaler": scaler = preprocessing.StandardScaler().fit(data) if method.lower() == "minmaxscaler": scaler = preprocessing.MinMaxScaler().fit(data) if method.lower() == "maxabsscaler": scaler = preprocessing.MaxAbsScaler().fit(data) if method.lower() == "robustscaler": scaler = preprocessing.RobustScaler().fit(data) if method.lower() == "powertransformer": scaler = preprocessing.PowerTransformer(method = "yeo-johnson", standardize = True).fit(data) return scaler.transform(data)
def get_scaler(self, scalername=None): if scalername is None: scalername = self.scaler return { 'NoScaler': None, 'MinMaxScaler': preprocessing.MinMaxScaler(), 'MaxAbsScaler': preprocessing.MaxAbsScaler(), 'StandardScaler': preprocessing.StandardScaler(), 'RobustScaler': preprocessing.RobustScaler(), 'Normalizer': preprocessing.Normalizer(), 'QuantileTransformer': preprocessing.QuantileTransformer(), 'PowerTransformer': preprocessing.PowerTransformer() }.get(scalername)
def feature_scale(feature_scaler, X_train, y_train): # more information about these scalers can be found @ # https://scikit-learn.org/stable/modules/preprocessing.html if feature_scaler == 'binarizer': # scale the X values in the set model = preprocessing.Binarizer() elif feature_scaler == 'one_hot_encoder': ''' >>> enc.transform([['female', 'from US', 'uses Safari'], ['male', 'from Europe', 'uses Safari']]).toarray() array([[1., 0., 0., 1., 0., 1.], [0., 1., 1., 0., 0., 1.]]) ''' # This is on y values model = preprocessing.OneHotEncoder(handle_unknown='ignore') elif feature_scaler == 'maxabs': model = preprocessing.MaxAbsScaler() elif feature_scaler == 'minmax': model = preprocessing.MinMaxScaler() elif feature_scaler == 'normalize': # L2 normalization model = preprocessing.Normalizer() elif feature_scaler == 'poly': # scale the X values in the set model = PolynomialFeatures(2) elif feature_scaler == 'power_transformer': # scale the X values in the set model = preprocessing.PowerTransformer(method='yeo-johnson') elif feature_scaler == 'quantile_transformer_normal': # scale the X values in the set model = preprocessing.QuantileTransformer(output_distribution='normal') elif feature_scaler == 'robust': model = preprocessing.RobustScaler(quantile_range=(25, 75)) elif feature_scaler == 'standard_scaler': # scale the X values in the set model = preprocessing.StandardScaler() return model
def impute_method_setup( self, random_state=DEFAULT_IMPUTER_RANDOM_STATE, add_indicator=DEFAULT_IMPUTER_ADD_INDICATOR, initial_strategy=DEFAULT_IMPUTER_INITIAL_STRATEGY, max_iter=DEFAULT_IMPUTER_MAX_ITER, estimator=DEFAULT_IMPUTER_ESTIMATOR, output_distribution=DEFAULT_TRANSFORMER_OUTPUT_DISTRIBUTION, transformer_method=DEFAULT_TRANSFORMER_METHOD, transformer_standardize=DEFAULT_TRANSFORMER_STANDARDIZE): """ Initialises the IterativeImputer, QuantileTransformer and PowerTransformer methods required if missing data is to be imputed. Parameters are passed to the sklearn routines. Where this is being done it is noted below. For further documentation on how these functions work, and what the parameters denote, please refer to the sklearn documentation. IterativeImputer: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html QuantileTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html PowerTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html Args: random_state: (int) (IterativeImputer & QuantileTransformer) seed for pseudo random number generator add_indicator: (boolean) (IterativeImputer) if True adds a `MissingIndicator` transform to the stack initial_strategy: (str) (IterativeImputer) define strategy to use for initialising missing values max_iter: (int) (IterativeImputer) maximum number of imputation rounds to perform estimator: (str) (IterativeImputer) estimator method to be used output_distribution: (str) (QuantileTransformer) Marginal distribution for the transformed data transformer_method (str) (PowerTransformer) method to use, 'box-cox' is default transformer_standardize (boolean) (PowerTransformer) select if zero-mean, unit-variance normalisation is applied, default is True Returns: None """ # set the imputer options (if we are using them) self.imputer = IterativeImputer(random_state=random_state, add_indicator=add_indicator, initial_strategy=initial_strategy, max_iter=max_iter, verbose=self.verbose, estimator=estimator) # set the power transform options self.transformer_quantile = preprocessing.QuantileTransformer( output_distribution=output_distribution, random_state=random_state) # set the power transform options self.transformer_power = preprocessing.PowerTransformer( method=transformer_method, standardize=transformer_standardize)
def train_scaler(data, preprocessing_type, n_channels, seq_length, toscale='channels'): """scaling data with different modes, apply detrend per sample per channel :param data the dataset as array :type data ndarray :param n_channels the number of channels :type n_channels int :param seq_length the length of a sequence of one single sensor :type seq_length int :param preprocessing_type mode used for scaling, e.g. 'standard', 'zero2one' :type preprocessing_type str :param toscale mode used for scaling the time sequence of each channel('channels') or each timestep/feature over the corresponding samples('samples') :type toscale str :returns scaled dataset :rtype ndarray """ scaler = None if toscale == 'channels': data = prep.concatenate_samples(data, n_channels, seq_length) data = data.transpose() if preprocessing_type == 'standard': # Center to the median and component wise scale according to the interquartile range scaler = skpreprocessing.RobustScaler().fit(data) elif preprocessing_type == 'gaussian': scaler = skpreprocessing.PowerTransformer().fit( data) # scale to range [0,1] elif preprocessing_type == 'zero2one': scaler = skpreprocessing.MinMaxScaler().fit( data) # scale to range [0,1] elif preprocessing_type == 'neg_one2one': scaler = skpreprocessing.MinMaxScaler(feature_range=(-1, 1)).fit( data) # scale to range [-1, 1] elif preprocessing_type == 'fourier_samples': scaler = skpreprocessing.MinMaxScaler(feature_range=(-1, 1)).fit( data) # scale to range [-1, 1] return scaler
def feature_transform(df, **params): """ 特征分布转换 :param df: :param params: :return: """ from sklearn import preprocessing import pandas as pd pt = preprocessing.PowerTransformer(method='box-cox', standardize=False) df[DefaultConfig.outlier_columns] = pd.DataFrame(columns=DefaultConfig.outlier_columns, data=pt.fit_transform(df[DefaultConfig.outlier_columns])) return df
def add_power_transform_scaling(self, data: object, gaussian_like_scale_columns: List[str] = None, gaussian_like_method: str = "yeo-johnson", standardize=True): """ Apply a power transform featurewise to make data more Gaussian-like. https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html :param data: Input data to fit the model. :param gaussian_like_scale_columns: Columns to which we apply the scaling. :param gaussian_like_method: yeo-johnson or box-cox :param standardize: Set to True to apply zero-mean, unit-variance normalization to the output. :return: """ scale = preprocessing.PowerTransformer(method=gaussian_like_method, standardize=standardize) self._normalization_steps.append((scale, gaussian_like_scale_columns))
def map_to_Gaussian(df, methodType): ''' Mapping non-Gaussian distribution to Gaussian Input: df: DataFrame methodType: 'yeo-johnson' or 'box-cox' Output: Mapped dataframe ''' pt = preprocessing.PowerTransformer(method=methodType, standardize=False) df = pt.fit_transform(df) return df
def scaleFeatures(data, opt='standard', **kwargs): from sklearn import preprocessing if opt == 'standard': scl = preprocessing.StandardScaler(**kwargs) elif opt == 'robust': scl = preprocessing.RobustScaler(**kwargs) elif opt == 'minmax': scl = preprocessing.MinMaxScaler(**kwargs) elif opt == 'norm': scl = preprocessing.Normalizer(**kwargs) elif opt == 'gaussian': # doesn't work! no idea why scl = preprocessing.PowerTransformer(method='yeo-johnson') elif opt == 'quantile': scl = preprocessing.QuantileTransformer(output_distribution='normal') out = pd.DataFrame(scl.fit_transform(data), columns=data.columns) print("Features scaled using", opt, "scaling method!") return out
def normalize_numeric_columns(training_data, test_data): normalish_columns = [] other_positive_columns = [] other_numeric_columns = [] # Everything should be numeric at this point, so we can loop over all the columns for col in training_data.columns: if col == target_column or col == id_column: continue n, p = stats.normaltest(training_data[col]) if p > .05: normalish_columns.append(col) elif (training_data[col] > 0).all(): other_positive_columns.append(col) else: other_numeric_columns.append(col) if len(normalish_columns) > 0: scaler = preprocessing.StandardScaler().fit( training_data[normalish_columns]) training_data[normalish_columns] = scaler.transform( training_data[normalish_columns]) test_data[normalish_columns] = scaler.transform( test_data[normalish_columns]) if len(other_positive_columns) > 0: transformer = preprocessing.PowerTransformer( method='box-cox', standardize=True).fit(training_data[other_positive_columns]) training_data[other_positive_columns] = transformer.transform( training_data[other_positive_columns]) test_data[other_positive_columns] = transformer.transform( test_data[other_positive_columns]) if len(other_numeric_columns) > 0: rs = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit( training_data[other_numeric_columns]) training_data[other_numeric_columns] = rs.transform( training_data[other_numeric_columns]) test_data[other_numeric_columns] = rs.transform( test_data[other_numeric_columns]) return training_data, test_data
def yeo_johnson(df): """ Wrapper for sklearn's preprocessing.PowerTransformer (Yeo-Johnson Option) which can handle negative values Parameters ---------- df : DataFrame Returns ------- DataFrame Yeo-Johnson transformed """ assert (isinstance(df, pd.DataFrame)) pt = preprocessing.PowerTransformer(method='yeo-johnson', standardize=False) return pd.DataFrame(pt.fit_transform(df))