def normalise_data(data_list): X = PowerTransformer(method='box-cox').fit_transform(data_list) X[:,0] = X[:,0]*2.0 X[:,1] = X[:,1]*2.0 X[:,2] = X[:,2]*1.6 symmetry = X[:, 0] border = X[:, 1] colour = X[:, 2] list_symmetry = [] list_border = [] list_colour = [] #CONVERT NUMPY LIST TO LISTS PER FEATURE for i in range(len(X)): list_symmetry.append(symmetry[i]) list_border.append(border[i]) list_colour.append(colour[i]) return list_symmetry, list_border, list_colour
def get_preprocessed_data(pre_process_choice, data_to_change): if pre_process_choice == "none": process_data = data_to_change elif pre_process_choice == "normalise": process_data = normalize(data_to_change) elif pre_process_choice == "standardscaler": process_data = StandardScaler().fit_transform(data_to_change) elif pre_process_choice == "minmaxscaler": process_data = MinMaxScaler().fit_transform(data_to_change) elif pre_process_choice == "powertransformer": process_data = PowerTransformer( method='yeo-johnson').fit_transform(data_to_change) elif pre_process_choice == "quantiletransformer": process_data = QuantileTransformer( output_distribution='uniform').fit_transform(data_to_change) elif pre_process_choice == "x_pca_reduced": #This part is only relevant to Q4 and Q5 X = MinMaxScaler().fit_transform( data_to_change) #Please see Q4 below for explanation process_data = PCA(n_components=121).fit_transform(X) return process_data
def make_forecasts_ensure_positive( train: TimeSeries, n_pred: int, predictions_to_make: ModelsToMake, ) -> Dict[str, TimeSeries]: """Wrap transfomations around make_transform() to ensure positive forecasts. Avoids negative values in forecasts by adding 1 to all values (ensuring no zeros) and applying a Box-Cox transformation/inversion before/after forecasting. """ scaler_wrapper = ScalerWrapper( # type: ignore scaler=PowerTransformer(method="box-cox"), ) # box-cox doesn't like the number "0", so add one to everything scaled_train = scaler_wrapper.fit_transform(train + 1) forecasts = make_forecasts(scaled_train, n_pred, predictions_to_make) # invert the transformation return { name: scaler_wrapper.inverse_transform(forecast) - 1 for (name, forecast) in forecasts.items() }
def load_scaler(use_scaler=None): """a""" if use_scaler: method = use_scaler else: method = config["adu"]["feature_selection"]["scaling_method"] if method == "Robust": scaler = RobustScaler() elif method == "Power": scaler = PowerTransformer() elif method == "MinMax": scaler = MinMaxScaler() elif method == "Standard": scaler = StandardScaler() elif method == "QuantileUniform": scaler = QuantileTransformer(output_distribution="uniform") elif method == "QuantileGaussian": scaler = QuantileTransformer(output_distribution="normal") else: exit(1) return scaler
def __init__( self, col_name: str, code_len: int, start_id: int, fillall: bool = True, base: int = 100, hasnan: bool = True, transform: str = 'quantile', ): super().__init__(col_name, code_len, start_id, fillall, base, hasnan) if transform == 'yeo-johnson': self.scaler = PowerTransformer(standardize=True) elif transform == 'quantile': self.scaler = QuantileTransformer(output_distribution='uniform') elif transform == 'robust': self.scaler = RobustScaler() else: raise ValueError( 'Supported data transformations are "yeo-johnson", "quantile", and "robust"' )
def doPCA(data, featureColumns, targetColumn, applyTransfer=False, outDF=False, analyzePCA=False, standardize=False): dataSet = DataSet() # Remove target column df = data.iloc[:, :12] dataSet.target = data[targetColumn].to_numpy() dataSet.data = df if applyTransfer: df = PowerTransformer(standardize=False).fit_transform(df) if standardize: df = StandardScaler().fit_transform(df) df = np.log(df + 1) pca = PCA(n_components=0.95, svd_solver='full') pca.fit(df) principalComponents = pca.transform(df) if analyzePCA: pca = PCA().fit(dataSet.data) print(np.cumsum(pca.explained_variance_ratio_)) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance') plt.show() if outDF: principalDf = pd.DataFrame(data=principalComponents) dataSet.components = principalDf else: dataSet.components = principalComponents # print(dataSet.data.shape) # print(len(dataSet.target) return dataSet, pca
def transform_data(df, ev, tsne=False): ''' Apply PowerTransformer(), PCA(), and optionally TSNE() sequentially on dataframe Args: df (pd.DataFrame): subject dataframe ev (int, float, None, str): explained variance correspond to `n_components` parameter in PCA() class and hence inherits its arguments tsne (bool) [default=False]: When True, apply TSNE() on dataframe Return: X (array): transformed dataframe ''' X = PCA(ev, random_state=42).fit_transform( PowerTransformer().fit_transform(df)) if tsne == True: perplexity = int(X.shape[0]**0.5) X = TSNE(perplexity=perplexity, random_state=42).fit_transform(X) return X
def ensure_normality(self, features_of_type='numerical', return_series=False): """ Ensures that the numerical features in the dataset, unless the parameter 'what' specifies any other subset selection primitive, fit into a normal distribution by applying the Yeo-Johnson transform :param features_of_type: Subset selection primitive :param return_series: Return the normalized series :return: the subset fitted to normal distribution. """ assert features_of_type in self.meta_tags subset = self.select(features_of_type) mapper = DataFrameMapper([(subset.columns, PowerTransformer(method='yeo-johnson', standardize=False))]) normed_features = mapper.fit_transform(subset.copy()) self.features[self.names(features_of_type)] = pd.DataFrame( normed_features, index=subset.index, columns=subset.columns) self.metainfo() if return_series is True: return self.features[self.names(features_of_type)]
def PreProcessing(self, scale_cols, one_hot_cols): #, ordinal_cols): """Establishes preprocessing and a pipeline for modeling Parameters ---------- scale_cols : list A list of numerical columns to be scaled one_hot_cols : list A list of categorical columns that will be one hot encoded """ # set transformer methods ohe = OneHotEncoder(handle_unknown='ignore', sparse=True) label_encode = LabelEncoder() imputer = SimpleImputer(add_indicator=True, verbose=1) scaler = PowerTransformer() # Make Transformer self.preprocessing = make_column_transformer( (ohe, one_hot_cols), (make_pipeline(imputer, scaler), scale_cols), remainder='drop')
def normalize_data(data, scaler): """ Data normalization by using a chosen scaler :param data: input data frame :param scaler: string for chosen scaler :return: scaled data frame """ # Switch between different scalers # Helpful source: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html switcher = { "max_abs": MaxAbsScaler(), "standard": StandardScaler(), "min_max": MinMaxScaler(), "robust_scaler": RobustScaler(), "quantile_transformer": QuantileTransformer(output_distribution='normal'), "power_transform": PowerTransformer(method='yeo-johnson') } scaler = switcher.get(scaler, StandardScaler()) return scaler.fit_transform(data), scaler
def normalize_df(df, numeric_data): from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer norm_method = st.sidebar.selectbox('Choose normalization method', ('None', 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'QuantileTransformer', 'PowerTransformer')) if norm_method == 'None': return df[numeric_data] else: if norm_method == 'StandardScaler': scaler = StandardScaler() elif norm_method == 'MinMaxScaler': scaler = MinMaxScaler() elif norm_method == 'RobustScaler': scaler = RobustScaler() elif norm_method == 'QuantileTransformer': scaler = QuantileTransformer() elif norm_method == 'PowerTransformer': scaler = PowerTransformer() df_scaled = scaler.fit_transform(df[numeric_data]) st.success('Done!') return df_scaled
def __init__(self, *, method='yeo-johnson', standardize=False, lmd=None, tolerance=(-np.inf, np.inf), on_err=None): """ Parameters ---------- method: 'yeo-johnson' or 'box-cox' ‘yeo-johnson’ works with positive and negative values ‘box-cox’ only works with strictly positive values standardize: boolean Normalize to standard normal or not. Recommend using a sepearate `standard` function instead of using this option. lmd: list or 1-dim ndarray You might assign each input xs with a specific lmd yourself. Leave None(default) to use a inferred value. See `PowerTransformer` for detials. tolerance: tuple Tolerance of lmd. Set None to accept any. Default is **(-np.inf, np.inf)** but recommend **(-2, 2)** for Box-cox transform on_err: None or str Error handle when try to inference lambda. Can be None or **log**, **nan** or **raise** by string. **log** will return the logarithmic transform of xs that have a min shift to 1. **nan** return ``ndarray`` with shape xs.shape filled with``np.nan``. **raise** raise a FloatingPointError. You can catch it yourself. Default(None) will return the input series without scale transform. .. _PowerTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer """ self._tolerance = tolerance self._pt = PT(method=method, standardize=standardize) self._lmd = lmd self._shape = None self._on_err = on_err
def infer(self): train_pred = self.model.predict((self.X_train)) val_pred = self.model.predict((self.X_val)) test_pred = self.model.predict((self.X_test)) print( "-----------------------------------------------------------------" ) print("Training results", "\n") if self.transform is not None: scaler = PowerTransformer(method="box-cox") scaler.fit(np.array(self.train.actual_load).reshape(-1, 1)) inv_train_pred = scaler.inverse_transform( np.array(train_pred).reshape(-1, 1)) inv_val_pred = scaler.inverse_transform( np.array(val_pred).reshape(-1, 1)) inv_test_pred = scaler.inverse_transform( np.array(test_pred).reshape(-1, 1)) print( "Training error: ", mse(self.train.actual_load, inv_train_pred, squared=False), ) print( "Validation error: ", mse(self.val.actual_load, inv_val_pred, squared=False), ) print("Test error: ", mse(self.y_test, inv_test_pred, squared=False)) print( "Note : The error printed above is calculated after the inverse transform of box-cox" ) else: print("Training error: ", mse(self.y_train, train_pred, squared=False)) print("Validation error: ", mse(self.y_val, val_pred, squared=False)) print("Test error: ", mse(self.y_test, test_pred, squared=False))
def update(self, event, instance): """Observer method that adds a row for every iteration to the DF.""" if event == Events.OPTIMIZATION_STEP: result = instance.res[-1] row = { "mean_test_rmse": result["target"], "param_regression__regressor__max_depth": int(result["params"]["max_depth"]), "param_regression__regressor__n_estimators": int(result["params"]["n_estimators"]), "param_regression__transformer": "None" if result["params"]["transformer"] < 0.5 else PowerTransformer(), } self.data = self.data.append(row, ignore_index=True) if event == Events.OPTIMIZATION_END: self.data["rank_test_rmse"] = self.data.sort_values( "mean_test_rmse", ascending=False)[[ "mean_test_rmse" ]].apply(lambda x: pd.Series(np.arange(len(x)) + 1, x.index)) self._update_tracker(event, instance)
def power_normalization( data: np.ndarray, return_scaler: bool = False, scaler: Optional[object] = None ) -> Union[np.ndarray, Tuple[np.ndarray, object]]: """Normalizes provided data via power normalization. More: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html Uses sklearn.preprocessing.PowerTransformer() object. :param data: np.ndarray data in 2d-format (n_samples, n_features) or 1d-format (n_samples,) :param return_scaler: bool Should function return fitted scaler or not. :param scaler: object (sklearn.preprocessing.StandardScaler) If not None, the provided scaler will be used as normalizer. :return: np.ndarray or (np.ndarray, scaler) If return_scaler==False, returns normalized data else returns normalized data and fitted scaler """ # check if data is in appropriate format if len(data.shape) > 2: raise AttributeError( 'The supplied data should be 1- or 2-dimensional. Got %i.' % (len(data.shape))) # if data is 1-dimensional, it should be converted into 2-dimensional by adding additional dimension if len(data.shape) == 1: data = data[..., np.newaxis] # if no scaler supplied, create scaler and fit it if scaler is None: scaler = PowerTransformer() scaler.fit(data) # transform data data = scaler.transform(data) # return scaler if need if return_scaler: return data, scaler return data
def Standard_(self, dfa, scale = 'S'): Scalers = { 'S' : StandardScaler(), 'R' : RobustScaler(quantile_range=tuple(self.arg.QuantileRange)), 'M' : MinMaxScaler(), 'MA': MaxAbsScaler(), 'OE': OrdinalEncoder(), 'OH': OneHotEncoder(), 'NL' : Normalizer(), 'QT': QuantileTransformer(), 'PT': PowerTransformer(), 'N' : FunctionTransformer( validate=False ), } Sca_map = [Scalers[i] for i in scale] Xa = list( dfa.columns ) mapper = DataFrameMapper([ ( Xa, Sca_map ) ]) clfit = mapper.fit( dfa ) self.log.CIF('Standardization Pocessing'.center(45, '-')) self.log.NIF('Scale paramaters:\n%s' %clfit) self.log.CIF(45 * '-') return clfit
def transform_data(X, do_diff=True, do_power_transform=True): def diff(X, y=None): return np.diff(X, axis=0) def diff2(X, y=None): return diff(diff(X)) first_diff = ('first_difference', FunctionTransformer(func=diff, validate=True)) power_transform = ('power_transform', PowerTransformer(method='yeo-johnson', standardize=True)) pipeline = [] if do_diff: pipeline.append(first_diff) if do_power_transform: pipeline.append(power_transform) if len(pipeline) == 0: Z = X else: pipeline = Pipeline(pipeline) Z = pipeline.fit_transform(X) return Z
def power_transform_feature(data, col): df = data.orderBy("Date").toPandas() df['pwr_tf'] = df[col] sc = MinMaxScaler(feature_range=(1, 2)) pt = PowerTransformer(method='box-cox') pipeline = Pipeline(steps=[('s', sc), ('p', pt)]) df[['pwr_tf']] = pipeline.fit_transform(df[['pwr_tf']]) fig, axs = plt.subplots(2, 2, figsize=(10, 7)) fig.suptitle(col) axs[0, 0].plot(df["Date"], df[col]) axs[0, 0].set(xlabel='Date', ylabel='Num Cases') axs[0, 0].set_title('Daily Cases') axs[0, 1].hist(df[col]) axs[0, 1].set(xlabel='Num Cases', ylabel='Freq') axs[0, 1].set_title('Histogram Daily Cases') axs[1, 0].plot(df["Date"], df['pwr_tf']) axs[1, 0].set(xlabel='Date', ylabel='Num Cases') axs[1, 0].set_title('After Power Transform') axs[1, 1].hist(df['pwr_tf']) axs[1, 1].set(xlabel='Num Cases', ylabel='Freq') axs[1, 1].set_title('Histogram After PT') for ax in axs.flat: ax.set(xlabel='Date', ylabel='Num Cases') # Hide x labels and tick labels for top plots and y ticks for right plots. for ax in axs.flat: ax.label_outer() plt.tight_layout()
def transform_amplitude(inputfile, scale=True): amplitudes = np.fromfile(inputfile, dtype=np.float) n_samples = amplitudes.shape[0] amplitudes = amplitudes.reshape((n_samples, -1)) bc = PowerTransformer(method='box-cox') yj = PowerTransformer(method='yeo-johnson') qt = QuantileTransformer(n_quantiles=n_samples, output_distribution='normal') min_max_scaler = MinMaxScaler() bc_amplitudes = bc.fit_transform(amplitudes) yj_amplitudes = yj.fit_transform(amplitudes) qt_amplitudes = qt.fit_transform(amplitudes) if scale: bc_amplitudes = min_max_scaler.fit_transform(bc_amplitudes) yj_amplitudes = min_max_scaler.fit_transform(yj_amplitudes) qt_amplitudes = min_max_scaler.fit_transform(qt_amplitudes) return amplitudes, bc_amplitudes, yj_amplitudes, qt_amplitudes
def scaling_data(df, sismic_data_labeled, geo_data_labeled): # scaler = StandardScaler() scaler = PowerTransformer() sismic_data_scaled = scaler.fit_transform(sismic_data_labeled) scaler = PowerTransformer() # scaler = StandardScaler() geo_data_scaled = scaler.fit_transform(geo_data_labeled) data_scaled = np.concatenate((sismic_data_scaled, geo_data_scaled), axis=-1) # Determine index for Arenito samples and assing weigths weight = np.mean(data_scaled.max(axis=0) - data_scaled.min(axis=0)) / 2 isarenito = [] for i in range(len(df['Geology'])): if 'A' in str(df['Geology'].iloc[i]): isarenito.append(weight) else: isarenito.append(-weight) isarenito = np.array(isarenito) isarenito = np.expand_dims(isarenito, axis=-1) data2evaluate = np.concatenate((data_scaled, isarenito), axis=-1) df_scaled = pd.DataFrame(data2evaluate, columns=['DT', 'GR', 'NPHI', 'RHOB', 'Permeabilidade', 'Porosidade', 'RQI', 'FZI', 'isArenito']) return df_scaled
from sklearn.preprocessing import StandardScaler, Normalizer, QuantileTransformer, PowerTransformer, OneHotEncoder, FunctionTransformer from collections import Counter import numpy as np import streamlit as st transformer = dict({'Standard Scaler': StandardScaler(), 'Normalizer (Unit Norm)': Normalizer(), 'Quantile-Transformer':QuantileTransformer(output_distribution='normal'), 'Box-Cox':PowerTransformer(method='box-cox'), 'Yeo-Johnson':PowerTransformer(), 'Custom': 'PlaceHolder'}) # add a new transformer of choice def tform(df, key, func): if func: return FunctionTransformer(func).fit_transform(df) else: return transformer[key].fit_transform(df) def topk_encoder(X, k=10): counts = Counter(X) most_common = dict(counts.most_common(k)) topk = list(most_common.keys()) return OneHotEncoder(categories=[topk], handle_unknown='ignore')
plt.ylim([0, 0.05]) plt.xlabel(r'Original NO$_2$ data ($\mu g/m^3$)') plt.ylabel(r'Normalized frequency') plt.tight_layout() plt.savefig("data_no2_hist.pdf", format='pdf') plt.figure(figsize=(5, 4)) plt.hist(pm10[0], bins=50, normed=True, color='gray') plt.xlim([0, 300]) plt.ylim([0, 0.04]) plt.xlabel(r'Original PM$_{10}$ data ($\mu g/m^3$)') plt.ylabel(r'Normalized frequency') plt.tight_layout() plt.savefig("data_pm10_hist.pdf", format='pdf') pt_no2 = PowerTransformer() pt_no2.fit(no2[0]) plt.figure(figsize=(5, 4)) plt.hist(pt_no2.transform(no2[0][no2[0] != 0].reshape(-1, 1)), bins=50, normed=True, color='gray') plt.xlim([-5, 5]) plt.ylim([0, 0.5]) plt.xlabel(r'Power transformed NO$_2$ data') plt.ylabel(r'Normalized frequency') plt.tight_layout() plt.savefig("data_no2_hist_trans.pdf", format='pdf') pt_pm10 = PowerTransformer()
# License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import PowerTransformer, minmax_scale print(__doc__) N_SAMPLES = 3000 FONT_SIZE = 6 BINS = 100 pt = PowerTransformer(method='box-cox', standardize=False) rng = np.random.RandomState(304) size = (N_SAMPLES, 1) # lognormal distribution X_lognormal = rng.lognormal(size=size) # chi-squared distribution df = 3 X_chisq = rng.chisquare(df=df, size=size) # weibull distribution a = 50 X_weibull = rng.weibull(a=a, size=size)
# Take only 2 features to make visualization easier # Feature of 0 has a long tail distribution. # Feature 5 has a few but very large outliers. X = X_full[:, [0, 5]] distributions = [ ('Unscaled data', X), ('Data after standard scaling', StandardScaler().fit_transform(X)), ('Data after min-max scaling', MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after power transformation (Yeo-Johnson)', PowerTransformer(method='yeo-johnson').fit_transform(X)), ('Data after power transformation (Box-Cox)', PowerTransformer(method='box-cox').fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X)), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform').fit_transform(X)), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), ] # scale the output between 0 and 1 for the colorbar y = minmax_scale(y_full) # plasma does not exist in matplotlib < 1.5 cmap = getattr(cm, 'plasma_r', cm.hot_r)
import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import PowerTransformer from sklearn.preprocessing import QuantileTransformer from sklearn.model_selection import train_test_split print(__doc__) N_SAMPLES = 1000 FONT_SIZE = 6 BINS = 30 rng = np.random.RandomState(304) bc = PowerTransformer(method='box-cox') yj = PowerTransformer(method='yeo-johnson') qt = QuantileTransformer(output_distribution='normal', random_state=rng) size = (N_SAMPLES, 1) # lognormal distribution X_lognormal = rng.lognormal(size=size) # chi-squared distribution df = 3 X_chisq = rng.chisquare(df=df, size=size) # weibull distribution a = 50 X_weibull = rng.weibull(a=a, size=size)
maxabs_fig, ax = plt.subplots() sn.distplot(MaxAbsScaler().fit_transform(X)).set_title('MaxAbsScaler') maxabs_fig.savefig('Transformation-MaxAbsScaler' + '.pdf', bbox_inches='tight', dpi=None, facecolor='w', edgecolor='b', orientation='portrait', papertype=None, format=None, transparent=True, pad_inches=0.25, frameon=None) PowerTrans_fig, ax = plt.subplots() sn.distplot(PowerTransformer( method='yeo-johnson').fit_transform(X)).set_title('PowerTransformer') PowerTrans_fig.savefig('Transformation-PowerTransformer' + '.pdf', bbox_inches='tight', dpi=None, facecolor='w', edgecolor='b', orientation='portrait', papertype=None, format=None, transparent=True, pad_inches=0.25, frameon=None) Robust_fig, ax = plt.subplots() sn.distplot(RobustScaler().fit_transform(X)).set_title('RobustScaler') Robust_fig.savefig('Transformation-RobustScaler' + '.pdf',
# Seems we want to strip 4 first columns of MFCC and 24 last of chroma cleaned_x = pd.concat([rhythm, chroma_cleaned, mfcc_cleaned], axis='columns', ignore_index=True) # Outlier detection threshold = 3 for col in range(cleaned_x.shape[1]): mean = np.mean(cleaned_x.iloc[:, col]) z = np.abs(stats.zscore(cleaned_x.iloc[:, col])) rows = np.where(z > threshold) for row in rows: cleaned_x.at[row, col] = mean # Scaling scaler = PowerTransformer() scaled_data = scaler.fit_transform(cleaned_x) scaled_df = pd.DataFrame(scaled_data) cleaned_data = pd.concat([df_labels, scaled_df], axis='columns', ignore_index=True) # Split dataset into train and test train, test = train_test_split(cleaned_data, test_size=0.3, random_state=0) x_train = train.drop(labels=0, axis='columns') y_train = np.ravel(train[[0]]) x_test = test.drop(labels=0, axis='columns') y_test = np.ravel(test[[0]]) # Build a bagging meta-estimator: best result was ~57 % with 1000 estimators and 10 max features
scaled[:, col] = [(x - min_) / (max_ - min_) * rng + min_val for x in arr[:, col]] return scaled data_scaled_5_10 = fit_range(data) plot_hist(data_scaled_5_10) # NON-LINEAR TRANSFORMATION data_scaled_quantile = QuantileTransformer(n_quantiles=100, random_state=0) \ .fit_transform(data) plot_hist(data_scaled_quantile) data_scaled_quantile_normal = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal') \ .fit_transform(data) plot_hist(data_scaled_quantile_normal) data_scaled_power = PowerTransformer().fit_transform(data) plot_hist(data_scaled_power) # FEATURE SAMPLING sampler = KBinsDiscretizer(n_bins=[3, 4, 3, 10, 2, 4], encode='ordinal') data_discrete = sampler.fit_transform(data) print(sampler.bin_edges_) plot_hist(data_discrete)
def __init__(self, getter: FeatureGetter = None, core_type: str = 'linear-regression', output_transform: str = 'log', imputation_method: str = "basic", scaling=None, age_half_life: float = 65.0): if getter is None: getter = FeatureGetter() self.core_type = core_type self.output_transform = output_transform self.age_half_life = 65.0 # these things need to be fit. self.feature_mergers = [ # FeatureMerger(produces='area', source_columns=['TotalBsmtSF', 'GrLivArea'], # getter=getter, operation='sum'), FeatureMerger(produces='baths', source_columns=['BsmtFullBath', 'FullBath'], getter=getter, operation='sum'), FeatureMerger(produces='half_baths', source_columns=['BsmtHalfBath', 'HalfBath'], getter=getter, operation='sum'), FeatureMerger(produces='access', source_columns=['LotFrontage', 'PavedDrive'], getter=getter), # FeatureMerger(produces='shape', source_columns=['LotShape', 'LandContour', 'LandSlope'], # feature_names=self.feature_names), FeatureMerger(produces='utilities', source_columns=['Heating', 'HeatingQC'], getter=getter), FeatureMerger( produces='quality', source_columns=['OverallQual', 'OverallCond', 'KitchenQual'], getter=getter, morph="exp", morph_slope=0.1), FeatureMerger(produces='exterior', source_columns=[ 'LotArea', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterCond' ], getter=getter), FeatureMerger( produces='porch', source_columns=['OpenPorchSF', 'EnclosedPorch', 'ScreenPorch'], getter=getter, operation='sum'), FeatureMerger(produces='garage', source_columns=[ 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond' ], getter=getter), FeatureMerger(produces='fireplaces', source_columns=['Fireplaces', 'FireplaceQu'], getter=getter), FeatureMerger(produces='basement', source_columns=[ 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2' ], getter=getter), # FeatureMerger(produces='conditions', source_columns=['Condition1', 'Condition2'], # feature_names=self.feature_names, operation='sum') ] self.imputation_method = imputation_method if self.imputation_method == 'neighbors': df = DistanceFunc(getter=getter, feature_weights={ "Neighborhood": 100.0, "GrLivArea": 0.01, "FullBath": 5, "BedroomAbvGr": 7, "YrSold": 10, }) self.imputer = KNNImputer(metric=df.get_distance) else: self.imputer = SimpleImputer(strategy='most_frequent') self.categorical_mapping = load_categorical_mapping() self.categorical_transform = CategoricalTransformer( feature_names=getter.feature_names, mapping=self.categorical_mapping) self.model = None self.is_fit = False self.getter = getter self.scaling = scaling self.scaler = None if self.scaling == 'robust': self.scaler = RobustScaler() elif self.scaling == 'power': self.scaler = PowerTransformer()
class MyModel(BaseEstimator, RegressorMixin): def __init__(self, getter: FeatureGetter = None, core_type: str = 'linear-regression', output_transform: str = 'log', imputation_method: str = "basic", scaling=None, age_half_life: float = 65.0): if getter is None: getter = FeatureGetter() self.core_type = core_type self.output_transform = output_transform self.age_half_life = 65.0 # these things need to be fit. self.feature_mergers = [ # FeatureMerger(produces='area', source_columns=['TotalBsmtSF', 'GrLivArea'], # getter=getter, operation='sum'), FeatureMerger(produces='baths', source_columns=['BsmtFullBath', 'FullBath'], getter=getter, operation='sum'), FeatureMerger(produces='half_baths', source_columns=['BsmtHalfBath', 'HalfBath'], getter=getter, operation='sum'), FeatureMerger(produces='access', source_columns=['LotFrontage', 'PavedDrive'], getter=getter), # FeatureMerger(produces='shape', source_columns=['LotShape', 'LandContour', 'LandSlope'], # feature_names=self.feature_names), FeatureMerger(produces='utilities', source_columns=['Heating', 'HeatingQC'], getter=getter), FeatureMerger( produces='quality', source_columns=['OverallQual', 'OverallCond', 'KitchenQual'], getter=getter, morph="exp", morph_slope=0.1), FeatureMerger(produces='exterior', source_columns=[ 'LotArea', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterCond' ], getter=getter), FeatureMerger( produces='porch', source_columns=['OpenPorchSF', 'EnclosedPorch', 'ScreenPorch'], getter=getter, operation='sum'), FeatureMerger(produces='garage', source_columns=[ 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond' ], getter=getter), FeatureMerger(produces='fireplaces', source_columns=['Fireplaces', 'FireplaceQu'], getter=getter), FeatureMerger(produces='basement', source_columns=[ 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2' ], getter=getter), # FeatureMerger(produces='conditions', source_columns=['Condition1', 'Condition2'], # feature_names=self.feature_names, operation='sum') ] self.imputation_method = imputation_method if self.imputation_method == 'neighbors': df = DistanceFunc(getter=getter, feature_weights={ "Neighborhood": 100.0, "GrLivArea": 0.01, "FullBath": 5, "BedroomAbvGr": 7, "YrSold": 10, }) self.imputer = KNNImputer(metric=df.get_distance) else: self.imputer = SimpleImputer(strategy='most_frequent') self.categorical_mapping = load_categorical_mapping() self.categorical_transform = CategoricalTransformer( feature_names=getter.feature_names, mapping=self.categorical_mapping) self.model = None self.is_fit = False self.getter = getter self.scaling = scaling self.scaler = None if self.scaling == 'robust': self.scaler = RobustScaler() elif self.scaling == 'power': self.scaler = PowerTransformer() def get_params(self, deep=True): return { "core_type": self.core_type, "output_transform": self.output_transform, "age_half_life": self.age_half_life, } def predict(self, x): x2 = self.transform(x) y2 = self.model.predict(x2) y = y2 if self.output_transform == 'log': y = np.exp(y2) return y def _setup(self, x): x0 = x.copy() if not isinstance(x, pd.DataFrame) else x.to_numpy() x1 = self.categorical_transform.transform( x0, exclude_columns=['YrSold', 'MoSold', 'YearBuilt']) return x1 def _preprocess(self, x: np.ndarray, y: np.ndarray = None, operation: str = 'fit'): overall_quality = self.getter.get_column('OverallQual', x) overall_quality = overall_quality.astype(float) x = self._setup(x) if operation == 'fit': x = self.imputer.fit_transform(x) else: x = self.imputer.transform(x) year_sold = self.getter.get_column('YrSold', x) month_sold = self.getter.get_column('MoSold', x) year_built = self.getter.get_column('YearBuilt', x) age = year_sold - year_built # TODO a column for the seasonal effect of things merged_columns = [] for merger in self.feature_mergers: op = 'fit_transform' if 'fit' in operation else 'transform' merged_column = merger._process(x, operation=op) merged_columns.append(merged_column) merged_columns = np.array(merged_columns).T bedrooms = self.getter.get_column("BedroomAbvGr", x) year_built = normalize(year_built) age = normalize(age) bedrooms = normalize(bedrooms) # foundation seems to make it worse. misc = self.getter.get_column('MiscVal', x) misc = normalize(misc) neighborhood = self.getter.get_column('Neighborhood', x) # zoning seems to help the prediction zoning = self.getter.get_column('MSZoning', x) pool = self.getter.get_column('PoolArea', x) has_pool = np.array(list(map(lambda t: 1.0 if t > 0.0 else 0.0, pool))) building_type = self.getter.get_column('BldgType', x) house_style = self.getter.get_column('HouseStyle', x) functional = self.getter.get_column('Functional', x) central_air = self.getter.get_column("CentralAir", x) electrical = self.getter.get_column("Electrical", x) twoflr = self.getter.get_column("2ndFlrSF", x) has_two_floors = np.array( list(map(lambda t: 1.0 if t > 0.0 else 0.0, twoflr))) grla = self.getter.get_column('GrLivArea', x) bsma = self.getter.get_column('TotalBsmtSF', x) total_area = (grla + bsma) area_feature = total_area / 6000.0 tmp = overall_quality / 10.0 quality_feature = np.exp(tmp) / math.e yr_sold = self.getter.get_column('YrSold', x) yr_sold = yr_sold.astype(float) yr_built = self.getter.get_column('YearBuilt', x) yr_built = yr_built.astype(float) age = yr_sold - yr_built age_feature = np.exp(-age / self.age_half_life) combined = age_feature * quality_feature * area_feature combined2 = quality_feature * area_feature input_data = [ combined, combined2, age_feature, quality_feature, area_feature, neighborhood, bedrooms, merged_columns, misc, building_type, house_style ] #, functional, central_air, electrical, month_sold, # has_pool, zoning, has_two_floors] x3 = np.column_stack(input_data) # The robust scaler seems to be helping the accuracy. if self.scaler is not None: if 'fit' in operation: self.scaler.fit(x3) if 'transform' in operation: x3 = self.scaler.transform(x3) if operation == 'transform': return x3 # if self.core_type == 'linear-regression': # core = LinearRegression() # elif self.core_type == 'elastic-net': # core = ElasticNet() # elif self.core_type == 'perceptron': # core = MLPRegressor(max_iter=1500) core1 = ElasticNet() self.model = GradientBoostingRegressor(init=core1, n_estimators=100, loss='huber') y2 = y.copy() if self.output_transform == 'log': y2 = np.log(y2) self.model.fit(x3, y2) self.is_fit = True return self def fit(self, x, y): return self._preprocess(x, y, operation='fit') def transform(self, x): return self._preprocess(x, operation='transform')
def plot_simulations_pca(sims, ax='',\ figsize=(8, 8),\ target='',\ feature_set='',\ loadings=False,\ nsims=1000,\ select='',\ tol='',\ title='',\ outfile='',\ colorbar=True,\ verbose=False): """ Plot summary statistics for simulations projected into PC space. :param str sims: :param matplotlib.pyplot.axis ax: :param tuple figsize: :param str target: :param list feature_set: :param bool loadings: BROKEN! Whether to plot the loadings in the figure. :param int nsims: :param int/float select: :param int/float tol: :param str title: :param str outfile: :param bool verbose: :return: Return the `matplotlib.pyplot.axis` on which the simulations are plotted. """ if not ax: fig, ax = plt.subplots(figsize=figsize) ## Filter and downsample the simulations sim_df = _filter_sims(sims,\ feature_set=feature_set,\ nsims=nsims,\ select=select,\ tol=tol,\ verbose=verbose) ## Have to retain the targets because we drop them prior to PCA target_df = sim_df[default_targets] sim_df = sim_df.drop(default_targets, axis=1) ## These are also left over from mess and not sure they are needed. # sim_df = StandardScaler().fit_transform(sim_df) sim_df = PowerTransformer(method='yeo-johnson').fit_transform(sim_df) pca = PCA(n_components=2) dat = pca.fit_transform(sim_df) if not target: target = "zeta" target_values = target_df[target].values sc = ax.scatter(dat[:, 0], dat[:, 1], label=target_df[target], c=target_values) if colorbar: plt.colorbar(sc) ## Remove a bunch of visual noise ax.set_yticklabels([]) ax.set_xticklabels([]) ax.tick_params(top='off', bottom='off', left='off', right='off') var_expl = pca.explained_variance_ratio_ ax.set_xlabel("Variance explained {:.3}%".format(var_expl[0] * 100), fontsize=15) ax.set_ylabel("Variance explained {:.3}%".format(var_expl[1] * 100), fontsize=15) if title: ax.set_title(title) ## TODO: Doesn't work how I'd like. ##print("Explained variance", pca.explained_variance_ratio_) ##if loadings: ## for i, comp in enumerate(pca.components_.T): ## plt.arrow(0, 0, pca.components_.T[i,0], pca.components_.T[i,1], color = 'r',alpha = 0.5) ## plt.text(pca.components_.T[i,0]* 1.5, pca.components_.T[i,1] * 1.5, dat[i+2], color = 'black', ha = 'center', va = 'center') ## If writing to file then don't plot to screen. if outfile: try: plt.savefig(outfile) if verbose: print("Wrote figure to: {}".format(outfile)) except Exception as inst: raise Exception("Failed saving figure: {}".format(inst)) plt.close() return ax
import matplotlib.pyplot as plt from sklearn.preprocessing import PowerTransformer from sklearn.preprocessing import QuantileTransformer from sklearn.model_selection import train_test_split print(__doc__) N_SAMPLES = 1000 FONT_SIZE = 6 BINS = 30 rng = np.random.RandomState(304) bc = PowerTransformer(method='box-cox') yj = PowerTransformer(method='yeo-johnson') qt = QuantileTransformer(output_distribution='normal', random_state=rng) size = (N_SAMPLES, 1) # lognormal distribution X_lognormal = rng.lognormal(size=size) # chi-squared distribution df = 3 X_chisq = rng.chisquare(df=df, size=size) # weibull distribution a = 50 X_weibull = rng.weibull(a=a, size=size)