def select_kbest_freg_scaled(X_train, y_train, k): X_train_scaled = split_scale.min_max_scaler(X_train)[0] y_train_scaled = split_scale.min_max_scaler(y_train)[0] f_selector = SelectKBest(f_regression, k=k).fit(X_train_scaled, y_train_scaled) f_support = f_selector.get_support() f_feature = X_train.loc[:,f_support].columns.tolist() return (str(len(f_feature)), 'selected features'),(f_feature),(f_selector.scores_)
def prep_titanic(df): df.drop(columns=['deck'], inplace=True) df.embark_town = df.embark_town.fillna('Southampton') df.embarked = df.embarked.fillna('S') train, test = split_scale.split_my_data(df, .8) encoder = OneHotEncoder(sparse=False) encoder.fit(train[['embarked']]) cols = [c for c in encoder.categories_[0]] m_train = encoder.transform(train[['embarked']]) m_test = encoder.transform(test[['embarked']]) encoded_train = pd.DataFrame(m_train, columns=cols, index=train.index) encoded_test = pd.DataFrame(m_test, columns=cols, index=test.index) train = pd.concat([train, encoded_train], axis=1).drop(columns='embarked') test = pd.concat([test, encoded_test], axis=1).drop(columns='embarked') imputer = SimpleImputer(strategy='mean') imputer.fit(train[['age']]) train.age = imputer.transform(train[['age']]) test.age = imputer.transform(test[['age']]) train_to_scale = train[['age', 'fare']] test_to_scale = test[['age', 'fare']] scaler, train_scaled, test_scaled = \ split_scale.min_max_scaler(train_to_scale, test_to_scale) train.update(train_scaled) test.update(test_scaled) return train, test
def ols_backward_elimination(X_train, y_train): X_train_scaled = split_scale.min_max_scaler(X_train)[0] y_train_scaled = split_scale.min_max_scaler(y_train)[0] cols = list(X_train_scaled.columns) pmax = 1 while (len(cols) > 0): p = [] x_1 = X_train_scaled[cols] model = sm.OLS(y_train_scaled, x_1).fit() p = pd.Series(model.pvalues.values[0:,],index=cols) pmax = max(p) feature_with_pmax = p.idxmax() if(pmax>0.05): cols.remove(feature_with_pmax) else: break return cols
def scale_titanic(train, test): X_train = train[["age", "fare"]] X_test = test[["age", "fare"]] scaler, train_scaled, test_scaled = split_scale.min_max_scaler( X_train, X_test) train["age"] = train_scaled["age"] train["fare"] = train_scaled["fare"] test["age"] = test_scaled["age"] test["fare"] = test_scaled["fare"] return scaler, train, test
X_train_gaussian_scaled.head() # ### Gaussian Scale Inverse X_train_gaussian_unscaled, X_test_gaussian_unscaled = scale_inverse( X_train_gaussian_scaled, X_test_gaussian_scaled, gaussian_scaler) X_train_gaussian_unscaled.head() # ### Min-Max Scaler # # # $$x' = {\frac {x-{\text{min}}(x)}{{\text{max}}(x)-{\text{min}}(x)}}$$ # # $$x' = a+{\frac {x-{\text{min}}(x))(b-a)}{{\text{max}}(x)-{\text{min}}(x)}}$$ X_train_min_max_scaled, X_test_min_max_scaled, min_max_scaler = min_max_scaler( X_train, X_test) X_train_min_max_scaled.head() # ### Min-Max Scale Inverse X_train_min_max_unscaled, X_test_min_max_unscaled = scale_inverse( X_train_min_max_scaled, X_test_min_max_scaled, min_max_scaler) X_train_min_max_unscaled.head() # ### IRQ Robust Scaler # - With a lot of outliers, scaling using the mean and variance is not going to work very well. # # - Using RobustScaler, the median is removed (instead of mean) and data is scaled according to a quantile range (the IQR is default) X_train_iqr_robust_scaled, X_test_iqr_robust_scaled, iqr_robust_scaler = iqr_robust_scaler( X_train, X_test)
def scale_titanic(train, test): train_scaled = train[['age', 'fare']] test_scaled = test[['age', 'fare']] scaler, train_scaled, test_scaled = split_scale.min_max_scaler( train_scaled, test_scaled) return scaler, train_scaled, test_scaled