Пример #1
0
def select_kbest_freg_scaled(X_train, y_train, k):
    X_train_scaled = split_scale.min_max_scaler(X_train)[0]
    y_train_scaled = split_scale.min_max_scaler(y_train)[0]
    f_selector = SelectKBest(f_regression, k=k).fit(X_train_scaled, y_train_scaled)
    f_support = f_selector.get_support()
    f_feature = X_train.loc[:,f_support].columns.tolist()
    return (str(len(f_feature)), 'selected features'),(f_feature),(f_selector.scores_)
def prep_titanic(df):
    df.drop(columns=['deck'], inplace=True)
    df.embark_town = df.embark_town.fillna('Southampton')
    df.embarked = df.embarked.fillna('S')

    train, test = split_scale.split_my_data(df, .8)

    encoder = OneHotEncoder(sparse=False)
    encoder.fit(train[['embarked']])
    cols = [c for c in encoder.categories_[0]]
    m_train = encoder.transform(train[['embarked']])
    m_test = encoder.transform(test[['embarked']])

    encoded_train = pd.DataFrame(m_train, columns=cols, index=train.index)
    encoded_test = pd.DataFrame(m_test, columns=cols, index=test.index)

    train = pd.concat([train, encoded_train], axis=1).drop(columns='embarked')
    test = pd.concat([test, encoded_test], axis=1).drop(columns='embarked')

    imputer = SimpleImputer(strategy='mean')
    imputer.fit(train[['age']])
    train.age = imputer.transform(train[['age']])
    test.age = imputer.transform(test[['age']])

    train_to_scale = train[['age', 'fare']]
    test_to_scale = test[['age', 'fare']]
    scaler, train_scaled, test_scaled = \
        split_scale.min_max_scaler(train_to_scale, test_to_scale)

    train.update(train_scaled)
    test.update(test_scaled)

    return train, test
Пример #3
0
def ols_backward_elimination(X_train, y_train):
    X_train_scaled = split_scale.min_max_scaler(X_train)[0]
    y_train_scaled = split_scale.min_max_scaler(y_train)[0]
    cols = list(X_train_scaled.columns)
    pmax = 1
    while (len(cols) > 0):
        p = []
        x_1 = X_train_scaled[cols]
        model = sm.OLS(y_train_scaled, x_1).fit()
        p = pd.Series(model.pvalues.values[0:,],index=cols)
        pmax = max(p)
        feature_with_pmax = p.idxmax()
        if(pmax>0.05):
            cols.remove(feature_with_pmax)
        else:
            break
    return cols
Пример #4
0
def scale_titanic(train, test):
    X_train = train[["age", "fare"]]
    X_test = test[["age", "fare"]]
    scaler, train_scaled, test_scaled = split_scale.min_max_scaler(
        X_train, X_test)
    train["age"] = train_scaled["age"]
    train["fare"] = train_scaled["fare"]
    test["age"] = test_scaled["age"]
    test["fare"] = test_scaled["fare"]
    return scaler, train, test
X_train_gaussian_scaled.head()

# ### Gaussian Scale Inverse

X_train_gaussian_unscaled, X_test_gaussian_unscaled = scale_inverse(
    X_train_gaussian_scaled, X_test_gaussian_scaled, gaussian_scaler)
X_train_gaussian_unscaled.head()

# ### Min-Max Scaler
#
#
# $$x' = {\frac {x-{\text{min}}(x)}{{\text{max}}(x)-{\text{min}}(x)}}$$
#
# $$x' = a+{\frac  {x-{\text{min}}(x))(b-a)}{{\text{max}}(x)-{\text{min}}(x)}}$$

X_train_min_max_scaled, X_test_min_max_scaled, min_max_scaler = min_max_scaler(
    X_train, X_test)
X_train_min_max_scaled.head()

# ### Min-Max Scale Inverse

X_train_min_max_unscaled, X_test_min_max_unscaled = scale_inverse(
    X_train_min_max_scaled, X_test_min_max_scaled, min_max_scaler)
X_train_min_max_unscaled.head()

# ### IRQ Robust Scaler
# - With a lot of outliers, scaling using the mean and variance is not going to work very well.
#
# - Using RobustScaler, the median is removed (instead of mean) and data is scaled according to a quantile range (the IQR is default)

X_train_iqr_robust_scaled, X_test_iqr_robust_scaled, iqr_robust_scaler = iqr_robust_scaler(
    X_train, X_test)
def scale_titanic(train, test):
    train_scaled = train[['age', 'fare']]
    test_scaled = test[['age', 'fare']]
    scaler, train_scaled, test_scaled = split_scale.min_max_scaler(
        train_scaled, test_scaled)
    return scaler, train_scaled, test_scaled