def select_kbest_freg_scaled(X_train, y_train, k):
    X_scaler, X_train_scaled_data, X_test_scaled_data = split_scale.standard_scaler(X_train)
    y_scaler, y_train_scaled_data, y_test_scaled_data = split_scale.standard_scaler(y_train)

    f_selector = SelectKBest(f_regression,k=k)
    f_selector.fit(X_train_scaled_data, y_train_scaled_data)

    f_support = f_selector.get_support()
    f_feature = X_train_scaled_data.loc[:,f_support].columns.tolist()
    return f_feature
def split_scale_df(df):

    train, test = split_scale.split_my_data(df, train_ratio=.8, seed=123)

    scaler, train, test = split_scale.standard_scaler(train, test)

    X_train = train.drop(columns='tax_value')
    y_train = train[['tax_value']]
    X_test = test.drop(columns='tax_value')
    y_test = test[['tax_value']]
    ols_model = ols('y_train ~ X_train', data=train).fit()
    train['yhat'] = ols_model.predict(y_train)
    return train, test, X_train, y_train, X_test, y_test, ols_model
예제 #3
0
data = data.set_index(data.id)

#sns.pairplot(data=data)

train, test = split_scale.split_my_data(data)

X_train = train.drop(columns=["id", "taxvaluedollarcnt"])
y_train = pd.DataFrame([train.taxvaluedollarcnt])
y_train = y_train.transpose()

X_test = test.drop(columns=["id", "taxvaluedollarcnt"])
y_test = pd.DataFrame([test.taxvaluedollarcnt])
y_test = y_test.transpose()

X_train_scaled = split_scale.standard_scaler(X_train)
#sns.heatmap(data.corr(), cmap='Blues', annot=True)

predictions = pd.DataFrame({
    'actual': y_train.taxvaluedollarcnt
}).reset_index(drop=True)

# model 1 using square feet only
lm1 = LinearRegression()
lm1.fit(X_train_scaled[['calculatedfinishedsquarefeet']], y_train)
lm1_predictions = lm1.predict(X_train_scaled[['calculatedfinishedsquarefeet']])
predictions['lm1'] = lm1_predictions

# model 2 using square feet and bedroom count
lm2 = LinearRegression()
lm2.fit(X_train_scaled[['calculatedfinishedsquarefeet', 'bedroomcnt']],
df = wrangle.wrangle_telco()
df.dtypes

df = df[['monthly_charges', 'tenure', 'total_charges']]

sns.pairplot(data=df)

train, test = split_scale.split_my_data(df)

# split into train and test
# For feature engineering methods, we want to use the scaled data:
# scale the data using standard scaler

scaler, train_scaled_data, test_scaled_data = \
    split_scale.standard_scaler(df)

train_scaled_data['filler'] = -1
test_scaled_data['filler'] = -1

test['filler'] = -1
train['filler'] = -1

X_train = train[['monthly_charges', 'tenure', 'filler']]
y_train = train[['total_charges']]

X_train_scaled_data = train_scaled_data[[
    'monthly_charges', 'tenure', 'filler'
]]
y_train_scaled_data = train_scaled_data[['total_charges']]
# to return to orignal values
예제 #5
0
# Acquire and prep data
df = wrangle.wrangle_telco()
df.head()
df.info()
df.drop(columns=['customer_id'], inplace=True)
df.head()

# Explore data
sns.pairplot(data=df)

# split data
train, test = split_scale.split_my_data(data=df, train_ratio=.80, seed=123)

# Scale

scaler, train_scaled, test_scaled = split_scale.standard_scaler(train, test)

# Seperate into X and y dataframes

X_train = train.drop(columns=['total_charges'])
y_train = train[['total_charges']]

X_test = test.drop(columns=['total_charges'])
y_test = test[['total_charges']]

X_train_scaled = train_scaled.drop(columns=['total_charges'])
y_train_scaled = train_scaled[['total_charges']]

X_test_scaled = test_scaled.drop(columns=['total_charges'])
y_test_scaled = test_scaled[['total_charges']]
예제 #6
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

import env
import wrangle
import split_scale

# 1. Write a function, plot_variable_pairs(dataframe) that plots all of the pairwise relationships along with the regression line for each pair.

split_scale.standard_scaler()
df_train_x = pd.DataFrame(train_x_scaled_data)
df_train_y = pd.DataFrame(train_y_scaled_data)
df_test_x = pd.DataFrame(test_x_scaled_data)
df_test_y = pd.DataFrame(test_y_scaled_data)

df_train_x['target'] = df_train_y
df_test_x['target'] = df_test_y
df_train_x.head()
df_test_x.head()

df_train_x.rename(columns={0: "monthly_charges", 1: "tenure"})
df_test_x.rename(columns={0: "monthly_charges", 1: "tenure"})

g = sns.PairGrid(df_train_x)
g.map_diag(plt.hist)
col_names = ['customer_id', 'tenure', 'monthly_charges', 'total_charges']

X_train_scaled = df.copy()

X = df[['tenure', 'monthly_charges']]
y = df[['total_charges']]
train_pct = .25
X_train, X_test, y_train, y_test = split_my_data(X, y, train_pct)

assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

# ### Standard Scaler
# $$x'={\frac {x-{\bar {x}}}{\sigma }}$$

X_train_standard_scaled, X_test_standard_scaled, standard_scaler = standard_scaler(
    X_train, X_test)

X_train_standard_scaled, X_test_standard_scaled

X_train_standard_scaled.head()

# ### Standard Scale Inverse

X_train_standard_unscaled, X_test_standard_unscaled = scale_inverse(
    X_train_standard_scaled, X_test_standard_scaled, standard_scaler)

X_train_standard_unscaled.head()

# ### Uniform Scaler
#
# - It smooths out unusual distributions, and it spreads out the most frequent values and reduces the impact of (marginal) outliers →∴ a robust preprocessing scheme.
예제 #8
0
    return f_feature


select_kbest_freg(x_train, y_train, 2)
#print(str(len(f_feature)), 'selected features')
#print(f_feature)

plt.figure(figsize=(6, 5))
cor = x_train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

# ## Write a function, select_kbest_freg() that takes X_train, y_train (scaled) and k as input and returns a list of the top k features.

train_x_scaled_data, test_x_scaled_data, scaler_x_train, scaler_x_test = split_scale.standard_scaler(
    x_train, x_test)


def select_kbest_freg(train_x_scaled_data, k):
    f_selector = SelectKBest(f_regression, k)

    f_selector.fit(x_train, y_train)

    f_support = f_selector.get_support()
    f_feature = x_train.loc[:, f_support].columns.tolist()

    return f_feature


select_kbest_freg(train_x_scaled_data, 2)
예제 #9
0
# Write a function, select_kbest_freg_unscaled() that takes X_train, y_train and k as input (X_train and y_train should not be scaled!) and returns a list of the top k features.

X_train = train.drop(columns = "total_charges")
y_train = train["total_charges"]
X_test = test.drop(columns = "total_charges")
y_test = test["total_charges"]

def select_kbest_freg_unscaled(X_train, y_train, k):
    f_selector = SelectKBest(f_regression, k=k).fit(X_train, y_train)
    f_support = f_selector.get_support()
    f_feature = X_train.loc[:,f_support].columns.tolist()
    return f_feature

# Write a function, select_kbest_freg_scaled() that takes X_train, y_train (scaled) and k as input and returns a list of the top k features.

X_train_scaled, X_test_scaled, scaler = split_scale.standard_scaler(X_train, X_test)

def select_kbest_freg_scaled(X_train, y_train, k):
    f_selector = SelectKBest(f_regression, k=k).fit(X_train, y_train)
    f_support = f_selector.get_support()
    f_feature = X_train.loc[:,f_support].columns.tolist()
    return f_feature

# Write a function, ols_backware_elimination() that takes X_train and y_train (scaled) as input and returns selected features based on the ols backwards elimination method.
def ols_backware_elimination(X_train, y_train):
    ols_model = sm.OLS(y_train, X_train)
    fit = ols_model.fit()

    cols = list(X_train.columns)
    while (len(cols)>0):
        X_1 = X_train[cols]
예제 #10
0
def select_kbest_freg_scaled(X_train, y_train, k):
    x_scale = ss.standard_scaler(X_train, X_train)[1]
    y_scale = ss.standard_scaler(y_train, y_train)[1]
    return select_kbest_freg_scaled(x_scale, y_scale, k)
예제 #11
0
    return number_of_features


def top_n_features(X_train, y_train, n, model):
    cols = X_train.columns
    rfe = RFE(model, n)
    X_rfe = rfe.fit_transform(X_train, y_train)
    model.fit(X_rfe, y_train)
    features = list(cols[rfe.support_])
    return features


if __name__ == '__main__':
    seed = 43

    telco = wrangle.wrangle_telco()

    train, test = ss.split_my_data(telco, .8, seed)
    X_train = train.drop(columns='total_charges').set_index('customer_id')
    y_train = train[['customer_id', 'total_charges']].set_index('customer_id')
    X_test = test.drop(columns='total_charges')
    y_test = test[['total_charges']]

    select_kbest_freg_unscaled(X_train, y_train, 1)
    x_scale = ss.standard_scaler(X_train, X_train)[1]
    y_scale = ss.standard_scaler(y_train, y_train)[1]
    select_kbest_freg_scaled(X_test, y_test, 1)
    ols_backward_elimination(x_scale, y_scale)
    lasso_cv_coef(x_scale, y_train)
    n = optimal_feature_n(X_train, y_train)
    top_n_features(X_train, y_train, n)