('feature_selector',
     feature_selection.SelectFromModel(linear_model.Lasso())),
    ('pca', decomposition.PCA()),
    ('regressor', neighbors.KNeighborsRegressor())
])
pipeline_grid = {
    'preprocess__num__imputer__strategy': ['mean', 'median'],
    'pca__n_components': [0.90, 0.95],
    'regressor__n_neighbors': list(range(5, 15))
}

#build model with pipeline
scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
pipeline_generated = utils.grid_search_best_model(complete_pipeline,
                                                  pipeline_grid,
                                                  house_train1,
                                                  house_train['SalePrice'],
                                                  scoring=scoring)

#read test data
house_test = pd.read_csv(os.path.join(path, "test.csv"))
house_test.shape
house_test.info()
house_test['SalePrice'] = None

#apply preprocessing required before pipeline
utils.cast_to_cat(house_test, features_to_cast)
house_test1 = utils.drop_features(house_test, features_to_drop)
house_test1.info()

#get predictions on test data with constructed pipeline
Exemplo n.º 2
0
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)
X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000, noise=0.1)

cutils.plot_data_2d_classification(X, y)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

#perceptron algorithm
stages = [
            ('features', preprocessing.PolynomialFeatures()),
            ('clf', linear_model.Perceptron(max_iter=1000))
        ]
perceptron_pipeline = pipeline.Pipeline(stages)
perceptron_pipeline_grid  = {'features__gamma':[0.1, 0.01, 0.2]}
pipeline_object = comutils.grid_search_best_model(perceptron_pipeline, perceptron_pipeline_grid, X_train, y_train)
final_estimator = pipeline_object.named_steps['clf']
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(pipeline_object, X_train, y_train)

#logistic regression algorithm
stages = [
            ('features', preprocessing.PolynomialFeatures()),
            ('clf', linear_model.LogisticRegression())
        ]

lr_pipeline = pipeline.Pipeline(stages)
lr_pipeline_grid  = {'features__gamma':[0.1, 1, 5,10]}
pipeline_object = comutils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train)
final_estimator = pipeline_object.named_steps['clf']
Exemplo n.º 3
0
scoring = metrics.make_scorer(rutils.rmse, greater_is_better=False)

#linear pattern in 2d
X, y = rutils.generate_nonlinear_synthetic_data_regression(n_samples=200,
                                                           n_features=1)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.1, random_state=1)
rutils.plot_data_2d_regression(X_train, y_train)

lr_pipeline = pipeline.Pipeline([('features',
                                  preprocessing.PolynomialFeatures()),
                                 ('reg', linear_model.LinearRegression())])
lr_pipeline_grid = {'features__degree': [2, 3, 5, 10]}
pipeline_object = utils.grid_search_best_model(lr_pipeline,
                                               lr_pipeline_grid,
                                               X_train,
                                               y_train,
                                               scoring=scoring)
final_linear_model = pipeline_object.named_steps['reg']
print(final_linear_model.coef_)
print(final_linear_model.intercept_)
rutils.plot_model_2d_regression(pipeline_object, X_train, y_train)
rutils.regression_performance(pipeline_object, X_test, y_test)

lr_pipeline = pipeline.Pipeline([('features', kutils.GaussianFeatures()),
                                 ('reg', linear_model.LinearRegression())])
lr_pipeline_grid = {'features__n_centres': [15, 20, 30, 36]}
pipeline_object = utils.grid_search_best_model(lr_pipeline,
                                               lr_pipeline_grid,
                                               X_train,
                                               y_train,
Exemplo n.º 4
0
pca_data = lpca.transform(X_train1)
print(pca_data.shape)

tsne = manifold.TSNE(n_components=2)
tsne_data = tsne.fit_transform(pca_data)
rutils.plot_data_3d_regression(tsne_data, y_train)

scoring = metrics.make_scorer(log_rmse, greater_is_better=False)

sns.distplot(y_train)
y_trans = np.log1p(y_train)
sns.distplot(y_trans)

knn_estimator = neighbors.KNeighborsRegressor()
knn_grid = {'n_neighbors': list(range(5, 15))}
final_model = utils.grid_search_best_model(knn_estimator,
                                           knn_grid,
                                           pca_data,
                                           y_trans,
                                           scoring=scoring)

X_test = house3[house_train.shape[0]:]
X_test1 = utils.select_features(lasso_selector, X_test)
pca_test_data = lpca.transform(X_test1)
pca_test_data.shape

house_test['SalePrice'] = np.expm1(final_model.predict(pca_test_data))
house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv",
                  columns=["Id", "SalePrice"],
                  index=False)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.1, random_state=1)
rutils.plot_data_2d_regression(X_train, y_train)

#add outliers in features
X_train[::10] = 4
#add outliers in target
y_train[::10] = 250
rutils.plot_data_2d_regression(X_train, y_train)

# Fit linear model
lr_estimator = linear_model.LinearRegression()
lr_grid = {'normalize': [True, False]}
lr_model = utils.grid_search_best_model(lr_estimator,
                                        lr_grid,
                                        X_train,
                                        y_train,
                                        scoring=scoring)
rutils.plot_model_2d_regression(lr_model,
                                X_train,
                                y_train,
                                title="LinearRegression")
rutils.regression_performance(lr_model, X_test, y_test)

# Robustly fit linear model with Huber Regressor algorithm
hr_estimator = linear_model.HuberRegressor()
hr_grid = {'epsilon': [1.1, 1.2, 1.3, 1.5]}
hr_model = utils.grid_search_best_model(hr_estimator,
                                        hr_grid,
                                        X_train,
                                        y_train,
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000,
                                                            n_features=2,
                                                            n_classes=2,
                                                            weights=[0.4, 0.6],
                                                            class_sep=1.5)
#X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)

X_train, X_eval, y_train, y_eval = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

lr_estimator = linear_model.LogisticRegression()
lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1]}
final_estimator = comutils.grid_search_best_model(lr_estimator,
                                                  lr_grid,
                                                  X_train,
                                                  y_train,
                                                  scoring='accuracy')
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)
cutils.performance_metrics_hard_binary_classification(final_estimator, X_eval,
                                                      y_eval)

#multi class classification
X, y = cutils.generate_linear_synthetic_data_classification(
    n_samples=1000,
    n_features=2,
    n_classes=4,
    weights=[0.3, 0.3, 0.2, 0.2],
    class_sep=1.5)
Exemplo n.º 7
0
import common_utils as comutils
import classification_utils as cutils
from sklearn import model_selection, naive_bayes, preprocessing
import seaborn as sns

#2-d classification pattern
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000,
                                                            n_features=2,
                                                            n_classes=2,
                                                            weights=[0.5, 0.5],
                                                            class_sep=2)
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000,
                                                                noise=0.1)
cutils.plot_data_2d_classification(X, y)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

sns.distplot(X_train[:, 0], hist=False)
sns.distplot(X_train[:, 1], hist=False)

#grid search for parameter values
gnb_estimator = naive_bayes.GaussianNB()
gnb_grid = {'priors': [None]}
final_estimator = comutils.grid_search_best_model(gnb_estimator, gnb_grid,
                                                  X_train, y_train)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

final_estimator.predict_proba(X_test)
tsne_data = tsne.fit_transform(pca_data)
rutils.plot_data_3d_regression(tsne_data, y_train)

scoring = metrics.make_scorer(log_rmse, greater_is_better=False)

sns.distplot(y_train)
y_trans = np.log1p(y_train)
sns.distplot(y_trans)

kernel_svm = svm.SVR(kernel="rbf")
kernel_svm_grid = {
    'C': [0.2, 0.5, 10, 20, 50],
    'gamma': [0.0001, 0.0005, 0.001, 0.005]
}
final_kernel_svm_model = utils.grid_search_best_model(kernel_svm,
                                                      kernel_svm_grid,
                                                      pca_data,
                                                      y_trans,
                                                      scoring=scoring)

X_test = house3[house_train.shape[0]:]
X_test1 = utils.select_features(lasso_selector, X_test)
pca_test_data = lpca.transform(X_test1)
pca_test_data.shape

house_test['SalePrice'] = np.expm1(
    final_kernel_svm_model.predict(pca_test_data))
house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv",
                  columns=["Id", "SalePrice"],
                  index=False)
Exemplo n.º 9
0
scoring = metrics.make_scorer(rutils.rmse, greater_is_better=False)

#linear pattern in 2d
X, y = rutils.generate_linear_synthetic_data_regression(n_samples=100,
                                                        n_features=1,
                                                        n_informative=1,
                                                        noise=200)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.1, random_state=1)
rutils.plot_data_2d_regression(X_train, y_train)

linear_estimator = linear_model.LinearRegression()
linear_grid = {'normalize': [False]}
final_linear_model = utils.grid_search_best_model(linear_estimator,
                                                  linear_grid,
                                                  X_train,
                                                  y_train,
                                                  scoring=scoring)
print(final_linear_model.coef_)
print(final_linear_model.intercept_)
rutils.plot_model_2d_regression(final_linear_model, X_train, y_train)
rutils.regression_performance(final_linear_model, X_test, y_test)

lasso_estimator = linear_model.Lasso(max_iter=5000)
lasso_grid = {'alpha': [0, 0.1, 0.5, 1.0, 10]}
final_lasso_model = utils.grid_search_best_model(lasso_estimator,
                                                 lasso_grid,
                                                 X_train,
                                                 y_train,
                                                 scoring=scoring)
print(final_lasso_model.coef_)
Exemplo n.º 10
0
import sys
sys.path.append("E:/utils")

import common_utils as comutils
import classification_utils as cutils
from sklearn import preprocessing, linear_model, pipeline

X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000,
                                                                noise=0.1)
X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000,
                                                                noise=0.1)

cutils.plot_data_2d_classification(X, y)

stages = [('features', preprocessing.PolynomialFeatures()),
          ('perceptron', linear_model.Perceptron(max_iter=1000))]
perceptron_pipeline = pipeline.Pipeline(stages)
perceptron_pipeline_grid = {
    'perceptron__penalty': ['l1'],
    'perceptron__alpha': [0, 0.1, 0.3, 0.5],
    'features__degree': [2, 3]
}
pipeline_object = comutils.grid_search_best_model(perceptron_pipeline,
                                                  perceptron_pipeline_grid, X,
                                                  y)
final_estimator = pipeline_object.named_steps['perceptron']
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(pipeline_object, X, y)
Exemplo n.º 11
0
X_train1 = utils.select_features(lasso_selector, X_train)

utils.corr_heatmap(X_train1)
lpca = decomposition.PCA(0.95)
lpca.fit(X_train1)
print(np.cumsum(lpca.explained_variance_ratio_))
pca_data = lpca.transform(X_train1)
print(pca_data.shape)

tsne = manifold.TSNE(n_components=2)
tsne_data = tsne.fit_transform(pca_data)
rutils.plot_data_3d_regression(tsne_data, y_train)

scoring = metrics.make_scorer(log_rmse, greater_is_better=False)

sns.distplot(y_train)
y_trans = np.log1p(y_train)
sns.distplot(y_trans)

rf_estimator = ensemble.RandomForestRegressor(random_state=100)
rf_grid = {'n_estimators':list(range(100,501,200)), 'max_features':[8, 10, 15], 'max_depth':[3,5,7]}
final_rf_model = utils.grid_search_best_model(rf_estimator, rf_grid, pca_data, y_trans, scoring = scoring)

X_test = house3[house_train.shape[0]:]
X_test1 = utils.select_features(lasso_selector, X_test)
pca_test_data = lpca.transform(X_test1)
pca_test_data.shape

house_test['SalePrice'] = np.expm1(final_rf_model.predict(pca_test_data))
house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)
Exemplo n.º 12
0
#linear pattern in 2d
X, y = rutils.generate_nonlinear_synthetic_data_regression(n_samples=200,
                                                           n_features=1)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.1, random_state=1)
rutils.plot_data_2d_regression(X_train, y_train)

rf_estimator = ensemble.RandomForestRegressor()
rf_grid = {
    'n_estimators': list(range(10, 200, 20)),
    'max_depth': list(range(3, 6))
}
final_rf_model = utils.grid_search_best_model(rf_estimator,
                                              rf_grid,
                                              X_train,
                                              y_train,
                                              scoring=scoring)
rutils.plot_model_2d_regression(final_rf_model, X_train, y_train)
rutils.regression_performance(final_rf_model, X_test, y_test)

et_estimator = ensemble.ExtraTreesRegressor()
et_grid = {
    'n_estimators': list(range(10, 200, 20)),
    'max_depth': list(range(3, 6))
}
final_et_model = utils.grid_search_best_model(et_estimator,
                                              et_grid,
                                              X_train,
                                              y_train,
                                              scoring=scoring)
Exemplo n.º 13
0
                                                            n_features=2,
                                                            n_classes=2,
                                                            weights=[0.5, 0.5],
                                                            class_sep=2)
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000,
                                                                noise=0.1)
cutils.plot_data_2d_classification(X, y)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

#grid search for parameter values
dt_estimator = tree.DecisionTreeClassifier()
dt_grid = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(1, 9))}
final_estimator = comutils.grid_search_best_model(dt_estimator, dt_grid,
                                                  X_train, y_train)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

knn_estimator = neighbors.KNeighborsClassifier()
knn_grid = {
    'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance']
}
final_estimator = comutils.grid_search_best_model(knn_estimator, knn_grid,
                                                  X_train, y_train)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

rf_estimator = ensemble.RandomForestClassifier()
rf_grid = {
    'max_depth': list(range(5, 10)),
    'n_estimators': list(range(1, 100, 20))
X, y = datasets.make_classification(n_samples=100,
                                    n_features=20,
                                    n_informative=2,
                                    n_redundant=4,
                                    n_repeated=0,
                                    n_classes=2)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)

np.corrcoef(X_train, rowvar=False)

#overfit
perceptron_estimator = linear_model.Perceptron(max_iter=1000)
perceptron_grid = {'alpha': [0]}
final_estimator = cutils.grid_search_best_model(perceptron_estimator,
                                                perceptron_grid, X_train,
                                                y_train)
print(final_estimator.intercept_)
print(final_estimator.coef_)

#overfit control
perceptron_estimator = linear_model.Perceptron(max_iter=1000)
perceptron_grid = {
    'penalty': ['l1', 'l2'],
    'alpha': [0, 0.00001, 0.0001, 0.0005, 0.001, 0.01, 0.1, 0.2, 0.5, 1, 3]
}
final_estimator = cutils.grid_search_best_model(perceptron_estimator,
                                                perceptron_grid, X_train,
                                                y_train)
print(final_estimator.intercept_)
print(final_estimator.coef_)
    train1, y, test_size=0.1, random_state=1)

#perceptron algorithm
stages = [('imputer', impute.SimpleImputer()),
          ('zv_filter', feature_selection.VarianceThreshold()),
          ('classifier', linear_model.LogisticRegression())]
pipeline = pipeline.Pipeline(stages)
pipeline_grid = {
    'imputer__strategy': ['mean', 'median'],
    'zv_filter__threshold': [0, 0.5],
    'classifier__C': [0.001, 0.01, 0.1, 0.2, 0.5],
    'classifier__penalty': ['l1', 'l2']
}
pipeline_generated = utils.grid_search_best_model(pipeline,
                                                  pipeline_grid,
                                                  X_train,
                                                  y_train,
                                                  scoring="roc_auc")
final_estimator = pipeline_generated.named_steps['classifier']
print(pipeline_generated.score(X_eval, y_eval))

test = pd.read_csv(os.path.join(dir, 'test.csv'))
print(test.info())
print(test.columns)

test1 = test.iloc[:, 1:]
test['Survived'] = np.round(pipeline_generated.predict_proba(test1)[:, 1], 2)
test.to_csv(os.path.join(dir, 'submission.csv'),
            columns=['PassengerId', 'Survived'],
            index=False)
import kernel_utils as kutils
from sklearn import metrics, linear_model, svm, model_selection, preprocessing, pipeline

scoring = metrics.make_scorer(rutils.rmse, greater_is_better=False)

X, y = rutils.generate_nonlinear_synthetic_data_regression(n_samples=200, n_features=1)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
rutils.plot_data_2d_regression(X_train, y_train)

stages = [
            ('features', preprocessing.PolynomialFeatures()),
            ('reg', linear_model.LinearRegression())
        ]
lr_pipeline = pipeline.Pipeline(stages)
lr_pipeline_grid  = {'reg__normalize':[True, False], 'features__degree':[2,3,5,10]}
pipeline_object = utils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train, scoring = scoring)
final_linear_model = pipeline_object.named_steps['reg']
print(final_linear_model.coef_)
print(final_linear_model.intercept_)
rutils.plot_model_2d_regression(pipeline_object, X_train, y_train)
rutils.regression_performance(pipeline_object, X_test, y_test)

stages = [
            ('features', kutils.GaussianFeatures() ),
            ('reg', linear_model.LinearRegression())
        ]
lr_pipeline = pipeline.Pipeline(stages)
lr_pipeline_grid  = {'reg__normalize':[True, False], 'features__n_centres':[15, 20, 30, 36] }
pipeline_object = utils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train, scoring = scoring)
final_linear_model = pipeline_object.named_steps['reg']
print(final_linear_model.coef_)
scoring = metrics.make_scorer(rutils.rmse, greater_is_better=False)

X, y = rutils.generate_nonlinear_synthetic_data_regression(n_samples=200,
                                                           n_features=1)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.1, random_state=1)
rutils.plot_data_2d_regression(X_train, y_train)

kernel_lr = kernel_ridge.KernelRidge(kernel="rbf")
kernel_lr_grid = {
    'alpha': [0.0001, 0.01, 0.05, 0.2, 0.5, 1],
    'gamma': [0.01, 0.1, 1, 2, 3, 4, 5, 10]
}
final_kernel_lr_model = utils.grid_search_best_model(kernel_lr,
                                                     kernel_lr_grid,
                                                     X_train,
                                                     y_train,
                                                     scoring=scoring)
rutils.plot_model_2d_regression(final_kernel_lr_model, X_train, y_train)
rutils.regression_performance(final_kernel_lr_model, X_test, y_test)

kernel_svm = svm.SVR(kernel="rbf")
kernel_svm_grid = {
    'C': [0.2, 0.5, 10, 20, 50, 100],
    'gamma': [0.01, 0.1, 1, 2, 3, 4, 5, 10]
}
final_kernel_svm_model = utils.grid_search_best_model(kernel_svm,
                                                      kernel_svm_grid,
                                                      X_train,
                                                      y_train,
                                                      scoring=scoring)
Exemplo n.º 18
0
#one -hot encoding
#One hot encoding is used when there exists no ordinal relationship in column
#Ordinal variables are variables that are categorized in an ordered format, so that the different categories can be ranked
#from smallest to largest or from less to more on a particular characteristic
X_train = cutils.ohe(titanic_train1, cat_features)
#get_dummies Convert categorical variable into dummy/indicator variables
#A dummy variable (aka, an indicator variable) is a numeric variable that represents categorical data, such as gender, race, political affiliation, etc.
y_train = titanic_train['Survived']

#build model
knn_pipelines_stages = [('scaler', preprocessing.StandardScaler()),
                        ('knn', neighbors.KNeighborsClassifier())]
knn_pipeline = pipeline.Pipeline(knn_pipelines_stages)
knn_pipeline_grid = {'knn__n_neighbors': list(range(1, 10))}
knn_pipeline_model = cutils.grid_search_best_model(knn_pipeline,
                                                   knn_pipeline_grid, X_train,
                                                   y_train)

titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
titanic_test1 = cutils.drop_features(
    titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin'])
cutils.cast_to_cat(titanic_test1, ['Sex', 'Embarked', 'Pclass'])
cont_features = cutils.get_continuous_features(titanic_test1)
cat_features = cutils.get_categorical_features(titanic_test1)
titanic_test1[cat_features] = cat_imputers.transform(
    titanic_test1[cat_features])
titanic_test1[cont_features] = con_imputers.transform(
    titanic_test1[cont_features])
X_test = cutils.ohe(titanic_test1, cat_features)
titanic_test['Survived'] = knn_pipeline_model.predict(X_test)