X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000,
                                                            n_features=2,
                                                            n_classes=2,
                                                            weights=[0.8, 0.2],
                                                            class_sep=1.0)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

lr_estimator = linear_model.LogisticRegression()
lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1]}
final_estimator = cutils.grid_search_best_model(lr_estimator,
                                                lr_grid,
                                                X_train,
                                                y_train,
                                                scoring='roc_auc')
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

final_estimator.predict_proba(X_test)
cutils.performance_metrics_soft_binary_classification(final_estimator, X_test,
                                                      y_test)

#imbalanced binary classification
X, y = cutils.generate_linear_synthetic_data_classification(
    n_samples=1000,
    n_features=2,
    n_classes=2,
Пример #2
0
sys.path.append("E:/New Folder/utils")

import classification_utils as cutils
from sklearn import model_selection, linear_model, dummy

#binary classification
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=2, weights=[0.4,0.6], class_sep=1.5)
#X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)


X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

lr_estimator = linear_model.LogisticRegression()
lr_grid  = {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1] }
final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='accuracy')
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)
cutils.performance_metrics_hard_binary_classification(final_estimator, X_eval, y_eval)

#multi class classification
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=4, weights=[0.3,0.3,0.2,0.2], class_sep=1.5)
#X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)


X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

lr_estimator = linear_model.LogisticRegression()
lr_grid  = {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1] }
Пример #3
0
cat_imputers = utils.get_categorical_imputers(titanic, cat_features)
titanic[cat_features] = cat_imputers.transform(titanic[cat_features])
cont_imputers = utils.get_continuous_imputers(titanic, cont_features)
titanic[cont_features] = cont_imputers.transform(titanic[cont_features])

#one hot encoding
titanic = utils.ohe(titanic, cat_features)

#scale the data
scaler = preprocessing.StandardScaler()
tmp = scaler.fit_transform(titanic)
titanic = pd.DataFrame(tmp, columns=titanic.columns)

titanic_train1 = titanic[:titanic_train.shape[0]]
y_train = titanic_train['Survived']

#feature selection
rf_estimator = ensemble.RandomForestClassifier()
rf_grid  = {'max_depth':list(range(1,9)), 'n_estimators':list(range(1,300,100)) }
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, titanic_train1, y_train)
X_train = utils.select_features(rf_final_estimator, titanic_train1, threshold='median')

kernel_svm_estimator = svm.SVC(kernel='rbf')
kernel_svm_grid = {'gamma':[0.001, 0.01, 0.05, 0.1, 1], 'C':[10, 100] }
svm_final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train)

titanic_test1 = titanic[titanic_train.shape[0]:]
X_test = utils.select_features(rf_final_estimator, titanic_test1, threshold='median')

titanic_test['Survived'] = svm_final_estimator.predict(X_test)
titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)
Пример #4
0
cv_scores = model_selection.cross_val_score(knn_estimator,
                                            X_train,
                                            y_train,
                                            cv=10)
print(np.mean(cv_scores))
train_score = knn_estimator.score(X_train, y_train)
print(train_score)
cutils.plot_model_2d_classification(knn_estimator, X_train, y_train)

#underfitted learning in knn
knn_estimator = neighbors.KNeighborsClassifier(n_neighbors=1)
knn_estimator.fit(X_train, y_train)
cv_scores = model_selection.cross_val_score(knn_estimator,
                                            X_train,
                                            y_train,
                                            cv=10)
print(np.mean(cv_scores))
train_score = knn_estimator.score(X_train, y_train)
print(train_score)
cutils.plot_model_2d_classification(knn_estimator, X_train, y_train)

#grid seach tuning
knn_estimator = neighbors.KNeighborsClassifier()
knn_grid = {'n_neighbors': list(range(1, 200))}
cutils.grid_search_plot_one_parameter_curves(knn_estimator, knn_grid, X_train,
                                             y_train)
cutils.grid_search_plot_models_classification(knn_estimator, knn_grid, X_train,
                                              y_train)
final_estimator = cutils.grid_search_best_model(knn_estimator, knn_grid,
                                                X_train, y_train)
Пример #5
0
train = pd.read_csv(os.path.join(dir, 'train.csv'))
print(train.info())
print(train.columns)

sns.countplot(x='target',data=train)

#filter unique value features
train1 = train.iloc[:,2:] 
y = train['target'].astype(int)

X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train1, y, test_size=0.1, random_state=1)

stages = [  ('imputer', preprocessing.Imputer()),
            ('zv_filter', feature_selection.VarianceThreshold()),
            ('feature_selector', feature_selection.SelectKBest(score_func=feature_selection.f_classif)),
            ('classifier', linear_model.LogisticRegression())
        ]
pipeline_ml = pipeline.Pipeline(stages)
pipeline_grid  = {'feature_selector__k':[70, 75, 100], 'classifier__C':[0.001, 0.01, 0.1, 0.2, 0.5],'classifier__penalty':['l1', 'l2'], 'classifier__class_weight':['balanced', None]}
pipeline_generated = cutils.grid_search_best_model(pipeline_ml, pipeline_grid, X_train, y_train, scoring="roc_auc")
final_estimator = pipeline_generated.named_steps['classifier']
print(pipeline_generated.score(X_eval, y_eval))

test = pd.read_csv(os.path.join(dir, 'test.csv'))
print(test.info())
print(test.columns)

test1 = test.iloc[:,1:] 
test['target'] = np.round(pipeline_generated.predict_proba(test1)[:,1], 2)
test.to_csv(os.path.join(dir, 'submission.csv'), columns=['id', 'target'], index=False)
Пример #6
0
X, y = cutils.generate_linear_synthetic_data_classification(
    n_samples=1000,
    n_features=2,
    n_classes=2,
    weights=[0.05, 0.95],
    class_sep=0.1)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

lr_estimator = linear_model.LogisticRegression()
lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1]}
final_estimator = cutils.grid_search_best_model(lr_estimator,
                                                lr_grid,
                                                X_train,
                                                y_train,
                                                scoring='f1')
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

cutils.performance_metrics_hard_binary_classification(final_estimator, X_test,
                                                      y_test)

dummy_estimator = dummy.DummyClassifier(strategy='most_frequent')
dummy_estimator.fit(X_train, y_train)
dummy_estimator.predict(X_test)
cutils.performance_metrics_hard_binary_classification(dummy_estimator, X_test,
                                                      y_test)
Пример #7
0
X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000,
                                                                noise=0.1)

cutils.plot_data_2d_classification(X, y)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

#perceptron algorithm
stages = [('features', preprocessing.PolynomialFeatures()),
          ('clf', linear_model.Perceptron(max_iter=1000))]
perceptron_pipeline = pipeline.Pipeline(stages)
perceptron_pipeline_grid = {'features__gamma': [0.1, 0.01, 0.2]}
pipeline_object = cutils.grid_search_best_model(perceptron_pipeline,
                                                perceptron_pipeline_grid,
                                                X_train, y_train)
final_estimator = pipeline_object.named_steps['clf']
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(pipeline_object, X_train, y_train)

#logistic regression algorithm
stages = [('features', preprocessing.PolynomialFeatures()),
          ('clf', linear_model.LogisticRegression())]

lr_pipeline = pipeline.Pipeline(stages)
lr_pipeline_grid = {'features__gamma': [0.1, 1, 5, 10]}
pipeline_object = cutils.grid_search_best_model(lr_pipeline, lr_pipeline_grid,
                                                X_train, y_train)
final_estimator = pipeline_object.named_steps['clf']
Пример #8
0
titanic = utils.ohe(titanic, cat_features)

#scale the data
scaler = preprocessing.StandardScaler()
tmp = scaler.fit_transform(titanic)
titanic = pd.DataFrame(tmp, columns=titanic.columns)

titanic_train1 = titanic[:titanic_train.shape[0]]
y_train = titanic_train['Survived']

rf_estimator = ensemble.RandomForestClassifier()
rf_grid = {
    'max_depth': list(range(1, 9)),
    'n_estimators': list(range(1, 300, 100))
}
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid,
                                                   titanic_train1, y_train)
X_train = utils.select_features(rf_final_estimator,
                                titanic_train1,
                                threshold='mean')

tpot_estimator = tpot.TPOTClassifier(
    generations=10,
    population_size=40,
    verbosity=2,
    early_stop=3,
    random_state=100,
    cv=5,
    scoring='accuracy',
    periodic_checkpoint_folder='E:/checkpoint')
tpot_estimator.fit(X_train, y_train)
print(tpot_estimator.score(X_train, y_train))
Пример #9
0
import classification_utils as cutils
from sklearn import model_selection, ensemble, tree
import xgboost as xgb

X_train, y_train = cutils.generate_nonlinear_synthetic_data_classification3(
    n_samples=500, noise=0.25)
cutils.plot_data_2d_classification(X_train, y_train)

dt_estimator = tree.DecisionTreeClassifier()
ada_estimator = ensemble.AdaBoostClassifier(dt_estimator)
ada_grid = {
    'base_estimator__max_depth': [1, 2, 3],
    'n_estimators': list(range(50, 150, 30)),
    'learning_rate': [0.1, 0.2, 0.5, 1.0]
}
final_estimator = cutils.grid_search_best_model(ada_estimator, ada_grid,
                                                X_train, y_train)

gb_estimator = ensemble.GradientBoostingClassifier()
gb_grid = {
    'max_depth': [1, 2, 3],
    'n_estimators': list(range(50, 150, 30)),
    'learning_rate': [0.1, 0.2, 0.5, 1.0]
}
final_estimator = cutils.grid_search_best_model(gb_estimator, gb_grid, X_train,
                                                y_train)

xgb_estimator = xgb.XGBClassifier()
xgb_grid = {
    'max_depth': [1, 2, 3],
    'n_estimators': list(range(50, 150, 30)),
    'learning_rate': [0.1, 0.2, 0.5, 1.0],
Пример #10
0
                                            y_train,
                                            cv=10)
print(np.mean(cv_scores))
train_score = dt_estimator.score(X_train, y_train)
print(train_score)

#visualize the deciion tree
X_df = pd.DataFrame(X_train, columns=['X0', 'X1'])
dot_data = io.StringIO()
tree.export_graphviz(dt_estimator,
                     out_file=dot_data,
                     feature_names=X_df.columns)
graph = pydot.graph_from_dot_data(dot_data.getvalue())[0]
dir = 'E:/'
graph.write_pdf(os.path.join(dir, "tree.pdf"))

dt_estimator = tree.DecisionTreeClassifier()
dt_grid = {'max_depth': list(range(1, 9))}
cutils.grid_search_plot_one_parameter_curves(dt_estimator, dt_grid, X_train,
                                             y_train)
cutils.grid_search_plot_models_classification(dt_estimator, dt_grid, X_train,
                                              y_train)
final_estimator = cutils.grid_search_best_model(dt_estimator, dt_grid, X_train,
                                                y_train)

cutils.generate_linear_synthetic_data_classification
dt_grid_estimator = model_selection.GridSearchCV(dt_estimator,
                                                 dt_grid,
                                                 cv=10,
                                                 refit=True)
dt_grid_estimator.fit(X_train, y_train)
Пример #11
0
import classification_utils as cutils
from sklearn import model_selection, naive_bayes, preprocessing
import seaborn as sns

#2-d classification pattern
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000,
                                                            n_features=2,
                                                            n_classes=2,
                                                            weights=[0.5, 0.5],
                                                            class_sep=2)
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000,
                                                                noise=0.1)
cutils.plot_data_2d_classification(X, y)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

sns.distplot(X_train[:, 0], hist=False)
sns.distplot(X_train[:, 1], hist=False)

#grid search for parameter values
gnb_estimator = naive_bayes.GaussianNB()
gnb_grid = {'priors': [None]}
final_estimator = cutils.grid_search_best_model(gnb_estimator, gnb_grid,
                                                X_train, y_train)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

final_estimator.predict_proba(X_test)
Пример #12
0
import sys
sys.path.append("E:/New Folder/utils")

import classification_utils as cutils
from sklearn import preprocessing, linear_model, pipeline

X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000,
                                                                noise=0.1)
X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000,
                                                                noise=0.1)

cutils.plot_data_2d_classification(X, y)

stages = [('features', preprocessing.PolynomialFeatures()),
          ('perceptron', linear_model.Perceptron(max_iter=1000))]
perceptron_pipeline = pipeline.Pipeline(stages)
perceptron_pipeline_grid = {
    'perceptron__penalty': ['l1'],
    'perceptron__alpha': [0, 0.1, 0.3, 0.5],
    'features__degree': [2, 3]
}
pipeline_object = cutils.grid_search_best_model(perceptron_pipeline,
                                                perceptron_pipeline_grid, X, y)
final_estimator = pipeline_object.named_steps['perceptron']
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(pipeline_object, X, y)
    titanic_train1[cont_features])

#adding new levels
#titanic_train['Pclass'] = titanic_train['Pclass'].cat.add_categories([4,5])

#one hot encoding
X_train = utils.ohe(titanic_train1, cat_features)
y_train = titanic_train['Survived']

#build model
knn_pipeline_stages = [('scaler', preprocessing.StandardScaler()),
                       ('knn', neighbors.KNeighborsClassifier())]
knn_pipeline = pipeline.Pipeline(knn_pipeline_stages)
knn_pipeline_grid = {'knn__n_neighbors': list(range(1, 10))}
knn_pipeline_model = cutils.grid_search_best_model(knn_pipeline,
                                                   knn_pipeline_grid, X_train,
                                                   y_train)

titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))

print(titanic_test.shape)
print(titanic_test.info())

titanic_test1 = utils.drop_features(titanic_test,
                                    ['PassengerId', 'Name', 'Ticket', 'Cabin'])

utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked'])

cat_features = utils.get_categorical_features(titanic_test1)
print(cat_features)
cont_features = utils.get_continuous_features(titanic_test1)
Пример #14
0
                                                            class_sep=2)
#X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)
cutils.plot_data_2d_classification(X, y)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

#Perceptron algorithm
perceptron_estimator = linear_model.Perceptron()
perceptron_grid = {
    'penalty': ['l1', 'l2'],
    'alpha': [0, 0.01, 0.02, 0.1, 0.3, 0.5, 0.7, 1]
}
final_estimator = cutils.grid_search_best_model(perceptron_estimator,
                                                perceptron_grid, X_train,
                                                y_train)
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

#predict distances and classes for test data
print(final_estimator.decision_function(X_test))
print(final_estimator.predict(X_test))

#logistic regression algorithm
lr_estimator = linear_model.LogisticRegression()
lr_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1]}
final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train,
                                                y_train)
print(final_estimator.intercept_)
X, y = cutils.generate_nonlinear_synthetic_data_classification3(n_samples=1000,
                                                                noise=0.1)

cutils.plot_data_2d_classification(X, y)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

#perceptron algorithm
stages = [('features', kutils.KernelTransformer('rbf')),
          ('clf', linear_model.Perceptron(max_iter=1000))]
perceptron_pipeline = pipeline.Pipeline(stages)
perceptron_pipeline_grid = {'features__gamma': [0.1, 0.01, 0.2]}
pipeline_object = cutils.grid_search_best_model(perceptron_pipeline,
                                                perceptron_pipeline_grid,
                                                X_train, y_train)
final_estimator = pipeline_object.named_steps['clf']
print(final_estimator.intercept_)
print(final_estimator.coef_)
cutils.plot_model_2d_classification(pipeline_object, X_train, y_train)

#logistic regression algorithm
stages = [('features', kutils.KernelTransformer('rbf')),
          ('clf', linear_model.LogisticRegression())]

lr_pipeline = pipeline.Pipeline(stages)
lr_pipeline_grid = {'features__gamma': [0.1, 1, 5, 10]}
pipeline_object = cutils.grid_search_best_model(lr_pipeline, lr_pipeline_grid,
                                                X_train, y_train)
final_estimator = pipeline_object.named_steps['clf']
Пример #16
0
                                                            n_features=2,
                                                            n_classes=2,
                                                            weights=[0.5, 0.5],
                                                            class_sep=2)
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000,
                                                                noise=0.1)
cutils.plot_data_2d_classification(X, y)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
cutils.plot_data_2d_classification(X_train, y_train)

#grid search for parameter values
dt_estimator = tree.DecisionTreeClassifier()
dt_grid = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(1, 9))}
final_estimator = cutils.grid_search_best_model(dt_estimator, dt_grid, X_train,
                                                y_train)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

knn_estimator = neighbors.KNeighborsClassifier()
knn_grid = {
    'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance']
}
final_estimator = cutils.grid_search_best_model(knn_estimator, knn_grid,
                                                X_train, y_train)
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)

rf_estimator = ensemble.RandomForestClassifier()
rf_grid = {
    'max_depth': list(range(5, 10)),
    'n_estimators': list(range(1, 100, 20))
Пример #17
0
    titanic_train1[cat_features])
cont_imputers = utils.get_continuous_imputers(titanic_train1, cont_features)
titanic_train1[cont_features] = cont_imputers.transform(
    titanic_train1[cont_features])

#one hot encoding
X_train = utils.ohe(titanic_train1, cat_features)
y_train = titanic_train['Survived']

#embedded feature selectors
rf_estimator = ensemble.RandomForestClassifier()
rf_grid = {
    'max_depth': list(range(1, 9)),
    'n_estimators': list(range(1, 300, 100))
}
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid,
                                                   X_train, y_train)
embedded_selector = feature_selection.SelectFromModel(rf_final_estimator,
                                                      prefit=True,
                                                      threshold='mean')
X_train1 = embedded_selector.transform(X_train)
utils.plot_feature_importances(rf_final_estimator, X_train)

gb_estimator = ensemble.GradientBoostingClassifier()
gb_grid = {
    'max_depth': [1, 2, 3],
    'n_estimators': list(range(50, 300, 100)),
    'learning_rate': [0.001, 0.1, 1.0]
}
gb_final_estimator = cutils.grid_search_best_model(gb_estimator, gb_grid,
                                                   X_train, y_train)
embedded_selector = feature_selection.SelectFromModel(gb_final_estimator,
Пример #18
0
#handle missing data(imputation)
cat_imputers = utils.get_categorical_imputers(titanic_train1, cat_features)
titanic_train1[cat_features] = cat_imputers.transform(titanic_train1[cat_features])
cont_imputers = utils.get_continuous_imputers(titanic_train1, cont_features)
titanic_train1[cont_features] = cont_imputers.transform(titanic_train1[cont_features])

#one hot encoding
titanic_train2 = utils.ohe(titanic_train1, cat_features)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(titanic_train2)
y_train = titanic_train['Survived']

#build model
knn_estimator = neighbors.KNeighborsClassifier()
knn_grid = { 'n_neighbors': list(range(1,10))  }
knn_final_estimator =  cutils.grid_search_best_model(knn_estimator, knn_grid, X_train, y_train)

kernel_svm_estimator = svm.SVC(kernel='rbf')
kernel_svm_grid = {'gamma':[0.001, 0.01, 0.1, 1], 'C':[0.001, 0.01, 1, 10, 100] }
svm_final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train)

stages = [('features', kutils.KernelTransformer('rbf')) ,
          ('clf', linear_model.LogisticRegression())
          ]
lr_pipeline = pipeline.Pipeline(stages)
lr_pipeline_grid  = {'features__gamma':[0.001, 0.01, 0.1, 1], 'clf__C':[0.001, 0.01, 1, 10, 100]}
lr_final_pipeline_estimator = cutils.grid_search_best_model(lr_pipeline, lr_pipeline_grid, X_train, y_train)

rf_estimator = ensemble.RandomForestClassifier()
rf_grid  = {'max_depth':list(range(5,10)), 'n_estimators':list(range(1,500, 100)) }
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, X_train, y_train)
        ('gb', gb_estimator) , 
        ('knn', knn_estimator), 
        ('rf', rf_estimator),
        ('svm', kernel_svm_estimator),
        ('lr', lr_pipeline)
        ]

hvoting_estimator = ensemble.VotingClassifier(estimators)
hvoting_grid = {
        'gb__max_depth':[2], 'gb__n_estimators':list(range(300,500, 100)), 'gb__learning_rate':[0.1, 0.2, 0.5, 1.0], 
        'knn__n_neighbors': list(range(6,10)),
        'rf__max_depth':list(range(6,8)), 'rf__n_estimators':list(range(200,400, 100)),
         'lr__features__gamma':[0.001, 0.01], 'lr__clf__C':[1, 10],
         'svm__gamma':[0.001, 0.01], 'svm__C':[0.001, 0.01, 1, 10]         
        }
voting_final_estimator = cutils.grid_search_best_model(hvoting_estimator, hvoting_grid, X_train, y_train)


titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))

print(titanic_test.shape)
print(titanic_test.info())

titanic_test1 = utils.drop_features(titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin'])

utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked'])

cat_features = utils.get_categorical_features(titanic_test1)
print(cat_features)
cont_features = utils.get_continuous_features(titanic_test1)
print(cont_features)
#adding new levels
#titanic_train['Pclass'] = titanic_train['Pclass'].cat.add_categories([4,5])

#one hot encoding
titanic_train2 = utils.ohe(titanic_train1, cat_features)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(titanic_train2)
y_train = titanic_train['Survived']

kernel_svm_estimator = svm.SVC(kernel='rbf')
kernel_svm_grid = {
    'gamma': [0.001, 0.01, 0.1, 1],
    'C': [0.001, 0.01, 1, 10, 100]
}
svm_final_estimator = cutils.grid_search_best_model(kernel_svm_estimator,
                                                    kernel_svm_grid, X_train,
                                                    y_train)

titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))

print(titanic_test.shape)
print(titanic_test.info())

titanic_test1 = utils.drop_features(titanic_test,
                                    ['PassengerId', 'Name', 'Ticket', 'Cabin'])

utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked'])

cat_features = utils.get_categorical_features(titanic_test1)
print(cat_features)
cont_features = utils.get_continuous_features(titanic_test1)
stage1_estimators = [ gb_estimator, 
                     knn_estimator,
                    rf_estimator,
                    kernel_svm_estimator,
                    lr_pipeline   ]
stage2_estimator = linear_model.LogisticRegression()

stacking_estimator = mlxtnd.StackingClassifier(stage1_estimators, stage2_estimator  )
stacking_grid = {
        'gradientboostingclassifier__max_depth':[2], 'gradientboostingclassifier__n_estimators':list(range(300,500, 100)), 'gradientboostingclassifier__learning_rate':[0.1, 0.2, 0.5, 1.0], 
        'kneighborsclassifier__n_neighbors': list(range(6,10)),
        'randomforestclassifier__max_depth':list(range(6,8)), 'randomforestclassifier__n_estimators':list(range(200,400, 100)),
         'svc__gamma':[0.001, 0.01], 'svc__C':[0.001, 0.01, 1, 10] ,
         'meta_classifier__C': [0.1, 10.0]
        }
stacking_final_estimator = cutils.grid_search_best_model(stacking_estimator, stacking_grid, X_train, y_train)


titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))

print(titanic_test.shape)
print(titanic_test.info())

titanic_test1 = utils.drop_features(titanic_test, ['PassengerId', 'Name', 'Ticket', 'Cabin'])

utils.cast_to_cat(titanic_test1, ['Sex', 'Pclass', 'Embarked'])

cat_features = utils.get_categorical_features(titanic_test1)
print(cat_features)
cont_features = utils.get_continuous_features(titanic_test1)
print(cont_features)