model.add(Dense(units=64, input_dim=n)) model.add(Activation('relu')) model.add(Dropout(cfg["dropout"])) model.add(Dense(units=64, input_dim=n)) model.add(Activation('relu')) model.add(Dropout(cfg["dropout"])) model.add(Dense(units=1)) model.add(Activation('sigmoid')) model.compile(loss='mse', optimizer='adam', metrics=['accuracy']) logger.info("Fitting model on X_train...") model.fit(X_train, y_train, epochs=cfg["epochs"], batch_size=cfg["batch_size"]) logger.info("Predicting on X_val...") results_val = model.predict(X_val) score = gini_normalized(y_val, results_val) logger.info("normalized gini score on validation set is {}".format(score)) logger.info("Loading and predicting on Test set...") test = load_file("test") test["bias"] = 1 X_test = pipe.transform(test) results_test = model.predict(X_test) test['target'] = results_test write_submission_file(test, columns=['target'], name='keras-v1') logger.info("Finished with time {}".format(datetime.now() - start))
chi2_df = load_file("chi2") def n_best(chdf, n=15): sorted = chdf.sort_values('chi2', axis=0, ascending=False) return sorted['feature'][:n] columns = n_best(chi2_df, n=20) # training data train = load_file() bit_columns = get_bin_cat_features(train) bit_columns.append('target') train = convert_columns_to_int(train, bit_columns) X = train[columns] y = train.target # make a pipeline pipe = make_pipeline(StandardScaler(), LogisticRegression()) pipe.fit(X, y) # test data test = convert_columns_to_int(load_file("test"), bit_columns) X_test = test[columns] # predict y_test_pred = pipe.predict_proba(X_test) test['target'] = y_test_pred[:, 1] write_submission_file(test, columns=['target'], name='mvp')
X = train.drop('target', axis = 1) drop_cols = drop_cols(X, names = True) X.drop(drop_cols, axis = 1, inplace = True) y = train.target cat_columns = get_cat_features_idx(X) logger.info("Making Ensemble...") classifiers = [('xgb', XGBClassifier(learning_rate=0.07, reg_alpha=8, reg_lambda=0.75, max_depth=4, n_estimators = 800, gamma = 3)), ('lgbm', LGBMClassifier(learning_rate = 0.018, max_depth = 6, num_leaves = 11, colsample_bytree=0.85)), ('rf', RandomForestClassifier(n_estimators = 200, criterion = 'gini'))] model = Pipeline([('impute', Imputer(missing_values = -1, strategy = "most_frequent")), ('encode', OneHotEncoder(categorical_features=cat_columns, handle_unknown = 'ignore')), ('ensemble', VotingClassifier(estimators = classifiers, voting = 'soft'))]) logger.info("Fitting model on X...") model.fit(X, y) logger.info("Predicting score (w/Cross-Val) on X...") results = cross_val_predict(model, X, y, cv = 3, method = 'predict_proba')[:, 1] score = gini_normalized(y, results) logger.info("normalized gini score on training set is {}".format(score)) logger.info("Loading and predicting on Test set...") test = load_file("test") test.drop(drop_cols, axis = 1, inplace = True) test['target'] = model.predict_proba(test)[:, 1] write_submission_file(test, columns = ['target'], name = 'ensemble-v1') logger.info("Finished with time {}".format(datetime.now() - start))
pipe = Pipeline([('impute', Imputer(missing_values = -1)), ('encode', OneHotEncoder(categorical_features=cat_columns, handle_unknown = 'ignore')), ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), ('decompose', PCA()), ('model', LogisticRegression())]) param_grid = { 'impute__strategy': ["most_frequent"], 'decompose__n_components': [30], 'model': [LogisticRegression()], 'model__C': [.4, .5, .6], 'model__n_jobs': [1] } model = GridSearchCV(pipe, param_grid, scoring = 'roc_auc') logger.info("Fitting model on X...") model.fit(X, y) logger.info("Best Params: {}".format(model.best_params_)) logger.info("Predicting score (w/Cross-Val) on X...") results = cross_val_predict(model.best_estimator_, X, y, cv = 3, method = 'predict_proba')[:, 1] score = gini_normalized(y, results) logger.info("normalized gini score on training set is {}".format(score)) logger.info("Loading and predicting on Test set...") test = load_file("test") test['target'] = model.predict_proba(test)[:, 1] write_submission_file(test, columns = ['target'], name = 'ohe-cv-pipe') logger.info("Finished with time {}".format(datetime.now() - start))
from xgboost import XGBClassifier import numpy as np from datetime import datetime start = datetime.now() logger = get_logger() logger.info("Loading training data into X and y...") train = load_file() X = train.drop('target', axis = 1) y = train.target n = X.shape[1] model = Pipeline([('features', SelectKBest(k = 40)), ('model', XGBClassifier(learning_rate = .095, reg_alpha = .35, reg_lambda = .76, max_depth = 5))]) logger.info("Predicting score (w/Cross-Val) on X...") results = cross_val_predict(model, X, y, cv = 2, method = 'predict_proba')[:, 1] score = gini_normalized(y, results) logger.info("normalized gini score on training set is {}".format(score)) logger.info("Fitting model on X...") model.fit(X, y) logger.info("Loading and predicting on Test set...") test = load_file("test") test['target'] = model.predict_proba(test)[:, 1] write_submission_file(test, columns = ['target'], name = 'xgb-rfe') logger.info("Finished with time {}".format(datetime.now() - start))
logger.info("Making Pipeline...") model = Pipeline([('model', XGBClassifier(n_estimators=800, learning_rate=0.07, reg_alpha=8, reg_lambda=0.75, gamma=3, max_depth=4))]) logger.info("Predicting score (w/Cross-Val) on X...") results = cross_val_predict(model, X, y, cv=cfg["folds"], method='predict_proba')[:, 1] score = gini_normalized(y, results) logger.info("normalized gini score on training set is {}".format(score)) logger.info("Fitting model on upscaled X...") model.fit(X, y) logger.info("Loading and predicting on Test set...") test = load_file("test") test.drop(drop_cols, axis=1, inplace=True) test = tenc.transform(test) test['target'] = model.predict_proba(test)[:, 1] write_submission_file(test, columns=['target'], name='xgb-imp-ohe-ups2') logger.info("Finished with time {:.3f} minutes".format( (time.time() - start) / 60.0))