Exemplo n.º 1
0
model.add(Dense(units=64, input_dim=n))
model.add(Activation('relu'))
model.add(Dropout(cfg["dropout"]))

model.add(Dense(units=64, input_dim=n))
model.add(Activation('relu'))
model.add(Dropout(cfg["dropout"]))

model.add(Dense(units=1))
model.add(Activation('sigmoid'))

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

logger.info("Fitting model on X_train...")
model.fit(X_train, y_train, epochs=cfg["epochs"], batch_size=cfg["batch_size"])

logger.info("Predicting on X_val...")
results_val = model.predict(X_val)
score = gini_normalized(y_val, results_val)
logger.info("normalized gini score on validation set is {}".format(score))

logger.info("Loading and predicting on Test set...")
test = load_file("test")
test["bias"] = 1
X_test = pipe.transform(test)
results_test = model.predict(X_test)
test['target'] = results_test
write_submission_file(test, columns=['target'], name='keras-v1')

logger.info("Finished with time {}".format(datetime.now() - start))
Exemplo n.º 2
0
chi2_df = load_file("chi2")


def n_best(chdf, n=15):
    sorted = chdf.sort_values('chi2', axis=0, ascending=False)
    return sorted['feature'][:n]


columns = n_best(chi2_df, n=20)

# training data
train = load_file()
bit_columns = get_bin_cat_features(train)
bit_columns.append('target')
train = convert_columns_to_int(train, bit_columns)
X = train[columns]
y = train.target

# make a pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X, y)

# test data
test = convert_columns_to_int(load_file("test"), bit_columns)
X_test = test[columns]

# predict
y_test_pred = pipe.predict_proba(X_test)
test['target'] = y_test_pred[:, 1]
write_submission_file(test, columns=['target'], name='mvp')
Exemplo n.º 3
0
X = train.drop('target', axis = 1)
drop_cols = drop_cols(X, names = True)
X.drop(drop_cols, axis = 1, inplace = True)
y = train.target
cat_columns = get_cat_features_idx(X)

logger.info("Making Ensemble...")
classifiers = [('xgb', XGBClassifier(learning_rate=0.07, reg_alpha=8, reg_lambda=0.75, max_depth=4, n_estimators = 800, gamma = 3)),
               ('lgbm', LGBMClassifier(learning_rate = 0.018, max_depth = 6, num_leaves = 11, colsample_bytree=0.85)),
               ('rf', RandomForestClassifier(n_estimators = 200, criterion = 'gini'))]

model = Pipeline([('impute', Imputer(missing_values = -1, strategy = "most_frequent")),
                  ('encode', OneHotEncoder(categorical_features=cat_columns, handle_unknown = 'ignore')),
                  ('ensemble', VotingClassifier(estimators = classifiers, voting = 'soft'))])

logger.info("Fitting model on X...")
model.fit(X, y)

logger.info("Predicting score (w/Cross-Val) on X...")
results = cross_val_predict(model, X, y, cv = 3, method = 'predict_proba')[:, 1]
score = gini_normalized(y, results)
logger.info("normalized gini score on training set is {}".format(score))

logger.info("Loading and predicting on Test set...")
test = load_file("test")
test.drop(drop_cols, axis = 1, inplace = True)
test['target'] = model.predict_proba(test)[:, 1]
write_submission_file(test, columns = ['target'], name = 'ensemble-v1')

logger.info("Finished with time {}".format(datetime.now() - start))
Exemplo n.º 4
0
pipe = Pipeline([('impute', Imputer(missing_values = -1)),
                 ('encode', OneHotEncoder(categorical_features=cat_columns, handle_unknown = 'ignore')),
                 ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
                 ('decompose', PCA()),
                 ('model', LogisticRegression())])
param_grid = {
    'impute__strategy': ["most_frequent"],
    'decompose__n_components': [30],
    'model': [LogisticRegression()],
    'model__C': [.4, .5, .6],
    'model__n_jobs': [1]
}

model = GridSearchCV(pipe, param_grid, scoring = 'roc_auc')

logger.info("Fitting model on X...")
model.fit(X, y)
logger.info("Best Params: {}".format(model.best_params_))

logger.info("Predicting score (w/Cross-Val) on X...")
results = cross_val_predict(model.best_estimator_, X, y, cv = 3, method = 'predict_proba')[:, 1]
score = gini_normalized(y, results)
logger.info("normalized gini score on training set is {}".format(score))

logger.info("Loading and predicting on Test set...")
test = load_file("test")
test['target'] = model.predict_proba(test)[:, 1]
write_submission_file(test, columns = ['target'], name = 'ohe-cv-pipe')

logger.info("Finished with time {}".format(datetime.now() - start))
Exemplo n.º 5
0
from xgboost import XGBClassifier
import numpy as np
from datetime import datetime

start = datetime.now()
logger = get_logger()

logger.info("Loading training data into X and y...")
train = load_file()
X = train.drop('target', axis = 1)
y = train.target
n = X.shape[1]

model = Pipeline([('features', SelectKBest(k = 40)),
                 ('model', XGBClassifier(learning_rate = .095, reg_alpha = .35, reg_lambda = .76, max_depth = 5))])

logger.info("Predicting score (w/Cross-Val) on X...")
results = cross_val_predict(model, X, y, cv = 2, method = 'predict_proba')[:, 1]
score = gini_normalized(y, results)
logger.info("normalized gini score on training set is {}".format(score))

logger.info("Fitting model on X...")
model.fit(X, y)

logger.info("Loading and predicting on Test set...")
test = load_file("test")
test['target'] = model.predict_proba(test)[:, 1]
write_submission_file(test, columns = ['target'], name = 'xgb-rfe')

logger.info("Finished with time {}".format(datetime.now() - start))
Exemplo n.º 6
0
logger.info("Making Pipeline...")
model = Pipeline([('model',
                   XGBClassifier(n_estimators=800,
                                 learning_rate=0.07,
                                 reg_alpha=8,
                                 reg_lambda=0.75,
                                 gamma=3,
                                 max_depth=4))])

logger.info("Predicting score (w/Cross-Val) on X...")
results = cross_val_predict(model,
                            X,
                            y,
                            cv=cfg["folds"],
                            method='predict_proba')[:, 1]
score = gini_normalized(y, results)
logger.info("normalized gini score on training set is {}".format(score))

logger.info("Fitting model on upscaled X...")
model.fit(X, y)

logger.info("Loading and predicting on Test set...")
test = load_file("test")
test.drop(drop_cols, axis=1, inplace=True)
test = tenc.transform(test)
test['target'] = model.predict_proba(test)[:, 1]
write_submission_file(test, columns=['target'], name='xgb-imp-ohe-ups2')

logger.info("Finished with time {:.3f} minutes".format(
    (time.time() - start) / 60.0))