(LogisticRegression(**utils.read_estimator_params(s, "lr")), "lr"), (RandomForestClassifier(**utils.read_estimator_params(s, "rf")), "rf") ] results = [] for clf in clfs: ts = time.time() model, log_loss = train_model(X_train, y_train, X_val, y_val, clf[0]) results.append((clf[1], model, log_loss)) logger.info("Trained {} in {:.2f} seconds, Log loss : {:.6f}" .format(type(clf[0]).__name__, (time.time() - ts), log_loss)) # Sort by log_loss results.sort(key=lambda tup: tup[2]) # Prepare the DataFrame containing from the predicted_probabilities model = results[0][1] predicted_probabilities = model.predict_proba(test_sp) df = pd.DataFrame(predicted_probabilities) subm = pd.read_csv(os.path.join("data", "sample_submission.csv.gz"), dtype={"device_id": np.str}) classes = subm.columns.values.tolist()[1:] df["device_id"] = subm["device_id"] df = df[["device_id"] + np.arange(0, 12).tolist()] new_names = dict(zip(np.arange(0, 12).tolist(), classes)) df.rename(columns=new_names, inplace=True) # Submission file logger.info(tabulate(zip([r[0] for r in results], [r[2] for r in results]), floatfmt=".4f", headers=("model", "log_loss"))) utils.make_submission_file(model, df, "%s_" % results[0][0])
df_all.shape[1]) # Separating the train and test train = df_all[df_all["ID"].isin(id_train)] test = df_all[df_all["ID"].isin(id_test)] logger.info("Training model. Train dataset shape : %s" % str(train.shape)) X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4) preds = None clf = ensemble.RandomForestClassifier() clf.set_params(**cfg[section]["best_estimator"]) if cfg[section]["find_best"] == True: model = utils.find_best_estimator(clf, X, y, cfg, section=section, grid_search_params_key="param_dist", scoring="f1", verbosity=2) else: model = clf.fit(X, y) preds = model.predict_proba(X_eval)[:, 1] log_loss = metrics.log_loss(y_eval, preds) logger.info("Trained model %s" % model) logger.info("Log loss : %.6f" % log_loss) logger.info("Making predictions..") predicted_probabilities = model.predict_proba(test)[:, 1] utils.make_submission_file(predicted_probabilities, "simple-randomforest")
if args['load']: with open(args['load'], 'rb') as f: loaded_net = cPickle.load(f) conv_net.load_params_from(loaded_net) conv_net.fit(X, y) name = exp_name + '_' + str(date.today()) with open('models/conv_net_'+name+'.pkl', 'wb') as f: cPickle.dump(conv_net, f, -1) conv_net.save_params_to('models/params_'+name) # ----- Train set ---- train_predictions = conv_net.predict_proba(X) make_submission_file(train_predictions[:sample_size], images_id[:sample_size], output_filepath='models/training_'+name+'.csv') # ----- Test set ---- X_test, _, images_id_test = load_numpy_arrays(args['test_file']) print "Test:" print "X_test.shape:", X_test.shape predictions = conv_net.predict_proba(X_test) make_submission_file(predictions, images_id_test, output_filepath='submissions/submission_'+name+'.csv') # ----- Make plots ---- plot_loss(conv_net, "models/loss_"+name+".png", show=False) plot_conv_weights(conv_net.layers_[1], figsize=(4, 4)) plt.savefig('models/weights_'+name+'.png')
for i in xrange(cfg[s]["n_blends"]): print("Iteration {}".format(i)) bclf, b_t, log_loss = run_stacked_generalization(clfs, train, target) results_2.append((bclf, b_t, log_loss)) logger.info("Iteration {}, Log loss : {:.4f}".format(i, log_loss)) # Sort by log_loss results_2.sort(key=lambda tup: tup[2]) # Prepare the DataFrame containing from the predicted_probabilities log_loss_1, log_loss_2 = results_1[0][2], results_2[0][2] model, predicted_probabilities = None, None if log_loss_1 < log_loss_2: logger.info("Method 1 has lower log loss {:.4f}".format(log_loss_1)) model = results_1[0][1] predicted_probabilities = model.predict_proba(test) else: logger.info("Method 2 has lower log loss {:.4f}".format(log_loss_2)) model = results_2[0][0] blend_test = results_2[0][1] predicted_probabilities = model.predict_proba(blend_test) df = pd.DataFrame(predicted_probabilities) df["device_id"] = test["device_id"] df = df[["device_id"] + np.arange(0, 12).tolist()] new_names = dict(zip(np.arange(0, 12).tolist(), model.classes_.tolist())) df.rename(columns=new_names, inplace=True) # Submission file prefix = utils.get_key(type(model).__name__) utils.make_submission_file(model, df, "{}_".format(prefix))
#!/usr/bin/env python # -*- coding: utf-8 -*- from utils import make_submission_file from utils import load_numpy_arrays from datetime import date import cPickle import sys conv_net = cPickle.load(open(str(sys.argv[1]),'rb')) # ----- Test set ---- X_test, _, images_id_test = load_numpy_arrays('test.npz') print "Test:" print "X_test.shape:", X_test.shape predictions = conv_net.predict_proba(X_test) make_submission_file(predictions, images_id_test, output_filepath='submissions/submission_'+str(date.today)+'.csv')
# Drop the remaining categorical columns (ones we did not convert) col_names = list(df_all.columns.values) logger.info("Categorical columns not converted : %s" % remaining_cols) df_all = df_all.drop(remaining_cols, axis=1) logger.info("%d columns after dropping remaining categorical columns." % df_all.shape[1]) # Separating the train and test train = df_all[df_all["ID"].isin(id_train)] test = df_all[df_all["ID"].isin(id_test)] logger.info("Training model. Train dataset shape : %s" % str(train.shape)) X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4) preds = None clf = ensemble.RandomForestClassifier() clf.set_params(**cfg[section]["best_estimator"]) if cfg[section]["find_best"] == True: model = utils.find_best_estimator(clf, X, y, cfg, section=section, grid_search_params_key="param_dist", scoring="f1", verbosity=2) else: model = clf.fit(X, y) preds = model.predict_proba(X_eval)[:, 1] log_loss = metrics.log_loss(y_eval, preds) logger.info("Trained model %s" % model) logger.info("Log loss : %.6f" % log_loss) logger.info("Making predictions..") predicted_probabilities = model.predict_proba(test)[:, 1] utils.make_submission_file(predicted_probabilities, "simple-randomforest")
# For numeric columns, replace missing values with -999 tmp_len = len(train[a_vals.isnull()]) if tmp_len > 0: train.loc[a_vals.isnull(), a] = -999 tmp_len = len(test[b_vals.isnull()]) if tmp_len > 0: test.loc[b_vals.isnull(), b] = -999 # Training t0 = time.time() clf = ExtraTreesClassifier() clf.set_params(**cfg[s]["estimator_params_etc"]) X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4) if cfg[s]["find_best"] == True: model = utils.find_best_estimator(clf, X, y, cfg, section=s, grid_search_params_key="gs_params_etc", scoring="log_loss", verbosity=2) logger.info(model) else: model = clf.fit(X, y) logger.info("%.2f seconds to train %s" % ((time.time() - t0), model)) preds = model.predict_proba(X_eval)[:, 1] log_loss = metrics.log_loss(y_eval, preds) logger.info("Log loss : %.6f" % log_loss) logger.info("Making predictions..") y_pred = model.predict_proba(test) utils.make_submission_file(y_pred[:, 1], "etc_")
test.loc[b_vals.isnull(), b] = -999 # Training t0 = time.time() clf = ExtraTreesClassifier() clf.set_params(**cfg[s]["estimator_params_etc"]) X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4) if cfg[s]["find_best"] == True: model = utils.find_best_estimator( clf, X, y, cfg, section=s, grid_search_params_key="gs_params_etc", scoring="log_loss", verbosity=2) logger.info(model) else: model = clf.fit(X, y) logger.info("%.2f seconds to train %s" % ((time.time() - t0), model)) preds = model.predict_proba(X_eval)[:, 1] log_loss = metrics.log_loss(y_eval, preds) logger.info("Log loss : %.6f" % log_loss) logger.info("Making predictions..") y_pred = model.predict_proba(test) utils.make_submission_file(y_pred[:, 1], "etc_")