def rrf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] best_pruning = refineparams['n_prunings'] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) _run.info['loss'].append(loss2te) _run.info['trainloss'].append(loss2tr) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format( i, loss2tr, loss2te)) pred.iloc[itest, :] = rrf.predict_proba(Xte) i += 1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) # # weights = np.concatenate((np.ones(ytr.shape[0]),0.3*np.ones(semilabels.shape[0]))) # Xtr = pd.concat((Xtr, Xtest), axis=0) # ytr = pd.concat((ytr, semilabels)) clf = RF(**clfparams) clf.fit(Xtr, ytr) #,weights) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) predtest = pd.DataFrame(rrf.predict_proba(Xte), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss
def et(series, n_folds, clfparams, featureparams, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f") pred_cols = ['predict_{}'.format(i) for i in range(3)] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] feature_importances_ = 0 for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) pred.iloc[itest, :] = clf.predict_proba(Xte) trainloss = multiclass_log_loss(ytr, clf.predict_proba(Xtr)) _run.info['trainloss'].append(trainloss) loss = multiclass_log_loss(yte, pred.iloc[itest].values) _run.info['loss'].append(loss) if i == 1: feature_importances_ = clf.feature_importances_ / n_folds else: feature_importances_ += clf.feature_importances_ / n_folds i += 1 loss = multiclass_log_loss(y, pred.values) _run.info['features'] = list(Xtr.columns) _run.info['feature_importances'] = list(feature_importances_) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) predtest = pd.DataFrame(clf.predict_proba(Xte), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss
def rrf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include = include, exclude = exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] best_pruning = refineparams['n_prunings'] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index = y.index, columns = pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) _run.info['loss'].append(loss2te) _run.info['trainloss'].append(loss2tr) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(i, loss2tr, loss2te)) pred.iloc[itest,:] = rrf.predict_proba(Xte) i+=1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) # # weights = np.concatenate((np.ones(ytr.shape[0]),0.3*np.ones(semilabels.shape[0]))) # Xtr = pd.concat((Xtr, Xtest), axis=0) # ytr = pd.concat((ytr, semilabels)) clf = RF(**clfparams) clf.fit(Xtr, ytr)#,weights) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) predtest = pd.DataFrame(rrf.predict_proba(Xte), index = yte.index, columns = pred_cols) predtest.to_csv(filename, index_label='id') return loss
def nnrun(_run, modelparams, nb_epoch, early_stopping_rounds, series, n_folds, featureparams, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation): data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] best_epoch = nb_epoch if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['history'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) # find best nb_epoch by early stopping in the first fold params = {"nb_epoch": best_epoch} if i == 1: params["early_stopping_rounds"] = early_stopping_rounds model = NN(modelparams) history = model.fit_val(Xtr, ytr, Xte, yte, **params) _run.info['history'].append(history) pred.iloc[itest, :] = model.predict_proba(Xte) if i == 1: best_epoch = len(history['loss']) - early_stopping_rounds _run.info['best_epoch'] = best_epoch loss = multiclass_log_loss(yte, pred.iloc[itest].values) print('Fold {:02d}: Loss = {:.5f}'.format(i, loss)) i += 1 break loss = multiclass_log_loss(y.values, pred.values) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) model = NN(modelparams) history = model.fit(Xtr, ytr, nb_epoch=best_epoch) predtest = pd.DataFrame(model.predict_proba(Xte), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss
def rf(series, n_folds, clfparams, featureparams, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] feature_importances_ = 0 for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) #, weights) pred.iloc[itest, :] = clf.predict_proba(Xte) trainloss = multiclass_log_loss(ytr, clf.predict_proba(Xtr)) _run.info['trainloss'].append(trainloss) loss = multiclass_log_loss(yte, pred.iloc[itest].values) _run.info['loss'].append(loss) if i == 1: feature_importances_ = clf.feature_importances_ / n_folds else: feature_importances_ += clf.feature_importances_ / n_folds i += 1 loss = multiclass_log_loss(y, pred.values) _run.info['features'] = list(Xtr.columns) _run.info['feature_importances'] = list(feature_importances_) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) #,weights) predtest = pd.DataFrame( clf.predict_proba(Xte), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss
def nnrun(_run, modelparams, nb_epoch, early_stopping_rounds, series, n_folds, featureparams, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation): data = TelstraData(include = include, exclude = exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] best_epoch = nb_epoch if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index = y.index, columns = pred_cols) i = 1 _run.info['history'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) # find best nb_epoch by early stopping in the first fold params = {"nb_epoch":best_epoch} if i == 1: params["early_stopping_rounds"] = early_stopping_rounds model = NN(modelparams) history = model.fit_val(Xtr, ytr, Xte, yte, **params) _run.info['history'].append(history) pred.iloc[itest, :] = model.predict_proba(Xte) if i == 1: best_epoch = len(history['loss']) - early_stopping_rounds _run.info['best_epoch'] = best_epoch loss = multiclass_log_loss(yte, pred.iloc[itest].values) print('Fold {:02d}: Loss = {:.5f}'.format(i, loss)) i += 1 break loss = multiclass_log_loss(y.values, pred.values) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) model = NN(modelparams) history = model.fit(Xtr, ytr, nb_epoch = best_epoch) predtest = pd.DataFrame(model.predict_proba(Xte), index = yte.index, columns = pred_cols) predtest.to_csv(filename, index_label='id') return loss
def ret(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) X, y, Xtest, ytest = data.get_train_test_features(**aggregateparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] rrfparams = dict(refineparams) rrfparams['n_prunings'] = 1 kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) testpreds = [] # list for storing test predictions i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] _run.info['best_pruning'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **rrfparams) best_loss = 1000. train_loss = 1000. no_improvement_in = 0 for k in range(refineparams['n_prunings']): try: rrf.fit(Xtr, ytr) # fit and do 1 pruning except IndexError as e: print('IndexError') # sometimes I get an index error when an # unfortunate tree gets cut down to the root # we'll just stop and use best-so-far prediction in this case break loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) print("Fold {} Pruning {} mlogloss train: {:.4f}, test: {:.4f}". format(i, k, loss2tr, loss2te)) if loss2te < best_loss: # performance is better no_improvement_in = 0 best_loss = loss2te + 0. # save new best loss # predict oof samples with new model pred.iloc[itest, :] = rrf.predict_proba(Xte) # predict test with new model testpred = rrf.predict_proba(Xtest) # record current train loss train_loss = loss2tr + 0. else: no_improvement_in += 1 if no_improvement_in >= 5: break # Append current testpred to testpreds list testpreds.append( pd.DataFrame(testpred, index=ytest.index, columns=pred_cols)) # Save loss and train loss from current fold _run.info['loss'].append(best_loss) _run.info['trainloss'].append(train_loss) _run.info['best_pruning'].append(k + 1) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format( i, train_loss, best_loss)) i += 1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) filename = '{}_{}_cv_{:.4f}.csv'.format(series, time, loss) pred.to_csv(filename, index_label='id') # Optionally generate test predictions testpred = sum(testpreds) / len(testpreds) filename = '{}_test_{}_cv_{:.4f}.csv'.format(series, time, loss) testpred.to_csv(filename, index_label='id') return loss
def rf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include = include, exclude = exclude, **featureparams) X, y, Xtest, ytest = data.get_train_test_features(**aggregateparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] rrfparams = dict(refineparams) rrfparams['n_prunings'] = 1 kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index = y.index, columns = pred_cols) testpreds = [] # list for storing test predictions i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] _run.info['best_pruning'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **rrfparams) best_loss = 1000. train_loss = 1000. no_improvement_in = 0 for k in range(refineparams['n_prunings']): try: rrf.fit(Xtr, ytr) # fit and do 1 pruning except IndexError as e: print('IndexError') break loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) print("Fold {} Pruning {} mlogloss train: {:.4f}, test: {:.4f}".format(i,k, loss2tr, loss2te)) if loss2te < best_loss: # performance is better no_improvement_in = 0 best_loss = loss2te + 0. # save new best loss # predict oof samples with new model pred.iloc[itest,:] = rrf.predict_proba(Xte) # predict test with new model testpred = rrf.predict_proba(Xtest) # record current train loss train_loss = loss2tr + 0. else: no_improvement_in += 1 if no_improvement_in >= 5: break # Append current testpred to testpreds list testpreds.append(pd.DataFrame(testpred, index = ytest.index, columns = pred_cols)) # Save loss and train loss from current fold _run.info['loss'].append(best_loss) _run.info['trainloss'].append(train_loss) _run.info['best_pruning'].append(k + 1) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(i, train_loss, best_loss)) i+=1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) filename = '{}_{}_cv_{:.4f}.csv'.format(series, time, loss) pred.to_csv(filename, index_label='id') # Optionally generate test predictions testpred = sum(testpreds) / len(testpreds) filename = '{}_test_{}_cv_{:.4f}.csv'.format(series, time, loss) testpred.to_csv(filename, index_label='id') return loss
def xgbrun(series, n_folds, clfparams, featureparams, num_trees, early_stopping_rounds, verbose_eval, _seed, _run, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation): clfparams['seed'] = _seed data = TelstraData(include = include, exclude = exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] num_rounds = num_trees + 0 params = {"verbose_eval":verbose_eval} if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index = y.index, columns = pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) dtrain = xgb.DMatrix(Xtr, ytr) dvalid = xgb.DMatrix(Xte, yte) params = {"verbose_eval":verbose_eval} if (i == 1) and (early_stopping_rounds > 0) : watchlist = [(dtrain, 'train'), (dvalid, 'eval')] params["evals"] = watchlist params["early_stopping_rounds"] = early_stopping_rounds gbm = xgb.train(clfparams, dtrain, num_rounds, **params) if i == 1: num_rounds = gbm.best_ntree_limit _run.info['num_rounds'] = num_rounds pred.iloc[itest, :] = gbm.predict(dvalid,ntree_limit=num_rounds).reshape(yte.shape[0],len(pred_cols)) predtrain = gbm.predict(dtrain,ntree_limit=num_rounds).reshape(ytr.shape[0],len(pred_cols)) loss = multiclass_log_loss(yte, pred.iloc[itest].values) trainloss = multiclass_log_loss(ytr, predtrain) #print("Fold {:02d}: trainloss = {:.4f}, testloss = {:.4f}".format(i,trainloss, loss)) _run.info['loss'].append(loss) _run.info['trainloss'].append(trainloss) i += 1 loss = multiclass_log_loss(y, pred.values) _run.info['features'] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) dtrain = xgb.DMatrix(Xtr, ytr) dtest = xgb.DMatrix(Xte) gbm = xgb.train(clfparams, dtrain, num_rounds, **params) predtest = pd.DataFrame(gbm.predict(dtest).reshape(yte.shape[0],len(pred_cols)), index = yte.index, columns = pred_cols) predtest.to_csv(filename, index_label='id') return loss
def xgbrun(series, n_folds, clfparams, featureparams, num_trees, early_stopping_rounds, verbose_eval, _seed, _run, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation): clfparams['seed'] = _seed data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] num_rounds = num_trees + 0 params = {"verbose_eval": verbose_eval} if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) dtrain = xgb.DMatrix(Xtr, ytr) dvalid = xgb.DMatrix(Xte, yte) params = {"verbose_eval": verbose_eval} if (i == 1) and (early_stopping_rounds > 0): watchlist = [(dtrain, 'train'), (dvalid, 'eval')] params["evals"] = watchlist params["early_stopping_rounds"] = early_stopping_rounds gbm = xgb.train(clfparams, dtrain, num_rounds, **params) if i == 1: num_rounds = gbm.best_ntree_limit _run.info['num_rounds'] = num_rounds pred.iloc[itest, :] = gbm.predict(dvalid, ntree_limit=num_rounds).reshape( yte.shape[0], len(pred_cols)) predtrain = gbm.predict(dtrain, ntree_limit=num_rounds).reshape( ytr.shape[0], len(pred_cols)) loss = multiclass_log_loss(yte, pred.iloc[itest].values) trainloss = multiclass_log_loss(ytr, predtrain) #print("Fold {:02d}: trainloss = {:.4f}, testloss = {:.4f}".format(i,trainloss, loss)) _run.info['loss'].append(loss) _run.info['trainloss'].append(trainloss) i += 1 loss = multiclass_log_loss(y, pred.values) _run.info['features'] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) dtrain = xgb.DMatrix(Xtr, ytr) dtest = xgb.DMatrix(Xte) gbm = xgb.train(clfparams, dtrain, num_rounds, **params) predtest = pd.DataFrame(gbm.predict(dtest).reshape( yte.shape[0], len(pred_cols)), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss