def rrf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] best_pruning = refineparams['n_prunings'] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) _run.info['loss'].append(loss2te) _run.info['trainloss'].append(loss2tr) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format( i, loss2tr, loss2te)) pred.iloc[itest, :] = rrf.predict_proba(Xte) i += 1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) # # weights = np.concatenate((np.ones(ytr.shape[0]),0.3*np.ones(semilabels.shape[0]))) # Xtr = pd.concat((Xtr, Xtest), axis=0) # ytr = pd.concat((ytr, semilabels)) clf = RF(**clfparams) clf.fit(Xtr, ytr) #,weights) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) predtest = pd.DataFrame(rrf.predict_proba(Xte), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss
def rrf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include = include, exclude = exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] best_pruning = refineparams['n_prunings'] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index = y.index, columns = pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) _run.info['loss'].append(loss2te) _run.info['trainloss'].append(loss2tr) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(i, loss2tr, loss2te)) pred.iloc[itest,:] = rrf.predict_proba(Xte) i+=1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) # # weights = np.concatenate((np.ones(ytr.shape[0]),0.3*np.ones(semilabels.shape[0]))) # Xtr = pd.concat((Xtr, Xtest), axis=0) # ytr = pd.concat((ytr, semilabels)) clf = RF(**clfparams) clf.fit(Xtr, ytr)#,weights) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) predtest = pd.DataFrame(rrf.predict_proba(Xte), index = yte.index, columns = pred_cols) predtest.to_csv(filename, index_label='id') return loss
def ret(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) X, y, Xtest, ytest = data.get_train_test_features(**aggregateparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] rrfparams = dict(refineparams) rrfparams['n_prunings'] = 1 kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) testpreds = [] # list for storing test predictions i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] _run.info['best_pruning'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **rrfparams) best_loss = 1000. train_loss = 1000. no_improvement_in = 0 for k in range(refineparams['n_prunings']): try: rrf.fit(Xtr, ytr) # fit and do 1 pruning except IndexError as e: print('IndexError') # sometimes I get an index error when an # unfortunate tree gets cut down to the root # we'll just stop and use best-so-far prediction in this case break loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) print("Fold {} Pruning {} mlogloss train: {:.4f}, test: {:.4f}". format(i, k, loss2tr, loss2te)) if loss2te < best_loss: # performance is better no_improvement_in = 0 best_loss = loss2te + 0. # save new best loss # predict oof samples with new model pred.iloc[itest, :] = rrf.predict_proba(Xte) # predict test with new model testpred = rrf.predict_proba(Xtest) # record current train loss train_loss = loss2tr + 0. else: no_improvement_in += 1 if no_improvement_in >= 5: break # Append current testpred to testpreds list testpreds.append( pd.DataFrame(testpred, index=ytest.index, columns=pred_cols)) # Save loss and train loss from current fold _run.info['loss'].append(best_loss) _run.info['trainloss'].append(train_loss) _run.info['best_pruning'].append(k + 1) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format( i, train_loss, best_loss)) i += 1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) filename = '{}_{}_cv_{:.4f}.csv'.format(series, time, loss) pred.to_csv(filename, index_label='id') # Optionally generate test predictions testpred = sum(testpreds) / len(testpreds) filename = '{}_test_{}_cv_{:.4f}.csv'.format(series, time, loss) testpred.to_csv(filename, index_label='id') return loss
def rf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include = include, exclude = exclude, **featureparams) X, y, Xtest, ytest = data.get_train_test_features(**aggregateparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] rrfparams = dict(refineparams) rrfparams['n_prunings'] = 1 kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index = y.index, columns = pred_cols) testpreds = [] # list for storing test predictions i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] _run.info['best_pruning'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **rrfparams) best_loss = 1000. train_loss = 1000. no_improvement_in = 0 for k in range(refineparams['n_prunings']): try: rrf.fit(Xtr, ytr) # fit and do 1 pruning except IndexError as e: print('IndexError') break loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) print("Fold {} Pruning {} mlogloss train: {:.4f}, test: {:.4f}".format(i,k, loss2tr, loss2te)) if loss2te < best_loss: # performance is better no_improvement_in = 0 best_loss = loss2te + 0. # save new best loss # predict oof samples with new model pred.iloc[itest,:] = rrf.predict_proba(Xte) # predict test with new model testpred = rrf.predict_proba(Xtest) # record current train loss train_loss = loss2tr + 0. else: no_improvement_in += 1 if no_improvement_in >= 5: break # Append current testpred to testpreds list testpreds.append(pd.DataFrame(testpred, index = ytest.index, columns = pred_cols)) # Save loss and train loss from current fold _run.info['loss'].append(best_loss) _run.info['trainloss'].append(train_loss) _run.info['best_pruning'].append(k + 1) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(i, train_loss, best_loss)) i+=1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) filename = '{}_{}_cv_{:.4f}.csv'.format(series, time, loss) pred.to_csv(filename, index_label='id') # Optionally generate test predictions testpred = sum(testpreds) / len(testpreds) filename = '{}_test_{}_cv_{:.4f}.csv'.format(series, time, loss) testpred.to_csv(filename, index_label='id') return loss