def validate_classifier(clf, X, Y_train, Y_validate, cv, use_predProb_instead, scoreFunc=roc_auc_score, n_jobs=16, test_size=0.25): """ @param cv: an object (list of (trainInds, testInds)) or an integer (number of folds) @return: list of cv scores """ cvObj = StratifiedShuffleSplit(Y_validate, n_iter=cv, test_size=test_size) if isinstance(cv, int) else cv scores = jjcross_val_score(clf, X, Y_train, scoreFunc, cvObj, Y_validate, n_jobs=n_jobs, use_predProb_instead=use_predProb_instead) return scores
def gridSearch(clf, cvOutputFname, x_train, y_train, weights, innerclf=False, num_folds = 10): """ :param clf: :param cvOutputFname: :param x_train: :param y_train: :param weights: :param innerclf: if true, the clf parameter is an "outter" object that wraps a "regressor" attribute of type Ridge; if false, clf is the Ridge itself :param num_folds: :return: """ print '================== Grid Search for the Best Parameter ==================' cvOutputFile = open(cvOutputFname, 'w') res = {} cvObj = KFold(len(y_train), n_folds=num_folds, shuffle=True, random_state=0) for tolerance in [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.5]: for alpha in np.arange(0.01, 5, 0.1): print '>>>> alpha=', alpha, ', tolerance =', tolerance if innerclf: clf.regressor.set_params(alpha=alpha, tol=tolerance) else: clf.set_params(alpha=alpha, tol=tolerance) scores = jjcross_val_score(clf, x_train, y_train, normalized_weighted_gini, cvObj, weights=weights, verbose=False) meanScore = np.mean(scores) stdScore = np.std(scores) s = 'alpha = %f, tolerance = %f, mean = %f, std = %f\n' % (alpha, tolerance, meanScore, stdScore) print s res[(alpha, tolerance)] = (meanScore, stdScore) cvOutputFile.write(s) print '>>>>>> Result sorted by mean score:' pprint(sorted(res.items(), key=lambda x: -x[1][0])) cvOutputFile.close() return res
parentsProportion=0.4, mutationProportion=0.1, mutationProbability=0.1, mutationStdDev=None, populationSize=6) if cur_bestScore > bestScore: bestScore = cur_bestScore bestPipe = clone(pipe) bestPipe.set_params(**cur_bestParams) bestParams = cur_bestParams indivClfs.append(bestPipe) print '---->', col, '<----', bestScore pprint(bestParams) combinedClf = CombinedClassifier(indivClfs) print 'OVERALL CV SCORE:', np.mean(jjcross_val_score(combinedClf, X_cal, y_cal, accuracy_score, cv=5, n_jobs=N_JOBS)) # validate classifier print '====== TRAINING' _, inputTable_train, outputTable_train, _ = condense_data(trainingName, isTraining=True, readFromFiles = True) pdf(inputTable_train) temp = riskFactorImp.fit_transform(inputTable_train) assert np.isnan(temp.risk_factor).sum() == 0 X_train = Normalizer().fit_transform(Imputer().fit_transform(temp)) # X_train = Normalizer().fit_transform(Imputer().fit_transform(inputTable_train)) y_train = CombinedClassifier.combine_outputs(np.array(outputTable_train)) combinedClf.fit(X_train, y_train)
# verbose=1) # clf = SVR() # ================== CORRELATION ================== # print '================== CORRELATION ==================' # print x_train.shape # numFields = 30 # x_train, newCols = create_new_features(x_train, columns=columns_train) # corrs = calculate_y_corrs(x_train, y_train)[0] # ord = corrs.argsort()[::-1][:numFields] # x_train = x_train[:, ord] # ================== CV ================== print '================== CV ==================' scores = jjcross_val_score(regressor, x_train, y_train, normalized_weighted_gini, KFold(len(y_train), n_folds=5, shuffle=True, random_state=0), weights=weights)#, n_jobs=1) # ================== Grid Search for the Best Parameter ================== # gridSearch(clf, '/home/jj/code/Kaggle/Fire/cvRes/RidgeGroupThenRegress.txt', x_train, y_train, weights, innerclf=True) # ================== train ================== # print '================== train ==================' # clf.fit(x_train, y_train, sample_weight=weights) # # # # # ================== predict ================== # print '================== predict ==================' # x_test, _, ids_pred, _, _, _ = process_data('/home/jj/code/Kaggle/Fire/Data/test.csv', # impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median', # fieldsToUse=columns_train) # pred = clf.predict(x_test) # pandas.DataFrame({'id': ids_pred, 'target': pred}).\
from sklearn.linear_model import Ridge from sklearn.svm import SVR from sklearn.cross_validation import KFold from sklearn.decomposition import PCA from sklearn.metrics import roc_auc_score from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from Kaggle.utilities import plot_histogram, plot_feature_importances, jjcross_val_score from globalVars import * from evaluation import normalized_weighted_gini from utilities import process_data from correlations import * x_train, y_regress, _, columns_train, weights, y_class = \ process_data('/home/jj/code/Kaggle/Fire/Data/train.csv', impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median', fieldsToUse=FIELDS_CLASS_GBC_TOP100[:5]) # print '==================== feature importances ==================' # plot_feature_importances(x_train, y_class, columns_train, numTopFeatures=0.95, numEstimators=50, num_jobs=11) print '==================== CV ==================' # clf = GradientBoostingClassifier(learning_rate=0.1, loss='deviance') clf = RandomForestClassifier(n_estimators=50, n_jobs=cpu_count()-2) # clf.fit(x_train, y_class) jjcross_val_score(clf, x_train, y_class, roc_auc_score, KFold(len(y_class), n_folds=5, shuffle=True, random_state=0), weights=weights)#, n_jobs=1)
def quick_score(clf, X, y, cv=5, n_jobs=20): """ returns the cv score of a classifier """ return jjcross_val_score(clf, X, y, mean_absolute_error, cv, n_jobs=n_jobs).mean()