max_features=None, min_density=None, min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best'), n_estimators=150, learning_rate=.5, loss='linear', random_state=None) #clf = gaussian_process.GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1, random_start=100) #clf = neighbors.KNeighborsRegressor(100, weights='uniform', algorithm = 'auto');clf_name='KNN_200' ################################################################################################ #---Different methods of cross validation---# #May require mtxTrn.toarray() cv_preds = train.cross_validate(hstack([sparse.csr_matrix(dfTrn.urlid.values).transpose(),mtxTrn]),mtxTrnTarget.ravel(), folds=10,SEED=42,test_size=.1,clf=clf,clf_name=clf_name,pred_fg=True) train.cross_validate(mtxTrn,mtxTrnTarget.ravel(),folds=8,SEED=888,test_size=.1,clf=clf,clf_name=clf_name,pred_fg=False) train.cross_validate_temporal(mtxTrn,mtxTest,mtxTrnTarget.ravel(),mtxTestTarget.ravel(),clf=clf, clf_name=clf_name,pred_fg=False) train.cross_validate_using_benchmark('global_mean',dfTrn, mtxTrn,mtxTrnTarget,folds=20) ################################################################################################ #---Calculate the degree of variance between ground truth and the mean of the CV predictions.----# #---Returns a list of all training records with their average variance---# train.calc_cv_preds_var(dfTrn,cv_preds) ################################################################################################ #--Use estimator for manual predictions--# dfTest, clf = train.predict(mtxTrn,mtxTrnTarget.ravel(),mtxTest,dfTest,clf,clf_name) #may require mtxTest.toarray() dfTest, clf = train.predict(mtxTrn.todense(),mtxTrnTarget.ravel(),mtxTest.todense(),dfTest,clf,clf_name) #may require mtxTest.toarray() ################################################################################################ #--Save feature matrices in svm format for external modeling--#
#--Save predictions to file--# train.save_predictions(dfTest_ML, clf_name, '_All_1_5_KitchenSink', submission_no) #---------End Machine Learning Section-------------# #------------------------------Optional Steps----------------------------------# #--Memory cleanup prior to running the memory intensive classifiers--# dfTrn, dfTest, dfAll = utils.data_garbage_collection(dfTrn, dfTest, dfAll) #--use a benchmark instead of a classifier--# benchmark_preds = train.cross_validate_using_benchmark( '3.5', dfTrn, dfTrn[0].merge(dfTrn[1], how='inner', on='business_id').as_matrix(), dfTrn[0].ix[:, ['rev_stars']].as_matrix(), folds=3, SEED=42, test_size=.15) benchmark_preds = train.cross_validate_using_benchmark( 'global_mean', dfTrn, dfTrn[0].merge(dfTrn[1], how='inner', on='business_id').as_matrix(), dfTrn[0].ix[:, ['rev_stars']].as_matrix(), folds=3, SEED=42, test_size=.15) benchmark_preds = train.cross_validate_using_benchmark( 'business_mean', dfTrn, dfTrn[0].merge(dfTrn[1], how='inner', on='business_id').as_matrix(),
train.cross_validate(mtxTrn,mtxTarget,clf,folds=10,SEED=42,test_size=.2) #may require mtxTrn.toarray() #--Use classifier for predictions--# dfTest_ML, clf = train.predict(mtxTrn,mtxTarget,mtxTest,dfTest_ML,clf,clf_name) #may require mtxTest.toarray() #--Save predictions to file--# train.save_predictions(dfTest_ML,clf_name,'_All_1_5_KitchenSink',submission_no) #---------End Machine Learning Section-------------# #------------------------------Optional Steps----------------------------------# #--Memory cleanup prior to running the memory intensive classifiers--# dfTrn,dfTest,dfAll = utils.data_garbage_collection(dfTrn,dfTest,dfAll) #--use a benchmark instead of a classifier--# benchmark_preds = train.cross_validate_using_benchmark('3.5', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15) benchmark_preds = train.cross_validate_using_benchmark('global_mean', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15) benchmark_preds = train.cross_validate_using_benchmark('business_mean', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15) benchmark_preds = train.cross_validate_using_benchmark('usr_mean', dfTrn, dfTrn[0].merge(dfTrn[2],how='inner',on='user_id').as_matrix(),dfTrn[0].merge(dfTrn[2],how='inner',on='user_id').ix[:,['rev_stars']].as_matrix(),folds=3,SEED=22,test_size=.15) #--predict using a benchmark--# train.save_predictions_benchmark(dfTest_Benchmark_BusMean,'bus_mean',submission_no) train.save_predictions_benchmark(dfTest_Benchmark_UsrMean,'usr_mean',submission_no) train.save_predictions_benchmark(dfTest_Benchmark_BusUsrMean,'bus_usr_mean',submission_no) #--Save model to joblib file--# train.save_model(clf,clf_name) #--Save a dataframe to CSV--# filename = 'Data/'+datetime.now().strftime("%d-%m-%y_%H%M")+'--FinalDataset--OldUserTest'+'.csv' #del dfTest_Master['business_id'];del dfTest_Master['user_id'];
folds=8, SEED=888, test_size=.1, clf=clf, clf_name=clf_name, pred_fg=False) train.cross_validate_temporal(mtxTrn, mtxTest, mtxTrnTarget.ravel(), mtxTestTarget.ravel(), clf=clf, clf_name=clf_name, pred_fg=False) train.cross_validate_using_benchmark('global_mean', dfTrn, mtxTrn, mtxTrnTarget, folds=20) ################################################################################################ #---Calculate the degree of variance between ground truth and the mean of the CV predictions.----# #---Returns a list of all training records with their average variance---# train.calc_cv_preds_var(dfTrn, cv_preds) ################################################################################################ #--Use estimator for manual predictions--# dfTest, clf = train.predict(mtxTrn, mtxTrnTarget.ravel(), mtxTest, dfTest, clf, clf_name) #may require mtxTest.toarray() dfTest, clf = train.predict(mtxTrn.todense(), mtxTrnTarget.ravel(), mtxTest.todense(), dfTest, clf, clf_name) #may require mtxTest.toarray()