Пример #1
0
                                                                                   max_features=None, min_density=None, min_samples_leaf=1,
                                                                                   min_samples_split=2, random_state=None, splitter='best'),
                                 n_estimators=150, learning_rate=.5, loss='linear', random_state=None)
#clf = gaussian_process.GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1, random_start=100)
#clf = neighbors.KNeighborsRegressor(100, weights='uniform', algorithm = 'auto');clf_name='KNN_200'


################################################################################################
#---Different methods of cross validation---#
#May require mtxTrn.toarray()
cv_preds = train.cross_validate(hstack([sparse.csr_matrix(dfTrn.urlid.values).transpose(),mtxTrn]),mtxTrnTarget.ravel(),
                                folds=10,SEED=42,test_size=.1,clf=clf,clf_name=clf_name,pred_fg=True)
train.cross_validate(mtxTrn,mtxTrnTarget.ravel(),folds=8,SEED=888,test_size=.1,clf=clf,clf_name=clf_name,pred_fg=False)
train.cross_validate_temporal(mtxTrn,mtxTest,mtxTrnTarget.ravel(),mtxTestTarget.ravel(),clf=clf,
                              clf_name=clf_name,pred_fg=False)
train.cross_validate_using_benchmark('global_mean',dfTrn, mtxTrn,mtxTrnTarget,folds=20)


################################################################################################
#---Calculate the degree of variance between ground truth and the mean of the CV predictions.----#
#---Returns a list of all training records with their average variance---#
train.calc_cv_preds_var(dfTrn,cv_preds)


################################################################################################
#--Use estimator for manual predictions--#
dfTest, clf = train.predict(mtxTrn,mtxTrnTarget.ravel(),mtxTest,dfTest,clf,clf_name) #may require mtxTest.toarray()
dfTest, clf = train.predict(mtxTrn.todense(),mtxTrnTarget.ravel(),mtxTest.todense(),dfTest,clf,clf_name) #may require mtxTest.toarray()

################################################################################################
#--Save feature matrices in svm format for external modeling--#
#--Save predictions to file--#
train.save_predictions(dfTest_ML, clf_name, '_All_1_5_KitchenSink',
                       submission_no)

#---------End Machine Learning Section-------------#

#------------------------------Optional Steps----------------------------------#
#--Memory cleanup prior to running the memory intensive classifiers--#
dfTrn, dfTest, dfAll = utils.data_garbage_collection(dfTrn, dfTest, dfAll)

#--use a benchmark instead of a classifier--#
benchmark_preds = train.cross_validate_using_benchmark(
    '3.5',
    dfTrn,
    dfTrn[0].merge(dfTrn[1], how='inner', on='business_id').as_matrix(),
    dfTrn[0].ix[:, ['rev_stars']].as_matrix(),
    folds=3,
    SEED=42,
    test_size=.15)
benchmark_preds = train.cross_validate_using_benchmark(
    'global_mean',
    dfTrn,
    dfTrn[0].merge(dfTrn[1], how='inner', on='business_id').as_matrix(),
    dfTrn[0].ix[:, ['rev_stars']].as_matrix(),
    folds=3,
    SEED=42,
    test_size=.15)
benchmark_preds = train.cross_validate_using_benchmark(
    'business_mean',
    dfTrn,
    dfTrn[0].merge(dfTrn[1], how='inner', on='business_id').as_matrix(),
train.cross_validate(mtxTrn,mtxTarget,clf,folds=10,SEED=42,test_size=.2)  #may require mtxTrn.toarray()

#--Use classifier for predictions--#
dfTest_ML, clf = train.predict(mtxTrn,mtxTarget,mtxTest,dfTest_ML,clf,clf_name) #may require mtxTest.toarray()

#--Save predictions to file--#
train.save_predictions(dfTest_ML,clf_name,'_All_1_5_KitchenSink',submission_no)

#---------End Machine Learning Section-------------#

#------------------------------Optional Steps----------------------------------#
#--Memory cleanup prior to running the memory intensive classifiers--#
dfTrn,dfTest,dfAll = utils.data_garbage_collection(dfTrn,dfTest,dfAll)

#--use a benchmark instead of a classifier--#
benchmark_preds = train.cross_validate_using_benchmark('3.5', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15)
benchmark_preds = train.cross_validate_using_benchmark('global_mean', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15)
benchmark_preds = train.cross_validate_using_benchmark('business_mean', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15)
benchmark_preds = train.cross_validate_using_benchmark('usr_mean', dfTrn, dfTrn[0].merge(dfTrn[2],how='inner',on='user_id').as_matrix(),dfTrn[0].merge(dfTrn[2],how='inner',on='user_id').ix[:,['rev_stars']].as_matrix(),folds=3,SEED=22,test_size=.15)

#--predict using a benchmark--#
train.save_predictions_benchmark(dfTest_Benchmark_BusMean,'bus_mean',submission_no)
train.save_predictions_benchmark(dfTest_Benchmark_UsrMean,'usr_mean',submission_no)
train.save_predictions_benchmark(dfTest_Benchmark_BusUsrMean,'bus_usr_mean',submission_no)

#--Save model to joblib file--#
train.save_model(clf,clf_name)

#--Save a dataframe to CSV--#
filename = 'Data/'+datetime.now().strftime("%d-%m-%y_%H%M")+'--FinalDataset--OldUserTest'+'.csv'
#del dfTest_Master['business_id'];del dfTest_Master['user_id'];
Пример #4
0
                     folds=8,
                     SEED=888,
                     test_size=.1,
                     clf=clf,
                     clf_name=clf_name,
                     pred_fg=False)
train.cross_validate_temporal(mtxTrn,
                              mtxTest,
                              mtxTrnTarget.ravel(),
                              mtxTestTarget.ravel(),
                              clf=clf,
                              clf_name=clf_name,
                              pred_fg=False)
train.cross_validate_using_benchmark('global_mean',
                                     dfTrn,
                                     mtxTrn,
                                     mtxTrnTarget,
                                     folds=20)

################################################################################################
#---Calculate the degree of variance between ground truth and the mean of the CV predictions.----#
#---Returns a list of all training records with their average variance---#
train.calc_cv_preds_var(dfTrn, cv_preds)

################################################################################################
#--Use estimator for manual predictions--#
dfTest, clf = train.predict(mtxTrn, mtxTrnTarget.ravel(), mtxTest, dfTest, clf,
                            clf_name)  #may require mtxTest.toarray()
dfTest, clf = train.predict(mtxTrn.todense(), mtxTrnTarget.ravel(),
                            mtxTest.todense(), dfTest, clf,
                            clf_name)  #may require mtxTest.toarray()