Пример #1
0
 def train_wrapper(params):
     cv_losses, _ = cross_validate(params, X, y)
     # return an object to be recorded in hyperopt trials for future uses
     return {
         'loss': np.mean(cv_losses),
         'status': STATUS_OK,
         'eval_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
         'params': params
     }
Пример #2
0
#----For smaller data sets------#   (Do not work or have very long training times on large sparse datasets)  Require .todense()
#clf = ensemble.RandomForestRegressor(n_estimators=50);  clfname='RFReg_50'
#clf = ensemble.ExtraTreesRegressor(n_estimators=30)  #n_jobs = -1 if running in a main() loop
#clf = ensemble.GradientBoostingRegressor(n_estimators=700, learning_rate=.1, max_depth=1, random_state=888, loss='ls');clf_name='GBM'
clf = ensemble.AdaBoostRegressor(base_estimator=tree.DecisionTreeRegressor(compute_importances=None, criterion='mse', max_depth=3,
                                                                                   max_features=None, min_density=None, min_samples_leaf=1,
                                                                                   min_samples_split=2, random_state=None, splitter='best'),
                                 n_estimators=150, learning_rate=.5, loss='linear', random_state=None)
#clf = gaussian_process.GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1, random_start=100)
#clf = neighbors.KNeighborsRegressor(100, weights='uniform', algorithm = 'auto');clf_name='KNN_200'


################################################################################################
#---Different methods of cross validation---#
#May require mtxTrn.toarray()
cv_preds = train.cross_validate(hstack([sparse.csr_matrix(dfTrn.urlid.values).transpose(),mtxTrn]),mtxTrnTarget.ravel(),
                                folds=10,SEED=42,test_size=.1,clf=clf,clf_name=clf_name,pred_fg=True)
train.cross_validate(mtxTrn,mtxTrnTarget.ravel(),folds=8,SEED=888,test_size=.1,clf=clf,clf_name=clf_name,pred_fg=False)
train.cross_validate_temporal(mtxTrn,mtxTest,mtxTrnTarget.ravel(),mtxTestTarget.ravel(),clf=clf,
                              clf_name=clf_name,pred_fg=False)
train.cross_validate_using_benchmark('global_mean',dfTrn, mtxTrn,mtxTrnTarget,folds=20)


################################################################################################
#---Calculate the degree of variance between ground truth and the mean of the CV predictions.----#
#---Returns a list of all training records with their average variance---#
train.calc_cv_preds_var(dfTrn,cv_preds)


################################################################################################
#--Use estimator for manual predictions--#
dfTest, clf = train.predict(mtxTrn,mtxTrnTarget.ravel(),mtxTest,dfTest,clf,clf_name) #may require mtxTest.toarray()
#--------------Machine Learning (woohoo, we finally got to the good stuff)------------------------#
#quant_features = ['user_average_stars','user_review_count','calc_total_checkins','bus_stars','bus_review_count']
quant_features = ['bus_stars','user_average_stars','bus_review_count', 'user_review_count','calc_total_checkins','calc_cat_avg']
dfTrn_ML=dfTrn_All_5_8; dfTest_ML= dfTest_All_1_5;
mtxTrn,mtxTest = features.standardize(dfTrn_ML,dfTest_ML,quant_features)
#--Combine the standardized quant features and the vectorized categorical features--#
#mtxTrn = hstack([mtxTrn,vecTrn_BusOpen])  #vecTrn_BusOpen,vecTrn_Cats,vecTrn_Zip,
#mtxTest = hstack([mtxTest,vecTest_BusOpen]) #vecTest_Master_Cats,vecTest_Master_Zip,
#--Test without the vecZip and vecCats--#
#mtxTrn = hstack([mtxTrn,vecTrn_BusOpen])
#mtxTest = hstack([mtxTest,vecTest_Master_BusOpen])
#--select target--#
mtxTarget = dfTrn_ML.ix[:,['rev_stars']].as_matrix()

#--Use classifier for cross validation--#
train.cross_validate(mtxTrn,mtxTarget,clf,folds=10,SEED=42,test_size=.2)  #may require mtxTrn.toarray()

#--Use classifier for predictions--#
dfTest_ML, clf = train.predict(mtxTrn,mtxTarget,mtxTest,dfTest_ML,clf,clf_name) #may require mtxTest.toarray()

#--Save predictions to file--#
train.save_predictions(dfTest_ML,clf_name,'_All_1_5_KitchenSink',submission_no)

#---------End Machine Learning Section-------------#

#------------------------------Optional Steps----------------------------------#
#--Memory cleanup prior to running the memory intensive classifiers--#
dfTrn,dfTest,dfAll = utils.data_garbage_collection(dfTrn,dfTest,dfAll)

#--use a benchmark instead of a classifier--#
benchmark_preds = train.cross_validate_using_benchmark('3.5', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15)
    'calc_total_checkins', 'calc_cat_avg'
]
dfTrn_ML = dfTrn_All_5_8
dfTest_ML = dfTest_All_1_5
mtxTrn, mtxTest = features.standardize(dfTrn_ML, dfTest_ML, quant_features)
#--Combine the standardized quant features and the vectorized categorical features--#
#mtxTrn = hstack([mtxTrn,vecTrn_BusOpen])  #vecTrn_BusOpen,vecTrn_Cats,vecTrn_Zip,
#mtxTest = hstack([mtxTest,vecTest_BusOpen]) #vecTest_Master_Cats,vecTest_Master_Zip,
#--Test without the vecZip and vecCats--#
#mtxTrn = hstack([mtxTrn,vecTrn_BusOpen])
#mtxTest = hstack([mtxTest,vecTest_Master_BusOpen])
#--select target--#
mtxTarget = dfTrn_ML.ix[:, ['rev_stars']].as_matrix()

#--Use classifier for cross validation--#
train.cross_validate(mtxTrn, mtxTarget, clf, folds=10, SEED=42,
                     test_size=.2)  #may require mtxTrn.toarray()

#--Use classifier for predictions--#
dfTest_ML, clf = train.predict(mtxTrn, mtxTarget, mtxTest, dfTest_ML, clf,
                               clf_name)  #may require mtxTest.toarray()

#--Save predictions to file--#
train.save_predictions(dfTest_ML, clf_name, '_All_1_5_KitchenSink',
                       submission_no)

#---------End Machine Learning Section-------------#

#------------------------------Optional Steps----------------------------------#
#--Memory cleanup prior to running the memory intensive classifiers--#
dfTrn, dfTest, dfAll = utils.data_garbage_collection(dfTrn, dfTest, dfAll)
#----------------------------------------#
#--------- Machine Learning--------------#
#----------------------------------------#
#--select target--#
mtxTarget = frmTrn_All.ix[:,['rev_votes_useful']].as_matrix()

#--select classifier--#
##  Common options:  ensemble -- RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
##                   linear_model -- SGDRegressor, Lasso
#clf = linear_model.LassoCV(cv=3)
#clf = linear_model.ElasticNet()
#clf = ensemble.RandomForestRegressor(n_estimators=50)
clf = linear_model.SGDRegressor(alpha=0.001, n_iter=1000,shuffle=True); clfname='SGD_001_1000'

#--Use classifier for cross validation--#
train.cross_validate(mtxTrn,mtxTarget,clf,folds=10,SEED=42,test_size=.15)

#--Use classifier for predictions--#
frmTest_All = train.predict(mtxTrn,mtxTarget,mtxTest,frmTest_All,clf,clfname)
#frmTest_NoVotes = train.predict(mtxTrn,mtxTarget,mtxTest,frmTest_NoVotes,clf,clfname)
#frmTest_NoUser = train.predict(mtxTrn,mtxTarget,mtxTest,frmTest_NoUser,clf,clfname)

#--Save predictions to file--#
train.save_predictions(frmTest_All,clfname)
#train.save_predictions(frmTest_NoVotes,clfname)
#train.save_predictions(frmTest_NoUser,clfname)

#--Save model to joblib file--#
train.save_model(clf,clfname)

#--Load model from joblib file--#
Пример #6
0
def main():
    log.info('********New program instance started********')

    #-------------Load Environment----------------------#
    #Get program settings and model settings from SETTINGS.json file in root directory
    settings, model_settings = utils.load_settings()

    #If not using cached data, then load raw data, clean/munge it, create hand-crafted features, slice it for CV
    if settings['use_cached_data'] == 'y':
        log.info('==========LOADING CACHED FEATURES===========')
        dfTrn = data_io.load_cached_object('dfTrn')
        dfTest = data_io.load_cached_object('dfTest')
        dfCV = data_io.load_flatfile_to_df('Data/CV.csv')
    else:
        #-------Data Loading/Cleaning/Munging------------#
        #Load the data
        log.info('===============LOADING DATA=================')
        dfTrn = data_io.load_flatfile_to_df(settings['file_data_train'])
        dfTest = data_io.load_flatfile_to_df(settings['file_data_test'])
        dfCV = data_io.load_flatfile_to_df('Data/CV.csv')

        #Clean/Munge the data
        log.info('=======CLEANING AND MUNGING DATA============')
        dfTrn = munge.clean(dfTrn)
        dfTest = munge.clean(dfTest)

        #-------Feature creation-------------------------#
        #Add all currently used hand crafted features to dataframes
        log.info('====CREATING HAND-CRAFTED DATA FEATURES=====')
        features.add(dfTrn)
        features.add(dfTest)

        #---------Data slicing/parsing--------------------------#
        #Split data for CV
        if settings['generate_cv_score'] == 'y':
            log.info('=====SPLITTING DATA FOR CROSS-VALIDATION====')
            if settings['cv_method'] == 'april':
                dfTrnCV, dfTestCV = munge.temporal_split(dfTrn, (2013, 04, 1))
            elif settings['cv_method'] == 'march':
                #take an addtional week from February b/c of lack of remote_api source issues in March
                dfTrnCV, dfTestCV = munge.temporal_split(dfTrn, (2013, 02, 21))
            elif settings['cv_method'] == 'list_split':
                #load stored list of data points and use those for CV
                dfCVlist = pd.DataFrame({'id': data_io.load_cached_object('Cache/cv_issue_ids.pkl'), 'dummy': 0})
                dfTrnCV, dfTestCV = munge.list_split(dfTrn, dfCVlist)

    #--------------Modeling-------------------------#
    #If cached models exist then load them for reuse into segment_models.  Then run through model_settings and for
    # each model where 'use_cached_model' is false then clear the cached model and recreate it fresh
    log.info('=========LOADING CACHED MODELS==============')
    segment_models = data_io.load_cached_object('segment_models')
    if segment_models == None:
        log.info('=========CACHED MODELS NOT LOADED===========')
        for model in model_settings:
            model['use_cached_model'] = 'n'
        segment_models = []
    #Initialize new model for models not set to use cache
    log.info('=======INITIALIZING UN-CACHED MODELS========')
    index = 0
    for model in model_settings:
        if model_settings[model]['use_cached_model'] == 'n':
            new_model = ensembles.Model(model_name=model,target=model_settings[model]['target'],
                                        segment=model_settings[model]['segment'],
                                        estimator_class=model_settings[model]['estimator_class'],
                                        estimator_params=model_settings[model]['estimator_params'],
                                        features=model_settings[model]['features'],
                                        postprocess_scalar=model_settings[model]['postprocess_scalar'])
            #Flag the model as not cached, so that it does not get skipped when running the modeling process
            new_model.use_cached_model='n'
            #Project specific model attributes not part of base class
            new_model.KNN_neighborhood_threshold=model_settings[model]['KNN_neighborhood_threshold']
            new_model.sub_zip_neighborhood=model_settings[model]['sub_zip_neighborhood']
            segment_models[index] = new_model
            log.info('Model %s intialized at index %i' % (model,index))
        index += 1

    #Cross validate all segment models (optional)
    if settings['export_cv_predictions_all_models'] == 'y' or settings['export_cv_predictions_new_models'] == 'y':
        log.info('============CROSS VALIDATION================')
        for model in segment_models[:]:
            #If model has cached CV predictions then skip predicting and just export them (if selected in settings)
            if hasattr(model,'dfCVPredictions'):
                log.info('Cached CV predictions found.  Using cached CV predictions.')
                if settings['export_cv_predictions_all_models'] == 'y':
                    data_io.save_predictions(model.dfCVPredictions,model.target,model_name=model.model_name,
                                             directory=settings['dir_submissions'],
                                             estimator_class=model.estimator_class, note='CV_list')
            else:
                print_model_header(model)
                #Prepare segment model:  segment and create feature vectors for the CV data set
                dfTrn_Segment, dfTest_Segment = prepare_segment_model(dfTrnCV,dfTestCV,model)
                #Generate CV predictions
                train.cross_validate(model, settings, dfTrn_Segment, dfTest_Segment)
                #Cache the CV predictions as a dataframe stored in each segment model
                model.dfCVPredictions = dfTest_Segment.ix[:,['id',model.target]]
                if settings['export_cv_predictions_new_models'] == 'y':
                    data_io.save_predictions(model.dfCVPredictions,model.target,model_name=model.model_name,
                                             directory=settings['dir_submissions'],
                                             estimator_class=model.estimator_class, note='CV_list')

    #Generate predictions on test set for all segment models (optional)
    if settings['export_predictions_all_models'] == 'y' or settings['export_predictions_new_models'] == 'y'\
        or settings['export_predictions_total'] == 'y':
        log.info('=======GENERATING TEST PREDICTIONS==========')
        for model in segment_models[:]:
            #If model has cached test predictions then skip predicting and just export them (if selected in settings)
            if hasattr(model,'dfPredictions'):
                log.info('Cached test predictions found for model %s.  Using cached predictions.' % model.model_name)
                if settings['export_predictions_all_models'] == 'y':
                    data_io.save_predictions(model.dfPredictions,model.target,model_name=model.model_name,
                             directory=settings['dir_submissions'],
                             estimator_class=model.estimator_class,note='TESTset')
            else:
                print_model_header(model)
                #Prepare segment model:  segment and create feature vectors for the full TEST data set
                dfTrn_Segment, dfTest_Segment = prepare_segment_model(dfTrn,dfTest,model)
                #Generate TEST set predictions
                model.predict(dfTrn_Segment, dfTest_Segment)
                if settings['export_predictions_all_models'] == 'y' or settings['export_predictions_new_models'] == 'y':
                    data_io.save_predictions(model.dfPredictions,model.target,model_name=model.model_name,
                                             directory=settings['dir_submissions'],
                                             estimator_class=model.estimator_class,note='TESTset')
                log.info(utils.line_break())

    #Cache the trained models and predictions to file (optional)
    if settings['export_cached_models'] == 'y':
        log.info('==========EXPORTING CACHED MODELS===========')
        data_io.save_cached_object(segment_models,'segment_models')

    #Merge each segment model's CV predictions into a master dataframe and export it (optional)----#
    if settings['export_cv_predictions_total'] == 'y':
        log.info('====MERGING CV PREDICTIONS FROM SEGMENTS====')
        dfTestPredictionsTotal = merge_segment_predictions(segment_models, dfTestCV, cv=True)
        #---Apply post process rules to master dataframe---#
        #Set all votes and comments for remote_api segment to 1 and 0
        dfTestPredictionsTotal = dfTestPredictionsTotal.merge(dfTest.ix[:][['source','id']], on='id', how='left')
        for x in dfTestPredictionsTotal.index:
            if dfTestPredictionsTotal.source[x] == 'remote_api_created':
                dfTestPredictionsTotal.num_votes[x] = 1
                dfTestPredictionsTotal.num_comments[x] = 0
        #Export
        timestamp = datetime.now().strftime('%m-%d-%y_%H%M')
        filename = 'Submits/'+timestamp+'--bryan_CV_predictions.csv'
        dfTestPredictionsTotal.to_csv(filename)


    #Merge each segment model's TEST predictions into a master dataframe and export it (optional)----#
    if settings['export_predictions_total'] == 'y':
        log.info('===MERGING TEST PREDICTIONS FROM SEGMENTS===')
        dfTestPredictionsTotal = merge_segment_predictions(segment_models, dfTest)
        #---Apply post process rules to master dataframe---#
        #Set all votes and comments for remote_api segment to 1 and 0
        dfTestPredictionsTotal = dfTestPredictionsTotal.merge(dfTest.ix[:][['source','id']], on='id', how='left')
        for x in dfTestPredictionsTotal.index:
            if dfTestPredictionsTotal.source[x] == 'remote_api_created':
                dfTestPredictionsTotal.num_votes[x] = 1
                dfTestPredictionsTotal.num_comments[x] = 0
        del dfTestPredictionsTotal['source']
        #Export
        filename = 'bryan_test_predictions.csv'
        data_io.save_combined_predictions(dfTestPredictionsTotal, settings['dir_submissions'], filename)

    #End main
    log.info('********Program ran successfully. Exiting********')
Пример #7
0
def cross_validate_pipeline(X, y):
    """Trains and validates a model"""
    train.cross_validate(X, y)
Пример #8
0
    splitter='best'),
                                 n_estimators=150,
                                 learning_rate=.5,
                                 loss='linear',
                                 random_state=None)
#clf = gaussian_process.GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1, random_start=100)
#clf = neighbors.KNeighborsRegressor(100, weights='uniform', algorithm = 'auto');clf_name='KNN_200'

################################################################################################
#---Different methods of cross validation---#
#May require mtxTrn.toarray()
cv_preds = train.cross_validate(hstack(
    [sparse.csr_matrix(dfTrn.urlid.values).transpose(), mtxTrn]),
                                mtxTrnTarget.ravel(),
                                folds=10,
                                SEED=42,
                                test_size=.1,
                                clf=clf,
                                clf_name=clf_name,
                                pred_fg=True)
train.cross_validate(mtxTrn,
                     mtxTrnTarget.ravel(),
                     folds=8,
                     SEED=888,
                     test_size=.1,
                     clf=clf,
                     clf_name=clf_name,
                     pred_fg=False)
train.cross_validate_temporal(mtxTrn,
                              mtxTest,
                              mtxTrnTarget.ravel(),