def test_model(model, league, params, csv_test_file_path, num_of_last_games_list, cv=5): sp_model = SoccerPredictModel(fake_model, redwoodParser, data_manger, league, data_predict_org) grid_search = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1, cv=cv) # get the train data and train the model train_data_vector, train_winners_vector = get_train_data(sp_model) print('train the model : ') grid_search.fit(numpy.array(train_data_vector), numpy.array(train_winners_vector)) print('grid_search : {}'.format(grid_search)) print('grid_search, best_estimator_ : {}'.format(grid_search.best_estimator_)) print('grid_search, best_params_ : {}'.format(grid_search.best_params_)) print('grid_search, best_score_ : {}'.format(grid_search.best_score_)) result = [] for num in num_of_last_games_list: # get the test data test_vector_list, test_winner_list = get_test_data(csv_test_file_path, sp_model.get_data_after_parse(), num) # test the model and get the score score = grid_search.score(numpy.array(test_vector_list), numpy.array(test_winner_list)) print('score for num of last games: {} score : {}'.format(num, score)) result.append([num, score]) print('grid_search.cv_results_ : {}'.format(grid_search.cv_results_)) return result, grid_search.best_estimator_
def get(self, league, home_team, away_team): sp_model_file_name = '' print('get : {}/{}/{}'.format(league, home_team, away_team)) if league in ['England', 'Spain', 'Italy', 'Germany']: sp_model_file_name = 'modelForProd/' + league.lower( ) + '_sp_model.joblib' else: return "league : {} is not supported".format(league), 404 print('load {}'.format(sp_model_file_name)) model = load(sp_model_file_name) get_db_connector = mongo_API() get_data_manger = FileDataManger(get_db_connector) get_redwoodParser = RedWoodParser() get_data_predict_org = Data_Predict_Organizer() sp_model = SoccerPredictModel(model, get_redwoodParser, get_data_manger, league, get_data_predict_org) mapped_home_team = mapper.map(home_team) print(mapped_home_team) mapped_away_team = mapper.map(away_team) print(mapped_away_team) res = sp_model.predict(mapped_home_team, mapped_away_team, 6) print(res) return res, 200
def test_basic_model_english_random_forest(self): db_connector = mongo_API() data_manger = DataManger(db_connector) redwoodParser = RedWoodParser() model = RandomForestClassifier() data_predict_org = Data_Predict_Organizer() sp_model = SoccerPredictModel(model, redwoodParser, data_manger, 'England', data_predict_org) sp_model.train(7, 0.1) test(sp_model, 10, self.english_csv_file)
def test_basic_model_italy(self): db_connector = mongo_API() data_manger = DataManger(db_connector) redwoodParser = RedWoodParser() model = XGBClassifier(max_depth=3, booster='gblinear') data_predict_org = Data_Predict_Organizer() sp_model_italy = SoccerPredictModel(model, redwoodParser, data_manger, 'Italy', data_predict_org) sp_model_italy.train(7, 0.1) test(sp_model_italy, 7, self.italy_csv_file)
def test_basic_model_english_KNN(self): db_connector = mongo_API() data_manger = DataManger(db_connector) redwoodParser = RedWoodParser() model = KNeighborsClassifier(n_neighbors=3, ) data_predict_org = Data_Predict_Organizer() sp_model = SoccerPredictModel(model, redwoodParser, data_manger, 'England', data_predict_org) sp_model.train(7, 0.1) test(sp_model, 6, self.english_csv_file)
def test_basic_model_english_xg(self): db_connector = mongo_API() data_manger = DataManger(db_connector) redwoodParser = RedWoodParser() ignore_list = [ 'Date', 'TournamentName', 'SeasonName', 'RoundId', 'HomeTeamName', 'AwayTeamName' ] redwoodParser.set_ignore_list(ignore_list) model = XGBClassifier(max_depth=5, booster='gblinear') data_predict_org = Data_Predict_Organizer() sp_model = SoccerPredictModel(model, redwoodParser, data_manger, 'England', data_predict_org) sp_model.train(7, 0.1) test(sp_model, 6, self.english_csv_file)
def search_cat_boost(league, test_data): results = [] current_max_accurate = 0 final_most_acc = '' #[iterations, learning_rate, depth_cat_boost, boosting_type_cat_boost] for iter in parameters_cat_boost[iterations]: for l_rate in parameters_cat_boost[learning_rate]: for dep in parameters_cat_boost[depth_cat_boost]: for b_type in parameters_cat_boost[boosting_type_cat_boost]: model = CatBoostClassifier(iterations=iter, learning_rate=l_rate, depth=dep, boosting_type=b_type, loss_function='MultiClass', silent=True) sp_model_cat_boost = SoccerPredictModel( model, redwoodParser, data_manger, league, data_predict_org) sp_model_cat_boost.train(7, 0.1) for n_games in num_of_last_games: accurate_result = 0 for game in test_data: res__proba = sp_model_cat_boost.predict( game[home_team], game[away_team], n_games) res = get_winner_from_prob(res__proba) if res == game[real_winner]: accurate_result += 1 accurate_percentage = accurate_result / len(test_data) results.append([ iter, l_rate, dep, b_type, n_games, accurate_percentage ]) print( 'iter : {}, l_rate : {}, dep : {}, obj : {}, n_games : {}, acc : {} ' .format(iter, l_rate, dep, b_type, n_games, accurate_percentage)) if accurate_percentage > current_max_accurate: current_max_accurate = accurate_percentage final_most_acc = '{},{},{},{},{},{}'.format( iter, l_rate, dep, b_type, n_games, accurate_percentage) print('save model : {}'.format(final_most_acc)) dump( sp_model_cat_boost.get_model(), joblib_save_path.format( league, 'CatBoostClassifier', accurate_percentage.__str__().replace( '.', ','))) return results, final_most_acc
def search_lgbm(league, test_data): results = [] current_max_accurate = 0 final_most_acc = '' for b_type in parameters_LGBM[boosting_type]: for l_rate in parameters_LGBM[learning_rate]: for n_est in parameters_LGBM[n_estimators]: for obj in parameters_LGBM[objective_lgbm]: model = LGBMClassifier(boosting_type=b_type, learning_rate=l_rate, n_estimators=n_est, objective=obj) sp_model_LGBM = SoccerPredictModel(model, redwoodParser, data_manger, league, data_predict_org) sp_model_LGBM.train(7, 0.1) for n_games in num_of_last_games: accurate_result = 0 for game in test_data: res__proba = sp_model_LGBM.predict( game[home_team], game[away_team], n_games) res = get_winner_from_prob(res__proba) if res == game[real_winner]: accurate_result += 1 accurate_percentage = accurate_result / len(test_data) results.append([ b_type, l_rate, n_est, obj, n_games, accurate_percentage ]) print( 'b_type : {}, l_rate : {}, n_est : {}, obj : {}, n_games : {}, acc : {} ' .format(b_type, l_rate, n_est, obj, n_games, accurate_percentage)) if accurate_percentage > current_max_accurate: current_max_accurate = accurate_percentage final_most_acc = '{},{},{},{},{},{}'.format( b_type, l_rate, n_est, obj, n_games, accurate_percentage) print('save model : {}'.format(final_most_acc)) dump( sp_model_LGBM.get_model(), joblib_save_path.format( league, 'LGBMClassifier', accurate_percentage.__str__().replace( '.', ','))) return results, final_most_acc
def search_RF(league, test_data): results = [] current_max_accurate = 0 final_most_acc = '' for n_est in parameters_random_forest[n_estimators_RF]: for crit in parameters_random_forest[criterion]: for bootstrap_param in parameters_random_forest[bootstrap]: for warm_s in parameters_random_forest[warm_start]: model = RandomForestClassifier(n_estimators=n_est, criterion=crit, bootstrap=bootstrap_param, warm_start=warm_s) sp_model_rf = SoccerPredictModel(model, redwoodParser, data_manger, league, data_predict_org) sp_model_rf.train(7, 0.1) for n_games in num_of_last_games: accurate_result = 0 for game in test_data: res__proba = sp_model_rf.predict( game[home_team], game[away_team], n_games) res = get_winner_from_prob(res__proba) if res == game[real_winner]: accurate_result += 1 accurate_percentage = accurate_result / len(test_data) results.append([ n_est, crit, bootstrap_param, warm_s, n_games, accurate_percentage ]) print( 'n_est : {}, crit : {}, bootstrap_param : {}, warm_s : {}, n_games : {}, acc : {} ' .format(n_est, crit, bootstrap_param, warm_s, n_games, accurate_percentage)) if accurate_percentage > current_max_accurate: current_max_accurate = accurate_percentage final_most_acc = '{},{},{},{},{},{}'.format( n_est, crit, bootstrap_param, warm_s, n_games, accurate_percentage) print('save model : {}'.format(final_most_acc)) dump( sp_model_rf.get_model(), joblib_save_path.format( league, 'RandomForestClassifier', accurate_percentage.__str__().replace( '.', ','))) return results, final_most_acc
def search_KNN(league, test_data): results = [] current_max_accurate = 0 final_most_acc = '' for n_neighbor in parameters_KNN[n_neighbors]: for weight in parameters_KNN[weights]: for algo in parameters_KNN[algorithm]: for power in parameters_KNN[p]: model = KNeighborsClassifier(n_neighbors=n_neighbor, weights=weight, algorithm=algo, p=power) sp_model_knn = SoccerPredictModel(model, redwoodParser, data_manger, league, data_predict_org) sp_model_knn.train(7, 0.1) for n_games in num_of_last_games: accurate_result = 0 for game in test_data: res__proba = sp_model_knn.predict( game[home_team], game[away_team], n_games) res = get_winner_from_prob(res__proba) if res == game[real_winner]: accurate_result += 1 accurate_percentage = accurate_result / len(test_data) results.append([ n_neighbor, weight, algo, power, n_games, accurate_percentage ]) print( 'n_neighbor : {}, weight : {}, algo : {}, power : {}, n_games : {}, acc : {} ' .format(n_neighbor, weight, algo, power, n_games, accurate_percentage)) if accurate_percentage > current_max_accurate: current_max_accurate = accurate_percentage final_most_acc = '{},{},{},{},{},{}'.format( n_neighbor, weight, algo, power, n_games, accurate_percentage) print('save model : {}'.format(final_most_acc)) dump( sp_model_knn.get_model(), joblib_save_path.format( league, 'KNeighborsClassifier', accurate_percentage.__str__().replace( '.', ','))) return results, final_most_acc
def search_XGBClassifier(league, test_data, silent_param=True): results = [] current_max_accurate = 0 final_most_acc = '' for estimator in parameters_XGB[n_estimators]: for m_depth in parameters_XGB[max_depth]: for boost in parameters_XGB[booster]: for obj in parameters_XGB[objective]: for l_rate in parameters_XGB[learning_rate]: for b_score in parameters_XGB[base_score]: model = XGBClassifier(max_depth=m_depth, estimator=estimator, booster=boost, objective=obj, learning_rate=l_rate, silent=silent_param, base_score=b_score) sp_model = SoccerPredictModel( model, redwoodParser, data_manger, league, data_predict_org) sp_model.train(7, 0.1) for n_games in num_of_last_games: accurate_result = 0 for game in test_data: res__proba = sp_model.predict( game[home_team], game[away_team], n_games) res = get_winner_from_prob(res__proba) if res == -1: print('error -1') if res == game[real_winner]: accurate_result += 1 accurate_percentage = accurate_result / len( test_data) results.append([ estimator, m_depth, boost, obj, l_rate, b_score, n_games, accurate_percentage ]) print( 'estimator : {}, m_depth : {}, boost : {}, obj : {}, l_rate : {}, b_score : {}, n_games : {}, acc : {} ' .format(estimator, m_depth, boost, obj, l_rate, b_score, n_games, accurate_percentage)) if accurate_percentage > current_max_accurate: current_max_accurate = accurate_percentage final_most_acc = '{},{},{},{},{},{},{},{} '.format( estimator, m_depth, boost, obj, l_rate, b_score, n_games, accurate_percentage) print('save model : {}'.format( final_most_acc)) dump( sp_model.get_model(), joblib_save_path.format( league, 'XGBClassifier', accurate_percentage.__str__(). replace('.', ','))) return results, final_most_acc
#from lightgbm import LGBMClassifier # from catboost import CatBoostClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV db_connector = mongo_API() data_manger = DataManger(db_connector) redwoodParser = RedWoodParser() model = XGBClassifier(max_depth=3, booster='gblinear') #model = RandomForestClassifier() #model = CatBoostClassifier() #model = LGBMClassifier() #model = KNeighborsClassifier() data_predict_org = Data_Predict_Organizer() sp_model_english = SoccerPredictModel(model, redwoodParser, data_manger, 'England', data_predict_org) sp_model_english.train(7, 0.1) number_of_last_game = 6 res_eng_12_4_19_1 = sp_model_english.predict_proba('Leicester', 'Newcastle', number_of_last_game) res_eng_13_4_19_1 = sp_model_english.predict_proba('Tottenham', 'Huddersfield', number_of_last_game) res_eng_13_4_19_2 = sp_model_english.predict_proba('Burnley', 'Cardiff', number_of_last_game) res_eng_13_4_19_3 = sp_model_english.predict_proba('Brighton', 'Bournemouth', number_of_last_game) res_eng_13_4_19_4 = sp_model_english.predict_proba('Fulham', 'Everton', number_of_last_game)
def test_grid_search_english_model_xgboost(self): # general things: db_connector = mongo_API() data_manger = DataManger(db_connector) redwoodParser = RedWoodParser() data_predict_org = Data_Predict_Organizer() # model params max_depths = range(1, 22, 1) learning_rates = [x / 20 for x in range(1, 21)] n_estimators = range(50, 400, 50) objectives = [ "binary:logistic", "reg:linear", "reg:logistic", "binary:logistic", "binary:logitraw", "count:poisson", "multi:softmax", "multi:softprob", "rank:pairwise" ] boosters = ['gbtree', 'gblinear'] gammas = [x / 20 for x in range(0, 105, 5)] min_child_weights = [x / 2 for x in range(0, 21)] max_delta_steps = range(0, 10, 1) subsamples = [x / 20 for x in range(0, 21)] colsample_bytrees = [x / 20 for x in range(0, 21)] #reg_alpha = 0 #reg_lambda = 1 base_score = [x / 20 for x in range(1, 21)] num_of_games = range(0, 15) test_results = [] for md in max_depths: for lr in learning_rates: for ne in n_estimators: for obj in objectives: for booster in boosters: for g in gammas: for min_child_w in min_child_weights: for max_delta_step in max_delta_steps: for sub_sample in subsamples: for cb in colsample_bytrees: for bs in base_score: model = XGBClassifier( max_depth=md, learning_rate=lr, n_estimators=ne, objective=obj, booster=booster, gamma=g, min_child_weight= min_child_w, max_delta_step= max_delta_step, subsample=sub_sample, colsample_bytree=cb, base_score=bs) sp_model = SoccerPredictModel( model, redwoodParser, data_manger, 'England', data_predict_org) sp_model.train(7, 0.1) for num in num_of_games: res = test( sp_model, num, self. english_csv_file) test_results.append({ 'max_depth': md, 'learning_rate': lr, 'n_estimators': ne, 'objective': obj, 'booster': booster, 'gamma': g, 'min_child_weight': min_child_w, 'max_delta_step': max_delta_step, 'subsample': sub_sample, 'colsample_bytree': cb, 'base_score': bs, 'res': res }) print(test_results) with open(self.english_xg_result) as res_file: # save fields names for k, v in test_results[0].items(): res_file.write(k + ',') for result in test_results: for k, v in result.items(): res_file.write(v + ',')