def predict(self, league, market): print(league, market) unplayed_data_columns = self.unplayed_df(league=league, market=market) if unplayed_data_columns is None: return None unplayed_data = unplayed_data_columns[0] columns = unplayed_data_columns[1] unplayed_df = unplayed_data[columns] try: clf = joblib.load( get_analysis_root_path('tools/league_models/{}_{}'.format( league, market))) unplayed_data.loc[:, market] = clf.predict(unplayed_df) if market == "wdw": outcome_probs = clf.predict_proba(unplayed_df) unplayed_data.loc[:, "d_prob"] = outcome_probs[:, 0] unplayed_data.loc[:, "a_prob"] = outcome_probs[:, 1] unplayed_data.loc[:, "h_prob"] = outcome_probs[:, 2] unplayed_data[market].replace(self.result_map, inplace=True) elif market == "ou25": unplayed_data.loc[:, market].replace(self.inverse_ou_class, inplace=True) elif market == "dc": unplayed_data.loc[:, market].replace({ 0: '1X', 1: '12' }, inplace=True) unplayed_data.loc[:, "league"] = league prediction = unplayed_data[[ "Date", "Time", "HomeTeam", "AwayTeam", market, "league" ]] prediction.rename(index=str, columns={ "Date": "date", "Time": "time", "HomeTeam": "home", "AwayTeam": "away" }, inplace=True) team_mapping = get_config(file="team_mapping/{}".format(league)) team_mapping_inv = {v: k for k, v in team_mapping.items()} prediction["home"].replace(team_mapping_inv, inplace=True) prediction["away"].replace(team_mapping_inv, inplace=True) return prediction except Exception as e: self.log.error(msg="The following error occurred: {}".format(e)) return
def unplayed_games(league: str, market: str): columns = get_config(file="{}_columns/{}".format(market, league)) lg_data = pd.read_csv(get_analysis_root_path( 'tools/data/clean_data/team_trend/{}.csv'.format(league)), index_col=False) unplayed = lg_data[lg_data["played"] == 0] if len(unplayed) > 0: return unplayed, columns return None
def fetch_all_league_fixtures(self, league): """ :return: game fixtures.csv """ self.log.info("Getting {} league fixture".format(league)) data = pd.read_csv(get_analysis_root_path( 'tools/data/fixtures/all_fixtures/{}.csv'.format(league)), usecols=['Date', 'Time', 'HomeTeam', 'AwayTeam']) data = team_translation(data=data, league=league) start_date, end_date = get_start_and_end_dates(end_days=self.days) indexed_data = data.set_index('Date') indexed_data = indexed_data.loc[start_date:end_date] data = indexed_data.reset_index() return data
def save_prediction(self, league, market): match_predictions = self.predict(league=league, market=market) if match_predictions is not None: self.log.info( "{} prediction dataframe sorted by date, time and league". format(league)) preds = match_predictions.sort_values(['date', 'time', 'league']) pred_list = [] try: self.log.info("Connecting to the database") client = MongoClient(mongodb_uri, connectTimeoutMS=30000) db = client.get_database("sports_prediction") wdw_football = db.wdw_football self.log.info("Inserting predictions") for idx, pred in preds.iterrows(): pred = dict(pred) exist = { 'league': pred.get('league'), 'home': pred.get('home'), 'away': pred.get('away'), 'time': pred.get('time'), 'date': pred.get('date') } wdw_count = wdw_football.count_documents(exist) if wdw_count == 0: pred_list.append(pred) elif wdw_count == 1: wdw_football.update_one(exist, {'$set': pred}) if len(pred_list) != 0: wdw_football.insert_many(pred_list) self.log.info("Done!!!") except Exception as e: self.log.error( "Could not save {} {} into the database: \n{}".format( league, market, str(e))) preds.to_csv(get_analysis_root_path( 'tools/data/predictions/{}_{}.csv'.format(league, market)), index=False)
@author: tola """ import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.externals import joblib from imblearn.over_sampling import SMOTE from utils import get_analysis_root_path, get_config from te_logger.logger import log leagues_data = get_config(file="league") leagues = list(leagues_data.keys()) for league in leagues: log.info(msg="Building model for league: {}".format(league)) lg_data_path = get_analysis_root_path( 'tools/data/clean_data/team_trend/{}.csv'.format(league)) try: games = pd.read_csv(lg_data_path) games = games.dropna(how='any') model_columns = get_config(file="wdw_columns/{}".format(league)) played_data = games.loc[ (games.Season.isin([1415, 1516, 1617, 1718, 1819])) & (games.played == 1)] target = played_data.FTR.map({"D": 0, "A": 1, "H": 2}) # Select significant columns data = played_data[model_columns] model = LogisticRegression(C=1e5)
wdw_count = wdw_football.count_documents(exist) if wdw_count == 0: pred_list.append(pred) elif wdw_count == 1: wdw_football.update_one(exist, {'$set': pred}) if len(pred_list) != 0: wdw_football.insert_many(pred_list) self.log.info("Done!!!") except Exception as e: self.log.error( "Could not save {} {} into the database: \n{}".format( league, market, str(e))) preds.to_csv(get_analysis_root_path( 'tools/data/predictions/{}_{}.csv'.format(league, market)), index=False) if __name__ == '__main__': dr = Predictors() for lg in get_config().keys(): if os.path.exists( get_analysis_root_path( 'tools/data/fixtures/selected_fixtures/{}.csv'.format( lg))): for mkt in ["wdw", "dc", "ou25"]: dr.save_prediction(league=lg, market=mkt) else: log.warning("{} has no new games".format(lg).upper())
def __init__(self): self.clean_team_trend_data_directory = get_analysis_root_path( 'tools/data/clean_data/team_trend/{}.csv')
def store_significant_columns(self, lg="england_premiership"): self.log = log self.log.info("Processing {} data".format(lg)) fix_path = get_analysis_root_path( 'tools/data/fixtures/selected_fixtures/{}.csv'.format(lg)) if os.path.exists(fix_path): fix_data = pd.read_csv(fix_path) fix_data.loc[:, "FTHG"] = 0 fix_data.loc[:, "FTAG"] = 0 fix_data.loc[:, "FTR"] = 'D' fix_data.loc[:, "Season"] = 1819 fix_data.loc[:, "played"] = 0 fix_data.loc[:, "BTTS"] = 0 fix_data.loc[:, "HSc"] = 0 fix_data.loc[:, "ASc"] = 0 client = MongoClient(mongodb_uri, connectTimeoutMS=30000) db = client.get_database("sports_prediction") lg_data = db[lg] data = pd.DataFrame(list(lg_data.find({})), columns=[ "Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "Season" ]) data = team_translation(data=data, league=lg) data.loc[:, "played"] = 1 data.loc[:, "FTHG"] = pd.to_numeric(data.FTHG.values) data.loc[:, "FTAG"] = pd.to_numeric(data.FTAG.values) data.loc[(data.FTHG > 0) & (data.FTAG > 0), 'BTTS'] = 1 data.loc[(data.FTHG == 0) | (data.FTAG == 0), 'BTTS'] = 0 data.loc[:, 'HSc'] = list( np.where(pd.to_numeric(data.FTHG.values) > 0, 1, 0)) data.loc[:, 'ASc'] = list( np.where(pd.to_numeric(data.FTAG.values) > 0, 1, 0)) agg_data = pd.concat([data, fix_data], ignore_index=True, sort=False) agg_data = self.compute_last_point_ave_goals_and_goals_conceded( data=agg_data, lg=lg) agg_data = agg_data.fillna(0) agg_data.loc[:, 'UO25'] = list( np.where((pd.to_numeric(agg_data.FTHG.values) + pd.to_numeric(agg_data.FTAG.values)) > 2.5, 1, 0)) #TODO: make sure that played_data is used to find significant columns played_data = agg_data[agg_data["played"] == 1] target_real = played_data.FTR.map({"A": -3, "D": 0, "H": 3}) dc_real = played_data.FTR.map({"A": 1, "D": 0, "H": 0}) ou25_target = played_data.UO25 played_data = played_data.drop([ 'FTR', 'FTHG', 'FTAG', 'UO25', "HLM", "ALM", 'BTTS', "HSc", "ASc" ], axis=1) wdw_coef_data = played_data.corrwith(target_real) wdw_sig_cols = list( played_data.drop(["Date", "played", "Time"], axis=1).columns) wdw_sig_data = wdw_coef_data.where(wdw_coef_data.abs() > 0.05) wdw_sig_data = wdw_sig_data.dropna() if len(list(wdw_sig_data.index)) != 0: wdw_sig_cols = list(wdw_sig_data.index) save_league_model_attr(model="wdw_columns", league=lg, cols=wdw_sig_cols) dc_coef_data = played_data.corrwith(dc_real) dc_sig_cols = list( played_data.drop(["Date", "played", "Time"], axis=1).columns) dc_sig_data = dc_coef_data.where(dc_coef_data.abs() > 0.05) dc_sig_data = dc_sig_data.dropna() if len(list(dc_sig_data.index)) != 0: dc_sig_cols = list(dc_sig_data.index) save_league_model_attr(model="dc_columns", league=lg, cols=dc_sig_cols) ou25_coef_data = played_data.corrwith(ou25_target) ou25_sig_cols = list( played_data.drop(["Date", "played", "Time"], axis=1).columns) ou25_sig_data = ou25_coef_data.where(ou25_coef_data.abs() > 0.05) ou25_sig_data = ou25_sig_data.dropna() if len(list(ou25_sig_data.index)) != 0: ou25_sig_cols = list(ou25_sig_data.index) save_league_model_attr(model="ou25_columns", league=lg, cols=ou25_sig_cols) # date without time agg_data["Date"] = [ pd.to_datetime(str(d)).date() for d in agg_data.Date.values ] agg_data = agg_data.drop(['FTHG', 'FTAG', "HSc", "ASc"], axis=1) agg_data.to_csv(self.clean_team_trend_data_directory.format(lg), index=False) self.log.info("{} data saved in clean folder".format(lg.upper())) else: self.log.warning( "{} not processed as there are no fixtures for the next 3 days" .format(lg).upper())