예제 #1
0
    def predict(self, league, market):
        print(league, market)
        unplayed_data_columns = self.unplayed_df(league=league, market=market)
        if unplayed_data_columns is None:
            return None
        unplayed_data = unplayed_data_columns[0]
        columns = unplayed_data_columns[1]
        unplayed_df = unplayed_data[columns]

        try:
            clf = joblib.load(
                get_analysis_root_path('tools/league_models/{}_{}'.format(
                    league, market)))

            unplayed_data.loc[:, market] = clf.predict(unplayed_df)

            if market == "wdw":
                outcome_probs = clf.predict_proba(unplayed_df)
                unplayed_data.loc[:, "d_prob"] = outcome_probs[:, 0]
                unplayed_data.loc[:, "a_prob"] = outcome_probs[:, 1]
                unplayed_data.loc[:, "h_prob"] = outcome_probs[:, 2]
                unplayed_data[market].replace(self.result_map, inplace=True)
            elif market == "ou25":
                unplayed_data.loc[:, market].replace(self.inverse_ou_class,
                                                     inplace=True)
            elif market == "dc":
                unplayed_data.loc[:, market].replace({
                    0: '1X',
                    1: '12'
                },
                                                     inplace=True)

            unplayed_data.loc[:, "league"] = league

            prediction = unplayed_data[[
                "Date", "Time", "HomeTeam", "AwayTeam", market, "league"
            ]]
            prediction.rename(index=str,
                              columns={
                                  "Date": "date",
                                  "Time": "time",
                                  "HomeTeam": "home",
                                  "AwayTeam": "away"
                              },
                              inplace=True)
            team_mapping = get_config(file="team_mapping/{}".format(league))
            team_mapping_inv = {v: k for k, v in team_mapping.items()}

            prediction["home"].replace(team_mapping_inv, inplace=True)
            prediction["away"].replace(team_mapping_inv, inplace=True)
            return prediction

        except Exception as e:
            self.log.error(msg="The following error occurred: {}".format(e))
            return
예제 #2
0
def unplayed_games(league: str, market: str):

    columns = get_config(file="{}_columns/{}".format(market, league))

    lg_data = pd.read_csv(get_analysis_root_path(
        'tools/data/clean_data/team_trend/{}.csv'.format(league)),
                          index_col=False)
    unplayed = lg_data[lg_data["played"] == 0]

    if len(unplayed) > 0:
        return unplayed, columns
    return None
예제 #3
0
    def fetch_all_league_fixtures(self, league):
        """
        :return: game fixtures.csv
        """
        self.log.info("Getting {} league fixture".format(league))
        data = pd.read_csv(get_analysis_root_path(
            'tools/data/fixtures/all_fixtures/{}.csv'.format(league)),
                           usecols=['Date', 'Time', 'HomeTeam', 'AwayTeam'])

        data = team_translation(data=data, league=league)
        start_date, end_date = get_start_and_end_dates(end_days=self.days)
        indexed_data = data.set_index('Date')
        indexed_data = indexed_data.loc[start_date:end_date]
        data = indexed_data.reset_index()

        return data
예제 #4
0
    def save_prediction(self, league, market):

        match_predictions = self.predict(league=league, market=market)

        if match_predictions is not None:
            self.log.info(
                "{} prediction dataframe sorted by date, time and league".
                format(league))
            preds = match_predictions.sort_values(['date', 'time', 'league'])
            pred_list = []

            try:
                self.log.info("Connecting to the database")
                client = MongoClient(mongodb_uri, connectTimeoutMS=30000)
                db = client.get_database("sports_prediction")

                wdw_football = db.wdw_football

                self.log.info("Inserting predictions")
                for idx, pred in preds.iterrows():

                    pred = dict(pred)
                    exist = {
                        'league': pred.get('league'),
                        'home': pred.get('home'),
                        'away': pred.get('away'),
                        'time': pred.get('time'),
                        'date': pred.get('date')
                    }
                    wdw_count = wdw_football.count_documents(exist)

                    if wdw_count == 0:
                        pred_list.append(pred)
                    elif wdw_count == 1:
                        wdw_football.update_one(exist, {'$set': pred})

                if len(pred_list) != 0:
                    wdw_football.insert_many(pred_list)
                self.log.info("Done!!!")
            except Exception as e:
                self.log.error(
                    "Could not save {} {} into the database: \n{}".format(
                        league, market, str(e)))
                preds.to_csv(get_analysis_root_path(
                    'tools/data/predictions/{}_{}.csv'.format(league, market)),
                             index=False)
@author: tola
"""
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from imblearn.over_sampling import SMOTE

from utils import get_analysis_root_path, get_config
from te_logger.logger import log

leagues_data = get_config(file="league")
leagues = list(leagues_data.keys())

for league in leagues:
    log.info(msg="Building model for league: {}".format(league))
    lg_data_path = get_analysis_root_path(
        'tools/data/clean_data/team_trend/{}.csv'.format(league))
    try:
        games = pd.read_csv(lg_data_path)
        games = games.dropna(how='any')

        model_columns = get_config(file="wdw_columns/{}".format(league))
        played_data = games.loc[
            (games.Season.isin([1415, 1516, 1617, 1718, 1819]))
            & (games.played == 1)]

        target = played_data.FTR.map({"D": 0, "A": 1, "H": 2})

        # Select significant columns
        data = played_data[model_columns]

        model = LogisticRegression(C=1e5)
예제 #6
0
                    wdw_count = wdw_football.count_documents(exist)

                    if wdw_count == 0:
                        pred_list.append(pred)
                    elif wdw_count == 1:
                        wdw_football.update_one(exist, {'$set': pred})

                if len(pred_list) != 0:
                    wdw_football.insert_many(pred_list)
                self.log.info("Done!!!")
            except Exception as e:
                self.log.error(
                    "Could not save {} {} into the database: \n{}".format(
                        league, market, str(e)))
                preds.to_csv(get_analysis_root_path(
                    'tools/data/predictions/{}_{}.csv'.format(league, market)),
                             index=False)


if __name__ == '__main__':
    dr = Predictors()
    for lg in get_config().keys():
        if os.path.exists(
                get_analysis_root_path(
                    'tools/data/fixtures/selected_fixtures/{}.csv'.format(
                        lg))):
            for mkt in ["wdw", "dc", "ou25"]:
                dr.save_prediction(league=lg, market=mkt)
        else:
            log.warning("{} has no new games".format(lg).upper())
 def __init__(self):
     self.clean_team_trend_data_directory = get_analysis_root_path(
         'tools/data/clean_data/team_trend/{}.csv')
    def store_significant_columns(self, lg="england_premiership"):
        self.log = log

        self.log.info("Processing {} data".format(lg))

        fix_path = get_analysis_root_path(
            'tools/data/fixtures/selected_fixtures/{}.csv'.format(lg))
        if os.path.exists(fix_path):
            fix_data = pd.read_csv(fix_path)
            fix_data.loc[:, "FTHG"] = 0
            fix_data.loc[:, "FTAG"] = 0
            fix_data.loc[:, "FTR"] = 'D'
            fix_data.loc[:, "Season"] = 1819
            fix_data.loc[:, "played"] = 0
            fix_data.loc[:, "BTTS"] = 0
            fix_data.loc[:, "HSc"] = 0
            fix_data.loc[:, "ASc"] = 0

            client = MongoClient(mongodb_uri, connectTimeoutMS=30000)
            db = client.get_database("sports_prediction")
            lg_data = db[lg]

            data = pd.DataFrame(list(lg_data.find({})),
                                columns=[
                                    "Date", "HomeTeam", "AwayTeam", "FTHG",
                                    "FTAG", "FTR", "Season"
                                ])
            data = team_translation(data=data, league=lg)
            data.loc[:, "played"] = 1
            data.loc[:, "FTHG"] = pd.to_numeric(data.FTHG.values)
            data.loc[:, "FTAG"] = pd.to_numeric(data.FTAG.values)
            data.loc[(data.FTHG > 0) & (data.FTAG > 0), 'BTTS'] = 1
            data.loc[(data.FTHG == 0) | (data.FTAG == 0), 'BTTS'] = 0
            data.loc[:, 'HSc'] = list(
                np.where(pd.to_numeric(data.FTHG.values) > 0, 1, 0))
            data.loc[:, 'ASc'] = list(
                np.where(pd.to_numeric(data.FTAG.values) > 0, 1, 0))

            agg_data = pd.concat([data, fix_data],
                                 ignore_index=True,
                                 sort=False)

            agg_data = self.compute_last_point_ave_goals_and_goals_conceded(
                data=agg_data, lg=lg)
            agg_data = agg_data.fillna(0)
            agg_data.loc[:, 'UO25'] = list(
                np.where((pd.to_numeric(agg_data.FTHG.values) +
                          pd.to_numeric(agg_data.FTAG.values)) > 2.5, 1, 0))

            #TODO: make sure that played_data is used to find significant columns
            played_data = agg_data[agg_data["played"] == 1]

            target_real = played_data.FTR.map({"A": -3, "D": 0, "H": 3})
            dc_real = played_data.FTR.map({"A": 1, "D": 0, "H": 0})
            ou25_target = played_data.UO25
            played_data = played_data.drop([
                'FTR', 'FTHG', 'FTAG', 'UO25', "HLM", "ALM", 'BTTS', "HSc",
                "ASc"
            ],
                                           axis=1)

            wdw_coef_data = played_data.corrwith(target_real)
            wdw_sig_cols = list(
                played_data.drop(["Date", "played", "Time"], axis=1).columns)
            wdw_sig_data = wdw_coef_data.where(wdw_coef_data.abs() > 0.05)
            wdw_sig_data = wdw_sig_data.dropna()
            if len(list(wdw_sig_data.index)) != 0:
                wdw_sig_cols = list(wdw_sig_data.index)
            save_league_model_attr(model="wdw_columns",
                                   league=lg,
                                   cols=wdw_sig_cols)

            dc_coef_data = played_data.corrwith(dc_real)
            dc_sig_cols = list(
                played_data.drop(["Date", "played", "Time"], axis=1).columns)
            dc_sig_data = dc_coef_data.where(dc_coef_data.abs() > 0.05)
            dc_sig_data = dc_sig_data.dropna()
            if len(list(dc_sig_data.index)) != 0:
                dc_sig_cols = list(dc_sig_data.index)
            save_league_model_attr(model="dc_columns",
                                   league=lg,
                                   cols=dc_sig_cols)

            ou25_coef_data = played_data.corrwith(ou25_target)
            ou25_sig_cols = list(
                played_data.drop(["Date", "played", "Time"], axis=1).columns)
            ou25_sig_data = ou25_coef_data.where(ou25_coef_data.abs() > 0.05)
            ou25_sig_data = ou25_sig_data.dropna()
            if len(list(ou25_sig_data.index)) != 0:
                ou25_sig_cols = list(ou25_sig_data.index)

            save_league_model_attr(model="ou25_columns",
                                   league=lg,
                                   cols=ou25_sig_cols)

            # date without time
            agg_data["Date"] = [
                pd.to_datetime(str(d)).date() for d in agg_data.Date.values
            ]

            agg_data = agg_data.drop(['FTHG', 'FTAG', "HSc", "ASc"], axis=1)
            agg_data.to_csv(self.clean_team_trend_data_directory.format(lg),
                            index=False)
            self.log.info("{} data saved in clean folder".format(lg.upper()))
        else:
            self.log.warning(
                "{} not processed as there are no fixtures for the next 3 days"
                .format(lg).upper())