def get_info(self, home_id, away_id, date, season): """Given the data and home/away team id's, get model features""" h_manager = get_manager(team_id=home_id, date=date) a_manager = get_manager(team_id=away_id, date=date) # Check that data was retrieved (catch the error sooner to speed up debugging) assert len(h_manager) > 0, 'No data returned for home manager' assert len(a_manager) > 0, 'No data returned for away manager' # Get the max date from the database conn, cursor = connect_to_db() max_date = run_query(cursor, 'select max(date) from main_fixtures') max_date = pd.to_datetime(max_date.iloc[0, 0]) # set the fixture_id to be 1 higher than the max fixture_id for that season max_fixture = run_query( cursor, "select max(fixture_id) id from main_fixtures " "where date = '{}'".format(str(max_date))) max_fixture = max_fixture.iloc[0, 0] info_dict = { "date": date, "home_id": home_id, "home_team": fetch_name(home_id), "away_id": away_id, "away_team": fetch_name(away_id), "fixture_id": max_fixture, "home_manager_start": h_manager.loc[0, "start_date"], "away_manager_start": a_manager.loc[0, "start_date"], "season": season } output = pd.DataFrame() output = output.append(pd.DataFrame(info_dict, index=[0])) conn.close() return output
def find_closest_match(team_name): # Hardcoded some of the matches because it was too difficult to match accurately if team_name == 'Man City': output = { "team_name": 'Manchester City', "team_id": fetch_id("Manchester City") } return output elif team_name == 'Spurs': output = {"team_name": 'Tottenham', "team_id": fetch_id("Tottenham")} return output else: conn, cursor = connect_to_db() df = run_query(cursor, "Select team_name, team_id from team_ids") df['l_dist'] = df['team_name'].apply( lambda x: Levenshtein.ratio(x, team_name)) max_similarity = max(df['l_dist']) closest_match = df.loc[df['l_dist'] == max_similarity, ['team_name', 'team_id']].reset_index(drop=True) output = { "team_name": closest_match.loc[0, 'team_name'], "team_id": closest_match.loc[0, 'team_id'] } conn.close() return output
async def get_fpl_teams(): """Download upcoming fixture data from the Fantasy Premier League website""" # Connect to the sqlite3 DB conn, cursor = connect_to_db() df_teams = pd.DataFrame() i=0 # Get all of the team IDs from the fpl_fixtures table teams = run_query(cursor, 'select distinct team_h from fpl_fixtures') n_teams = len(teams) assert n_teams == 20, 'The number of returned teams should be 20, ' \ 'but is actually {}'.format(n_teams) async with aiohttp.ClientSession() as session: fpl = FPL(session) team_data = await fpl.get_teams(return_json=True) for team in team_data: df_teams = df_teams.append(pd.DataFrame(team, index=[i])) i += 1 # Find the closest matching team names to the ones in our data matches = list(df_teams['name'].apply(lambda x: find_closest_match(x))) df_matches = pd.DataFrame() for match in matches: df_matches = df_matches.append(pd.DataFrame(match, index=[0])) df_teams = pd.concat([df_teams, df_matches.reset_index(drop=True)], axis=1) # Upload the data to a table in the database run_query(cursor, 'DROP TABLE IF EXISTS fpl_teams', return_data=False) df_teams.to_sql('fpl_teams', conn) conn.close()
def get_feature_data(min_training_data_date='2013-08-01'): conn, cursor = connect_to_db() df = run_query(cursor, """select t1.*, m_h.manager home_manager, m_h.start_date home_manager_start, m_a.manager away_manager, m_a.start_date away_manager_start from main_fixtures t1 left join managers m_h on t1.home_id = m_h.team_id and (t1.date between m_h.start_date and date(m_h.end_date, '+1 day') or t1.date > m_h.start_date and m_h.end_date is NULL) left join managers m_a on t1.away_id = m_a.team_id and (t1.date between m_a.start_date and date(m_a.end_date, '+1 day') or t1.date > m_a.start_date and m_a.end_date is NULL) where t1.date > '{}'""".format(min_training_data_date)) df=get_manager_features(df) df2 = run_query(cursor, "select * from team_fixtures where date > '{}'".format( min_training_data_date)) conn.close() df2['date'] = pd.to_datetime(df2['date']) df2 = pd.merge( df2, df[['date', 'season', 'fixture_id', 'home_manager_age', 'away_manager_age', 'home_manager_new', 'away_manager_new']], on=['date', 'season', 'fixture_id'], how="left") return df2
def get_defaut_collection(): server = server_carrier.get_server() uri = server.config.get('MONGODB_URI') db_conn = connect_to_db(DatabaseTypes.MONGODB, uri, DatabaseNames.python_test_db.value) member_coll = db_conn.get_collection(Collections.member.value) return member_coll
async def get_player_data(): """Download all data from the Fantasy Premier League website""" # Connect to the sqlite3 DB conn, cursor = connect_to_db() summary_df = pd.DataFrame() async with aiohttp.ClientSession() as session: fpl = FPL(session) players = await fpl.get_players() i = 0 for player in players: id = player.id i += 1 print("Loading data for player {} of {}".format(i, len(players))) player_summary = await fpl.get_player_summary(id) player_history = pd.DataFrame(player_summary.history) player_history['web_name'] = player.web_name player_history['id'] = player.id player_history['team'] = player.team player_history['team_code'] = player.team_code summary_df = summary_df.append(player_history) # Upload the data to a table in the database run_query(cursor, 'DROP TABLE IF EXISTS player_data', return_data=False) summary_df.to_sql('player_data', conn)
def fetch_name(team_id): """Get name from team ID""" conn, cursor = connect_to_db() cursor.execute("Select team_name from team_ids where team_id == ?", [team_id]) output = cursor.fetchone()[0] conn.close() return output
def fetch_id(team_name): """Get team ID from name""" conn, cursor = connect_to_db() cursor.execute( "Select team_id from team_ids where team_name = '{}' or alternate_name = '{}'" .format(team_name, team_name)) output = cursor.fetchone()[0] conn.close() return output
def get_upcoming_games(n=10): """Get the next n upcoming games in the Premier League""" conn, cursor = connect_to_db() query = """select kickoff_time, t2.team_id home_id, t2.team_name home_name, t3.team_id away_id, t3.team_name away_name from fpl_fixtures t1 left join fpl_teams t2 on t1.team_h = t2.id left join fpl_teams t3 on t1.team_a = t3.id where started = 0 order by kickoff_time limit {}""".format(n) df = run_query(cursor, query) return df
def get_teams_from_wiki(): # Connect to database conn, cursor = connect_to_db() website_url = requests.get( 'https://en.wikipedia.org/wiki/List_of_Premier_League_clubs').text soup = BeautifulSoup(website_url) # print(soup.prettify()) My_table = soup.find('div', {'class': 'timeline-wrapper'}) links = My_table.findAll('area') teams = [] for link in links: team = link.get('alt').encode('UTF-8').decode() # Convert &26 to & team = team.replace('%26', '&') # Remove ' A.F.C' and ' F.C' team = team.replace('A.F.C', '') team = team.replace('F.C', '') # Remove any '.'s team = team.replace('.', '') # Remove white space from the start and end team = team.lstrip().rstrip() teams.append(team) df = pd.DataFrame() df['team_name'] = teams df = df.sort_values('team_name') df['team_id'] = np.arange(len(df)) + 1 # Load the names into the database run_query(cursor, 'drop table if exists team_ids', return_data=False) run_query( cursor, 'create table team_ids (team_name TEXT, team_id INTEGER, alternate_name)', return_data=False) for row in df.iterrows(): params = [row[1]['team_name'], row[1]['team_id']] params.append(fetch_alternative_name(row[1]['team_name'])) run_query( cursor, 'insert into team_ids(team_name, team_id, alternate_name) values(?, ?, ?)', params=params, return_data=False) conn.commit() conn.close()
async def get_fpl_fixtures(): """Download upcoming fixture data from the Fantasy Premier League website""" # Connect to the sqlite3 DB conn, cursor = connect_to_db() fixture_list = pd.DataFrame() async with aiohttp.ClientSession() as session: fpl = FPL(session) fixtures = await fpl.get_fixtures(return_json=True) for fixture in fixtures: del fixture['stats'] fixture_list = fixture_list.append(pd.DataFrame(fixture, index=[0])) # Upload the data to a table in the database run_query(cursor, 'DROP TABLE IF EXISTS fpl_fixtures', return_data=False) fixture_list.to_sql('fpl_fixtures', conn)
def get_manager(team_id, date): """Find the sitting manager for a given team_id and date""" conn, cursor = connect_to_db() # Get the latest date in the db max_date = run_query(cursor, "select max(end_date) date from managers").loc[0, 'date'] # If we are predicting past our data if date > max_date: # Take the latest manager for the team query = """select * from managers where end_date = '{}' and team_id = {}""".format(max_date, team_id) else: query = """select * from managers where team_id = {} and '{}' between start_date and end_date""".format(team_id, date) df = run_query(cursor, query) rows = len(df) conn.close() if rows != 1: logger.warning("get_manager: Expected 1 row but got {}. Is the manager " "info up to date?".format(rows)) return df
def train_model(self, X, y): """Train a model on 90% of the data and predict 10% using KFold validation, such that a prediction is made for all data""" logger.info("Training model.") kf = KFold(n_splits=10) model_predictions = pd.DataFrame() for train_index, test_index in kf.split(X): xgb_model = xgb.XGBClassifier().fit( X=np.array(X.iloc[train_index, :][self.model_features]), y=np.array(y.iloc[train_index])) predictions = xgb_model.predict(np.array(X.iloc[test_index, :][self.model_features])) actuals = y.iloc[test_index] model_predictions = model_predictions.append( pd.concat([ X.iloc[test_index, :], pd.DataFrame(predictions, columns=['pred'], index=X.iloc[test_index, :].index), actuals], axis=1)) # Assess the model performance using the first performance metric main_performance_metric = self.performance_metrics[0].__name__ performance = self.performance_metrics[0](actuals, predictions) # If the model performs better than the previous model, save it # ToDo: Returning 0 when there is no performance score only works # for performance scores where higher is better if performance > self.performance.get(main_performance_metric, 0): self.trained_model = xgb_model for metric in self.performance_metrics: metric_name = metric.__name__ self.performance[metric_name] = metric(actuals, predictions) # Upload the predictions to the model_predictions table conn, cursor = connect_to_db() # Add model ID so we can compare model performances model_predictions['model_id'] = self.model_id # Add profit made if we bet on the game model_predictions['profit'] = model_predictions.apply(lambda x: get_profit(x), axis=1) run_query(cursor, "drop table if exists historic_predictions", return_data=False) if (not self.test_mode ) or self.upload_historic_predictions: model_predictions.to_sql( 'historic_predictions', con=conn, if_exists='append') conn.close()
def get_historic_predictions(self): conn, cursor = connect_to_db() df = run_query( cursor, "select * from historic_predictions where " "model_id = '{}'".format(self.model_id)) return df
import numpy as np from sklearn.preprocessing import QuantileTransformer from keras import regularizers from keras.activations import tanh from keras.layers import Dense from keras.models import Sequential from keras.layers import LSTM, Masking from keras.optimizers import adam from keras.callbacks import History import tensorflow as tf import talos as ta from sklearn.metrics import accuracy_score from sklearn.utils.class_weight import compute_class_weight # Connect to database conn, cursor = connect_to_db() # Get all fixtures after game week 8, excluding the last game week df = run_query( cursor, "select t1.*, m_h.manager home_manager, m_h.start home_manager_start, " "m_a.manager away_manager, m_a.start away_manager_start " "from main_fixtures t1 " "left join managers m_h " "on t1.home_id = m_h.team_id " "and (t1.date between m_h.start and date(m_h.end, '+1 day') or t1.date > m_h.start and m_h.end is NULL) " "left join managers m_a " "on t1.away_id = m_a.team_id " "and (t1.date between m_a.start and date(m_a.end, '+1 day') or t1.date > m_a.start and m_a.end is NULL) " "where t1.date > '2013-08-01'")
def get_training_data(self): conn, cursor = connect_to_db() # Get all fixtures after game week 8, excluding the last game week df = run_query(cursor, self.training_data_query) return df
def combine_team_poisson_probabilities(): # Connect to database conn, cursor = connect_to_db() # Extract data, join the poisson_team_odds table on itself to combine the home and away teams. query = ''' select -- IDENTIFIERS fixture_id, date, season, home_team, away_team, -- SCORE PROBABILITIES 0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_0_a + ga_0_h) p0_0, 0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_0_a + ga_0_h) p1_0, 0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_0_a + ga_0_h) p2_0, 0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_0_a + ga_0_h) p3_0, 0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_1_a + ga_1_h) p0_1, 0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_1_a + ga_1_h) p1_1, 0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_1_a + ga_1_h) p2_1, 0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_1_a + ga_1_h) p3_1, 0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_2_a + ga_2_h) p0_2, 0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_2_a + ga_2_h) p1_2, 0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_2_a + ga_2_h) p2_2, 0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_2_a + ga_2_h) p3_2, 0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_3_a + ga_3_h) p0_3, 0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_3_a + ga_3_h) p1_3, 0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_3_a + ga_3_h) p2_3, 0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_3_a + ga_3_h) p3_3 from ( select -- IDENTIFIERS t1.fixture_id, t1.date, t1.season, t1.team_name home_team, t2.team_name away_team, -- HOME TEAM PROBABILITIES t1.gf_prob_0 gf_0_h, t1.gf_prob_1 gf_1_h, t1.gf_prob_2 gf_2_h, t1.gf_prob_3 gf_3_h, t1.ga_prob_0 ga_0_h, t1.ga_prob_1 ga_1_h, t1.ga_prob_2 ga_2_h, t1.ga_prob_3 ga_3_h, -- AWAY TEAM PROBABILITIES t2.gf_prob_0 gf_0_a, t2.gf_prob_1 gf_1_a, t2.gf_prob_2 gf_2_a, t2.gf_prob_3 gf_3_a, t2.ga_prob_0 ga_0_a, t2.ga_prob_1 ga_1_a, t2.ga_prob_2 ga_2_a, t2.ga_prob_3 ga_3_a from (select * from poisson_team_odds where is_home = 1) t1 -- home_teams left join (select * from poisson_team_odds where is_home = 0) t2 -- away_teams on t1.fixture_id = t2.fixture_id and t1.season = t2.season and t1.team_name is not t2.team_name )''' df = run_query(cursor, query) df['prob_sum'] = df.select_dtypes('float64').apply(lambda x: sum(x), axis=1) # Round floats to 2 decimal places float_columns = df.select_dtypes(include='float64').columns df[float_columns] = round(df.select_dtypes(include='float64'), 4) # Create the DB table to store data cursor.execute("DROP TABLE IF EXISTS poisson_match_probabilities") cursor.execute("""CREATE TABLE poisson_match_probabilities (fixture_id INT, date DATE, season TEXT, home_team TEXT, away_team TEXT, p0_0 FLOAT, p1_0 FLOAT, p2_0 FLOAT, p3_0 FLOAT, p0_1 FLOAT, p1_1 FLOAT, p2_1 FLOAT, p3_1 FLOAT, p0_2 FLOAT, p1_2 FLOAT, p2_2 FLOAT, p3_2 FLOAT, p0_3 FLOAT, p1_3 FLOAT, p2_3 FLOAT, p3_3 FLOAT, prob_sum FLOAT)""") # Load data into the DB for row in df.iterrows(): params = [row[1][0], str(row[1][1]), str(row[1][2]), str(row[1][3]), row[1][4], row[1][5], row[1][6], row[1][7], row[1][8], row[1][9], row[1][10], row[1][11], row[1][12], row[1][13], row[1][14], row[1][15], row[1][16], row[1][17], row[1][18], row[1][19], row[1][20], row[1][21]] cursor.execute('insert into poisson_match_probabilities VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', params) conn.commit()
def create_team_poisson_probabilities(): # Connect to database conn, cursor = connect_to_db() # Extract data query = 'select fixture_id, team_name, date, season, is_home, goals_for, goals_against from team_fixtures' df = run_query(cursor, query) # Sort the data by season, team_name and date df = df.sort_values(['season', 'team_name', 'date']) # Calculate the moving average # NOTE: The moving average value uses the previous games data, not the goals_for and goals_against on that row. df['goals_for_mavg'] = df.groupby(['team_name', 'season'])['goals_for'].\ transform(lambda x: x.rolling(8, 8).mean()).shift(1) df['goals_against_mavg'] = df.groupby(['team_name', 'season'])['goals_against'].\ transform(lambda x: x.rolling(8, 8).mean()).shift(1) # Calculate the moving standard deviation #df['goals_for_sdavg'] = df.groupby(['team_name', 'season'])['goals_for'].\ # transform(lambda x: x.rolling(8, 8).std()).shift(1) #df['goals_against_sdavg'] = df.groupby(['team_name', 'season'])['goals_against'].\ # transform(lambda x: x.rolling(8, 8).std()).shift(1) # Get goals_for probabilities df['gf_prob_0'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(0, x)) df['gf_prob_1'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(1, x)) df['gf_prob_2'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(2, x)) df['gf_prob_3'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(3, x)) df['gf_prob_other'] = df.apply(lambda x: 1-(x['gf_prob_0'] + x['gf_prob_1'] + x['gf_prob_2'] + x['gf_prob_3']), axis=1) # Get goals_against probabilities df['ga_prob_0'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(0, x)) df['ga_prob_1'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(1, x)) df['ga_prob_2'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(2, x)) df['ga_prob_3'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(3, x)) df['ga_prob_other'] = df.apply(lambda x: 1 - (x['ga_prob_0'] + x['ga_prob_1'] + x['ga_prob_2'] + x['ga_prob_3']), axis=1) # ToDo: Get the data also in key-value pair, e.g type: goals_for/goals_against, amount:, value: for easier plotting # Create the DB table to store data cursor.execute("DROP TABLE IF EXISTS poisson_team_odds") cursor.execute("""CREATE TABLE poisson_team_odds (fixture_id INT, team_name TEXT, date DATE, season TEXT, is_home INT, goals_for FLOAT, goals_against FLOAT, goals_for_mavg FLOAT, goals_against_mavg FLOAT, gf_prob_0 FLOAT, gf_prob_1 FLOAT, gf_prob_2 FLOAT, gf_prob_3 FLOAT, ga_prob_0 FLOAT, ga_prob_1 FLOAT, ga_prob_2 FLOAT, ga_prob_3 FLOAT)""") # Remove Nans (the first 8 games for each team) df = df.dropna() # Round floats to 2 decimal places float_columns = df.select_dtypes(include='float64').columns df[float_columns] = round(df.select_dtypes(include='float64'), 4) # Load data into the DB for row in df.iterrows(): params = [row[1][0], str(row[1][1]), str(row[1][2]), str(row[1][3]), row[1][4], row[1][5], row[1][6], row[1][7], row[1][8], row[1][9], row[1][10], row[1][11], row[1][12], row[1][13], row[1][14], row[1][15], row[1][16]] cursor.execute('''insert into poisson_team_odds VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''', params) conn.commit()