async def get_fpl_teams(): """Download upcoming fixture data from the Fantasy Premier League website""" # Connect to the sqlite3 DB conn, cursor = connect_to_db() df_teams = pd.DataFrame() i=0 # Get all of the team IDs from the fpl_fixtures table teams = run_query(cursor, 'select distinct team_h from fpl_fixtures') n_teams = len(teams) assert n_teams == 20, 'The number of returned teams should be 20, ' \ 'but is actually {}'.format(n_teams) async with aiohttp.ClientSession() as session: fpl = FPL(session) team_data = await fpl.get_teams(return_json=True) for team in team_data: df_teams = df_teams.append(pd.DataFrame(team, index=[i])) i += 1 # Find the closest matching team names to the ones in our data matches = list(df_teams['name'].apply(lambda x: find_closest_match(x))) df_matches = pd.DataFrame() for match in matches: df_matches = df_matches.append(pd.DataFrame(match, index=[0])) df_teams = pd.concat([df_teams, df_matches.reset_index(drop=True)], axis=1) # Upload the data to a table in the database run_query(cursor, 'DROP TABLE IF EXISTS fpl_teams', return_data=False) df_teams.to_sql('fpl_teams', conn) conn.close()
async def get_player_data(): """Download all data from the Fantasy Premier League website""" # Connect to the sqlite3 DB conn, cursor = connect_to_db() summary_df = pd.DataFrame() async with aiohttp.ClientSession() as session: fpl = FPL(session) players = await fpl.get_players() i = 0 for player in players: id = player.id i += 1 print("Loading data for player {} of {}".format(i, len(players))) player_summary = await fpl.get_player_summary(id) player_history = pd.DataFrame(player_summary.history) player_history['web_name'] = player.web_name player_history['id'] = player.id player_history['team'] = player.team player_history['team_code'] = player.team_code summary_df = summary_df.append(player_history) # Upload the data to a table in the database run_query(cursor, 'DROP TABLE IF EXISTS player_data', return_data=False) summary_df.to_sql('player_data', conn)
def get_info(self, home_id, away_id, date, season): """Given the data and home/away team id's, get model features""" h_manager = get_manager(team_id=home_id, date=date) a_manager = get_manager(team_id=away_id, date=date) # Check that data was retrieved (catch the error sooner to speed up debugging) assert len(h_manager) > 0, 'No data returned for home manager' assert len(a_manager) > 0, 'No data returned for away manager' # Get the max date from the database conn, cursor = connect_to_db() max_date = run_query(cursor, 'select max(date) from main_fixtures') max_date = pd.to_datetime(max_date.iloc[0, 0]) # set the fixture_id to be 1 higher than the max fixture_id for that season max_fixture = run_query( cursor, "select max(fixture_id) id from main_fixtures " "where date = '{}'".format(str(max_date))) max_fixture = max_fixture.iloc[0, 0] info_dict = { "date": date, "home_id": home_id, "home_team": fetch_name(home_id), "away_id": away_id, "away_team": fetch_name(away_id), "fixture_id": max_fixture, "home_manager_start": h_manager.loc[0, "start_date"], "away_manager_start": a_manager.loc[0, "start_date"], "season": season } output = pd.DataFrame() output = output.append(pd.DataFrame(info_dict, index=[0])) conn.close() return output
def get_feature_data(min_training_data_date='2013-08-01'): conn, cursor = connect_to_db() df = run_query(cursor, """select t1.*, m_h.manager home_manager, m_h.start_date home_manager_start, m_a.manager away_manager, m_a.start_date away_manager_start from main_fixtures t1 left join managers m_h on t1.home_id = m_h.team_id and (t1.date between m_h.start_date and date(m_h.end_date, '+1 day') or t1.date > m_h.start_date and m_h.end_date is NULL) left join managers m_a on t1.away_id = m_a.team_id and (t1.date between m_a.start_date and date(m_a.end_date, '+1 day') or t1.date > m_a.start_date and m_a.end_date is NULL) where t1.date > '{}'""".format(min_training_data_date)) df=get_manager_features(df) df2 = run_query(cursor, "select * from team_fixtures where date > '{}'".format( min_training_data_date)) conn.close() df2['date'] = pd.to_datetime(df2['date']) df2 = pd.merge( df2, df[['date', 'season', 'fixture_id', 'home_manager_age', 'away_manager_age', 'home_manager_new', 'away_manager_new']], on=['date', 'season', 'fixture_id'], how="left") return df2
def find_closest_match(team_name): # Hardcoded some of the matches because it was too difficult to match accurately if team_name == 'Man City': output = { "team_name": 'Manchester City', "team_id": fetch_id("Manchester City") } return output elif team_name == 'Spurs': output = {"team_name": 'Tottenham', "team_id": fetch_id("Tottenham")} return output else: conn, cursor = connect_to_db() df = run_query(cursor, "Select team_name, team_id from team_ids") df['l_dist'] = df['team_name'].apply( lambda x: Levenshtein.ratio(x, team_name)) max_similarity = max(df['l_dist']) closest_match = df.loc[df['l_dist'] == max_similarity, ['team_name', 'team_id']].reset_index(drop=True) output = { "team_name": closest_match.loc[0, 'team_name'], "team_id": closest_match.loc[0, 'team_id'] } conn.close() return output
async def get_fpl_fixtures(): """Download upcoming fixture data from the Fantasy Premier League website""" # Connect to the sqlite3 DB conn, cursor = connect_to_db() fixture_list = pd.DataFrame() async with aiohttp.ClientSession() as session: fpl = FPL(session) fixtures = await fpl.get_fixtures(return_json=True) for fixture in fixtures: del fixture['stats'] fixture_list = fixture_list.append(pd.DataFrame(fixture, index=[0])) # Upload the data to a table in the database run_query(cursor, 'DROP TABLE IF EXISTS fpl_fixtures', return_data=False) fixture_list.to_sql('fpl_fixtures', conn)
def get_upcoming_games(n=10): """Get the next n upcoming games in the Premier League""" conn, cursor = connect_to_db() query = """select kickoff_time, t2.team_id home_id, t2.team_name home_name, t3.team_id away_id, t3.team_name away_name from fpl_fixtures t1 left join fpl_teams t2 on t1.team_h = t2.id left join fpl_teams t3 on t1.team_a = t3.id where started = 0 order by kickoff_time limit {}""".format(n) df = run_query(cursor, query) return df
def get_manager(team_id, date): """Find the sitting manager for a given team_id and date""" conn, cursor = connect_to_db() # Get the latest date in the db max_date = run_query(cursor, "select max(end_date) date from managers").loc[0, 'date'] # If we are predicting past our data if date > max_date: # Take the latest manager for the team query = """select * from managers where end_date = '{}' and team_id = {}""".format(max_date, team_id) else: query = """select * from managers where team_id = {} and '{}' between start_date and end_date""".format(team_id, date) df = run_query(cursor, query) rows = len(df) conn.close() if rows != 1: logger.warning("get_manager: Expected 1 row but got {}. Is the manager " "info up to date?".format(rows)) return df
def train_model(self, X, y): """Train a model on 90% of the data and predict 10% using KFold validation, such that a prediction is made for all data""" logger.info("Training model.") kf = KFold(n_splits=10) model_predictions = pd.DataFrame() for train_index, test_index in kf.split(X): xgb_model = xgb.XGBClassifier().fit( X=np.array(X.iloc[train_index, :][self.model_features]), y=np.array(y.iloc[train_index])) predictions = xgb_model.predict(np.array(X.iloc[test_index, :][self.model_features])) actuals = y.iloc[test_index] model_predictions = model_predictions.append( pd.concat([ X.iloc[test_index, :], pd.DataFrame(predictions, columns=['pred'], index=X.iloc[test_index, :].index), actuals], axis=1)) # Assess the model performance using the first performance metric main_performance_metric = self.performance_metrics[0].__name__ performance = self.performance_metrics[0](actuals, predictions) # If the model performs better than the previous model, save it # ToDo: Returning 0 when there is no performance score only works # for performance scores where higher is better if performance > self.performance.get(main_performance_metric, 0): self.trained_model = xgb_model for metric in self.performance_metrics: metric_name = metric.__name__ self.performance[metric_name] = metric(actuals, predictions) # Upload the predictions to the model_predictions table conn, cursor = connect_to_db() # Add model ID so we can compare model performances model_predictions['model_id'] = self.model_id # Add profit made if we bet on the game model_predictions['profit'] = model_predictions.apply(lambda x: get_profit(x), axis=1) run_query(cursor, "drop table if exists historic_predictions", return_data=False) if (not self.test_mode ) or self.upload_historic_predictions: model_predictions.to_sql( 'historic_predictions', con=conn, if_exists='append') conn.close()
def get_teams_from_wiki(): # Connect to database conn, cursor = connect_to_db() website_url = requests.get( 'https://en.wikipedia.org/wiki/List_of_Premier_League_clubs').text soup = BeautifulSoup(website_url) # print(soup.prettify()) My_table = soup.find('div', {'class': 'timeline-wrapper'}) links = My_table.findAll('area') teams = [] for link in links: team = link.get('alt').encode('UTF-8').decode() # Convert &26 to & team = team.replace('%26', '&') # Remove ' A.F.C' and ' F.C' team = team.replace('A.F.C', '') team = team.replace('F.C', '') # Remove any '.'s team = team.replace('.', '') # Remove white space from the start and end team = team.lstrip().rstrip() teams.append(team) df = pd.DataFrame() df['team_name'] = teams df = df.sort_values('team_name') df['team_id'] = np.arange(len(df)) + 1 # Load the names into the database run_query(cursor, 'drop table if exists team_ids', return_data=False) run_query( cursor, 'create table team_ids (team_name TEXT, team_id INTEGER, alternate_name)', return_data=False) for row in df.iterrows(): params = [row[1]['team_name'], row[1]['team_id']] params.append(fetch_alternative_name(row[1]['team_name'])) run_query( cursor, 'insert into team_ids(team_name, team_id, alternate_name) values(?, ?, ?)', params=params, return_data=False) conn.commit() conn.close()
def get_training_data(self) -> pd.DataFrame: """Retrieve training data (by querying a DB or otherwise)""" df = run_query(query=self.training_data_query) return df
from keras.callbacks import History import tensorflow as tf import talos as ta from sklearn.metrics import accuracy_score from sklearn.utils.class_weight import compute_class_weight # Connect to database conn, cursor = connect_to_db() # Get all fixtures after game week 8, excluding the last game week df = run_query( cursor, "select t1.*, m_h.manager home_manager, m_h.start home_manager_start, " "m_a.manager away_manager, m_a.start away_manager_start " "from main_fixtures t1 " "left join managers m_h " "on t1.home_id = m_h.team_id " "and (t1.date between m_h.start and date(m_h.end, '+1 day') or t1.date > m_h.start and m_h.end is NULL) " "left join managers m_a " "on t1.away_id = m_a.team_id " "and (t1.date between m_a.start and date(m_a.end, '+1 day') or t1.date > m_a.start and m_a.end is NULL) " "where t1.date > '2013-08-01'") # Get additional features (time as manager) df['date'] = pd.to_datetime(df['date']) df['home_manager_start'] = pd.to_datetime(df['home_manager_start']) df['home_manager_age'] = df.apply( lambda x: np.log10(round((x['date'] - x['home_manager_start']).days)), axis=1) df['away_manager_start'] = pd.to_datetime(df['away_manager_start']) df['away_manager_age'] = df.apply( lambda x: np.log10(round((x['date'] - x['away_manager_start']).days)),
def combine_team_poisson_probabilities(): # Connect to database conn, cursor = connect_to_db() # Extract data, join the poisson_team_odds table on itself to combine the home and away teams. query = ''' select -- IDENTIFIERS fixture_id, date, season, home_team, away_team, -- SCORE PROBABILITIES 0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_0_a + ga_0_h) p0_0, 0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_0_a + ga_0_h) p1_0, 0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_0_a + ga_0_h) p2_0, 0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_0_a + ga_0_h) p3_0, 0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_1_a + ga_1_h) p0_1, 0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_1_a + ga_1_h) p1_1, 0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_1_a + ga_1_h) p2_1, 0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_1_a + ga_1_h) p3_1, 0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_2_a + ga_2_h) p0_2, 0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_2_a + ga_2_h) p1_2, 0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_2_a + ga_2_h) p2_2, 0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_2_a + ga_2_h) p3_2, 0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_3_a + ga_3_h) p0_3, 0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_3_a + ga_3_h) p1_3, 0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_3_a + ga_3_h) p2_3, 0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_3_a + ga_3_h) p3_3 from ( select -- IDENTIFIERS t1.fixture_id, t1.date, t1.season, t1.team_name home_team, t2.team_name away_team, -- HOME TEAM PROBABILITIES t1.gf_prob_0 gf_0_h, t1.gf_prob_1 gf_1_h, t1.gf_prob_2 gf_2_h, t1.gf_prob_3 gf_3_h, t1.ga_prob_0 ga_0_h, t1.ga_prob_1 ga_1_h, t1.ga_prob_2 ga_2_h, t1.ga_prob_3 ga_3_h, -- AWAY TEAM PROBABILITIES t2.gf_prob_0 gf_0_a, t2.gf_prob_1 gf_1_a, t2.gf_prob_2 gf_2_a, t2.gf_prob_3 gf_3_a, t2.ga_prob_0 ga_0_a, t2.ga_prob_1 ga_1_a, t2.ga_prob_2 ga_2_a, t2.ga_prob_3 ga_3_a from (select * from poisson_team_odds where is_home = 1) t1 -- home_teams left join (select * from poisson_team_odds where is_home = 0) t2 -- away_teams on t1.fixture_id = t2.fixture_id and t1.season = t2.season and t1.team_name is not t2.team_name )''' df = run_query(cursor, query) df['prob_sum'] = df.select_dtypes('float64').apply(lambda x: sum(x), axis=1) # Round floats to 2 decimal places float_columns = df.select_dtypes(include='float64').columns df[float_columns] = round(df.select_dtypes(include='float64'), 4) # Create the DB table to store data cursor.execute("DROP TABLE IF EXISTS poisson_match_probabilities") cursor.execute("""CREATE TABLE poisson_match_probabilities (fixture_id INT, date DATE, season TEXT, home_team TEXT, away_team TEXT, p0_0 FLOAT, p1_0 FLOAT, p2_0 FLOAT, p3_0 FLOAT, p0_1 FLOAT, p1_1 FLOAT, p2_1 FLOAT, p3_1 FLOAT, p0_2 FLOAT, p1_2 FLOAT, p2_2 FLOAT, p3_2 FLOAT, p0_3 FLOAT, p1_3 FLOAT, p2_3 FLOAT, p3_3 FLOAT, prob_sum FLOAT)""") # Load data into the DB for row in df.iterrows(): params = [row[1][0], str(row[1][1]), str(row[1][2]), str(row[1][3]), row[1][4], row[1][5], row[1][6], row[1][7], row[1][8], row[1][9], row[1][10], row[1][11], row[1][12], row[1][13], row[1][14], row[1][15], row[1][16], row[1][17], row[1][18], row[1][19], row[1][20], row[1][21]] cursor.execute('insert into poisson_match_probabilities VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', params) conn.commit()
def create_team_poisson_probabilities(): # Connect to database conn, cursor = connect_to_db() # Extract data query = 'select fixture_id, team_name, date, season, is_home, goals_for, goals_against from team_fixtures' df = run_query(cursor, query) # Sort the data by season, team_name and date df = df.sort_values(['season', 'team_name', 'date']) # Calculate the moving average # NOTE: The moving average value uses the previous games data, not the goals_for and goals_against on that row. df['goals_for_mavg'] = df.groupby(['team_name', 'season'])['goals_for'].\ transform(lambda x: x.rolling(8, 8).mean()).shift(1) df['goals_against_mavg'] = df.groupby(['team_name', 'season'])['goals_against'].\ transform(lambda x: x.rolling(8, 8).mean()).shift(1) # Calculate the moving standard deviation #df['goals_for_sdavg'] = df.groupby(['team_name', 'season'])['goals_for'].\ # transform(lambda x: x.rolling(8, 8).std()).shift(1) #df['goals_against_sdavg'] = df.groupby(['team_name', 'season'])['goals_against'].\ # transform(lambda x: x.rolling(8, 8).std()).shift(1) # Get goals_for probabilities df['gf_prob_0'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(0, x)) df['gf_prob_1'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(1, x)) df['gf_prob_2'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(2, x)) df['gf_prob_3'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(3, x)) df['gf_prob_other'] = df.apply(lambda x: 1-(x['gf_prob_0'] + x['gf_prob_1'] + x['gf_prob_2'] + x['gf_prob_3']), axis=1) # Get goals_against probabilities df['ga_prob_0'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(0, x)) df['ga_prob_1'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(1, x)) df['ga_prob_2'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(2, x)) df['ga_prob_3'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(3, x)) df['ga_prob_other'] = df.apply(lambda x: 1 - (x['ga_prob_0'] + x['ga_prob_1'] + x['ga_prob_2'] + x['ga_prob_3']), axis=1) # ToDo: Get the data also in key-value pair, e.g type: goals_for/goals_against, amount:, value: for easier plotting # Create the DB table to store data cursor.execute("DROP TABLE IF EXISTS poisson_team_odds") cursor.execute("""CREATE TABLE poisson_team_odds (fixture_id INT, team_name TEXT, date DATE, season TEXT, is_home INT, goals_for FLOAT, goals_against FLOAT, goals_for_mavg FLOAT, goals_against_mavg FLOAT, gf_prob_0 FLOAT, gf_prob_1 FLOAT, gf_prob_2 FLOAT, gf_prob_3 FLOAT, ga_prob_0 FLOAT, ga_prob_1 FLOAT, ga_prob_2 FLOAT, ga_prob_3 FLOAT)""") # Remove Nans (the first 8 games for each team) df = df.dropna() # Round floats to 2 decimal places float_columns = df.select_dtypes(include='float64').columns df[float_columns] = round(df.select_dtypes(include='float64'), 4) # Load data into the DB for row in df.iterrows(): params = [row[1][0], str(row[1][1]), str(row[1][2]), str(row[1][3]), row[1][4], row[1][5], row[1][6], row[1][7], row[1][8], row[1][9], row[1][10], row[1][11], row[1][12], row[1][13], row[1][14], row[1][15], row[1][16]] cursor.execute('''insert into poisson_team_odds VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''', params) conn.commit()
def get_manager_data(): """Download html content from wikipedia, break down with BeautifulSoup""" url = "https://en.wikipedia.org/wiki/List_of_Premier_League_managers" website_url = requests.get(url).text soup = BeautifulSoup(website_url, features="html.parser") My_table = soup.find("table", {"class": "wikitable sortable plainrowheaders"}) # All data can be found in 'a' and 'span' tags links = My_table.findAll("a") links2 = My_table.findAll("span") # Load data from links1 link1_data = [] for name in links: link1_data.append(name.get("title")) # Load data from links2 link2_data = [] for name in links2: link2_data.append(name.get("data-sort-value")) # Remove Nulls link2_data = [i for i in link2_data if i] link1_data = [i for i in link1_data if i] # Test2 manager name indexes name_indexes = [] for i in range(0, len(link2_data)): if not hasNumbers(link2_data[i]): name_indexes.append(i) # Test3 manager name indexes manager = [] country = [] team = [] for i in range(0, len(link1_data)): if i % 3 == 0: manager.append(link1_data[i]) if i % 3 == 1: country.append(link1_data[i]) if i % 3 == 2: team.append(link1_data[i]) # Create a DataFrame to store all of the scraped data managers = pd.DataFrame() for i in range(0, len(name_indexes)): manager_index = name_indexes[i] try: next_manager = name_indexes[i + 1] except IndexError: next_manager = len(name_indexes) + 1 num_elements = next_manager - manager_index manager_name = link2_data[manager_index] manager_sd = link2_data[manager_index + 1] # Remove the first 0 zeros from manager_sd manager_sd = manager_sd[8:-5] managers.loc[i, "manager"] = " ".join(str.split(manager[i])[0:2]) managers.loc[i, "country"] = country[i] managers.loc[i, "team"] = fix_team_name(team[i]) managers.loc[i, "from"] = manager_sd # If the manager has left, there will be a second row we must capture if num_elements == 3: manager_ed = link2_data[manager_index + 2] # Remove the first 0 zeros from manager_ed manager_ed = manager_ed[8:-5] managers.loc[i, "until"] = manager_ed # Replace club names with those used in the rest of the project # (e.g. Use Manchester City instead of Man City). Also get the team_id managers["team_id"] = managers["team"].apply(lambda x: fetch_id(x)) managers["team"] = managers["team_id"].apply(lambda x: fetch_name(x)) # Rename columns managers = managers[["manager", "team", "team_id", "from", "until"]] managers['from'] = pd.to_datetime(managers['from']) managers['until'] = pd.to_datetime(managers['until']) # If the until date of the last manager and from date of the next manager are the same, # # subtract 1 day from until of the previous manager managers['next_from'] = managers.groupby('team')['from'].shift(-1) managers['until'] = managers.apply(lambda x: x['until'] if x['until'] != x['next_from'] else x['until'] - dt.timedelta(days=1), axis=1) df_dates = pd.DataFrame() for row in managers.iterrows(): until = row[1]['until'] if \ isinstance(row[1]['until'], pd._libs.tslibs.timestamps.Timestamp) \ else dt.datetime.today() dates_between = pd.DataFrame([row[1]['from'] + dt.timedelta(days=x) for x in range((until - row[1]['from']).days + 1)], columns=['date']) dates_between['manager'] = row[1]['manager'] dates_between['team'] = row[1]['team'] dates_between['team_id'] = row[1]['team_id'] df_dates = df_dates.append(dates_between) # Concatenate manager names when two managers have managed at once df_dates = df_dates.groupby(['date', 'team', 'team_id'])['manager'].apply( lambda x: ' & '.join(x)).reset_index() # Get the number of days between each row to identify missing data df_dates['date_lag'] = df_dates.groupby(['team', 'team_id'])['date'].apply(lambda x: x.shift(1)) df_dates['date_diff'] = df_dates.apply(lambda x: (x['date'] - x['date_lag']).days, axis=1) # Create rows for missing data and add them to df_dates missing_data = df_dates[df_dates['date_diff'] > 1] missing_dates = pd.DataFrame() for row in missing_data.dropna().iterrows(): dates_between = pd.DataFrame( [row[1]['date_lag'] + dt.timedelta(days=x+1) for x in range((row[1]['date'] - (row[1]['date_lag'] + dt.timedelta(days=1))).days)], columns=['date']) dates_between['manager'] = 'No Manager' dates_between['team'] = row[1]['team'] dates_between['team_id'] = row[1]['team_id'] missing_dates = missing_dates.append(dates_between) # Drop unnecessary columns and add the missing data rows df_dates.drop(['date_lag', 'date_diff'], axis=1, inplace=True) df_dates = df_dates.append(missing_dates).reset_index(drop=True).sort_values('date') # Create an indicator that tells us each time a team changes manager df_dates['last_manager'] = df_dates.groupby(['team', 'team_id'])['manager'].apply(lambda x: x.shift(1)) df_dates['manager_change'] = df_dates.apply( lambda x: 1 if x['manager'] != x['last_manager'] else 0, axis=1) df_dates['manager_num'] = df_dates.groupby(['team', 'team_id'])['manager_change'].cumsum() # Aggregate the manager data to get a start/end date for each managerial spell min_dates = df_dates.groupby( ['team', 'team_id', 'manager', 'manager_num'])['date'].min().reset_index() max_dates = df_dates.groupby( ['team', 'team_id', 'manager', 'manager_num'])['date'].max().reset_index() # Add on managers who are still in power df_current = df_dates.groupby( ['team', 'team_id', 'manager', 'manager_num'])['date'] manager_dates = pd.merge(min_dates, max_dates, on=['team', 'team_id', 'manager', 'manager_num']).reset_index(drop=True) manager_dates.columns = ['team', 'team_id', 'manager', 'manager_num', 'from', 'until'] manager_dates = manager_dates.groupby( ['team', 'team_id', 'manager', 'manager_num', 'from'])['until'].max().reset_index() manager_dates = manager_dates.groupby( ['team', 'team_id', 'manager', 'manager_num', 'until'])['from'].min().reset_index() manager_dates = manager_dates.groupby( ['team', 'team_id', 'manager', 'manager_num', 'from'])['until'].max().reset_index() # Drop and recreate the table we are going to populate run_query(cursor, "DROP TABLE IF EXISTS managers", return_data=False) run_query(cursor, "CREATE TABLE managers (manager INT, team TEXT, " "team_id INTEGER, start_date DATE, end_date DATE)", return_data=False) for row in manager_dates.iterrows(): params = [ str(row[1]["manager"]), str(row[1]["team"]), int(row[1]["team_id"]), str(row[1]["from"].date()), str(row[1]["until"].date()), ] run_query( cursor, "INSERT INTO managers (manager, team, team_id, start_date, end_date) VALUES(" "?, ?, ?, ?, ?)", params, return_data=False ) conn.commit() conn.close()
def get_historic_predictions(self): conn, cursor = connect_to_db() df = run_query( cursor, "select * from historic_predictions where " "model_id = '{}'".format(self.model_id)) return df
def get_training_data(self): conn, cursor = connect_to_db() # Get all fixtures after game week 8, excluding the last game week df = run_query(cursor, self.training_data_query) return df