async def get_fpl_teams():
    """Download upcoming fixture data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn, cursor = connect_to_db()
    df_teams = pd.DataFrame()
    i=0
    # Get all of the team IDs from the fpl_fixtures table
    teams = run_query(cursor, 'select distinct team_h from fpl_fixtures')
    n_teams = len(teams)
    assert n_teams == 20, 'The number of returned teams should be 20, ' \
                          'but is actually {}'.format(n_teams)
    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        team_data = await fpl.get_teams(return_json=True)

    for team in team_data:
        df_teams = df_teams.append(pd.DataFrame(team, index=[i]))
        i += 1
    # Find the closest matching team names to the ones in our data
    matches = list(df_teams['name'].apply(lambda x: find_closest_match(x)))
    df_matches = pd.DataFrame()
    for match in matches:
        df_matches = df_matches.append(pd.DataFrame(match, index=[0]))
    df_teams = pd.concat([df_teams, df_matches.reset_index(drop=True)], axis=1)
    # Upload the data to a table in the database
    run_query(cursor, 'DROP TABLE IF EXISTS fpl_teams', return_data=False)
    df_teams.to_sql('fpl_teams', conn)
    conn.close()
Пример #2
0
async def get_player_data():
    """Download all data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn, cursor = connect_to_db()
    summary_df = pd.DataFrame()

    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        players = await fpl.get_players()
        i = 0
        for player in players:
            id = player.id
            i += 1
            print("Loading data for player {} of {}".format(i, len(players)))
            player_summary = await fpl.get_player_summary(id)
            player_history = pd.DataFrame(player_summary.history)
            player_history['web_name'] = player.web_name
            player_history['id'] = player.id
            player_history['team'] = player.team
            player_history['team_code'] = player.team_code
            summary_df = summary_df.append(player_history)

    # Upload the data to a table in the database
    run_query(cursor, 'DROP TABLE IF EXISTS player_data', return_data=False)
    summary_df.to_sql('player_data', conn)
Пример #3
0
 def get_info(self, home_id, away_id, date, season):
     """Given the data and home/away team id's, get model features"""
     h_manager = get_manager(team_id=home_id, date=date)
     a_manager = get_manager(team_id=away_id, date=date)
     # Check that data was retrieved (catch the error sooner to speed up debugging)
     assert len(h_manager) > 0, 'No data returned for home manager'
     assert len(a_manager) > 0, 'No data returned for away manager'
     # Get the max date from the database
     conn, cursor = connect_to_db()
     max_date = run_query(cursor, 'select max(date) from main_fixtures')
     max_date = pd.to_datetime(max_date.iloc[0, 0])
     # set the fixture_id to be 1 higher than the max fixture_id for that season
     max_fixture = run_query(
         cursor,
         "select max(fixture_id) id from main_fixtures "
         "where date = '{}'".format(str(max_date)))
     max_fixture = max_fixture.iloc[0, 0]
     info_dict = {
         "date": date,
         "home_id": home_id,
         "home_team": fetch_name(home_id),
         "away_id": away_id,
         "away_team": fetch_name(away_id),
         "fixture_id": max_fixture,
         "home_manager_start": h_manager.loc[0, "start_date"],
         "away_manager_start": a_manager.loc[0, "start_date"],
         "season": season
     }
     output = pd.DataFrame()
     output = output.append(pd.DataFrame(info_dict, index=[0]))
     conn.close()
     return output
Пример #4
0
def get_feature_data(min_training_data_date='2013-08-01'):
    conn, cursor = connect_to_db()
    df = run_query(cursor, """select t1.*, m_h.manager home_manager,
     m_h.start_date home_manager_start, 
     m_a.manager away_manager, m_a.start_date away_manager_start 
     from main_fixtures t1 
     left join managers m_h 
     on t1.home_id = m_h.team_id 
     and (t1.date between m_h.start_date and date(m_h.end_date, '+1 day') 
     or t1.date > m_h.start_date and m_h.end_date is NULL) 
     left join managers m_a 
     on t1.away_id = m_a.team_id 
     and (t1.date between m_a.start_date and date(m_a.end_date, '+1 day') 
     or t1.date > m_a.start_date and m_a.end_date is NULL) 
     where t1.date > '{}'""".format(min_training_data_date))
    df=get_manager_features(df)
    df2 = run_query(cursor, "select * from team_fixtures where date > '{}'".format(
        min_training_data_date))
    conn.close()
    df2['date'] = pd.to_datetime(df2['date'])
    df2 = pd.merge(
        df2,
        df[['date', 'season', 'fixture_id', 'home_manager_age', 'away_manager_age',
            'home_manager_new', 'away_manager_new']],
        on=['date', 'season', 'fixture_id'],
        how="left")
    return df2
Пример #5
0
def find_closest_match(team_name):
    # Hardcoded some of the matches because it was too difficult to match accurately
    if team_name == 'Man City':
        output = {
            "team_name": 'Manchester City',
            "team_id": fetch_id("Manchester City")
        }
        return output
    elif team_name == 'Spurs':
        output = {"team_name": 'Tottenham', "team_id": fetch_id("Tottenham")}
        return output
    else:
        conn, cursor = connect_to_db()
        df = run_query(cursor, "Select team_name, team_id from team_ids")
        df['l_dist'] = df['team_name'].apply(
            lambda x: Levenshtein.ratio(x, team_name))
        max_similarity = max(df['l_dist'])
        closest_match = df.loc[df['l_dist'] == max_similarity,
                               ['team_name', 'team_id']].reset_index(drop=True)
        output = {
            "team_name": closest_match.loc[0, 'team_name'],
            "team_id": closest_match.loc[0, 'team_id']
        }
        conn.close()
        return output
async def get_fpl_fixtures():
    """Download upcoming fixture data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn, cursor = connect_to_db()
    fixture_list = pd.DataFrame()

    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        fixtures = await fpl.get_fixtures(return_json=True)
    for fixture in fixtures:
        del fixture['stats']
        fixture_list = fixture_list.append(pd.DataFrame(fixture, index=[0]))

    # Upload the data to a table in the database
    run_query(cursor, 'DROP TABLE IF EXISTS fpl_fixtures', return_data=False)
    fixture_list.to_sql('fpl_fixtures', conn)
Пример #7
0
def get_upcoming_games(n=10):
    """Get the next n upcoming games in the Premier League"""
    conn, cursor = connect_to_db()
    query = """select kickoff_time, t2.team_id home_id, t2.team_name home_name, 
    t3.team_id away_id, t3.team_name away_name
    from fpl_fixtures t1 left join fpl_teams t2 on t1.team_h = t2.id left 
    join fpl_teams t3 on t1.team_a = t3.id where started = 0 order by 
    kickoff_time limit {}""".format(n)
    df = run_query(cursor, query)
    return df
Пример #8
0
def get_manager(team_id, date):
    """Find the sitting manager for a given team_id and date"""
    conn, cursor = connect_to_db()
    # Get the latest date in the db
    max_date = run_query(cursor, "select max(end_date) date from managers").loc[0, 'date']
    # If we are predicting past our data
    if date > max_date:
        # Take the latest manager for the team
        query = """select * from managers where end_date = '{}' and 
        team_id = {}""".format(max_date, team_id)
    else:
        query = """select * from managers where team_id = {} 
                    and '{}' between start_date and end_date""".format(team_id, date)
    df = run_query(cursor, query)
    rows = len(df)
    conn.close()
    if rows != 1:
        logger.warning("get_manager: Expected 1 row but got {}. Is the manager "
                       "info up to date?".format(rows))
    return df
Пример #9
0
 def train_model(self, X, y):
     """Train a model on 90% of the data and predict 10% using KFold validation,
     such that a prediction is made for all data"""
     logger.info("Training model.")
     kf = KFold(n_splits=10)
     model_predictions = pd.DataFrame()
     for train_index, test_index in kf.split(X):
         xgb_model = xgb.XGBClassifier().fit(
             X=np.array(X.iloc[train_index, :][self.model_features]),
             y=np.array(y.iloc[train_index]))
         predictions = xgb_model.predict(np.array(X.iloc[test_index, :][self.model_features]))
         actuals = y.iloc[test_index]
         model_predictions = model_predictions.append(
             pd.concat([
                 X.iloc[test_index, :],
                 pd.DataFrame(predictions, columns=['pred'], index=X.iloc[test_index, :].index),
                 actuals], axis=1))
     # Assess the model performance using the first performance metric
     main_performance_metric = self.performance_metrics[0].__name__
     performance = self.performance_metrics[0](actuals, predictions)
     # If the model performs better than the previous model, save it
     # ToDo: Returning 0 when there is no performance score only works
     #  for performance scores where higher is better
     if performance > self.performance.get(main_performance_metric, 0):
         self.trained_model = xgb_model
         for metric in self.performance_metrics:
             metric_name = metric.__name__
             self.performance[metric_name] = metric(actuals, predictions)
     # Upload the predictions to the model_predictions table
     conn, cursor = connect_to_db()
     # Add model ID so we can compare model performances
     model_predictions['model_id'] = self.model_id
     # Add profit made if we bet on the game
     model_predictions['profit'] = model_predictions.apply(lambda x: get_profit(x), axis=1)
     run_query(cursor, "drop table if exists historic_predictions",
               return_data=False)
     if (not self.test_mode ) or self.upload_historic_predictions:
         model_predictions.to_sql(
             'historic_predictions', con=conn, if_exists='append')
     conn.close()
def get_teams_from_wiki():
    # Connect to database
    conn, cursor = connect_to_db()

    website_url = requests.get(
        'https://en.wikipedia.org/wiki/List_of_Premier_League_clubs').text

    soup = BeautifulSoup(website_url)
    # print(soup.prettify())

    My_table = soup.find('div', {'class': 'timeline-wrapper'})

    links = My_table.findAll('area')

    teams = []

    for link in links:
        team = link.get('alt').encode('UTF-8').decode()
        # Convert &26 to &
        team = team.replace('%26', '&')
        # Remove ' A.F.C' and ' F.C'
        team = team.replace('A.F.C', '')
        team = team.replace('F.C', '')
        # Remove any '.'s
        team = team.replace('.', '')
        # Remove white space from the start and end
        team = team.lstrip().rstrip()
        teams.append(team)

    df = pd.DataFrame()
    df['team_name'] = teams
    df = df.sort_values('team_name')
    df['team_id'] = np.arange(len(df)) + 1

    # Load the names into the database
    run_query(cursor, 'drop table if exists team_ids', return_data=False)
    run_query(
        cursor,
        'create table team_ids (team_name TEXT, team_id INTEGER, alternate_name)',
        return_data=False)

    for row in df.iterrows():
        params = [row[1]['team_name'], row[1]['team_id']]
        params.append(fetch_alternative_name(row[1]['team_name']))
        run_query(
            cursor,
            'insert into team_ids(team_name, team_id, alternate_name) values(?, ?, ?)',
            params=params,
            return_data=False)

    conn.commit()
    conn.close()
Пример #11
0
    def get_training_data(self) -> pd.DataFrame:
        """Retrieve training data (by querying a DB or otherwise)"""

        df = run_query(query=self.training_data_query)
        return df
Пример #12
0
from keras.callbacks import History
import tensorflow as tf
import talos as ta
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Connect to database
conn, cursor = connect_to_db()

# Get all fixtures after game week 8, excluding the last game week
df = run_query(
    cursor,
    "select t1.*, m_h.manager home_manager, m_h.start home_manager_start, "
    "m_a.manager away_manager, m_a.start away_manager_start "
    "from main_fixtures t1 "
    "left join managers m_h "
    "on t1.home_id = m_h.team_id "
    "and (t1.date between m_h.start and date(m_h.end, '+1 day') or t1.date > m_h.start and m_h.end is NULL) "
    "left join managers m_a "
    "on t1.away_id = m_a.team_id "
    "and (t1.date between m_a.start and date(m_a.end, '+1 day') or t1.date > m_a.start and m_a.end is NULL) "
    "where t1.date > '2013-08-01'")

# Get additional features (time as manager)
df['date'] = pd.to_datetime(df['date'])
df['home_manager_start'] = pd.to_datetime(df['home_manager_start'])
df['home_manager_age'] = df.apply(
    lambda x: np.log10(round((x['date'] - x['home_manager_start']).days)),
    axis=1)
df['away_manager_start'] = pd.to_datetime(df['away_manager_start'])
df['away_manager_age'] = df.apply(
    lambda x: np.log10(round((x['date'] - x['away_manager_start']).days)),
Пример #13
0
def combine_team_poisson_probabilities():

    # Connect to database
    conn, cursor = connect_to_db()

    # Extract data, join the poisson_team_odds table on itself to combine the home and away teams.
    query = '''
    select
        -- IDENTIFIERS
        fixture_id, date, season, home_team, away_team,
        -- SCORE PROBABILITIES
        0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_0_a + ga_0_h) p0_0,
        0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_0_a + ga_0_h) p1_0,
        0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_0_a + ga_0_h) p2_0,
        0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_0_a + ga_0_h) p3_0,
        0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_1_a + ga_1_h) p0_1,
        0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_1_a + ga_1_h) p1_1,
        0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_1_a + ga_1_h) p2_1,
        0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_1_a + ga_1_h) p3_1,
        0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_2_a + ga_2_h) p0_2,
        0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_2_a + ga_2_h) p1_2,
        0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_2_a + ga_2_h) p2_2,
        0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_2_a + ga_2_h) p3_2,
        0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_3_a + ga_3_h) p0_3,
        0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_3_a + ga_3_h) p1_3,
        0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_3_a + ga_3_h) p2_3,
        0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_3_a + ga_3_h) p3_3
    from
        (
        select 
            -- IDENTIFIERS
            t1.fixture_id, t1.date, t1.season, t1.team_name home_team, t2.team_name away_team,
            -- HOME TEAM PROBABILITIES
            t1.gf_prob_0 gf_0_h, t1.gf_prob_1 gf_1_h, t1.gf_prob_2 gf_2_h, t1.gf_prob_3 gf_3_h, 
            t1.ga_prob_0 ga_0_h, t1.ga_prob_1 ga_1_h, t1.ga_prob_2 ga_2_h, t1.ga_prob_3 ga_3_h, 
            -- AWAY TEAM PROBABILITIES
            t2.gf_prob_0 gf_0_a, t2.gf_prob_1 gf_1_a, t2.gf_prob_2 gf_2_a, t2.gf_prob_3 gf_3_a, 
            t2.ga_prob_0 ga_0_a, t2.ga_prob_1 ga_1_a, t2.ga_prob_2 ga_2_a, t2.ga_prob_3 ga_3_a
        from 
            (select * from poisson_team_odds where is_home = 1) t1 -- home_teams
        left join 
            (select * from poisson_team_odds where is_home = 0) t2 -- away_teams
        on t1.fixture_id = t2.fixture_id 
        and t1.season = t2.season 
        and t1.team_name is not t2.team_name
        )'''

    df = run_query(cursor, query)

    df['prob_sum'] = df.select_dtypes('float64').apply(lambda x: sum(x), axis=1)

    # Round floats to 2 decimal places
    float_columns = df.select_dtypes(include='float64').columns
    df[float_columns] = round(df.select_dtypes(include='float64'), 4)

    # Create the DB table to store data
    cursor.execute("DROP TABLE IF EXISTS poisson_match_probabilities")
    cursor.execute("""CREATE TABLE poisson_match_probabilities (fixture_id INT, date DATE, season TEXT, 
    home_team TEXT, away_team TEXT, p0_0 FLOAT, p1_0 FLOAT, p2_0 FLOAT, p3_0 FLOAT, p0_1 FLOAT, p1_1 FLOAT, 
    p2_1 FLOAT, p3_1 FLOAT, p0_2 FLOAT, p1_2 FLOAT, p2_2 FLOAT, p3_2 FLOAT, p0_3 FLOAT, p1_3 FLOAT, p2_3 FLOAT, 
    p3_3 FLOAT, prob_sum FLOAT)""")

    # Load data into the DB
    for row in df.iterrows():
        params = [row[1][0], str(row[1][1]), str(row[1][2]), str(row[1][3]), row[1][4], row[1][5], row[1][6],
                  row[1][7], row[1][8], row[1][9], row[1][10], row[1][11], row[1][12], row[1][13], row[1][14],
                  row[1][15], row[1][16], row[1][17], row[1][18], row[1][19], row[1][20], row[1][21]]
        cursor.execute('insert into poisson_match_probabilities VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', params)

    conn.commit()
Пример #14
0
def create_team_poisson_probabilities():

    # Connect to database
    conn, cursor = connect_to_db()

    # Extract data
    query = 'select fixture_id, team_name, date, season, is_home, goals_for, goals_against from team_fixtures'
    df = run_query(cursor, query)

    # Sort the data by season, team_name and date
    df = df.sort_values(['season', 'team_name', 'date'])

    # Calculate the moving average
    # NOTE: The moving average value uses the previous games data, not the goals_for and goals_against on that row.
    df['goals_for_mavg'] = df.groupby(['team_name', 'season'])['goals_for'].\
        transform(lambda x: x.rolling(8, 8).mean()).shift(1)
    df['goals_against_mavg'] = df.groupby(['team_name', 'season'])['goals_against'].\
        transform(lambda x: x.rolling(8, 8).mean()).shift(1)

    # Calculate the moving standard deviation
    #df['goals_for_sdavg'] = df.groupby(['team_name', 'season'])['goals_for'].\
    #    transform(lambda x: x.rolling(8, 8).std()).shift(1)
    #df['goals_against_sdavg'] = df.groupby(['team_name', 'season'])['goals_against'].\
    #    transform(lambda x: x.rolling(8, 8).std()).shift(1)

    # Get goals_for probabilities
    df['gf_prob_0'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(0, x))
    df['gf_prob_1'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(1, x))
    df['gf_prob_2'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(2, x))
    df['gf_prob_3'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(3, x))
    df['gf_prob_other'] = df.apply(lambda x: 1-(x['gf_prob_0'] + x['gf_prob_1'] +
                                                x['gf_prob_2'] + x['gf_prob_3']), axis=1)

    # Get goals_against probabilities
    df['ga_prob_0'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(0, x))
    df['ga_prob_1'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(1, x))
    df['ga_prob_2'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(2, x))
    df['ga_prob_3'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(3, x))
    df['ga_prob_other'] = df.apply(lambda x: 1 - (x['ga_prob_0'] + x['ga_prob_1'] +
                                                  x['ga_prob_2'] + x['ga_prob_3']), axis=1)

    # ToDo: Get the data also in key-value pair, e.g type: goals_for/goals_against, amount:, value: for easier plotting
    
    # Create the DB table to store data
    cursor.execute("DROP TABLE IF EXISTS poisson_team_odds")
    cursor.execute("""CREATE TABLE poisson_team_odds (fixture_id INT, team_name TEXT, date DATE, season TEXT, 
    is_home INT, goals_for FLOAT, goals_against FLOAT, goals_for_mavg FLOAT, goals_against_mavg FLOAT, gf_prob_0 FLOAT, 
    gf_prob_1 FLOAT, gf_prob_2 FLOAT, gf_prob_3 FLOAT, ga_prob_0 FLOAT, ga_prob_1 FLOAT, ga_prob_2 FLOAT, 
    ga_prob_3 FLOAT)""")

    # Remove Nans (the first 8 games for each team)
    df = df.dropna()

    # Round floats to 2 decimal places
    float_columns = df.select_dtypes(include='float64').columns
    df[float_columns] = round(df.select_dtypes(include='float64'), 4)

    # Load data into the DB
    for row in df.iterrows():
        params = [row[1][0], str(row[1][1]), str(row[1][2]), str(row[1][3]), row[1][4], row[1][5], row[1][6],
                  row[1][7], row[1][8], row[1][9], row[1][10], row[1][11], row[1][12], row[1][13], row[1][14],
                  row[1][15], row[1][16]]
        cursor.execute('''insert into poisson_team_odds VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''', params)

    conn.commit()
def get_manager_data():
    """Download html content from wikipedia, break down with BeautifulSoup"""
    url = "https://en.wikipedia.org/wiki/List_of_Premier_League_managers"
    website_url = requests.get(url).text
    soup = BeautifulSoup(website_url, features="html.parser")
    My_table = soup.find("table", {"class": "wikitable sortable plainrowheaders"})
    # All data can be found in 'a' and 'span' tags
    links = My_table.findAll("a")
    links2 = My_table.findAll("span")

    # Load data from links1
    link1_data = []
    for name in links:
        link1_data.append(name.get("title"))

    # Load data from links2
    link2_data = []
    for name in links2:
        link2_data.append(name.get("data-sort-value"))


    # Remove Nulls
    link2_data = [i for i in link2_data if i]
    link1_data = [i for i in link1_data if i]

    # Test2 manager name indexes
    name_indexes = []
    for i in range(0, len(link2_data)):
        if not hasNumbers(link2_data[i]):
            name_indexes.append(i)

    # Test3 manager name indexes
    manager = []
    country = []
    team = []
    for i in range(0, len(link1_data)):
        if i % 3 == 0:
            manager.append(link1_data[i])
        if i % 3 == 1:
            country.append(link1_data[i])
        if i % 3 == 2:
            team.append(link1_data[i])

    # Create a DataFrame to store all of the scraped data
    managers = pd.DataFrame()
    for i in range(0, len(name_indexes)):
        manager_index = name_indexes[i]
        try:
            next_manager = name_indexes[i + 1]
        except IndexError:
            next_manager = len(name_indexes) + 1
        num_elements = next_manager - manager_index
        manager_name = link2_data[manager_index]
        manager_sd = link2_data[manager_index + 1]

        # Remove the first 0 zeros from manager_sd
        manager_sd = manager_sd[8:-5]
        managers.loc[i, "manager"] = " ".join(str.split(manager[i])[0:2])
        managers.loc[i, "country"] = country[i]
        managers.loc[i, "team"] = fix_team_name(team[i])
        managers.loc[i, "from"] = manager_sd
        # If the manager has left, there will be a second row we must capture
        if num_elements == 3:
            manager_ed = link2_data[manager_index + 2]
            # Remove the first 0 zeros from manager_ed
            manager_ed = manager_ed[8:-5]
            managers.loc[i, "until"] = manager_ed

    # Replace club names with those used in the rest of the project
    # (e.g. Use Manchester City instead of Man City). Also get the team_id
    managers["team_id"] = managers["team"].apply(lambda x: fetch_id(x))
    managers["team"] = managers["team_id"].apply(lambda x: fetch_name(x))
    # Rename columns
    managers = managers[["manager", "team", "team_id", "from", "until"]]

    managers['from'] = pd.to_datetime(managers['from'])
    managers['until'] = pd.to_datetime(managers['until'])
    # If the until date of the last manager and from date of the next manager are the same,
    # # subtract 1 day from until of the previous manager
    managers['next_from'] = managers.groupby('team')['from'].shift(-1)
    managers['until'] = managers.apply(lambda x: x['until']
    if x['until'] != x['next_from'] else x['until'] - dt.timedelta(days=1), axis=1)

    df_dates = pd.DataFrame()
    for row in managers.iterrows():
        until = row[1]['until'] if \
            isinstance(row[1]['until'], pd._libs.tslibs.timestamps.Timestamp) \
            else dt.datetime.today()
        dates_between = pd.DataFrame([row[1]['from'] + dt.timedelta(days=x)
             for x in range((until - row[1]['from']).days + 1)], columns=['date'])
        dates_between['manager'] = row[1]['manager']
        dates_between['team'] = row[1]['team']
        dates_between['team_id'] = row[1]['team_id']
        df_dates = df_dates.append(dates_between)

    # Concatenate manager names when two managers have managed at once
    df_dates = df_dates.groupby(['date', 'team', 'team_id'])['manager'].apply(
        lambda x: ' & '.join(x)).reset_index()

    # Get the number of days between each row to identify missing data
    df_dates['date_lag'] = df_dates.groupby(['team', 'team_id'])['date'].apply(lambda x: x.shift(1))
    df_dates['date_diff'] = df_dates.apply(lambda x: (x['date'] - x['date_lag']).days, axis=1)
    # Create rows for missing data and add them to df_dates
    missing_data = df_dates[df_dates['date_diff'] > 1]
    missing_dates = pd.DataFrame()
    for row in missing_data.dropna().iterrows():
        dates_between = pd.DataFrame(
            [row[1]['date_lag'] + dt.timedelta(days=x+1)
             for x in range((row[1]['date'] - (row[1]['date_lag'] + dt.timedelta(days=1))).days)],
            columns=['date'])
        dates_between['manager'] = 'No Manager'
        dates_between['team'] = row[1]['team']
        dates_between['team_id'] = row[1]['team_id']
        missing_dates = missing_dates.append(dates_between)
    # Drop unnecessary columns and add the missing data rows
    df_dates.drop(['date_lag', 'date_diff'], axis=1, inplace=True)
    df_dates = df_dates.append(missing_dates).reset_index(drop=True).sort_values('date')

    # Create an indicator that tells us each time a team changes manager
    df_dates['last_manager'] = df_dates.groupby(['team', 'team_id'])['manager'].apply(lambda x: x.shift(1))
    df_dates['manager_change'] = df_dates.apply(
        lambda x: 1 if x['manager'] != x['last_manager'] else 0, axis=1)
    df_dates['manager_num'] = df_dates.groupby(['team', 'team_id'])['manager_change'].cumsum()

    # Aggregate the manager data to get a start/end date for each managerial spell
    min_dates = df_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num'])['date'].min().reset_index()
    max_dates = df_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num'])['date'].max().reset_index()

    # Add on managers who are still in power
    df_current = df_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num'])['date']

    manager_dates = pd.merge(min_dates, max_dates, on=['team', 'team_id', 'manager', 'manager_num']).reset_index(drop=True)
    manager_dates.columns = ['team', 'team_id', 'manager', 'manager_num', 'from', 'until']
    manager_dates = manager_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num', 'from'])['until'].max().reset_index()
    manager_dates = manager_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num', 'until'])['from'].min().reset_index()
    manager_dates = manager_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num', 'from'])['until'].max().reset_index()

    # Drop and recreate the table we are going to populate
    run_query(cursor, "DROP TABLE IF EXISTS managers", return_data=False)
    run_query(cursor, "CREATE TABLE managers (manager INT, team TEXT, "
                      "team_id INTEGER, start_date DATE, end_date DATE)",
              return_data=False)

    for row in manager_dates.iterrows():
        params = [
            str(row[1]["manager"]),
            str(row[1]["team"]),
            int(row[1]["team_id"]),
            str(row[1]["from"].date()),
            str(row[1]["until"].date()),
        ]
        run_query(
            cursor,
            "INSERT INTO managers (manager, team, team_id, start_date, end_date) VALUES("
            "?, ?, ?, ?, ?)",
            params, return_data=False
        )

    conn.commit()
    conn.close()
Пример #16
0
 def get_historic_predictions(self):
     conn, cursor = connect_to_db()
     df = run_query(
         cursor, "select * from historic_predictions where "
                 "model_id = '{}'".format(self.model_id))
     return df
Пример #17
0
 def get_training_data(self):
     conn, cursor = connect_to_db()
     # Get all fixtures after game week 8, excluding the last game week
     df = run_query(cursor, self.training_data_query)
     return df