async def get_player_data():
    """Download all data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn = connect_to_db()
    summary_df = pd.DataFrame()

    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        players = await fpl.get_players()
        i = 0
        for player in players:
            id = player.id
            i += 1
            print("Loading data for player {} of {}".format(i, len(players)))
            player_summary = await fpl.get_player_summary(id)
            player_history = pd.DataFrame(player_summary.history)
            player_history['web_name'] = player.web_name
            player_history['id'] = player.id
            player_history['team'] = player.team
            player_history['team_code'] = player.team_code
            summary_df = summary_df.append(player_history)

    # Upload the data to a table in the database
    run_query(query='DROP TABLE IF EXISTS player_data', return_data=False)
    summary_df.to_sql('player_data', conn)
Пример #2
0
def find_closest_match(team_name):
    # Hardcoded some of the matches because it was too difficult to match accurately
    if team_name == 'Man City':
        output = {
            "team_name": 'Manchester City',
            "team_id": fetch_id("Manchester City")
        }
        return output
    elif team_name == 'Spurs':
        output = {"team_name": 'Tottenham', "team_id": fetch_id("Tottenham")}
        return output
    else:
        conn = connect_to_db()
        df = run_query("Select team_name, team_id from team_ids")
        df['l_dist'] = df['team_name'].apply(
            lambda x: Levenshtein.ratio(x, team_name))
        max_similarity = max(df['l_dist'])
        closest_match = df.loc[df['l_dist'] == max_similarity,
                               ['team_name', 'team_id']].reset_index(drop=True)
        output = {
            "team_name": closest_match.loc[0, 'team_name'],
            "team_id": closest_match.loc[0, 'team_id']
        }
        conn.close()
        return output
Пример #3
0
async def get_fpl_teams():
    """Download upcoming fixture data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn = connect_to_db()
    df_teams = pd.DataFrame()
    i = 0
    # Get all of the team IDs from the fpl_fixtures table
    teams = run_query('select distinct team_h from fpl_fixtures')
    n_teams = len(teams)
    assert n_teams == 20, 'The number of returned teams should be 20, ' \
                          'but is actually {}'.format(n_teams)
    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        team_data = await fpl.get_teams(return_json=True)

    for team in team_data:
        df_teams = df_teams.append(pd.DataFrame(team, index=[i]))
        i += 1
    # Find the closest matching team names to the ones in our data
    matches = list(df_teams['name'].apply(lambda x: find_closest_match(x)))
    df_matches = pd.DataFrame()
    for match in matches:
        df_matches = df_matches.append(pd.DataFrame(match, index=[0]))
    df_teams = pd.concat([df_teams, df_matches.reset_index(drop=True)], axis=1)
    # Upload the data to a table in the database
    run_query('DROP TABLE IF EXISTS fpl_teams', return_data=False)
    df_teams.to_sql('fpl_teams', conn)
    conn.close()
Пример #4
0
def fetch_name(team_id):
    """Get name from team ID"""
    conn = connect_to_db()
    cursor = conn.cursor()
    cursor.execute("Select team_name from team_ids where team_id == ?",
                   [team_id])
    output = cursor.fetchone()[0]
    conn.close()
    return output
Пример #5
0
def fetch_id(team_name):
    """Get team ID from name"""
    conn = connect_to_db()
    cursor = conn.cursor()
    cursor.execute(
        "Select team_id from team_ids where team_name = '{}' or alternate_name = '{}'"
        .format(team_name, team_name))
    output = cursor.fetchone()[0]
    conn.close()
    return output
Пример #6
0
async def get_fpl_fixtures():
    """Download upcoming fixture data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn = connect_to_db()
    fixture_list = pd.DataFrame()

    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        fixtures = await fpl.get_fixtures(return_json=True)
    for fixture in fixtures:
        del fixture['stats']
        fixture_list = fixture_list.append(pd.DataFrame(fixture, index=[0]))

    # Upload the data to a table in the database
    run_query(query='DROP TABLE IF EXISTS fpl_fixtures', return_data=False)
    fixture_list.to_sql('fpl_fixtures', conn)
Пример #7
0
def make_predictions():
    # Update tables
    update_tables()
    # Get the current season
    current_season = run_query(
        query='select max(season) from main_fixtures').iloc[0, 0]
    # Load the in-production model
    model = MatchResultXGBoost(load_trained_model=True,
                               problem_name='match-predict-base')
    # Get upcoming games (that we haven't predicted)
    df = get_upcoming_games()
    logger.info(f'Making predictions for upcoming games: {df}')
    # Make predictions
    predictions_df = pd.DataFrame(columns=[
        'date', 'home_id', 'away_id', 'season', 'home_odds', 'draw_odds',
        'away_odds', 'H', 'D', 'A', 'model', 'version', 'creation_time'
    ])
    for row in df.iterrows():
        input_dict = {
            "date": row[1]['kickoff_time'],
            "home_id": row[1]['home_id'],
            "away_id": row[1]['away_id'],
            "season": current_season,
            "home_odds": row[1]['home_odds'],
            "draw_odds": row[1]['draw_odds'],
            "away_odds": row[1]['away_odds'],
        }
        # Get predictions from model
        predictions = model.predict(**input_dict)
        # Combine predictions with input data
        output_dict = dict(input_dict, **predictions)
        # Convert dictionary to DataFrame
        output_df = pd.DataFrame(output_dict,
                                 columns=predictions_df.columns,
                                 index=len(predictions_df))
        # Add row to output DataFrame
        predictions_df = predictions_df.append(output_df)
        predictions_df["model"] = model.model_id,
        predictions_df["version"] = __version__,
        predictions_df["creation_time"] = str(dt.datetime.today())
    with connect_to_db() as conn:
        # TODO: Upload to S3 if LOCAL is False
        # Upload output DataFrame to DB
        predictions_df.to_sql('latest_predictions', conn, if_exists='replace')
 def upload_to_table(self, df, table_name, model_id=None):
     # Upload the predictions to the model_predictions table
     conn = connect_to_db()
     if 'creation_time' not in df.columns:
         df['creation_time'] = dt.datetime.now()
     try:
         model_ids = run_query(
             query=f'select distinct model_id from {table_name}')
         model_ids = list(model_ids['model_id'].unique())
     except sqlite3.OperationalError as e:
         model_ids = []
     if model_id in model_ids:
         logger.warning(
             f'Model id: {model_id} already exists in the table {table_name}!'
         )
     # Add model ID so we can compare model performances
     if model_id is not None:
         df['model_id'] = model_id
     df.to_sql(table_name, con=conn, if_exists='append', index=False)
     conn.close()
Пример #9
0
def update_fixtures_from_fbd(table_name='main_fixtures'):
    # Connect to the database
    conn = connect_to_db()
    cursor = conn.cursor()
    # Drop and recrease the table we are going to populate
    cursor.execute("DROP TABLE IF EXISTS {tn}".format(tn=table_name))
    cursor.execute("""CREATE TABLE {tn} (fixture_id INTEGER, home_team TEXT, 
    home_id INTEGER, away_team TEXT, away_id INTEGER, date DATE, home_score INTEGER, 
    away_score INTEGER, home_shots INTEGER, away_shots INTEGER, full_time_result TEXT, 
    b365_home_odds REAL, b365_draw_odds REAL, b365_away_odds REAL, referee TEXT, 
    over_2p5_odds REAL, under_2p5_odds REAL, ah_home_odds REAL, ah_away_odds REAL, 
    ah_home_handicap REAL, season TEXT, home_yellow_cards INTEGER, 
    away_yellow_cards INTEGER, home_red_cards INTEGER, 
    away_red_cards INTEGER)""".format(tn=table_name))
    conn.commit()
    # Create the list of urls to get data from
    urls = [
        'http://www.football-data.co.uk/mmz4281/0304/E0.csv',
        'http://www.football-data.co.uk/mmz4281/0405/E0.csv',
        'http://www.football-data.co.uk/mmz4281/0506/E0.csv',
        'http://www.football-data.co.uk/mmz4281/0607/E0.csv',
        'http://www.football-data.co.uk/mmz4281/0708/E0.csv',
        'http://www.football-data.co.uk/mmz4281/0809/E0.csv',
        'http://www.football-data.co.uk/mmz4281/0910/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1011/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1112/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1213/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1314/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1415/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1516/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1617/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1718/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1819/E0.csv',
        'http://www.football-data.co.uk/mmz4281/1920/E0.csv'
    ]

    for url in urls:
        extract_data_from_fbd(url, table_name)

    # Close the connection
    conn.close()
def get_teams_from_wiki():
    # Connect to database
    conn = connect_to_db()
    website_url = requests.get(
        'https://en.wikipedia.org/wiki/List_of_Premier_League_clubs').text
    soup = BeautifulSoup(website_url, features="lxml")
    My_table = soup.find('div', {'class': 'timeline-wrapper'})
    links = My_table.findAll('area')
    teams = []
    for link in links:
        team = link.get('alt').encode('UTF-8').decode()
        # Convert &26 to &
        team = team.replace('%26', '&')
        # Remove ' A.F.C' and ' F.C'
        team = team.replace('A.F.C', '')
        team = team.replace('F.C', '')
        # Remove any '.'s
        team = team.replace('.', '')
        # Remove white space from the start and end
        team = team.lstrip().rstrip()
        teams.append(team)
    df = pd.DataFrame()
    df['team_name'] = teams
    df = df.sort_values('team_name')
    df['team_id'] = np.arange(len(df))+1
    # Load the names into the database
    run_query(query='drop table if exists team_ids', return_data=False)
    run_query(query='create table team_ids (team_name TEXT, team_id INTEGER, '
              'alternate_name TEXT, alternate_name2 TEXT)',
              return_data=False)
    for row in df.iterrows():
        params = [row[1]['team_name'], row[1]['team_id']]
        params.append(fetch_alternative_name(row[1]['team_name']))
        params.append(fetch_alternative_name2(row[1]['team_name']))
        run_query(query='insert into team_ids(team_name, team_id, '
                  'alternate_name, alternate_name2) values(?, ?, ?, ?)',
                  params=params, return_data=False)
    conn.commit()
    conn.close()
 def get_info(home_id, away_id, date, season, home_odds, draw_odds,
              away_odds):
     """Given the data and home/away team id's, get model features"""
     conn = connect_to_db()
     h_manager = get_manager(team_id=home_id, date=date)
     a_manager = get_manager(team_id=away_id, date=date)
     # Check that data was retrieved (catch the error sooner to speed up debugging)
     assert len(h_manager) > 0, 'No data returned for home manager'
     assert len(a_manager) > 0, 'No data returned for away manager'
     # Get the max date from the database
     max_date = run_query(query='select max(date) from main_fixtures')
     max_date = pd.to_datetime(max_date.iloc[0, 0])
     # set the fixture_id to be 1 higher than the max fixture_id for that season
     max_fixture = run_query(
         query=
         "select max(fixture_id) id from main_fixtures where date = '{}'".
         format(str(max_date)))
     max_fixture = max_fixture.iloc[0, 0]
     info_dict = {
         "date": date,
         "home_id": home_id,
         "home_team": fetch_name(home_id),
         "away_id": away_id,
         "away_team": fetch_name(away_id),
         "fixture_id": max_fixture,
         "home_manager_start": h_manager.loc[0, "start_date"],
         "away_manager_start": a_manager.loc[0, "start_date"],
         "season": season,
         "home_odds": home_odds,
         "draw_odds": draw_odds,
         "away_odds": away_odds
     }
     output = pd.DataFrame()
     output = output.append(pd.DataFrame(info_dict, index=[0]))
     conn.close()
     return output
def create_team_fixtures():
    # Connect to database
    conn = connect_to_db()
    cursor = conn.cursor()
    # Calculate the home team stats
    cursor.execute("drop table if exists team_fixtures_home")
    cursor.execute("""CREATE TABLE team_fixtures_home as SELECT 
        fixture_id, 
        date, 
        home_team as team_name, 
        home_id as team_id,
        season, 
        home_score goals_for, 
        away_score goals_against, 
        home_score-away_score goal_difference,
        home_yellow_cards yellow_cards, 
        home_red_cards red_cards, 
        home_shots shots_for,
        away_shots shots_against,
        home_shots-away_shots shot_difference,
        1 as is_home,
        b365_home_odds as b365_win_odds,
        case 
            when home_score > away_score then 'W'
            when home_score < away_score then 'L' 
            when home_score = away_score then 'D' 
        end result,
        case when home_score > away_score then 3 
            when home_score < away_score then 0 
            when home_score = away_score then 1 
        end points
        FROM main_fixtures""")

    # Calculate the table for away only
    cursor.execute("drop table if exists team_fixtures_away")
    cursor.execute("""CREATE TABLE team_fixtures_away as SELECT 
        fixture_id, 
        date, 
        away_team as team_name, 
        away_id as team_id,
        season, 
        away_score goals_for, 
        home_score goals_against, 
        away_score-home_score goal_difference,
        away_yellow_cards yellow_cards, 
        away_red_cards red_cards, 
        away_shots shots_for,
        home_shots shots_against,
        away_shots-home_shots shot_difference,
        0 as is_home,
        b365_away_odds as b365_win_odds,
        case 
            when home_score > away_score then 'L'
            when home_score < away_score then 'W' 
            when home_score = away_score then 'D' 
        end result,
        case when home_score > away_score then 0 
            when home_score < away_score then 3 
            when home_score = away_score then 1 
        end points
        FROM main_fixtures""")

    # Combine the home and away table to get a full table
    cursor.execute("drop table if exists team_fixtures")
    cursor.execute(
        """CREATE TABLE team_fixtures as select * from team_fixtures_home 
    UNION ALL select * from team_fixtures_away""")
    conn.commit()
    conn.close()
def get_latest_fixtures():
    """Get the latest premier league fixtures from the Betfair Exchange API"""
    trading = connect_to_betfair()
    trading.login()
    # Define what type of data to retrieve from the API
    market_projection = [
        "MARKET_START_TIME", "RUNNER_DESCRIPTION", "COMPETITION", "EVENT",
        "MARKET_DESCRIPTION"
    ]
    # Create a market filter
    event_filter = betfairlightweight.filters.market_filter(
        event_type_ids=[1],
        market_countries=['GB'],
        market_type_codes=['MATCH_ODDS'],
        text_query='English Premier League')
    # Get market catalogue with event info
    market_catalogues = trading.betting.list_market_catalogue(
        filter=event_filter,
        market_projection=market_projection,
        max_results=200,
    )
    # Filter for premier league markets
    prem_markets = [
        market for market in market_catalogues
        if market.competition.name == 'English Premier League'
    ]
    # Populate a DataFrame with the catalogue info
    df_odds = pd.DataFrame(columns=['market_id', 'home_team', 'away_team'])
    for market in prem_markets:
        names = market.event.name.split(' v ')
        # Get the selection IDs for home/draw/away
        home_id = [
            runner for runner in market.runners
            if runner.runner_name == names[0]
        ][0].selection_id
        away_id = [
            runner for runner in market.runners
            if runner.runner_name == names[1]
        ][0].selection_id
        # Add everything to a DataFrame
        df_odds = df_odds.append(
            pd.DataFrame(
                {
                    "market_id": market.market_id,
                    'start_time': market.market_start_time,
                    "home_team": names[0],
                    "home_id": home_id,
                    "away_team": names[1],
                    "away_id": away_id
                },
                index=[len(df_odds)]))
    # Get market books (betting info) of the premier league markets
    market_books = trading.betting.list_market_book(
        market_ids=[market.market_id for market in prem_markets])
    # Populate a DataFrame of the market book information
    df_mb = pd.DataFrame(columns=['market_id', 'selection_id', 'odds'])
    for book in market_books:
        for runner in book.runners:
            sid = {
                'market_id':
                book.market_id,
                'selection_id':
                runner.selection_id,
                'odds':
                runner.last_price_traded
                if runner.last_price_traded is not None else None
            }
            df_mb = df_mb.append(pd.DataFrame(sid, index=[len(df_mb)]))
    output_cols = [
        'market_id', 'market_start_time', 'home_team', 'home_id', 'away_id',
        'away_team', 'home_odds', 'draw_odds', 'away_odds'
    ]
    # Check whether there are any markets open
    if len(df_odds) == 0 or len(df_mb) == 0:
        df_output = pd.DataFrame(columns=output_cols)
    else:
        # Merge the DataFrames containing market catalogue and market book info
        df_output = pd.merge(df_odds,
                             df_mb,
                             left_on=['market_id', 'home_id'],
                             right_on=['market_id', 'selection_id'])
        df_output = pd.merge(df_output,
                             df_mb[df_mb['selection_id'] == 58805],
                             on='market_id')
        df_output = pd.merge(df_output,
                             df_mb,
                             left_on=['market_id', 'away_id'],
                             right_on=['market_id', 'selection_id'])
        df_output = df_output.drop(
            ['selection_id', 'selection_id_x', 'selection_id_y'], axis=1)
        df_output.columns = [
            'away_id', 'away_team', 'home_id', 'home_team', 'market_id',
            'market_start_time', 'home_odds', 'draw_odds', 'away_odds'
        ]
        df_output = df_output[[
            'market_id', 'market_start_time', 'home_team', 'home_id',
            'away_id', 'away_team', 'home_odds', 'draw_odds', 'away_odds'
        ]]
        # ToDo: Check how accurate last price traded is, look into getting odds the normal way
        #  if these odds arent similar to the betfair odds
    with connect_to_db() as conn:
        df_output.to_sql('bfex_latest_fixtures', conn, if_exists='replace')
def get_manager_data():
    """Download html content from wikipedia, break down with BeautifulSoup
    """
    # Connect to database
    conn = connect_to_db()
    url = "https://en.wikipedia.org/wiki/List_of_Premier_League_managers"
    website_url = requests.get(url).text
    soup = BeautifulSoup(website_url, features="html.parser")
    My_table = soup.find("table",
                         {"class": "wikitable sortable plainrowheaders"})
    # All data can be found in 'a' and 'span' tags
    links = My_table.findAll("a")
    links2 = My_table.findAll("span")
    # Load data from links1
    link1_data = []
    for name in links:
        link1_data.append(name.get("title"))
    # Load data from links2
    link2_data = []
    for name in links2:
        link2_data.append(name.get("data-sort-value"))
    # Remove Nulls
    link2_data = [i for i in link2_data if i]
    link1_data = [i for i in link1_data if i]
    # Test2 manager name indexes
    name_indexes = []
    for i in range(0, len(link2_data)):
        if not hasNumbers(link2_data[i]):
            name_indexes.append(i)
    # Test3 manager name indexes
    manager = []
    country = []
    team = []
    for i in range(0, len(link1_data)):
        if i % 3 == 0:
            manager.append(link1_data[i])
        if i % 3 == 1:
            country.append(link1_data[i])
        if i % 3 == 2:
            team.append(link1_data[i])
    # Create a DataFrame to store all of the scraped data
    managers = pd.DataFrame()
    for i in range(0, len(name_indexes)):
        manager_index = name_indexes[i]
        try:
            next_manager = name_indexes[i + 1]
        except IndexError:
            next_manager = len(name_indexes) + 1
        num_elements = next_manager - manager_index
        manager_name = link2_data[manager_index]
        manager_sd = link2_data[manager_index + 1]

        # Remove the first 0 zeros from manager_sd
        manager_sd = manager_sd[8:-5]
        managers.loc[i, "manager"] = " ".join(str.split(manager[i])[0:2])
        managers.loc[i, "country"] = country[i]
        managers.loc[i, "team"] = fix_team_name(team[i])
        managers.loc[i, "from"] = manager_sd
        # If the manager has left, there will be a second row we must capture
        if num_elements == 3:
            manager_ed = link2_data[manager_index + 2]
            # Remove the first 0 zeros from manager_ed
            manager_ed = manager_ed[8:-5]
            managers.loc[i, "until"] = manager_ed
    # Replace club names with those used in the rest of the project
    # (e.g. Use Manchester City instead of Man City). Also get the team_id
    managers["team_id"] = managers["team"].apply(lambda x: fetch_id(x))
    managers["team"] = managers["team_id"].apply(lambda x: fetch_name(x))
    # Rename columns
    managers = managers[["manager", "team", "team_id", "from", "until"]]
    managers['from'] = pd.to_datetime(managers['from'])
    managers['until'] = pd.to_datetime(managers['until'])
    # If the until date of the last manager and from date of the next manager are the same,
    # # subtract 1 day from until of the previous manager
    managers['next_from'] = managers.groupby('team')['from'].shift(-1)
    managers['until'] = managers.apply(
        lambda x: x['until']
        if x['until'] != x['next_from'] else x['until'] - dt.timedelta(days=1),
        axis=1)
    df_dates = pd.DataFrame()
    for row in managers.iterrows():
        until = row[1]['until'] if \
            isinstance(row[1]['until'], pd._libs.tslibs.timestamps.Timestamp) \
            else dt.datetime.today()
        dates_between = pd.DataFrame([
            row[1]['from'] + dt.timedelta(days=x)
            for x in range((until - row[1]['from']).days + 1)
        ],
                                     columns=['date'])
        dates_between['manager'] = row[1]['manager']
        dates_between['team'] = row[1]['team']
        dates_between['team_id'] = row[1]['team_id']
        df_dates = df_dates.append(dates_between)
    # Concatenate manager names when two managers have managed at once
    df_dates = df_dates.groupby(
        ['date', 'team',
         'team_id'])['manager'].apply(lambda x: ' & '.join(x)).reset_index()
    # Get the number of days between each row to identify missing data
    df_dates['date_lag'] = df_dates.groupby(
        ['team', 'team_id'])['date'].apply(lambda x: x.shift(1))
    df_dates['date_diff'] = df_dates.apply(lambda x:
                                           (x['date'] - x['date_lag']).days,
                                           axis=1)
    # Create rows for missing data and add them to df_dates
    missing_data = df_dates[df_dates['date_diff'] > 1]
    missing_dates = pd.DataFrame()
    for row in missing_data.dropna().iterrows():
        dates_between = pd.DataFrame([
            row[1]['date_lag'] + dt.timedelta(days=x + 1)
            for x in range((row[1]['date'] -
                            (row[1]['date_lag'] + dt.timedelta(days=1))).days)
        ],
                                     columns=['date'])
        dates_between['manager'] = 'No Manager'
        dates_between['team'] = row[1]['team']
        dates_between['team_id'] = row[1]['team_id']
        missing_dates = missing_dates.append(dates_between)
    # Drop unnecessary columns and add the missing data rows
    df_dates.drop(['date_lag', 'date_diff'], axis=1, inplace=True)
    df_dates = df_dates.append(missing_dates).reset_index(
        drop=True).sort_values('date')
    # Create an indicator that tells us each time a team changes manager
    df_dates['last_manager'] = df_dates.groupby(
        ['team', 'team_id'])['manager'].apply(lambda x: x.shift(1))
    df_dates['manager_change'] = df_dates.apply(
        lambda x: 1 if x['manager'] != x['last_manager'] else 0, axis=1)
    df_dates['manager_num'] = df_dates.groupby(['team', 'team_id'
                                                ])['manager_change'].cumsum()
    # Aggregate the manager data to get a start/end date for each managerial spell
    min_dates = df_dates.groupby(['team', 'team_id', 'manager',
                                  'manager_num'])['date'].min().reset_index()
    max_dates = df_dates.groupby(['team', 'team_id', 'manager',
                                  'manager_num'])['date'].max().reset_index()
    # Add on managers who are still in power
    df_current = df_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num'])['date']
    manager_dates = pd.merge(min_dates,
                             max_dates,
                             on=['team', 'team_id', 'manager',
                                 'manager_num']).reset_index(drop=True)
    manager_dates.columns = [
        'team', 'team_id', 'manager', 'manager_num', 'from', 'until'
    ]
    manager_dates = manager_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num',
         'from'])['until'].max().reset_index()
    manager_dates = manager_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num',
         'until'])['from'].min().reset_index()
    manager_dates = manager_dates.groupby(
        ['team', 'team_id', 'manager', 'manager_num',
         'from'])['until'].max().reset_index()
    # Drop and recreate the table we are going to populate
    run_query(query="DROP TABLE IF EXISTS managers", return_data=False)
    run_query(query="CREATE TABLE managers (manager INT, team TEXT, "
              "team_id INTEGER, start_date DATE, end_date DATE)",
              return_data=False)
    for row in manager_dates.iterrows():
        params = [
            str(row[1]["manager"]),
            str(row[1]["team"]),
            int(row[1]["team_id"]),
            str(row[1]["from"].date()),
            str(row[1]["until"].date()),
        ]
        run_query(
            query=
            "INSERT INTO managers (manager, team, team_id, start_date, end_date) "
            "VALUES(?, ?, ?, ?, ?)",
            params=params,
            return_data=False)
    conn.commit()
    conn.close()
Пример #15
0
def extract_data_from_fbd(url, table_name, connection_url=None):
    # Connect to database
    conn = connect_to_db(connection_url)
    cursor = conn.cursor()
    # Pull the csv into a pandas data frame
    fixtureData = pd.read_csv(url,
                              skipinitialspace=True,
                              error_bad_lines=False,
                              keep_default_na=False).dropna()
    # Get the length of the data frame
    sLength = len(fixtureData['HomeTeam'])
    # Create an additional column for fixture_id, home_id and away_id. Full with random values for now
    fixtureData['fixture_id'] = 0
    fixtureData['home_id'] = 0
    fixtureData['away_id'] = 0
    # Replace dodgy column names.
    fixtureData.columns = fixtureData.columns.str.replace('>', '_greater_than_').\
        str.replace('<', '_less_than_').str.replace('.', 'p')
    # Loop through each row of the data
    for i in range(0, len(fixtureData.index)):
        # Check if the row is empty
        if fixtureData.loc[i, 'HomeTeam'] != "":
            # Change the game date format to a more appropriate format
            try:
                date_corrected = datetime.strptime(
                    fixtureData.Date[i], '%d/%m/%y').strftime('%Y-%m-%d')
            except:
                date_corrected = datetime.strptime(
                    fixtureData.Date[i], '%d/%m/%Y').strftime('%Y-%m-%d')
            date_corrected = datetime.strptime(date_corrected, '%Y-%m-%d')
            # The next section determines which season the game is in, e.g. 16/17.
            test = fixtureData.Date[i]
            # Extract the year and month
            year = date_corrected.year
            month = date_corrected.month

            # If we are in the second half of the season
            if month in [1, 2, 3, 4, 5, 6]:
                # The string is last_year / this_year
                season = str('{0:02d}'.format(int(str(year)[-2:]) -
                                              1)) + '/' + str(year)[-2:]
            else:
                # The string is this_year / next_year
                season = str(year)[-2:] + '/' + str(
                    '{0:02d}'.format(int(str(year)[-2:]) + 1))

            # Get the over/under 2.5 odds (they are named differently in older .csv's)
            try:
                # Try the newer format
                o2p5 = fixtureData.BbMx_greater_than_2p5[i]
                u2p5 = fixtureData.BbMx_less_than_2p5[i]
            except:
                # Try the older format
                o2p5 = fixtureData.B365_greater_than_2p5[i]
                u2p5 = fixtureData.B365_less_than_2p5[i]

            # Get the Asian Handicap odds
            try:
                # Try the newer format
                ahHome = fixtureData.BbMxAHH[i]
                ahAway = fixtureData.BbMxAHA[i]
                ahHandicap = fixtureData.BbAHh[i]
            except:
                # Try the older format
                ahHome = fixtureData.B365AHH[i]
                ahAway = fixtureData.B365AHA[i]
                try:
                    ahHandicap = fixtureData.B365AH[i]
                except AttributeError:
                    ahHandicap = None

            # Load all parameters into the main_fixtures table in SQLite.
        #  try:
        # Define the parameters
            params = [
                i + 1,  # Fixture ID
                fixtureData.HomeTeam[i],  # Home team name
                fetch_id(fixtureData.HomeTeam[i]),  # Home team ID
                fixtureData.AwayTeam[i],  # Away team name
                fetch_id(fixtureData.AwayTeam[i]),  # Away team ID
                date_corrected,  # Fixture date
                int(fixtureData.FTHG[i]),  # Home goals (full time)
                int(fixtureData.FTAG[i]),  # Away goals (full time)
                int(fixtureData.HS[i]),  # Home shots
                int(fixtureData.AS[i]),  # Away shots
                fixtureData.FTR[i],  # Full time result
                float(fixtureData.B365H[i]),  # Bet 365 home odds
                float(fixtureData.B365D[i]),  # Bet 365 draw odds
                float(fixtureData.B365A[i]),  # Bet 365 away odds
                fixtureData.Referee[i],  # Referee name
                o2p5,  # Over 2.5 goal odds
                u2p5,  # Under 2.5 goal odds
                ahHome,  # Asian Handicap home odds
                ahAway,  # Asian Handicap away odds
                ahHandicap,  #  Asian Handicap
                season,  # Season name (e.g. 16/17)
                int(fixtureData.HY[i]),  # Home yellow cards
                int(fixtureData.AY[i]),  # Away yellow cards
                int(fixtureData.HR[i]),  # Home red cards
                int(fixtureData.AR[i])
            ]  # Away red cards

            # Load the parameters into the table
            query = """
                INSERT INTO {tn} (
                    fixture_id, home_team, home_id, away_team, away_id, date, 
                    home_score, away_score, home_shots, away_shots, full_time_result, 
                    b365_home_odds, b365_draw_odds, b365_away_odds, referee, 
                    over_2p5_odds, under_2p5_odds, ah_home_odds, ah_away_odds, 
                    ah_home_handicap, season, home_yellow_cards, away_yellow_cards, 
                    home_red_cards, away_red_cards) VALUES (
                    ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 
                    ?, ?, ?, ?, ?, ?)"""
            cursor.execute(query.format(tn=table_name), params)

            # Commit the changes
            conn.commit()