예제 #1
0
def insert_team_data(conn, team_code, year):
    # Retrieve html from Baseball Reference
    url = f'https://www.baseball-reference.com/teams/tgl.cgi?team={team_code}&t=b&year={year}'
    try:
        r = requests.get(url)
        # If the response was successful, no exception will be raised
        r.raise_for_status()
    except HTTPError as http_err:
        raise http_err
    
    # Pull important data from xml tree
    tree = html.fromstring(r.content)
    wins,losses = tree.xpath('//div/div/div/div[contains(@data-template, \'Partials/Teams/Summary\')]/p/text()[contains(.,\'-\')]')[0].split()[0].split('-')
    losses = losses[:-1]

    # SQLite insert team
    query = 'INSERT INTO TeamSeason (team_id, season, wins, losses) VALUES (?,?,?,?)'
    try:
        conn.execute(query, (team_id_dict[team_code], year, wins, losses))
    except sqlite3.IntegrityError:
        print(f'Team Season exists: {team_code} {year}')
        return True
    except Exception as e:
        db_error_cleanup(conn, e)
        return False
    return True
예제 #2
0
def insert_batting_game_data(conn, team_code, year):
    team_id = team_id_dict[team_code]

    # Retrieve team batting data from Baseball Reference
    url = f'https://www.baseball-reference.com/teams/tgl.cgi?team={team_code}&t=b&year={year}'
    try:
        r = requests.get(url)
        # If the response was successful, no exception will be raised
        r.raise_for_status()
    except HTTPError as http_err:
        raise http_err

    # Retrieve Batting Table from XML page and put into a DataFrame
    soup = BeautifulSoup(r.content, "lxml")
    table = soup.find('table', attrs=dict(id='team_batting_gamelogs'))
    data = pd.read_html(str(table))[0]

    #   Rename some of the columns
    data.rename(columns={'Unnamed: 3':'HomeAway',
                        'Thr':'OppStarterThr',
                        'Opp':'opp_id',
                        'Date': 'game_date'}, inplace=True)

    #   Drop place holder rows 
    data.drop(data[data.OBP == 'OBP'].index, inplace=True)

    #   Fix Home/Away column values
    data.replace({'HomeAway': {'@':'A'}}, inplace=True)
    data.HomeAway.fillna('H', inplace=True)

    #   Split result column into multiple columns
    data[['Result', 'RunsAgainst']] = data.Rslt.str.split(',', expand=True)
    data.RunsAgainst = data.RunsAgainst.str.split('-').str[1]

    #   Fix Date column for double headers and made it SQLite compatible
    data.game_date = data.game_date.str.encode('ascii', 'ignore').str.decode('ascii').str.strip()
    data.game_date = data.game_date.str.replace('susp', '')
    data.game_date = data.game_date.str.split('(').str[0] + str(year)
    data.game_date = data.game_date.str.replace(' ', '')
    data.game_date = data.game_date.apply(lambda x: datetime.strptime(x, '%b%d%Y').strftime('%Y-%m-%d'))
    data['season'] = year

    #   Generate unique game id for each game and make it the index (home/away independent)
    game_ids = [data.game_date.iloc[i] + ((data.opp_id.iloc[i] + data.RunsAgainst.iloc[i] + team_code + data.R.iloc[i]) if data.HomeAway.iloc[i] == 'H' else (team_code + data.R.iloc[i] + data.opp_id.iloc[i] + data.RunsAgainst.iloc[i])) for i in range(len(data.index))]
    game_ids = [int(hashlib.sha256(s.encode('utf-8')).hexdigest(), 16) % sql_max_int for s in game_ids]
    data['game_id'] = game_ids

    # Add team ids
    data['team_id'] = team_id
    data.opp_id = data.opp_id.apply(lambda x: team_id_dict[x])

    #   Drop unneccessary columns and reorder the remains
    data.drop(columns=['Rslt', 'Rk', 'Gtm', '#', 'Opp. Starter (GmeSc)'], inplace=True)
    data = data[['game_id', 'team_id', 'opp_id', 'game_date', 'season', 'HomeAway', 'OppStarterThr', 'Result', 'RunsAgainst', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB',
                'IBB', 'SO', 'HBP', 'SH', 'SF', 'ROE', 'GDP', 'SB', 'CS', 'LOB', 'BA', 'OBP', 'SLG', 'OPS']]
                
    #   Convert numeric columns to numeric types
    data = data.copy()
    data.loc[:, 'RunsAgainst':] = data.loc[:, 'RunsAgainst':].apply(pd.to_numeric)

    data.set_index(['game_id','team_id'], inplace=True)

    try:
        data.to_sql('TeamBattingGame', conn, if_exists='append')
    except sqlite3.IntegrityError:
        return True
    except Exception as e:
        db_error_cleanup(conn, e)
        return False
    return True
예제 #3
0
def insert_pitching_season_data(conn, team_code, year):
    team_id = team_id_dict[team_code]

    # Retrieve team batting data from Baseball Reference
    url = f'https://www.baseball-reference.com/teams/{team_code}/{year}-pitching.shtml'
    try:
        r = requests.get(url)
        # If the response was successful, no exception will be raised
        r.raise_for_status()
    except HTTPError as http_err:
        raise http_err

    # Retrieve Batting Table from XML page and put into a DataFrame
    soup = BeautifulSoup(r.content, "lxml")
    table = soup.find('table', attrs=dict(id='team_pitching'))
    data = pd.read_html(str(table))[0]

    #   Get team's overall stats for the season
    team_pitching_data = data.loc[data['Name'] == 'Team Totals'].copy()
    team_pitching_data.drop(columns=['Rk', 'Pos', 'Name', 'SO/W', 'ERA+', 'W', 'L', 'W-L%'], inplace=True)
    team_pitching_data = list(pd.to_numeric(team_pitching_data.iloc[0]).round(3))

    #   Rename some of the columns
    data.rename(columns={'W':'wins',
                        'L':'losses'}, inplace=True)

    #   Drop place holder rows 
    data.drop(data[data.ERA == 'ERA'].index, inplace=True)
    data.dropna(axis=0, subset=['Rk'], inplace=True)

    #   Fill NaN win-loss %
    data.fillna(.000, inplace=True)

    #   Add player ids
    player_elements = table.findAll('td', attrs={'data-stat':'player'})
    player_ids = {}
    for row in player_elements:
        player_id = row.get('data-append-csv', None)
        if player_id is None:
            continue
        player_ids[row.get_text()] = player_id
    data.insert(0, 'player_id', data.Name.map(player_ids))
    data.insert(1, 'season', year)
    data.insert(2, 'team_id', team_id)

    #   Create new df to insert in Players table
    players = data[['player_id', 'Name']].copy()
    players.loc[players.Name.str.contains('\*'), 'Handedness'] = 'L'
    players.loc[players.Name.str.contains('\#'), 'Handedness'] = 'S'
    players.Handedness.fillna('R', inplace=True)
    players.Name = players.Name.str.rstrip('*#')
    players.set_index('player_id', inplace=True)

    #   Drop unneccessary columns and reorder the remains
    data.drop(columns=['Rk', 'Name', 'Pos', 'W-L%', 'SO/W', 'ERA+'], inplace=True)
                
    #   Convert numeric columns to numeric types
    data.loc['Age':] = data.loc['Age':].apply(pd.to_numeric)

    data.set_index(['player_id', 'season', 'team_id'], inplace=True)

    # Insert players into Players table if they're not already there
    query = 'INSERT INTO Pitchers (id, Name, Handedness) VALUES (?,?,?)'
    for i,row in players.iterrows():
        try:
            conn.execute(query, (i, *row))
        except sqlite3.IntegrityError:
            print(f'\t\t\tPlayer already inserted: {row[0]}')

    try:
        # Insert player and team's season pitching data
        data.to_sql('PlayerPitchingSeason', conn, if_exists='append')
        query = 'INSERT INTO TeamPitchingSeason (team_id, season, Age, ERA, G, GS, GF, CG, SHO, SV, IP, H, R, ER, HR, BB, IBB, SO, HBP, BK, WP, BF, FIP, WHIP, H9, HR9, BB9, SO9) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
        conn.execute(query, (team_id, year, *team_pitching_data))
    except sqlite3.IntegrityError:
        return True
    except Exception as e:
        db_error_cleanup(conn, e)
        return False
    return True