def get_bbref_pitch(url, year=2015): """ returns a pandas dataframe containing bbref info (not all numeric?) usage get_bbref(url) """ import Baseball import requests from bs4 import BeautifulSoup r = requests.get(url) data = r.text soup = BeautifulSoup(data, "lxml") tbl = soup.find("table", id="pitching_gamelogs") bbref = pd.read_html(str(tbl))[0] # Drop the last row = summary row bbref = bbref.iloc[:-1] try: bbref = bbref[bbref["Gcar"] != "Tm"] except TypeError: pass bbref = bbref.dropna(subset=["Gcar"], axis=0) for param in bbref.columns: bbref[param] = pd.to_numeric(bbref[param], errors="ignore") bbref["GDL_Date"] = bbref["Date"].apply(lambda x: bbref_date_to_gdl_date(x, year)) bbref["IP"] = bbref["IP"].apply(convert_bbref_ip) bbref["WHIP"] = Baseball.get_whip(bbref) bbref["ERIP"] = Baseball.get_erip(bbref) return bbref
def get_ops_pfx(df): """ From a pitchab dataframe calculate OPS OPS = SLG + OBP """ slg = Baseball.get_slg_pfx(df) obp = Baseball.get_obp_pfx(df) return(slg + obp)
def get_slg_pfx(df): """ Calculate slugging from a pitchab dataframe SLG = (Total bases) / (At Bats) """ tb = Baseball.get_tb(df) ab = Baseball.get_atbats_count_pfx(df) return(tb/ab)
def get_obp_pfx(df): # (Hits + Walks + Hit by Pitch) / (At Bats + Walks + Hit by Pitch + Sacrifice Flies) ob_events = ['Single', 'Double', 'Triple', 'Home Run', 'Walk', 'Intent Walk', 'Hit By Pitch'] on_base = len(df[df['event'].isin(ob_events)].groupby(['gameday_link','num']).first()) obp_pa = Baseball.get_pa_for_obp(df) return on_base/obp_pa
def hits_per_pitch_subzone(df): """ Get hits per pitch in a sub-region of the strike zone Input: dataframe containing pitches within a specific sub-region output: dataframe with rows = vertical position, columns = horizontal position, values = Hits per pitch in that subzone """ hits_subzoneD = {} for row in np.arange(-1.5, 1.5, 0.6): for col in np.arange(1, 4, 0.6): subzone = df[(df['px']>=row) & (df['px']<row+0.6) & (df['pz']>=col) & (df['pz']<col+0.6)] hitsD = Baseball.hits_tb_per_pitch(subzone) hits = hitsD['Hits'] pitches = hitsD['Pitches'] try: hits_per_pitch = hits/pitches except ZeroDivisionError: hits_per_pitch = 0 try: hits_subzoneD[row].append( hits_per_pitch ) except KeyError: hits_subzoneD[row] = [hits_per_pitch, ] return pd.DataFrame(hits_subzoneD, index=np.arange(1, 4, 0.6))
def get_pa_count_pfx(df): # PA = AB + BB + HBP + SH + SF + Times Reached on Defensive Interference pa_extras = ['Walk', 'Sac Fly', 'Hit By Pitch', 'Intent Walk', 'Sac Bunt', 'Sac Fly', 'Catcher Interference', 'Fan interference', 'Batter Interference', 'Sac Fly DP', 'Sacrifice Bunt DP'] pa = (Baseball.get_atbats_count_pfx(df) + len(df[df['event'].isin(pa_extras)].groupby(['gameday_link','num']).first())) return pa
def get_slg_pfx(df): ab = Baseball.get_atbats_df_pfx(df) events = list(ab.groupby(['gameday_link','num']).first()['event'].values) b1 = events.count('Single') b2 = events.count('Double') * 2 b3 = events.count('Triple') * 3 b4 = events.count('Home Run') * 4 return (b1 + b2 + b3 + b4)/len(events)
def get_pa_for_obp(df): # At Bats + Walks + Hit by Pitch + Sacrifice Flies obp_pa_extras = ['Walk', 'Sac Fly', 'Hit By Pitch', 'Intent Walk', 'Sac Fly', 'Catcher Interference', 'Fan interference', 'Batter Interference', 'Sac Fly DP', 'Sacrifice Bunt DP'] obp_pa = (Baseball.get_atbats_count_pfx(df) + len(df[df['event'].isin(obp_pa_extras)].groupby(['gameday_link','num']).first())) return obp_pa
def get_obp_pfx(df): """ Calculate on-base percentage from a pitchab dataframe OBP = (Hits + Walks + Hit by Pitch) / (At Bats + Walks + Hit by Pitch + Sacrifice Flies) """ ob_events = ['Single', 'Double', 'Triple', 'Home Run', 'Walk', 'Intent Walk', 'Hit By Pitch'] on_base = len(df[df['event'].isin(ob_events)].groupby(['gameday_link','num']).first()) obp_pa = Baseball.get_pa_count_pfx(df) return on_base/obp_pa
def get_pa_count_pfx(df): """ Given a pitchab dataframe return the number of official plate appearances PA = AB + BB + HBP + SH + SF + Times Reached on Defensive Interference -> Calculate by first counting at-bats and then counting additional events that count toward a plate appearance """ pa_extras = ['Walk', 'Sac Fly', 'Hit By Pitch', 'Intent Walk', 'Sac Bunt', 'Sac Fly', 'Catcher Interference', 'Fan interference', 'Batter Interference', 'Sac Fly DP', 'Sacrifice Bunt DP'] pa = (Baseball.get_atbats_count_pfx(df) + len(df[df['event'].isin(pa_extras)].groupby(['gameday_link','num']).first())) return pa
def get_box_info(df): """ Takes a pitchFX dataframe Breaks into sub-regions Finds the pitches in each subregion, calculates slg and hits in each Returns a dict box_infoD[(ctr_x, ctr_y)] containing 'pitch_pct': (Percent of all pitches that are in each box) 'slgs': (total bases per hit in box) 'hits': (total hits in box) 'hits_per_ptch': hits in box/pitches in box * percent of pitches in box """ (top, bottom,left,right, x_step, y_step) = Baseball.official_zone_25_boxes() assert len(df) > 0, 'Empty dataframe' box_infoD = {} for x in np.linspace(left, right, 6): for y in np.linspace(bottom, top, 6): ctr_x, ctr_y = get_center_point(x, y, x_step, y_step) box_infoD[(ctr_x, ctr_y)] = {} box = df[(df['px'] >= x) & (df['px'] < x + x_step) & (df['pz'] >= y) & (df['pz'] < y + y_step)] if len(box) == 0: box_infoD[(ctr_x, ctr_y)]['slgs'] = 0 box_infoD[(ctr_x, ctr_y)]['hits'] = 0 box_infoD[(ctr_x, ctr_y)]['hits_per_pitch'] = 0 box_infoD[(ctr_x, ctr_y)]['pitch_pct'] = 0 else: (tb_in_box, hits_in_box) = get_slg_in_box(box) box_infoD[(ctr_x, ctr_y)]['hits_per_pitch'] = hits_in_box/len(box) box_infoD[(ctr_x, ctr_y)]['pitch_pct'] = len(box)/len(df)*100 if hits_in_box == 0: box_infoD[(ctr_x, ctr_y)]['slgs'] = 0 box_infoD[(ctr_x, ctr_y)]['hits'] = 0 else: box_infoD[(ctr_x, ctr_y)]['slgs'] = tb_in_box/hits_in_box box_infoD[(ctr_x, ctr_y)]['hits'] = hits_in_box return (box_infoD)
def hits_tb_per_pitch(df): """ From a pitchab dataframe return a dict containing Pitches Hits Total Bases Total Bases per pitch' """ tb = Baseball.get_tb(df) hits = get_hits(df) if len(df) == 0: tb_per_pitch = 0 else: tb_per_pitch = tb/len(df) return {'Pitches':len(df), 'Hits':len(hits), 'TB':tb, 'TB_per_pitch':tb_per_pitch}
def zone_as_polygon(year): """ Converts a strike zone (as a series of points) into a matplotlib polygon path Allows use of path.contains_points to test if pitches are inside a strike zone """ import matplotlib.patches as patches zone_dict = Baseball.get_50pct_zone(2016) xy_r = np.array([np.array(xy) for xy in zip(np.array(zone_dict['xRs'][:-1]), np.array(zone_dict['yRs'][:-1]))]) zone_polygon_r = patches.Polygon(xy_r,closed=True, facecolor='grey', alpha=0.1) zone_path_r = zone_polygon_r.get_path() xy_l = np.array([np.array(xy) for xy in zip(np.array(zone_dict['xLs'][:-1]), np.array(zone_dict['yLs'][:-1]))]) zone_polygon_l = patches.Polygon(xy_l,closed=True, facecolor='grey', alpha=0.1) zone_path_l = zone_polygon_l.get_path() return {'R':zone_path_r, 'L':zone_path_l}
def get_bbref_bat(url, year=2015): from bs4 import BeautifulSoup import requests r = requests.get(url) data = r.text soup = BeautifulSoup(data, "lxml") tbl = soup.find("table", id="batting_gamelogs") bbref = pd.read_html(str(tbl))[0] # Drop the last row = summary row bbref = bbref.iloc[:-1] try: bbref = bbref[~((bbref["Gcar"].str.contains("Tm")) | (bbref["Gcar"].str.contains("Gcar")))] except TypeError: pass bbref = bbref.dropna(subset=["Gcar"], axis=0) for param in bbref.columns: bbref[param] = pd.to_numeric(bbref[param], errors="ignore") bbref["GDL_Date"] = bbref["Date"].apply(lambda x: Baseball.bbref_date_to_gdl_date(x, year)) return bbref
def tb_per_pitch_subzone(df): """ Get total bases in a sub-region of the strike zone Input: dataframe containing pitches within a specific sub-region output: dataframe with rows = vertical position, columns = horizontal position, values = TB per pitch in that subzone """ tb_subzoneD = {} for row in np.arange(-1.5, 1.5, 0.6): for col in np.arange(1, 4, 0.6): subzone = df[(df['px']>=row) & (df['px']<row+0.6) & (df['pz']>=col) & (df['pz']<col+0.6)] tbD = Baseball.hits_tb_per_pitch(subzone) try: tb_subzoneD[row].append( tbD['TB_per_pitch']) except KeyError: tb_subzoneD[row] = [tbD['TB_per_pitch'], ] return pd.DataFrame(tb_subzoneD, index=np.arange(1, 4, 0.6))
def get_con(year, dbFolder="/Users/iayork/Documents/Baseball/PitchFX", db=False): if not db: db = 'pitchfx%s.db' % year db_path = os.path.join(dbFolder, db print(db_path) engine = create_engine('sqlite:///%s' % db_path) connection = engine.connect() def get_pitchab(con, reg=True): """ Get everything from pitch and atbat, merge on gameday_link + num usage: get_pitchab(con, reg=True) set "reg=False" to get spring training, all-star, post-season games """ #atbat = pd.read_sql("select * from atbat ", con) # for sqlite3 connection #pitch = pd.read_sql("select * from pitch ", con) # for sqlite3 connection atbatdf = pd.read_sql_table('atbat', connection) pitchdf = pd.read_sql_table('pitch', connection) pitchab = pitch.merge(atbat, on=['gameday_link','num'], suffixes=('', '_duplicate_delete')) if reg: gamedf = pd.read_sql_table('game', connection) regdf = gamedf[gamedf['game_type']=='R'] reg_gdls = ['gid_%s' % x for x in regdf['gameday_link'].values] pitchab = pitchab[pitchab['gameday_link'].isin(reg_gdls)] #game_sql = """select gameday_link from game where game_type="R" """ #reg_gdls_df = pd.read_sql(game_sql, con) #reg_gdls = ['gid_%s' % x for x in reg_gdls_df['gameday_link'].values] #pitchab = pitchab[pitchab['gameday_link'].isin(reg_gdls)] drop_cols = [x for x in pitchab.columns if '_duplicate_delete' in x] for param in ('break_angle', 'break_length','break_y'): pitchab[param] = pd.to_numeric(pitchab[param]) return pitchab.drop(drop_cols, axis=1) """ def get_con(year, dbFolder="/Users/iayork/Documents/Baseball/PitchFX", db=False): # dbFolder default="/Users/iayork/Documents/Baseball/PitchFX" if not db: db = 'pitchfx%s.db' % year print(os.path.join(dbFolder, db)) return sql.connect(os.path.join(dbFolder, db)) """" def get_pitchab_for_pitcher(pitcher_name, con, reg=True): """ Get everything from pitch and atbat for a specific pitcher, merge on gameday_link + num usage: get_pitchab_for_pitcher(pitcher_name, con, reg=True) set "reg=False" to get spring training, all-star, post-season games """ atbat_sql = """select * from atbat where pitcher_name = "%s" """ % pitcher_name pitch_sql = """select * from pitch where gameday_link in (select gameday_link from atbat where pitcher_name = "%s") """ % pitcher_name atbat = pd.read_sql(atbat_sql, con) pitch = pd.read_sql(pitch_sql, con) pitchab = pitch.merge(atbat, on=['gameday_link','num']) pitchab.dropna(subset=['px',], inplace=True) if reg: game_sql = """select gameday_link from game where game_type="R" """ reg_gdls_df = pd.read_sql(game_sql, con) reg_gdls = ['gid_%s' % x for x in reg_gdls_df['gameday_link'].values] pitchab = pitchab[pitchab['gameday_link'].isin(reg_gdls)] for param in ('break_angle', 'break_length','break_y'): pitchab[param] = pd.to_numeric(pitchab[param]) return pitchab # -------- Convert between formats ---------- def bbref_date_to_gdl_date(bbref_date, year=2016): """ take date in format "Apr 8" or "Jul 7(1)" and convert to "04-08-16" format usage: bbref_date_to_gdl_date(bbref_date, year) year default = 2016 """ dateD = {'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11} if len(str(year)) == 4: year = str(year)[-2:] if ' (' in bbref_date: # e.g. May 6 (1) new_date = '%02d-%02d-%s' % (dateD[bbref_date.split()[0]], int(bbref_date.split()[1].split('(')[0]), year) else: new_date = '%02d-%02d-%s' % (dateD[bbref_date.split()[0]], int(bbref_date.split()[1].split('(')[0]), year) return new_date def bbref_dates_to_gdls(df, year=2016): """ Convert baseball-reference.com date and team information to a series of gameday_link gid_2015_05_02_nyamlb_bosmlb_1 Return a list of gameday_link """ gdls = [] monthD = {'Mar':'03','Apr':'04', 'May':'05', 'Jun':'06', 'Jul':'07', 'Aug':'08', 'Sep':'09', 'Oct':'10'} teamD = {'ATL':'atl','BAL':'bal', 'BOS':'bos', 'CHW':'cha', 'DET':'det','HOU':'hou', 'KCR':'kca', 'LAA':'ana', 'MIN':'min','NYY':'nya', 'OAK':'oak', 'SEA':'sea', 'TBR':'tba', 'TEX':'tex', 'TOR':'tor','MIA':'mia', 'NYM':'nyn','PHI':'phi', 'CLE':'cle','WSN':'was', 'CHC':'chn','PIT':'pit','STL':'sln','MIL':'mil', 'CIN':'cin','ARI':'ari','COL':'col','LAD':'lan', 'SDP':'sdn', 'SFG':'sfn', 'FLA':'flo'} for (date, tm, at_, opp) in df[['Date','Tm','Unnamed: 4','Opp']].values: month = monthD[date.split(' ')[0]] day = date.split(' ')[1].zfill(2) if '(' in date: game_no = date.split('(')[1].replace(')', '') else: game_no = '1' if at_ == '@': gdls.append( 'gid_%s_%s_%s_%smlb_%smlb_%s' % (year, month, day, teamD[tm], teamD[opp], game_no)) else: gdls.append( 'gid_%s_%s_%s_%smlb_%smlb_%s' % (year, month, day, teamD[opp], teamD[tm], game_no)) return gdls def gdl_to_date(gdl): # Takes gameday_link and returns a date as a string like 05-01-16 y, m, d = (gdl.split('_')[1], gdl.split('_')[2], gdl.split('_')[3]) return ('%s-%s-%s' % (m, d, y[-2:])) def gdl_to_datetime(x): # Takes a gameday_link and returns a date as a datetime return pd.to_datetime(gdl_to_date(x), format='%m-%d-%y') def convert_bbref_ip(x): """ convert series containing innings pitched in ".1", ".2" format to ".33", ".67" format """ return round(int(x)) + (x-round(int(x)))/0.3 # ----------------- Misc -------------------- def get_bbref_pitch(url, year=2016): """ returns a pandas dataframe containing bbref info (not all numeric?) usage get_bbref_pitch(url) """ import Baseball import requests from bs4 import BeautifulSoup r = requests.get(url) data = r.text soup = BeautifulSoup(data, 'lxml') tbl = soup.find('table', id='pitching_gamelogs') bbref = pd.read_html(str(tbl))[0] # Drop the last row = summary row bbref = bbref.iloc[:-1] try: bbref = bbref[bbref['Gcar'] != 'Tm'] except TypeError: pass bbref = bbref.dropna(subset=['Gcar',], axis=0) for param in bbref.columns: bbref[param] = pd.to_numeric(bbref[param], errors='ignore') bbref['GDL_Date'] = bbref['Date'].apply(lambda x:bbref_date_to_gdl_date(x, year)) bbref['IP'] = bbref['IP'].apply(convert_bbref_ip) bbref['WHIP'] = Baseball.get_whip(bbref) bbref['ERIP'] = Baseball.get_erip(bbref) return bbref def get_bbref_bat(url, year=2016): """ returns a pandas dataframe containing bbref info (not all numeric?) usage get_bbref_bat(url) """ from bs4 import BeautifulSoup import requests r = requests.get(url) data = r.text soup = BeautifulSoup(data, 'lxml') tbl = soup.find('table', id='batting_gamelogs') bbref = pd.read_html(str(tbl))[0] # Drop the last row = summary row bbref = bbref.iloc[:-1] try: bbref = bbref[~((bbref['Gcar'].str.contains('Tm')) | (bbref['Gcar'].str.contains('Gcar')))] except TypeError: pass bbref = bbref.dropna(subset=['Gcar',], axis=0) for param in bbref.columns: bbref[param] = pd.to_numeric(bbref[param], errors='ignore') bbref['GDL_Date'] = bbref['Date'].apply(lambda x:Baseball.bbref_date_to_gdl_date(x, year)) return (bbref)
def get_atbats_count_pfx(df): """ Given a pitchab dataframe return the number of official at-bats """ ab = Baseball.get_atbats_df_pfx(df) return len(ab.groupby(['gameday_link','num']).first())
def get_slg_per_atbat_pfx(df): """Calculate Total bases per ATBAT """ ab = len(Baseball.get_atbats_df_pfx(df)) tb = Baseball.get_tb(df) return tb/len(events)
def get_ops_pfx(df): slg = Baseball.get_slg_pfx(df) obp = Baseball.get_obp_pfx(df) return(slg + obp)
def get_atbats_count_pfx(df): ab = Baseball.get_atbats_df_pfx(df) return len(ab.groupby(['gameday_link','num']).first())