def main(): db = Database() conn = db.create_connection(DB_NAME) c = conn.cursor() c.execute('DROP table if exists batting') conn.commit() c.execute('CREATE table if not exists batting(Season real, Name text, Team text, ' + 'Age real, G real, AB real, PA real, H real, h_1B real, h_2B real, ' + 'h_3B real, HR real, RBI real, SB real, AVG real, BB_p real, K_p real, ' + 'BB_K real, OBP real, SLG real, OPS real, ISO real, BABIP real, ' + 'GB_FB real, LD_p real, GB_p real, FB_p real, IFFB_p real, wOBA real, ' + 'WAR real, wRC_p real)') conn.commit() for i in range(1970, 2020): data = batting_stats(i) for i, row in data.iterrows(): d = [(row['Season'], row['Name'], row['Team'], row['Age'], row['G'], row['AB'], row['PA'], row['H'], row['1B'], row['2B'], row['3B'], row['HR'], row['RBI'], row['SB'], row['AVG'], row['BB%'], row['K%'], row['BB/K'], row['OBP'], row['SLG'], row['OPS'], row['ISO'], row['BABIP'], row['GB/FB'], row['LD%'], row['GB%'], row['FB%'], row['IFFB%'], row['wOBA'], row['WAR'], row['wRC+'])] c.executemany('INSERT into batting VALUES (?, ?, ?, ?, ?, ?, ?, ' + '?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ' + '?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', d) conn.commit() conn.close()
def load_hitting_data(): batting_2019_L = batting_stats(2019, end_season=None, league='all', qual=1, ind=0, handVar='L') batting_2019_R = batting_stats(2019, end_season=None, league='all', qual=1, ind=0, handVar='R') batting_2019_L = batting_2019_L[batting_2019_L['AB'] >= 30] batting_2019_R = batting_2019_R[batting_2019_R['AB'] >= 30] return batting_2019_L, batting_2019_R
def test_cache(monkeypatch: MonkeyPatch, cache_type: str, thrower: Callable) -> None: with patch('pybaseball.cache.config.cache_type', cache_type): # Delete any existing data just in case pybaseball.cache.purge() # Uncached read result = pybaseball.batting_stats(2019) # type: ignore # Make requests.get throw an error so we can be sure this is coming from the cache monkeypatch.setattr(requests, 'get', thrower) # Cached read result2 = pybaseball.batting_stats(2019) # type: ignore pd.testing.assert_frame_equal(result, result2) # Cleanup pybaseball.cache.purge()
def make_period_dicts(dictionary): batter_df = { dic: pyb.batting_stats(int(dic), qual=False) for dic in dictionary.keys() } pitcher_df = { dic: pyb.pitching_stats(int(dic), qual=False) for dic in dictionary.keys() } return batter_df, pitcher_df
def get_fangraphs_data(path, fn, startyear, endyear, force_API): fgf = path + fn if (force_API == True) or (not os.path.exists(fgf)): data = batting_stats(startyear, endyear) data = data[[ 'Name', 'G', 'AB', 'H', '2B', '3B', 'HR', 'SF', 'BB', 'HBP', 'OBP', 'SLG', 'OPS' ]] export_csv = data.to_csv(fgf, index=None, header=True) else: data = pd.read_csv(fgf) return data
def cmd_batting_upload(start_year, end_year, db_username, db_password, db_hostname, db_name, db_tablename): click.echo('[[[ PULLING BATTING DATAFRAME ]]]') engine = initdb_batting(db_username, db_password, db_hostname, db_name, db_tablename) click.echo(str('Pulling data...')) try: data = batting_stats(start_year, end_year) data.columns = data.columns.str.replace('%', '') upload_block(data, engine, db_tablename) except Exception as exc: click.echo("ERROR pulling down data - Error was = " + str(exc)) else: click.echo(str('SUCCESS, pulled ' + str(data.shape[0]) + ' records'))
def retrieve_stats(team, year): batting = batting_stats(year) pitching = pitching_stats(year) team_batting = batting[batting['Team'] == team][[ 'Name', 'PA', 'WAR', 'wRC+', 'AVG', 'OBP', 'SLG', 'OPS', 'HR' ]].sort_values('PA', ascending=False) team_batting = team_batting.set_index('Name') print("***** BATTING STATS *****") print(team_batting) print("") team_pitching = pitching[pitching['Team'] == team][[ 'Name', 'IP', 'WAR', 'FIP' ]].sort_values('IP', ascending=False) team_pitching = team_pitching.set_index('Name') print("***** PITCHING STATS *****") print(team_pitching)
currentDate = datetime.now() currentMonth = f"{currentDate.month:02d}" if int(currentMonth) > 3: currentYear = currentDate.year currentDay = date.today() recentDay = currentDay - timedelta(days=7) currentDay = str(currentDay) recentDay = str(recentDay) else: currentYear = currentDate.year -1 currentDay = None recentDay = None pitchingData = pitching_stats(currentYear) battingData = batting_stats(currentYear) if currentDay is not None: recentPitchingData = pitching_stats_range(recentDay, currentDay) recentBattingData = batting_stats_range(recentDay, currentDay) recentPitchingDataFile = open("../public/json/pitcherRankingsRecent.json", "w") recentPitchingDataFile.write(json.dumps(json.loads(recentPitchingData.reset_index().to_json(orient='index')), indent=2)) recentPitchingDataFile.close() recentBattingDataFile = open("../public/json/batterRankingsRecent.json", "w") recentBattingDataFile.write(json.dumps(json.loads(recentBattingData.reset_index().to_json(orient='index')), indent=2)) recentBattingDataFile.close() else: recentPitchingDataFile = open("../public/json/pitcherRankingsRecent.json", "w")
def yearGrab(currentSeason): # Batting Stats from pybaseball import batting_stats BattingStats_Year = batting_stats(currentSeason, qual=1) # print(BattingStats_Year.head()) # Test BattingStats_Year.to_csv('data/YearlyData/temp/bstats.csv') import pybaseball #Team Batting Stats BattingStats_Team_Year = pybaseball.team_batting(currentSeason) BattingStats_Team_Year.to_csv('data/YearlyData/temp/team_bstats.csv') # Pitching Stats from pybaseball import pitching_stats PitchingStats_Year = pitching_stats(currentSeason) # print(PitchingStats_Year.head()) # Test PitchingStats_Year.to_csv('data/YearlyData/temp/pstats.csv') print(str(currentSeason) + " : Stats Grab: Successful") # Team Pitching Stats PitchingStats_Team_Year = pybaseball.team_pitching(currentSeason) PitchingStats_Team_Year.to_csv('data/YearlyData/temp/team_pstats.csv') """ # Year Standings if currentSeason >=1969: from pybaseball import standings # get the end-of-season division standings for each season Standings = standings(currentSeason) Standings.insert(0, 'Season', currentSeason) Standings.to_csv('data/YearlyData/temp/team_standings.csv') """ """ # Amateur Draft Data if currentSeason >= 1965: from pybaseball import amateur_draft # Get amateur Draft Results Number_Rounds = 30 RoundRange = list(range(1, Number_Rounds + 1)) for i in RoundRange: #Pull season draft data 1 round at a time Amateur_Draft = amateur_draft(currentSeason, i) #Add Draft Year Amateur_Draft.insert(0, 'Draft_Year', currentSeason) # print(Amateur_Draft.head()) # Test # Read file with all draft files so far A_Draft = pd.read_csv('data/YearlyData/temp/amateur_draft.csv') # Add New Round A_Draft = A_Draft.append(Amateur_Draft, ignore_index=True) #Save new round to File A_Draft.to_csv('data/YearlyData/temp/amateur_draft.csv', sep=',', index=False, encoding='utf-8') """ # Exit Veolocity Data from pybaseball import statcast_batter_exitvelo_barrels Exit_Velocity = statcast_batter_exitvelo_barrels(currentSeason) Exit_Velocity.insert(0, 'Season', currentSeason) Exit_Velocity.to_csv('data/YearlyData/temp/statcast_exit_velocity.csv') # FanGraph Data import pybaseball # Individual Batting Stats fan_bat = pybaseball.batting_stats(currentSeason) fan_bat.to_csv('data/YearlyData/temp/fan_bat.csv') # Individual Pitching Stats fan_pit = pybaseball.pitching_stats(currentSeason) fan_pit.to_csv('data/YearlyData/temp/fan_pit.csv') # Team Batting Stats fan_team_bat = pybaseball.team_batting(currentSeason) fan_team_bat.to_csv('data/YearlyData/temp/fan_team_bat.csv') # Team Pitching Stats fan_team_pit = pybaseball.team_pitching(currentSeason) fan_team_pit.to_csv('data/YearlyData/temp/fan_team_pit.csv') from pybaseball import top_prospects # Get top overall prospects leaguewide topProspects = top_prospects() topProspects.to_csv('data/YearlyData/Top_Prospects.csv')
import pandas as pd import numpy as np import matplotlib.pyplot as plt import pybaseball as pyb from pybaseball import statcast_batter from pybaseball import playerid_lookup pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) pd.set_option("chained_assignment", None) pitchers_2018 = pyb.pitching_stats(2018) pitchers_2019 = pyb.pitching_stats(2019) pitchers_2020 = pyb.pitching_stats(2020) df_2018 = pyb.batting_stats(2018) df_2019 = pyb.batting_stats(2019) df_2020 = pyb.batting_stats(2020) #cards_df = df_2019.loc[df_2019['Team'] == "Cardinals"] #cards_pitchers = pitchers_2019.loc[pitchers_2019['Team'] == "Cardinals"] """ winker_stats = statcast_batter('2020-08-01', '2020-08-03', 608385) print(winker_stats) """ """ Sorting the Cardinals (2019) by WAR (top 30) And ONLY showing the WAR and name columns. """ #print(cards_df.sort_values(by = 'WAR', ascending = False).head(30)) #print(cards_pitchers)
This is the data visualization part. This will allow us to see graphics of the data, rather then having to read through all the numbers. Here on line 5 we see something that is usually non Pythonic, we have two statements on one line, seperated by a semicolon. This is how you can have two statements on one line in python, otherwise this would be unattainable code. """ import pandas as pd pd.set_option('display.max_columns', None) import seaborn as sns from matplotlib import pyplot as plt import pybaseball as pyb import warnings warnings.filterwarnings('ignore') #This sets a var data to the batting stats of 2019, then you copy the data to the batting var made on line 15, and then you print the first 5 entries. data = pyb.batting_stats(2019) batting = data.copy() #print(batting.head(5)) """ In the following code we will be printing 2019 stats for how many homers per at bats players have, we will filter out players with less than 50 ABs. Many of these functions from seaborn consist of us having to pass in our DataFrames columns. In some cases we will only need to provide a single column, but in other cases where x/y data is required, we will provide two. First we will plot this data using seaborn, then we will do it in matplotlib. They do the same exact thing, but in some cases one is more useful than the other. Usually matplotlib takes more lines of code to produce the same plot. They also look different, so you can choose whichever floats your boat Seaborn documentation for scatter plots ---> https://seaborn.pydata.org/generated/seaborn.scatterplot.html """ #setting the style for visualizations #sns.set_style('whitegrid')
def points(currentSeason): from pybaseball import batting_stats data = batting_stats(currentSeason, qual=1) pointAmountsBat() points = ( ( R * data['R'] ) + ( Single * data['1B'] ) + ( Double * data['2B'] ) + ( Triple * data['3B'] ) + ( HR * data['HR'] ) + ( TB * ( data['1B'] + (2 * data['2B']) + (3 * data['3B']) + (4 * data['HR']) ) ) + ( RBI * data['RBI'] ) + ( BB * data['BB'] ) + ( K * data['SO'] ) + ( SB * data['SB'] ) + ( AB * data['AB'] ) + ( Hits * data['H']) + ( XBH * ( data['2B'] + data['3B'] + data['HR']) ) + ( IBB * data['IBB'] ) + ( HBP * data['HBP'] ) + ( CS * data['CS'] ) ) # missing Sac and GWRBI and everything past CS data['Points'] = points data['Points'] = data['Points'].round(decimals=0) BattingStats = data[['Season', 'Name', 'Team', 'Age', 'Points', 'G', 'PA', 'AB', 'AVG', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SB', 'CS', 'Pitches']] BattingStats = BattingStats.sort_values('Points', ascending=False) BattingStats = BattingStats.reset_index(drop=True) BattingStats.to_csv('data/bstats.csv') from pybaseball import pitching_stats pdata = pitching_stats(currentSeason) pointAmountsPit() ppoints = ( ( IP * pdata['IP'] ) + ( ER * pdata['ER'] ) + ( K * pdata['SO'] ) + ( SO * pdata['ShO'] ) + ( W * pdata['W'] ) + ( L * pdata['L'] ) + ( SV * pdata['SV'] ) + ( BS * pdata['BS'] ) + ( G * pdata['G'] ) + ( GS * pdata['GS'] ) + ( H * pdata['H'] ) + ( RA * pdata['R'] ) + ( HR * pdata['HR'] ) + ( BB * pdata['BB'] ) + ( HB * pdata['HBP'] ) + ( IBB * pdata['IBB'] ) + ( B * pdata['BK'] ) #+ ( PKO * pdata['PKO'] ) + ( QS * pdata['BK'] ) + ( CG * pdata['CG'] ) #+ ( NH * pdata['NH'] ) #+ ( PG * pdata['PG'] ) #+ ( BF * pdata['BF'] ) + ( PC * pdata['Pitches'] ) #+ ( SOP * pdata['SOP'] ) #+ ( HD * pdata['HD'] ) ) pdata['Points'] = ppoints pdata['Points'] = pdata['Points'].round(decimals=0) PitchingStats = pdata[['Season', 'Name', 'Team', 'Age', 'Points', 'W', 'L', 'ERA', 'ER', 'WAR', 'G', 'GS', 'CG', 'ShO', 'SV', 'BS', 'IP', 'H', 'R', 'HR', 'BB', 'IBB', 'HBP', 'BK', 'SO', 'TBF', 'Pitches']] PitchingStats = PitchingStats.sort_values('Points', ascending=False) PitchingStats = PitchingStats.reset_index(drop=True) PitchingStats.to_csv('data/pstats.csv')
'Fld', 'Rep', 'Pos', 'RAR', 'WAR', 'Spd', 'wRC+'] # %% url = 'https://baseballsavant.mlb.com/statcast_search/csv?hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=2018%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfInfield=&team=&position=&hfOutfield=&hfRO=&home_road=&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_pas=150#results' s = requests.get(url).content xwOBA_df = pd.read_csv(io.StringIO(s.decode('utf-8'))) #xwOBA_df = pd.read_csv('xWAR/savant_data_xwOBA_20180506.csv') fg_df = pb.batting_stats(2018) pf_df = pd.read_csv('mlb/xWAR/park_factors.csv', usecols=['Team', 'Basic']) guts_df = pd.read_csv('mlb/xWAR/fg_guts.csv', usecols=['Season', 'wOBA', 'wOBAScale', 'R/PA', 'R/W']) fg_df['League'] = np.where(fg_df.Team.isin(nl_teams), 'NL', 'AL') pf_df['dec_pf'] = pf_df.Basic / 100 guts_df.rename(columns={'wOBA': 'lg_wOBA', 'wOBAScale': 'wOBA_scale'}, inplace=True) # %% df = xwOBA_df.merge(fg_df, left_on='player_name', right_on='Name')[comb_fields] df = df.merge(pf_df, how='left', on='Team') df = df.merge(guts_df, on='Season') # %% al_r = df[df.League == 'AL'].wRC.sum() / df[df.League == 'AL'].PA.sum()
'Age Rng': 'Age_Rng', 'K-BB%': 'K_BB_pct', 'K/9+': 'K_9_plus', 'BB/9+': 'BB_9_plus', 'H/9+': 'H_9_plus', 'HR/9+': 'HR_9_plus', 'LOB%+': 'LOB_pct_plus', 'WHIP+': 'WHIP_plus' } for year in years: for side in sides: table_name = side + '_' + str(year) table_id = "fangraphs." + table_name if side == 'batting': df = batting_stats(year, qual=0) elif side == 'pitching': df = pitching_stats(year, qual=0) df.rename(columns=renamed_columns, inplace=True) print('Loading ' + table_id) table_schema = [] for column in df.columns: table_schema.append({'name': column, 'type': 'STRING'}) pandas_gbq.to_gbq(df, table_id, project_id=project_id, if_exists='replace', table_schema=table_schema, chunksize=10000) # try: # if side == 'batting':
def profile_batting_stats(): batting_stats(2019)
ax.set_ylabel('Projected Hits') ax.set_xlabel('Last Years Hits') sns.regplot(data = df, x = 'hits_last_year', y = 'hits_next_year') #plt.show() """ #predicting hits section pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('chained_assignment', None) print('Loading ten years of data...this may take a few minutes') #loading df from 2010-2020 hits_df = pyb.batting_stats(2010, 2021) hits_2019 = pyb.batting_stats(2018, 2019) print('data loaded') hits_df_copy = hits_df.copy() hits_df_copy = hits_df_copy.loc[:, [ 'Season', 'Name', 'AB', 'HR', 'SLG', 'LD%', 'wOBA', 'Contact%', 'Soft%', 'Med%', 'Hard%' ]] hits_df_copy['HR_Next_Year'] = hits_df_copy.sort_values( ['Name', 'Season'], ascending=False).groupby('Name')['HR'].shift() hits_df_copy = hits_df_copy.loc[hits_df_copy['AB'] > 300]