def pitchingFangraphsData(dateRange, stats): # extracting year from start date startStr = dateRange[0] start = int(startStr[:4]) # extracting year from end date endStr = dateRange[1] end = int(endStr[:4]) # get pitching_stats for specific range given print("Gathering Data from Fangraphs") pitchStats = bball.pitching_stats(start, end) print("Data Gathering Complete") pitchStatsDF = pd.DataFrame(pitchStats) headersList = pitchStatsDF.columns headers = np.asarray(headersList) # drop columns that are in the input stats array keepStats = [] index = [] count = 0 for i in stats: for j in headers: if j == i: index.insert(len(index) + 1, count) keepStats.insert(len(keepStats) + 1, j) count = count + 1 count = 0 dropCols = np.delete(headers, index) drop = np.asarray(dropCols) # drop the columns from pitching stats with these names pitchStatsDF = pitchStatsDF.drop(columns=drop) return pitchStatsDF
def load_pitching_data(): pitching_2019_L = pitching_stats(2019, end_season=None, league='all', qual=1, ind=0, handVar='L') pitching_2019_R = pitching_stats(2019, end_season=None, league='all', qual=1, ind=0, handVar='R') # Restrict data to only players with more than 9 IP pitching_2019_L = pitching_2019_L[pitching_2019_L['IP'] >= 9] pitching_2019_R = pitching_2019_R[pitching_2019_R['IP'] >= 9] return pitching_2019_L, pitching_2019_R
def make_period_dicts(dictionary): batter_df = { dic: pyb.batting_stats(int(dic), qual=False) for dic in dictionary.keys() } pitcher_df = { dic: pyb.pitching_stats(int(dic), qual=False) for dic in dictionary.keys() } return batter_df, pitcher_df
def preprocess_data(): # read in CY historical data print("\nload cy young award winner data") append_data = [] # i have Cy Young winner data from 2006 to 2015 for i in range(2006, 2016): for j in ['AL', 'NL']: file_name = "CY_" + str(j) +"_" + str(i) + ".txt" tmp = pd.read_table(os.path.join('Data', file_name), sep = ",") tmp['Season'] = i tmp['League'] = j append_data.append(tmp) old_cy = pd.concat(append_data, axis = 0) # winner of CY has Rank 1 old_cy['CY'] = [int(x == 1) for x in old_cy['Rank']] cy = old_cy[['Name', 'Season', 'CY']] # download baseball data: start from 2006 because more advanced stats are recorded since print("\nstart downloading data") stats1 = pb.pitching_stats(start_season = 2006, end_season = 2010, qual = qual) stats2 = pb.pitching_stats(start_season = 2011, end_season = last_season, qual = qual) df = create_var(pd.concat([stats1, stats2], axis = 0)) data = pd.merge(df, cy, on = ['Name', 'Season'], how = 'outer') data['CY'].fillna(0, inplace = True) # enter 2016 and 2017 winners data.loc[[all([any([a, b]), c]) for a, b, c in zip(data.Name == 'Max Scherzer', data.Name == 'Rick Porcello', data.Season == 2016)], 'CY'] = 1 data.loc[[all([any([a, b]), c]) for a, b, c in zip(data.Name == 'Max Scherzer', data.Name == 'Corey Kluber', data.Season == 2017)], 'CY'] = 1 # split training and testing data # drop columns if it has at least one missing value print("\npreprocess data") mod_data = data.dropna(thresh = data.shape[0], axis = 1) dout = mod_data.drop(['Team', 'Dollars', 'Age Rng'], axis = 1).set_index('Season') # set Season as index so I can extract season for splitting data later print("\nwrite data to files\n") if not os.path.exists(data_dir): os.makedirs(data_dir) dout.to_csv(os.path.join(data_dir, 'train_data.csv'), index = True)
def cmd_pitching_upload(start_year, end_year, db_username, db_password, db_hostname, db_name, db_tablename): click.echo('[[[ PULLING PITCHING DATAFRAME ]]]') engine = initdb_pitching(db_username, db_password, db_hostname, db_name, db_tablename) click.echo(str('Pulling data...')) try: data = pitching_stats(start_year, end_year) data.columns = data.columns.str.replace('%', '') upload_block(data, engine, db_tablename) except Exception as exc: click.echo("ERROR pulling down data - Error was = " + str(exc)) else: click.echo(str('SUCCESS, pulled ' + str(data.shape[0]) + ' records'))
def retrieve_stats(team, year): batting = batting_stats(year) pitching = pitching_stats(year) team_batting = batting[batting['Team'] == team][[ 'Name', 'PA', 'WAR', 'wRC+', 'AVG', 'OBP', 'SLG', 'OPS', 'HR' ]].sort_values('PA', ascending=False) team_batting = team_batting.set_index('Name') print("***** BATTING STATS *****") print(team_batting) print("") team_pitching = pitching[pitching['Team'] == team][[ 'Name', 'IP', 'WAR', 'FIP' ]].sort_values('IP', ascending=False) team_pitching = team_pitching.set_index('Name') print("***** PITCHING STATS *****") print(team_pitching)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import pybaseball as pyb from pybaseball import statcast_batter from pybaseball import playerid_lookup pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) pd.set_option("chained_assignment", None) pitchers_2018 = pyb.pitching_stats(2018) pitchers_2019 = pyb.pitching_stats(2019) pitchers_2020 = pyb.pitching_stats(2020) df_2018 = pyb.batting_stats(2018) df_2019 = pyb.batting_stats(2019) df_2020 = pyb.batting_stats(2020) #cards_df = df_2019.loc[df_2019['Team'] == "Cardinals"] #cards_pitchers = pitchers_2019.loc[pitchers_2019['Team'] == "Cardinals"] """ winker_stats = statcast_batter('2020-08-01', '2020-08-03', 608385) print(winker_stats) """ """ Sorting the Cardinals (2019) by WAR (top 30) And ONLY showing the WAR and name columns. """ #print(cards_df.sort_values(by = 'WAR', ascending = False).head(30)) #print(cards_pitchers)
def get_data(year = 2018, minimum_starts = 5): if not os.path.exists(str(year)): os.mkdir(str(year)) if not os.path.exists(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")): player_stats = pitching_stats(year, year) player_stats = player_stats[player_stats['GS']>minimum_starts] player_stats.to_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")) else: player_stats = pd.read_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")) out = None for name in player_stats['Name']: if not os.path.exists(os.path.join(str(year),'player')): os.mkdir(os.path.join(str(year),'player')) splitname = name.split(' ') # Database is really good and has some mistakes, so when we go to the lookup table for MLB Player IDs sometimes # it doesn't match up. This corrects the issues that I've found. Obviously this won't work for every year # out of the box because of this. splitname[0] = splitname[0].replace('.', '. ', 1) # print(splitname[0]) if splitname[0] == 'J.A.': splitname[0] = 'J. A.' if name == 'Zack Wheeler': splitname[0] = 'Zach' if name == 'Matthew Boyd': splitname[0] = 'Matt' if name == 'C.J. Wilson': splitname[0] = 'c. j.' if name == 'R.A. Dickey': splitname[0] = 'R. A.' if name == 'Jon Niese': splitname[0] = 'Jonathon' if name == 'A.J. Burnett': splitname[0] = 'A. J.' if name == 'Jorge De La Rosa': splitname[0] = 'Jorge' splitname[1] = 'De La Rosa' if name == 'Rubby de la Rosa': splitname[0] = 'Rubby' splitname[1] = 'de la Rosa' if name == 'Cole DeVries': splitname[1] = 'De Vries' if name == 'Samuel Deduno': splitname[0] = 'Sam' if name == 'JC Ramirez': splitname[0] = 'J. C.' if name == 'Nathan Karns': splitname[0] = 'Nate' if name == 'Daniel Ponce de Leon': splitname[1] = 'Ponce de Leon' if name == 'Chi Chi Gonzalez': splitname[0] = 'Chi Chi' splitname[1] = 'Gonzalez' if name == 'Josh A. Smith': splitname[0] = 'Josh' splitname[1] = 'Smith' if name == 'Joel De La Cruz': splitname[1] = 'De La Cruz' if not os.path.exists(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')): player_id = playerid_lookup(splitname[1], splitname[0]) print(year) player_id = player_id[player_id['mlb_played_first'] <= year] player_id = player_id[player_id['mlb_played_last'] >= year] print(player_id) print(len(player_id)) if len(player_id) != 1: print(player_id) print(name) print("Concerning") player = statcast_pitcher(str(year)+'-1-01', str(year)+'-12-31', player_id['key_mlbam'].iloc(0)[0]) player.to_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')) else: player = pd.read_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')) # ['SL' 'FF' 'CU' 'FT' 'CH' nan 'FC' 'KC' 'SI' 'PO' 'FS' 'EP' 'SC'] player_row = pd.DataFrame({'Name':[name]}) pitch_types = ['SL','FF','CU','FT','CH','FC','KC','SI','PO','FS','EP','SC','KN'] soi = ['release_speed','release_pos_x','release_pos_z','pfx_x','pfx_z','vx0','vy0','vz0','ax','ay','az','effective_speed','release_spin_rate'] for pitch in pitch_types: pitches = player[player['pitch_type'] == pitch] pitches = pitches[soi] for stat in soi: mean = np.mean(pitches[stat]) if math.isnan(mean): mean = 0 std = np.std(pitches[stat])+0 if math.isnan(std): std = 0 min = np.min(pitches[stat])+0 if math.isnan(min): min = 0 max = np.max(pitches[stat])+0 if math.isnan(max): max = 0 player_row[pitch+"_"+stat + '_std'] = std player_row[pitch+"_"+stat + '_mean'] = mean player_row[pitch + "_" + stat + '_min'] = min player_row[pitch + "_" + stat + '_max'] = max if out is None: out = player_row else: out = pd.concat([out,player_row]) out out.to_csv(str(year)+".csv")
#Create Empty set to use for export Result = [] #Import Files # last three years of player data MarcelTable = pd.read_csv('data/marcel/MarcelTable.csv') # League Averages by Year lgAVG = pd.read_csv('data/marcel/lgAVG.csv') Year = lgAVG.index[lgAVG["Season_bat"] == 2019.0] Year = lgAVG[lgAVG['Season_bat'] == Year1].index.values.astype(int)[0] print(Year) #marcelCalculations() # Import pandas package import pandas as pd from pybaseball import pitching_stats data = pitching_stats(2019) # making data frame #data = pd.read_csv("data/marcel/MarcelTable.csv") # iterating the columns for col in data.columns: print(col)
from pybaseball import pitching_stats import pathlib, time years = [2019 - x for x in range(50)] years = list(map(str, years)) folder_path = pathlib.Path.cwd().joinpath('pitching_status') if not folder_path.exists(): folder_path.mkdir() for year in years: print(f'Start with {year}') filename = f'{year}_pitching_status.csv' file_path = folder_path.joinpath(filename) if file_path.exists(): print(f'{year} already downloaded !') continue data = pitching_stats(year) data.to_csv(file_path, encoding='utf_8_sig') print(f'Done with {year}')
"""PyBaseball.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1BZ3IYkf_Kka410P_Hne0RCZRT84UWPFm """ !pip install pybaseball from pybaseball import statcast data = statcast(start_dt='2017-06-24', end_dt='2017-06-27') data.head(2) from pybaseball import pitching_stats data = pitching_stats(2012, 2016) data.head() from pybaseball import playerid_lookup from pybaseball import statcast_pitcher import pandas as pd csv = '2019pitchers.csv' df = pd.read_csv(csv) print(df) import pandas as pd alldata = statcast_pitcher('2019-03-27', '2019-11-01', df.get_value(0, 'MLBID')) for i in range(1, 121): data = statcast_pitcher('2019-03-27', '2019-11-01', df.get_value(i, 'MLBID')) data = data[::-1] alldata = pd.concat([alldata, data])
from pybaseball import batting_stats from pybaseball import pitching_stats #hitter_data = batting_stats(2015, 2017) pitcher_data = pitching_stats(2015, 2017) print(pitcher_data)
'K/9+': 'K_9_plus', 'BB/9+': 'BB_9_plus', 'H/9+': 'H_9_plus', 'HR/9+': 'HR_9_plus', 'LOB%+': 'LOB_pct_plus', 'WHIP+': 'WHIP_plus' } for year in years: for side in sides: table_name = side + '_' + str(year) table_id = "fangraphs." + table_name if side == 'batting': df = batting_stats(year, qual=0) elif side == 'pitching': df = pitching_stats(year, qual=0) df.rename(columns=renamed_columns, inplace=True) print('Loading ' + table_id) table_schema = [] for column in df.columns: table_schema.append({'name': column, 'type': 'STRING'}) pandas_gbq.to_gbq(df, table_id, project_id=project_id, if_exists='replace', table_schema=table_schema, chunksize=10000) # try: # if side == 'batting': # df = batting_stats(year) # elif side == 'pitching':
def profile_pitching_stats(): pitching_stats(2019)
def yearGrab(currentSeason): # Batting Stats from pybaseball import batting_stats BattingStats_Year = batting_stats(currentSeason, qual=1) # print(BattingStats_Year.head()) # Test BattingStats_Year.to_csv('data/YearlyData/temp/bstats.csv') import pybaseball #Team Batting Stats BattingStats_Team_Year = pybaseball.team_batting(currentSeason) BattingStats_Team_Year.to_csv('data/YearlyData/temp/team_bstats.csv') # Pitching Stats from pybaseball import pitching_stats PitchingStats_Year = pitching_stats(currentSeason) # print(PitchingStats_Year.head()) # Test PitchingStats_Year.to_csv('data/YearlyData/temp/pstats.csv') print(str(currentSeason) + " : Stats Grab: Successful") # Team Pitching Stats PitchingStats_Team_Year = pybaseball.team_pitching(currentSeason) PitchingStats_Team_Year.to_csv('data/YearlyData/temp/team_pstats.csv') """ # Year Standings if currentSeason >=1969: from pybaseball import standings # get the end-of-season division standings for each season Standings = standings(currentSeason) Standings.insert(0, 'Season', currentSeason) Standings.to_csv('data/YearlyData/temp/team_standings.csv') """ """ # Amateur Draft Data if currentSeason >= 1965: from pybaseball import amateur_draft # Get amateur Draft Results Number_Rounds = 30 RoundRange = list(range(1, Number_Rounds + 1)) for i in RoundRange: #Pull season draft data 1 round at a time Amateur_Draft = amateur_draft(currentSeason, i) #Add Draft Year Amateur_Draft.insert(0, 'Draft_Year', currentSeason) # print(Amateur_Draft.head()) # Test # Read file with all draft files so far A_Draft = pd.read_csv('data/YearlyData/temp/amateur_draft.csv') # Add New Round A_Draft = A_Draft.append(Amateur_Draft, ignore_index=True) #Save new round to File A_Draft.to_csv('data/YearlyData/temp/amateur_draft.csv', sep=',', index=False, encoding='utf-8') """ # Exit Veolocity Data from pybaseball import statcast_batter_exitvelo_barrels Exit_Velocity = statcast_batter_exitvelo_barrels(currentSeason) Exit_Velocity.insert(0, 'Season', currentSeason) Exit_Velocity.to_csv('data/YearlyData/temp/statcast_exit_velocity.csv') # FanGraph Data import pybaseball # Individual Batting Stats fan_bat = pybaseball.batting_stats(currentSeason) fan_bat.to_csv('data/YearlyData/temp/fan_bat.csv') # Individual Pitching Stats fan_pit = pybaseball.pitching_stats(currentSeason) fan_pit.to_csv('data/YearlyData/temp/fan_pit.csv') # Team Batting Stats fan_team_bat = pybaseball.team_batting(currentSeason) fan_team_bat.to_csv('data/YearlyData/temp/fan_team_bat.csv') # Team Pitching Stats fan_team_pit = pybaseball.team_pitching(currentSeason) fan_team_pit.to_csv('data/YearlyData/temp/fan_team_pit.csv') from pybaseball import top_prospects # Get top overall prospects leaguewide topProspects = top_prospects() topProspects.to_csv('data/YearlyData/Top_Prospects.csv')
def points(currentSeason): from pybaseball import batting_stats data = batting_stats(currentSeason, qual=1) pointAmountsBat() points = ( ( R * data['R'] ) + ( Single * data['1B'] ) + ( Double * data['2B'] ) + ( Triple * data['3B'] ) + ( HR * data['HR'] ) + ( TB * ( data['1B'] + (2 * data['2B']) + (3 * data['3B']) + (4 * data['HR']) ) ) + ( RBI * data['RBI'] ) + ( BB * data['BB'] ) + ( K * data['SO'] ) + ( SB * data['SB'] ) + ( AB * data['AB'] ) + ( Hits * data['H']) + ( XBH * ( data['2B'] + data['3B'] + data['HR']) ) + ( IBB * data['IBB'] ) + ( HBP * data['HBP'] ) + ( CS * data['CS'] ) ) # missing Sac and GWRBI and everything past CS data['Points'] = points data['Points'] = data['Points'].round(decimals=0) BattingStats = data[['Season', 'Name', 'Team', 'Age', 'Points', 'G', 'PA', 'AB', 'AVG', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SB', 'CS', 'Pitches']] BattingStats = BattingStats.sort_values('Points', ascending=False) BattingStats = BattingStats.reset_index(drop=True) BattingStats.to_csv('data/bstats.csv') from pybaseball import pitching_stats pdata = pitching_stats(currentSeason) pointAmountsPit() ppoints = ( ( IP * pdata['IP'] ) + ( ER * pdata['ER'] ) + ( K * pdata['SO'] ) + ( SO * pdata['ShO'] ) + ( W * pdata['W'] ) + ( L * pdata['L'] ) + ( SV * pdata['SV'] ) + ( BS * pdata['BS'] ) + ( G * pdata['G'] ) + ( GS * pdata['GS'] ) + ( H * pdata['H'] ) + ( RA * pdata['R'] ) + ( HR * pdata['HR'] ) + ( BB * pdata['BB'] ) + ( HB * pdata['HBP'] ) + ( IBB * pdata['IBB'] ) + ( B * pdata['BK'] ) #+ ( PKO * pdata['PKO'] ) + ( QS * pdata['BK'] ) + ( CG * pdata['CG'] ) #+ ( NH * pdata['NH'] ) #+ ( PG * pdata['PG'] ) #+ ( BF * pdata['BF'] ) + ( PC * pdata['Pitches'] ) #+ ( SOP * pdata['SOP'] ) #+ ( HD * pdata['HD'] ) ) pdata['Points'] = ppoints pdata['Points'] = pdata['Points'].round(decimals=0) PitchingStats = pdata[['Season', 'Name', 'Team', 'Age', 'Points', 'W', 'L', 'ERA', 'ER', 'WAR', 'G', 'GS', 'CG', 'ShO', 'SV', 'BS', 'IP', 'H', 'R', 'HR', 'BB', 'IBB', 'HBP', 'BK', 'SO', 'TBF', 'Pitches']] PitchingStats = PitchingStats.sort_values('Points', ascending=False) PitchingStats = PitchingStats.reset_index(drop=True) PitchingStats.to_csv('data/pstats.csv')
import json currentDate = datetime.now() currentMonth = f"{currentDate.month:02d}" if int(currentMonth) > 3: currentYear = currentDate.year currentDay = date.today() recentDay = currentDay - timedelta(days=7) currentDay = str(currentDay) recentDay = str(recentDay) else: currentYear = currentDate.year -1 currentDay = None recentDay = None pitchingData = pitching_stats(currentYear) battingData = batting_stats(currentYear) if currentDay is not None: recentPitchingData = pitching_stats_range(recentDay, currentDay) recentBattingData = batting_stats_range(recentDay, currentDay) recentPitchingDataFile = open("../public/json/pitcherRankingsRecent.json", "w") recentPitchingDataFile.write(json.dumps(json.loads(recentPitchingData.reset_index().to_json(orient='index')), indent=2)) recentPitchingDataFile.close() recentBattingDataFile = open("../public/json/batterRankingsRecent.json", "w") recentBattingDataFile.write(json.dumps(json.loads(recentBattingData.reset_index().to_json(orient='index')), indent=2)) recentBattingDataFile.close() else:
## grammar of graphics module from plotnine import * ## further modules from datetime import date #%% get data today = date.today() #today as reference ## first, standard pitching stats for last 5 years end_1 = today.year start_1 = today.year - 5 #year as integer pitching = pitching_stats(start_1, end_1) #takes a while # give sucess-message and summary stats of data set print() print("Sucessfully sampled pitching stats of the last 5 years:") print() #list(pitching.columns) #list variable names pitching.describe() #summary stats of pitching data frame ## then, more detailed pitching stats for last month # date of one month ago - be careful current day is 31st if today.month - 1 == 0: #check if its january one_month_ago = today.replace(month=12) #if january else: one_month_ago = today.replace(month=today.month - 1)