def hit_odds(self, batter, pitcher, bt, pt, xbo): if type(batter) is list: try: batter_id = playerid_lookup(batter[1], batter[0]) if len(batter_id) > 1: print( "Choose the index corresponding to the desired player:" ) print(batter_id) desired_index = input("Index: ") batter_id = batter_id.loc[int(desired_index), 'key_mlbam'] else: batter_id = batter_id.loc[0, 'key_mlbam'] except: print("Invalid batter name.") return else: batter_id = batter if type(pitcher) is list: try: pitcher_id = playerid_lookup(pitcher[1], pitcher[0]).loc[0, 'key_mlbam'] except: print("Invalid pitcher name.") return else: pitcher_id = pitcher if type(xbo) is list: hit_options = {} for i in xbo: hitting = self.hit_odds_work( batter_id, pitcher_id, pt, i, self.mlb_games.query("Team == @bt").reset_index().loc[ 0, 'moneyline'], self.mlb_games.query("Team == @bt").reset_index().loc[ 0, 'total']) hit_options[i] = hitting return hit_options else: hitting = self.hit_odds_work( batter_id, pitcher_id, pt, xbo, self.mlb_games.query("Team == @bt").reset_index().loc[ 0, 'moneyline'], self.mlb_games.query("Team == @bt").reset_index().loc[0, 'total']) return hitting
def get_mlbam(): #from pybaseball import playerid_lookup name_dict = {} count = 0 for name in names: if (count <= len(names)): LastName = (name.split()[1]) LastName = LastName.lower() FirstName = (name.split()[0]) FirstName = FirstName.lower() #print(LastName) #print(FirstName) # J.A. Happ x # Ronald Acuna Jr. => Ronald Acuna # Henderson Alvarez III => Henderson Alvarez temp_id = playerid_lookup(LastName, FirstName) #(LastName,FirstName) #print(len(temp_id)) if (len(temp_id) == 1): name_dict[name] = int(temp_id['key_mlbam']) else: print(name, "Not added") count += 1 #print(name_dict) df = pd.DataFrame.from_dict(name_dict, orient='index', columns=['mlbam']) df.reset_index(level=0, inplace=True) df.rename(index=str, columns={"index": "name"}, inplace=True) print(df) df.to_csv( r'/Users/blaisepage/Documents/CUBoulder/Sabermetrics/export_dataframe.csv' )
def get_data(first_name, last_name, start_date, end_date): try: key = pb.playerid_lookup( last_name, first_name)["key_mlbam"].values[0] # get unique pitcher identifier except: pass data = pb.statcast_pitcher(start_date, end_date, key) # get dataset of pitches thrown by pitcher data = data.sort_values(["pitch_number" ]) # sort pitches by order thrown, earliest first data = data.dropna(subset=[ "pitch_type", "des", "description", "release_spin_rate" ]) # make sure dataset does not contain nulls data["order"] = data.reset_index( ).index # create new column with pitch order df = pd.DataFrame(data) df = df.rename( { "des": "Play by Play", "description": "Result of Pitch", "order": "Pitch Number", "pitch_name": "Pitch Type", "release_speed": "Pitch Speed", }, axis=1, ) return df
def get_atbats(first, last): # Lookup player player_info = playerid_lookup(last, first) player_id = player_info["key_mlbam"].iloc[0] # assume only one line start_year = int(player_info["mlb_played_first"].iloc[0]) end_year = int(player_info["mlb_played_last"].iloc[0]) # ignore this year if end_year == 2019: end_year = 2018 # Get all the stats start_date = "{0}-01-01".format(start_year) end_date = "{0}-12-31".format(end_year) print("Scraping from {0} to {1}".format(start_date, end_date)) d_all_stats = statcast_pitcher(start_date, end_date, player_id) d_features = d_all_stats[features] # Iterate over strikeout rows, build into AtBat Objects strikeout_rows = d_all_stats.index[d_all_stats["events"] == "strikeout"].to_list() at_bats, ab_arrays = [], [] for row in strikeout_rows: this_ab = AtBat(d_features, row) at_bats.append(this_ab) ab_arrays.append(this_ab.np) return at_bats, ab_arrays
def get_number(last, first): playerTable = playerid_lookup(last, first) playerTable = playerTable.loc[playerTable['mlb_played_last'].isin([2019])] playerTable.index = range(len(playerTable['mlb_played_last'])) number = playerTable['key_mlbam'] number = number[0] return number
def filter_batting_player(league_data='all_outcomes_2018.csv', player_last='Vogelbach', player_first='Daniel', fname=None): """ This function should be used *after* getting league-wide data with get_league_batting_data(). If you already have league-wide data, this is faster than calling get_batting_player() since it doesn't query statcast data again Arguments league_data: any .csv file returned from get_league_batting_data player_last: last name of player player_first: first name of player fname: name of .csv file to export defaults to [player_last]_[player_first].csv Returns dataframe Saves to file 'fname' if fname is not None """ # get player's mlbam_id (mlb advanced metrics id) # note: for players the same first+last name, this will get the # player who entered the league first # need to fix -- for now pick players with unique names, sorry Chris Davis mlbam_id = playerid_lookup( player_last, player_first )['key_mlbam'].values[0] league_df = pd.read_csv(league_data) player_df = league_df[league_df['batter'] == mlbam_id] player_df.reset_index(inplace=True, drop=True) if fname is not None: player_df.to_csv(fname, index=False) return(player_df)
def get_player_batted_balls(playerFirst, playerLast, dateGT, dateLT): results = [] #all batted balls for a single player url = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=single%7Cdouble%7Ctriple%7Chome%5C.%5C.run%7Cfield%5C.%5C.out%7Cdouble%5C.%5C.play%7Cfield%5C.%5C.error%7Cgrounded%5C.%5C.into%5C.%5C.double%5C.%5C.play%7Cfielders%5C.%5C.choice%7Cfielders%5C.%5C.choice%5C.%5C.out%7Cforce%5C.%5C.out%7Csac%5C.%5C.bunt%7Csac%5C.%5C.bunt%5C.%5C.double%5C.%5C.play%7Csac%5C.%5C.fly%7Csac%5C.%5C.fly%5C.%5C.double%5C.%5C.play%7Ctriple%5C.%5C.play%7C&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={}&game_date_lt={}&hfInfield=&team=&position=&hfOutfield=&hfRO=&home_road=&batters_lookup%5B%5D={}&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name-event&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_pas=0&type=details&' #convert playername into Savant PlayerID for search playerIDtable = pybaseball.playerid_lookup(playerLast, playerFirst) player = playerIDtable.loc[0].key_mlbam #convert year, month, and day into individual integers monthGT = int(dateGT[5:7]) monthLT = int(dateLT[5:7]) yearGT = int(dateGT[0:4]) yearLT = int(dateLT[0:4]) dayGT = int(dateGT[8:10]) dayLT = int(dateLT[8:10]) #if spanning more than a month get needed days of first month, then months in between, then needed days of last month if monthLT > monthGT: dateGT = datetime.date(yearGT, monthGT, dayGT) dateGT = dateGT.strftime("%Y-%m-%d") dateLT = datetime.date(yearGT, monthGT + 1, 1) - datetime.timedelta(days=1) dateLT = dateLT.strftime("%Y-%m-%d") data = requests.get(url.format(dateGT, dateLT, player)).content df = pd.read_csv(io.StringIO(data.decode('utf-8'))) results.append(df) if ((monthLT - monthGT) > 1): for month in range((monthGT + 1), monthLT): dateGT = datetime.date(yearGT, month, 1) dateGT = dateGT.strftime("%Y-%m-%d") dateLT = datetime.date(yearGT, month + 1, 1) - datetime.timedelta(days=1) dateLT = dateLT.strftime("%Y-%m-%d") data = requests.get(url.format(dateGT, dateLT, player)).content df = pd.read_csv(io.StringIO(data.decode('utf-8'))) results.append(df) dateGT = datetime.date(yearGT, monthLT, 1) dateGT = dateGT.strftime("%Y-%m-%d") dateLT = datetime.date(yearGT, monthLT, dayLT) dateLT = dateLT.strftime("%Y-%m-%d") data = requests.get(url.format(dateGT, dateLT, player)).content df = pd.read_csv(io.StringIO(data.decode('utf-8'))) results.append(df) #if months equal just grab the days needed if monthLT == monthGT: dateGT = datetime.date(yearGT, monthGT, dayGT) dateGT = dateGT.strftime("%Y-%m-%d") dateLT = datetime.date(yearGT, monthLT, dayLT) dateLT = dateLT.strftime("%Y-%m-%d") data = requests.get(url.format(dateGT, dateLT, player)).content df = pd.read_csv(io.StringIO(data.decode('utf-8'))) results.append(df) return pd.concat(results)
def collect_statcast(sample_size, target, features, pitcher_names): """Scrapes the Statcast data for each pithcer based on specified criteria; see arguments. Arguments: sample_size {int} -- the number of pitches to collect for each pithcer target {list} -- a list containing the categories desired in the resulting pitch features {list} -- a list containing the desired features to keep for the resulting data. pitcher_names {list} -- the list of pitcher names from the read_pitchers function. Returns: pandas dataframe -- a pandas dataframe where each row is a single pitch for a particular pitcher and each column is a specified feature in the 'features' argument. """ #loop through all the names print('Begin scraping \n') final_data = pd.DataFrame(columns=features) for i, pitcher in enumerate(pitcher_names): if len(pitcher) == 2: fname, lname = pitcher[0], pitcher[1] elif len(pitcher) >= 3: fname, lname = pitcher[0], " ".join(pitcher[1:]) else: pass print( f'\n Pitcher Name: {fname} {lname}, #: {i+1}/{len(pitcher_names)} \n' ) #grap the unique identifier of the pitcher player = playerid_lookup(lname, fname) #to avoid any possible errors, execute following try statement: # grab the unique identifier value # get all available data in time frame # filter data to only have appropriate targets, defined above # append particular pitcher to 'master' dataframe #if any of these steps fail, particularly the grabbing of 'ID' #pass on to next pitcher try: ID = player['key_mlbam'].iloc[player['key_mlbam'].argmax()] df = statcast_pitcher('2018-03-29', '2018-09-30', player_id=ID) df = df[df['description'].isin(target)].sample(sample_size, random_state=2019) final_data = final_data.append(df[features], ignore_index=True) except ValueError: pass print('Finsihed Scraping') return final_data
def player(first_name, last_name, start_date, end_date): player_info = pybaseball.playerid_lookup(last_name, first_name) # if (player_info['mlb_played_last'][0] - player_info['mlb_played_first'][0]) < 10: # start_year = int(player_info['mlb_played_first'][0]) # else: # start_year = int(player_info['mlb_played_last'][0] - 10) player_id = player_info['key_mlbam'][0] # player_info = [player_id,start_year,str(player_info['mlb_played_last'][0])] data = pybaseball.statcast_batter(start_dt = start_date, end_dt = end_date, player_id = player_info[0]) data = data.reset_index(drop = True) return data
def process_data(time, firstname, lastname, pos): playerid = playerid_lookup(lastname, firstname) if "to" in time: if pos == 'batter': return statcast_batter(time[0:10], time[14:], player_id=int(playerid['key_mlbam'][0])) elif pos == 'pitcher': return statcast_pitcher(time[0:10], time[14:], player_id=int(playerid['key_mlbam'][0])) else: return None else: if pos == 'batter': return statcast_batter(time, player_id=int(playerid['key_mlbam'][0])) elif pos == 'pitcher': return statcast_pitcher(time, player_id=int(playerid['key_mlbam'][0])) else: return None
def getplayer(first, last): """ function for getting a list of players to select from """ if first == '': first = None players = playerid_lookup(last, first) players = players[players.mlb_played_first.notnull()] # TODO: figure out how to deal with players with the same name temp_dict = {} for r in players.iterrows(): first = r[1]['name_first'] last = r[1]['name_last'] sid = r[1]['key_mlbam'] # statcast ID full = f'{first} {last} - {sid}'.title() temp_dict[full] = sid return temp_dict
def get_mlbam_id(player_dict): if not player_dict["is_mlb"]: return None player_name = clean_name(player_dict["Player Name"]) first_name, last_name = player_name.split(maxsplit=1) if "." in first_name: # lookup has "A.J." as "A. J." for some reason first_name = first_name.replace(".", ". ").strip() id_lookup = playerid_lookup(last_name, first_name) if id_lookup.shape[0] > 1: mlbam_id = id_lookup.loc[id_lookup.mlb_played_last == id_lookup. mlb_played_last.max()].key_mlbam.values[0] elif id_lookup.shape[0] == 1: mlbam_id = id_lookup.key_mlbam.values[0] else: # how could this happen? mlbam_id = None return mlbam_id
def getPlayerIDs(players): listOfIDs = [] for i in range (0, len(players)): splitPlayer = players[i].split(' ') first = splitPlayer[0] last = splitPlayer[1] ident = playerid_lookup(last,first) identList = ident['key_bbref'].tolist() counter = len(identList) - 1 if len(identList) > 0: while str(identList[counter]) == 'nan' and counter > -1: counter -= 1 finalPlayerID = identList[counter] listOfIDs.append(finalPlayerID) else: listOfIDs.append("PLAYER NOT FOUND") print('added player ' + str(i)) print(listOfIDs)
def get_fangraphs_id(player_dict): """Figure out a way to refactor this and get_mlbam_id()""" if not player_dict["is_mlb"]: return None player_name = clean_name(player_dict["Player Name"]) first_name, last_name = player_name.split(maxsplit=1) if "." in first_name: # lookup has "A.J." as "A. J." for some reason first_name = first_name.replace(".", ". ").strip() id_lookup = playerid_lookup(last_name, first_name) if id_lookup.shape[0] > 1: fangraphs_id = id_lookup.loc[ id_lookup.mlb_played_last == id_lookup.mlb_played_last.max()].key_fangraphs.values[0] elif id_lookup.shape[0] == 1: fangraphs_id = id_lookup.key_fangraphs.values[0] else: # how could this happen? fangraphs_id = None return fangraphs_id
def data_from_name(last, first, year1=2020, num_years=1): years = range(year1, year1 + num_years) lookup = pybaseball.playerid_lookup(last, first) if len(lookup) > 1: print('Multiple players found, determining player by years.') lookup['int'] = lookup.apply(lambda row: len( set( range(int(row['mlb_played_first']), int(row['mlb_played_last']) )) & set(years)), axis=1) lookup = lookup[lookup['int'] == max(lookup['int'])] if len(lookup) > 1: print('Unable to determine player') else: mlb_id = int(lookup['key_mlbam']) data = pybaseball.statcast_batter(f'{year1}-01-01', f'{year1+num_years}-01-01', mlb_id) data = data[data.apply( lambda row: 'hit_into_play' in row['description'], axis=1)] data = data[data['events'] != 'home_run'] data = data.dropna( how='any', subset=['launch_angle', 'launch_speed', 'hc_x', 'hc_y']) return data
# https://pypi.org/project/baseball-scraper/ # https://pypi.org/project/vigorish/ # Useful source to compare packages: # https://snyk.io/advisor/python ############## Statcast data ############## data = statcast(start_dt='2021-04-01', end_dt='2021-04-02', team='SEA') data.loc[data.game_date == '2021-04-01', :] data.head() ############## Team Crosswalk ############# team_cross = teams() team_cross = team_ids() ############# Player Crosswalk ############ player = playerid_lookup('Sheffield', 'Justus') ############## Date Range ################# dt_lst = pd.date_range(start='2021-04-01', end='2021-05-31', freq='D') dt_range = [] for i in dt_lst: dt_range.append(i.strftime('%Y-%m-%d')) ############### Pitcher Profile ############## # Game by Game # TODO remove data.GS filter if want to look at all pitchers ; for now just looking at starters table_lst = [] for x in dt_range: data = pitching_stats_range(start_dt=x, end_dt=x) data = data.loc[data.GS == 1, :] table_lst.append(data)
def get_data(first_name, last_name): train_filename = 'Data/' + str(last_name) + "_" + str( first_name) + "_train.csv" test_filename = 'Data/' + str(last_name) + "_" + str( first_name) + "_test.csv" if os.path.isfile(train_filename) and os.path.isfile( test_filename): #If we've already gotten the data, read it in train_data = pd.read_csv(train_filename) test_data = pd.read_csv(test_filename) else: #If we haven't, get it off the web and store it for future runs #training is done on data from 2015 through 2017 train_data = statcast_pitcher( start_dt='2015-01-01', end_dt='2017-12-31', player_id=int(playerid_lookup('sale', 'chris')['key_mlbam'])) train_data.to_csv(train_filename) #testing is done on data from the beginning of 2018 to present test_data = statcast_pitcher( start_dt='2018-01-01', end_dt='2019-12-31', player_id=int(playerid_lookup('sale', 'chris')['key_mlbam'])) test_data.to_csv(test_filename) #Get all of the pitch types that a pitcher throws, then encode them using our system train_data = train_data[train_data['pitch_type'].isin(pitcher_pitches)] train_data = train_data.dropna(subset=['pitch_type']) train_data['pitch_code'] = train_data.apply( lambda row: get_pitch_code(row, pitcher_pitches), axis=1) #Do the same as above but for the testing data in case they added a new pitch test_data = test_data[test_data['pitch_type'].isin(pitcher_pitches)] test_data = test_data.dropna(subset=['pitch_type']) #Encode all the pitch type/location info to a unique int test_data['pitch_code'] = test_data.apply( lambda row: get_pitch_code(row, pitcher_pitches), axis=1) train_data = get_prev_pitch(train_data) test_data = get_prev_pitch(test_data) #Fills the Na values, turns the batter ID for the player on base into a bool value train_data['on_3b'] = train_data['on_3b'].fillna( value=0).astype(bool).astype(int) train_data['on_2b'] = train_data['on_2b'].fillna( value=0).astype(bool).astype(int) train_data['on_1b'] = train_data['on_1b'].fillna( value=0).astype(bool).astype(int) test_data['on_3b'] = test_data['on_3b'].fillna( value=0).astype(bool).astype(int) test_data['on_2b'] = test_data['on_2b'].fillna( value=0).astype(bool).astype(int) test_data['on_1b'] = test_data['on_1b'].fillna( value=0).astype(bool).astype(int) #Get the data we need and drop any null values (which is why it double selects) train_data_input = train_data[[ 'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes', 'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number', 'pitch_code' ]].dropna() train_data_result = train_data_input[['pitch_code']] train_data_input = train_data_input[[ 'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes', 'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number' ]] test_data = test_data[[ 'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes', 'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number', 'pitch_code' ]].dropna() return train_data_input, train_data_result, test_data
def get_batting_player( start_dt=None, end_dt=None, player_last='Vogelbach', player_first='Daniel', fname_all=None, fname_bb=None, features=[ 'events', 'description', 'batter', 'stand', 'launch_angle', 'launch_speed', 'hc_x', 'hc_y', 'pitcher', 'p_throws', 'pitch_type', 'release_speed', 'release_spin_rate' ] ): """ Pull player statcast batting data from baseballsavant using pybaseball https://github.com/jldbc/pybaseball https://baseballsavant.mlb.com/statcast_search Arguments start_dt: get data from start_dt forward stop_dt: get data up to stop_dt player_last: player's last name player_first: player's first name fname_all: export csv of all statcast at bat outcomes to this file **must be .csv** fname_bb: export csv of all outcomes with a batted ball to this file **must be .csv** Returns (all_outcomes, batted_balls) tuple of dataframes Saves to files 'fname_all' and 'fname_bb' if fname is not None """ # get player's mlbam_id (mlb advanced metrics id) # note: for players the same first+last name, this will get the # player who entered the league first # need to fix -- for now pick players with unique names # sorry Chris Davis :p player_id = playerid_lookup( player_last, player_first )['key_mlbam'].values[0] # get statcast data (this can take awhile) print('Querying batting stats for {} {}'.format(player_first, player_last)) df = statcast_batter(start_dt, end_dt, player_id) # discard null events all_outcomes = df[df['events'].notnull()] # get the specified features only all_outcomes = all_outcomes[features] if fname_all is not None: # export to csv all_outcomes.to_csv(fname_all, index=False) print('Exported: {}'.format(fname_all)) # get batted balls only batted_balls = filter_batted_balls(all_outcomes) if fname_bb is not None: # export data batted_balls.to_csv(fname_bb, index=False) print('Exported: {}'.format(fname_bb)) return(all_outcomes, batted_balls)
df = pd.read_csv('gather_ids.csv') df_noIDs = df[df['mlb_id'].isnull()] df_noIDs = df_noIDs[['First','Last']] df_noIDs.to_csv('df_noIDs.csv') ''' # Loading the hitter dataframe; will ammend each individual here and then re-save and overwrite the loaded file df_hitters = pd.read_csv('./data/gather_ids.csv') ''' I have loaded each individual player missing IDs below, commenting each of their names out and I will go through them one-by-one and update their IDs... ''' #16,J.P.,Arencibia jp_arencibia = pybaseball.playerid_lookup(last='arencibia') df_hitters['mlb_id'] = np.where( ((df_hitters.First == 'J.P.') & (df_hitters.Last == 'Arencibia')), int(jp_arencibia['key_mlbam'].item()), df_hitters['mlb_id'] ) df_hitters['retro_id'] = np.where( ((df_hitters.First == 'J.P.') & (df_hitters.Last == 'Arencibia')), str(jp_arencibia['key_retro'].item()), df_hitters['retro_id'] ) df_hitters['bbref_id'] = np.where( ((df_hitters.First == 'J.P.') & (df_hitters.Last == 'Arencibia')), str(jp_arencibia['key_bbref'].item()), df_hitters['bbref_id'] ) df_hitters['fangraphs_id'] = np.where( ((df_hitters.First == 'J.P.') & (df_hitters.Last == 'Arencibia')), int(jp_arencibia['key_fangraphs'].item()), df_hitters['fangraphs_id'] ) df_hitters['first_played'] = np.where( ((df_hitters.First == 'J.P.') & (df_hitters.Last == 'Arencibia')), int(jp_arencibia['mlb_played_first'].item()), df_hitters['first_played'] ) df_hitters['last_played'] = np.where( ((df_hitters.First == 'J.P.') & (df_hitters.Last == 'Arencibia')), int(jp_arencibia['mlb_played_last'].item()), df_hitters['last_played'] ) print(df_hitters[df_hitters['Last'] == 'Arencibia']) #28,Jose,Bautista jose_bautista = pybaseball.playerid_lookup(last='bautista', first='jose').iloc[1] df_hitters['mlb_id'] = np.where( ((df_hitters.First == 'Jose') & (df_hitters.Last == 'Bautista')), int(jose_bautista['key_mlbam'].item()), df_hitters['mlb_id'] ) df_hitters['retro_id'] = np.where( ((df_hitters.First == 'Jose') & (df_hitters.Last == 'Bautista')), str(jose_bautista['key_retro']), df_hitters['retro_id'] ) df_hitters['bbref_id'] = np.where( ((df_hitters.First == 'Jose') & (df_hitters.Last == 'Bautista')), str(jose_bautista['key_bbref']), df_hitters['bbref_id'] ) df_hitters['fangraphs_id'] = np.where( ((df_hitters.First == 'Jose') & (df_hitters.Last == 'Bautista')), int(jose_bautista['key_fangraphs'].item()), df_hitters['fangraphs_id'] ) df_hitters['first_played'] = np.where( ((df_hitters.First == 'Jose') & (df_hitters.Last == 'Bautista')), int(jose_bautista['mlb_played_first'].item()), df_hitters['first_played'] ) df_hitters['last_played'] = np.where( ((df_hitters.First == 'Jose') & (df_hitters.Last == 'Bautista')), int(jose_bautista['mlb_played_last'].item()), df_hitters['last_played'] )
print("Enter player's first name: ") firstName = input() print("Enter player's last name: ") lastName = input() print("Enter Start Date (YYYY-MM-DD): ") fromDate = input() print("Enter End Date (YYYY-MM-DD): ") toDate = input() playerBattedBalls = get_player_batted_balls(firstName, lastName, fromDate, toDate) playerBattedBalls = playerBattedBalls.reset_index(drop=True) xBA = playerBattedBalls["estimated_ba_using_speedangle"] playerIDtable = pybaseball.playerid_lookup(lastName, firstName) player = playerIDtable.loc[0].key_mlbam playerBattedBallsMore = pybaseball.statcast_batter(fromDate, toDate, player) BAlist = [] for x in range(100000): hit = 0 for i in range(0, len(xBA)): rand = random.uniform(0, 1) if rand < xBA[i]: hit = hit + 1 BA = hit / ((len(xBA) + len(playerBattedBallsMore.events[ playerBattedBallsMore.events == 'strikeout']))) BAlist.append(BA)
def main(): league_id = "953" # put this in env base_url = "https://ottoneu.fangraphs.com" current_year = 2020 auction_url = f"{base_url}/{league_id}/transactions" # for testing auction_url = "https://ottoneu.fangraphs.com/953/transactions?filters%5B%5D=cut&filters%5B%5D=increase" resp = requests.get(auction_url) soup = BeautifulSoup(resp.content, "html.parser") table = soup.find("table") thead = [th.get_text() for th in table.find("thead").find_all("th")] auction_players = list() for tr in tqdm(table.find("tbody").find_all("tr")): player_data = [td.get_text().strip() for td in tr.find_all("td")] player_page_url = [ a["href"] for a in tr.find_all("a") if "playercard" in a["href"] ].pop() player_dict = dict(zip(thead, player_data)) if player_dict["Transaction Type"] != "add": continue player_dict["ottoneu_id"] = player_page_url.rsplit("=")[1] player_salary_dict = get_ottoneu_player_page(player_dict["ottoneu_id"], league_id) player_dict.update(player_salary_dict) player_name = clean_name(player_dict["Player Name"]) first_name, last_name = player_name.split(maxsplit=1) if player_dict["is_mlb"]: if "." in first_name: # lookup has "A.J." as "A. J." for some reason first_name = first_name.replace(".", ". ").strip() id_lookup = playerid_lookup(last_name, first_name) if id_lookup.shape[0] > 1: player_dict["mlbam_id"] = id_lookup.loc[ id_lookup.mlb_played_last == id_lookup.mlb_played_last.max()].key_mlbam.values[0] else: player_dict["mlbam_id"] = id_lookup.key_mlbam.values[0] is_hitter, is_pitcher = get_position_group(player_dict["positions"]) player_dict["is_hitter"] = is_hitter player_dict["is_pitcher"] = is_pitcher auction_players.append(player_dict) hitters = [player for player in auction_players if player["is_hitter"]] pitchers = [player for player in auction_players if player["is_pitcher"]] if hitters: # setting minBBE = 0 to avoid not getting someone # get rid of this indentation and just pull exit velo #s regardless? exit_velo_data = statcast_batter_exitvelo_barrels(current_year, minBBE=0) for player in hitters: if not player["is_mlb"]: # avoid index error for minor leaguers continue player_exit_velo = (exit_velo_data.loc[exit_velo_data.player_id == player["mlbam_id"]].to_dict( "records").pop()) # add anything else? player["avg_exit_velo"] = player_exit_velo["avg_hit_speed"] player["max_exit_velo"] = player_exit_velo["max_hit_speed"] player["barrel_pa_rate"] = player_exit_velo["brl_pa"] player["barrel_bbe_rate"] = player_exit_velo["brl_percent"] if pitchers: # currently pybaseball only has individual pitcher data pass print(auction_players[0])
def get_data(year = 2018, minimum_starts = 5): if not os.path.exists(str(year)): os.mkdir(str(year)) if not os.path.exists(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")): player_stats = pitching_stats(year, year) player_stats = player_stats[player_stats['GS']>minimum_starts] player_stats.to_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")) else: player_stats = pd.read_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")) out = None for name in player_stats['Name']: if not os.path.exists(os.path.join(str(year),'player')): os.mkdir(os.path.join(str(year),'player')) splitname = name.split(' ') # Database is really good and has some mistakes, so when we go to the lookup table for MLB Player IDs sometimes # it doesn't match up. This corrects the issues that I've found. Obviously this won't work for every year # out of the box because of this. splitname[0] = splitname[0].replace('.', '. ', 1) # print(splitname[0]) if splitname[0] == 'J.A.': splitname[0] = 'J. A.' if name == 'Zack Wheeler': splitname[0] = 'Zach' if name == 'Matthew Boyd': splitname[0] = 'Matt' if name == 'C.J. Wilson': splitname[0] = 'c. j.' if name == 'R.A. Dickey': splitname[0] = 'R. A.' if name == 'Jon Niese': splitname[0] = 'Jonathon' if name == 'A.J. Burnett': splitname[0] = 'A. J.' if name == 'Jorge De La Rosa': splitname[0] = 'Jorge' splitname[1] = 'De La Rosa' if name == 'Rubby de la Rosa': splitname[0] = 'Rubby' splitname[1] = 'de la Rosa' if name == 'Cole DeVries': splitname[1] = 'De Vries' if name == 'Samuel Deduno': splitname[0] = 'Sam' if name == 'JC Ramirez': splitname[0] = 'J. C.' if name == 'Nathan Karns': splitname[0] = 'Nate' if name == 'Daniel Ponce de Leon': splitname[1] = 'Ponce de Leon' if name == 'Chi Chi Gonzalez': splitname[0] = 'Chi Chi' splitname[1] = 'Gonzalez' if name == 'Josh A. Smith': splitname[0] = 'Josh' splitname[1] = 'Smith' if name == 'Joel De La Cruz': splitname[1] = 'De La Cruz' if not os.path.exists(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')): player_id = playerid_lookup(splitname[1], splitname[0]) print(year) player_id = player_id[player_id['mlb_played_first'] <= year] player_id = player_id[player_id['mlb_played_last'] >= year] print(player_id) print(len(player_id)) if len(player_id) != 1: print(player_id) print(name) print("Concerning") player = statcast_pitcher(str(year)+'-1-01', str(year)+'-12-31', player_id['key_mlbam'].iloc(0)[0]) player.to_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')) else: player = pd.read_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')) # ['SL' 'FF' 'CU' 'FT' 'CH' nan 'FC' 'KC' 'SI' 'PO' 'FS' 'EP' 'SC'] player_row = pd.DataFrame({'Name':[name]}) pitch_types = ['SL','FF','CU','FT','CH','FC','KC','SI','PO','FS','EP','SC','KN'] soi = ['release_speed','release_pos_x','release_pos_z','pfx_x','pfx_z','vx0','vy0','vz0','ax','ay','az','effective_speed','release_spin_rate'] for pitch in pitch_types: pitches = player[player['pitch_type'] == pitch] pitches = pitches[soi] for stat in soi: mean = np.mean(pitches[stat]) if math.isnan(mean): mean = 0 std = np.std(pitches[stat])+0 if math.isnan(std): std = 0 min = np.min(pitches[stat])+0 if math.isnan(min): min = 0 max = np.max(pitches[stat])+0 if math.isnan(max): max = 0 player_row[pitch+"_"+stat + '_std'] = std player_row[pitch+"_"+stat + '_mean'] = mean player_row[pitch + "_" + stat + '_min'] = min player_row[pitch + "_" + stat + '_max'] = max if out is None: out = player_row else: out = pd.concat([out,player_row]) out out.to_csv(str(year)+".csv")
''' Example from pybaseball github on gathering ids: pid = pybaseball.playerid_lookup('kershaw', 'clayton') print(pid) print(pid.key_bbref.item()) ''' # Iterate over dataframe to gather first & last names and then get their ids for idx, row in hitters.iterrows(): first = str(row['First']) last = str(row['Last']) # How to set values while iterating: hitters.at[idx, ''] pid = pybaseball.playerid_lookup(last, first) if len(pid) == 1: hitters.at[idx, 'mlb_id'] = int(pid.key_mlbam.item()) hitters.at[idx, 'retro_id'] = str(pid.key_retro.item()) hitters.at[idx, 'bbref_id'] = str(pid.key_bbref.item()) hitters.at[idx, 'fangraphs_id'] = int(pid.key_fangraphs.item()) hitters.at[idx, 'first_played'] = int(pid.mlb_played_first.item()) hitters.at[idx, 'last_played'] = int(pid.mlb_played_last.item()) else: continue # Save the updated dataframe to a new csv file hitters.to_csv('./data/gather_ids.csv') print(hitters.head())
from pybaseball import statcast import pandas as pd import numpy as np from pybaseball import statcast_pitcher from pybaseball import playerid_lookup from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from imblearn.over_sampling import SMOTE from keras.utils import to_categorical from tensorflow import feature_column from tensorflow.keras import layers import tensorflow as tf pid = playerid_lookup('berrios', 'jose')["key_mlbam"][0] print(pid) # get all available data data = statcast_pitcher('2017-03-01', '2019-10-10', player_id=pid) data = data[[ "pitch_type", "bat_score", "fld_score", "on_3b", "on_2b", "on_1b", "outs_when_up", "inning", "inning_topbot", "pitch_number", "p_throws", "balls", "strikes", "stand", "batter", "release_speed", "description" ]] data = data[data.pitch_type != 'EP'] data = data[data.pitch_type != 'PO'] data[["on_3b", "on_2b", "on_1b"]] = data[["on_3b", "on_2b", "on_1b"]].replace(np.nan, 0)
from pybaseball import statcast_batter from pybaseball import playerid_lookup from pybaseball import statcast_pitcher hoskins_id = playerid_lookup('hoskins', 'rhys') print(hoskins_id) print("statcast stats from march 1st to april 1st") hoskins_statcast = statcast_batter('2019-03-01', '2019-04-01', 656555) print(hoskins_statcast) """ kersh = playerid_lookup('kershaw', 'clayton') kershaw_stats = statcast_pitcher('2017-06-01', '2017-07-01', kersh) print(kershaw_stats.head(5)) """
def main(input_files): # get the player that we'll be excluding from the neural network first = input('First Name: ') last = input('Last Name: ') possibles = (playerid_lookup(last, first)) print(possibles) index = int(input('Which player do you want?')) target_id = possibles['key_mlbam'][ index] # sometimes need to change index depending on player name inputs = [] training = [] testing = [] years = {'2015': {}, '2016': {}, '2017': {}, '2018': {}} diamond = [ "fielder_1", "fielder_2", "fielder_3", "fielder_4", "fielder_5", "fielder_6", "fielder_7", "fielder_8", "fielder_9" ] team_dict = { "ARI": 0, "ATL": 1, "BAL": 2, "BOS": 3, "CHC": 4, "CWS": 5, "CIN": 6, "CLE": 7, "COL": 8, "DET": 9, "MIA": 10, "HOU": 11, "KC": 12, "LAA": 13, "LAD": 14, "MIL": 15, "MIN": 16, "NYM": 17, "NYY": 18, "OAK": 19, "PHI": 20, "PIT": 21, "SD": 22, "SF": 23, "SEA": 24, "STL": 25, "TB": 26, "TEX": 27, "TOR": 28, "WSH": 29 } print("begin") for input_file in input_files: with open(input_file) as file_r: reader = csv.DictReader(file_r) missing = 0 for row in reader: # sets the on_base variables to binary on_1 = 0 if row['on_1b'] == "" else 1 on_2 = 0 if row['on_2b'] == "" else 1 on_3 = 0 if row['on_3b'] == "" else 1 # handles missing values if row['hc_x'] == "" or row['hc_y'] == "" or row[ 'launch_angle'] == "" or row['launch_speed'] == "" or row[ 'estimated_ba_using_speedangle'] == "" or row[ 'outs_when_up'] == "" or row[ 'total_bases'] == "" or row[ 'hit_location'] == "": # add or row['home_team'] == "" missing += 1 else: # creates input to be fed into the NN new_input = [ float(row['hc_x']), float(row['hc_y']), float(row['launch_angle']), float(row['launch_speed']), float(row['estimated_ba_using_speedangle']), int(float(row['outs_when_up'])), on_1, on_2, on_3, int(float(row['total_bases'])) + 6 ] # row['home_team'], # team_dict[row['home_team']], # inputs.append(new_input) if int(float(row['total_bases'])) > 6: print("Above 6", int(float(row['total_bases']))) # keeps track of the players associated with each event # print(row['hit_location']) location = int(float(row['hit_location'])) - 1 # for some reason, we don't have pitcher and catcher id's in here??? if location != 0 and location != 1: player = row[diamond[location]] if player != "": # Add to training and testing set if float(player) == target_id: testing.append(new_input) if int(float(row["hit_location"]) ) not in target_position: target_position.append( int(float(row["hit_location"]))) # print("Added to testing") else: training.append(new_input) current_dict = years[input_file[-8:-4]] if (player, location + 1) in current_dict.keys(): current_dict[(player, location + 1)].append(new_input) else: current_dict[(player, location + 1)] = [new_input] file_r.close() # print(years['2015'].keys()) print(input_file, "missing data:", missing) print("closed file") print("completed split") train_x = [] train_y = [] test_x = [] test_y = [] # handled by just adding 6 when setting inputs . . . for row in training: train_x.append(row[:-1]) train_y.append(row[-1]) # shifted by 6 bases, CHANGE THIS!!! for row in testing: test_x.append(row[:-1]) test_y.append(row[-1]) # shifted by 6 bases, CHANGE THIS!!! print("completed x/y var separation") train_data = numpy.asarray(train_x) train_labels = numpy.asarray(train_y) eval_data = numpy.asarray(test_x) eval_labels = numpy.asarray(test_y) print("---------------CREATING CLASSIFIER----------------") # create estimator model = tf.estimator.Estimator(model_fn=neural_net, model_dir="checkpoints/") print("---------------TRAINING CLASSIFIER----------------") # train the classifier model = train(train_data, train_labels, model) print("---------------EVALUATING CLASSIFIER----------------") # evaluate effectiveness results, pred_gen = test(eval_data, eval_labels, model) print(results) #print(test_y) # print(pred) # pred_gen = list(pred_gen) # predict = [] # for row in pred_gen: # predict.append(row["classes"]) #print(pred) # matrix = tf.math.confusion_matrix(test_y, predict) # print(matrix) # with tf.Session() as sess: # confusion_matrix = tf.confusion_matrix(labels=test_y, predictions=predict) # confusion_matrix_to_Print = sess.run(confusion_matrix) # print(confusion_matrix_to_Print) # start evaluating players yearly_results = {} for year, players in years.items(): print("Processing", year) individual_results = {} for key, value in players.items(): if int(key[1]) not in target_position: continue # key is (player id, location_played) # value is all of the plays identified with them curr_x = [] curr_y = [] for row in value: curr_x.append(row[:-1]) curr_y.append(row[-1]) individual_predictions = list( predict(numpy.asarray(curr_x), numpy.asarray(curr_y), model)) actual_pred = [] for row in individual_predictions: actual_pred.append(row['classes'] - 6) tally = 0 for i in range(len(curr_y)): # overlook drastically missed predictions # if actual_pred[i] < -1: # continue tally += actual_pred[i] - (curr_y[i] - 6) # print(tally) individual_results[key] = { "total_bases": tally, "opportunities": len(curr_y) } yearly_results[year] = individual_results return (last, yearly_results)
playerDatapath = '../../playerStats/data_raw/' for row in range(3980, df.shape[0]): #df = pd.read_csv('./player_lookup/players.csv') ID = df['RetroID'][row] relevant = df[df['RetroID'] == ID] #print(relevant.shape) LastName = relevant['LastName'].values[0] FirstName = relevant['FirstName'].values[0] ndf.iloc[row, 1] = LastName ndf.iloc[row, 2] = FirstName playerInfo = pb.playerid_lookup(last=LastName, first=FirstName) #['key_mlbam'] playerInfo = playerInfo[playerInfo['key_retro'] == ID] playerInfo = playerInfo['key_mlbam'] df.iloc[row, 3] = playerInfo.values print(row) if row % 100 == 0: print('row ' + str(row) + ' of ' + str(ndf.shape[0])) #ndf.to_csv('player_lookup/players.csv', index = False) ndf.to_csv('player_lookup/players.csv', index=False) for row in range(0, playerdf.shape[0]): try: playerdf['MLBAM'][row] = int(playerdf['MLBAM'][row].replace( ']', '').replace('[', ''))
# Made by Noah Mitchem for MLB Pitchers # Vertical pitch breaks seem off, don't know what other data can be used import plotly.graph_objs as go from plotly import tools from plotly.offline import plot import numpy as np from matplotlib import cm from pybaseball import playerid_lookup from pybaseball import statcast_pitcher file = statcast_pitcher("2019-03-25", "2019-10-01", playerid_lookup("scherzer", "max")["key_mlbam"][0]) def colorcode(speed): speed1 = int((speed - 50) * 4.3) co = np.array(cm.magma(speed1)) * 255 return "rgb(" + str(int(co[0])) + "," + str(int(co[1])) + "," + str( int(co[2])) + ")" data = [] data1 = [] pitchTrack = 0 breaks = 0 x = {} extremes = [] differentPitches = file["pitch_type"].unique().size totalPitches = file.index.size color = [ "rgb(102, 204, 0)", "rgb(0, 214, 214)", "rgb(204, 0, 0)", "rgb(255, 153, 0)", "rgb(153, 0, 255)"
user="******", database="baseball") #get players for both team players = [("corey", "kluber"), ("yan", "gomes"), ("yonder", "alonso"), ("jose", "ramirez"), ("josh", "donaldson"), ("francisco", "lindor"), ("melky", "cabrera"), ("jason", "kipnis"), ("michael", "brantley"), ("luis", "castillo"), ("tucker", "barnhart"), ("joey", "votto"), ("scooter", "gennett"), ("eugenio", "suarez"), ("mason", "williams"), ("billy", "hamilton"), ("preston", "tucker"), ("jose", "peraza")] mycursor = mydb.cursor() #get player data for player in players: id = playerid_lookup(player[1], player[0]) print(len(id)) if len(id) == 1: stats = statcast_batter('2018-3-29', '2018-10-02', id.key_mlbam.iloc[0]) hr = 0 bip = 0 tot = 0 for event in stats.events: tot = tot + 1 if event == 'home_run': hr = hr + 1 if type( event ) != float and event != "strikeout" and event != "walk" and event != "home_run": bip = bip + 1
from pybaseball import statcast from pybaseball import playerid_lookup import pandas as pd # First we have to find all of the player_ids for the Red Sox batters to filter to just their at-bats redsox_batters = [('christian', 'vazquez'), ('christian', 'arroyo'), ('xander', 'bogaerts'), ('rafael', 'devers'), ('alex', 'verdugo'), ('enrique', 'hernandez'), ('hunter', 'renfroe'), ('kyle', 'schwarber'), ('kevin', 'plawecki')] redsox_batters_dict = { "jd+martinez": 502110.0 } # Had to look up jd martinez manually because I could not figure out what search parameter to use for his first name for batter in redsox_batters: id = playerid_lookup(batter[1], batter[0]) redsox_batters_dict[f"{batter[0]}+{batter[1]}"] = float(id['key_mlbam']) pd.DataFrame.from_dict(data=redsox_batters_dict, orient='index').to_csv('redsox_batters.csv', header=False) # Now, collect all statcast data from the ALCS dates (15OCT21 thru 22OCT21) and filter based on player_ids found above data = statcast(start_dt="2021-10-15", end_dt="2021-10-22") data = data.loc[(data['batter'].isin(redsox_batters_dict.values()))] data.to_csv('unfilteredAtBats.csv')