def process_data(time, firstname, lastname, pos): playerid = playerid_lookup(lastname, firstname) if "to" in time: if pos == 'batter': return statcast_batter(time[0:10], time[14:], player_id=int(playerid['key_mlbam'][0])) elif pos == 'pitcher': return statcast_pitcher(time[0:10], time[14:], player_id=int(playerid['key_mlbam'][0])) else: return None else: if pos == 'batter': return statcast_batter(time, player_id=int(playerid['key_mlbam'][0])) elif pos == 'pitcher': return statcast_pitcher(time, player_id=int(playerid['key_mlbam'][0])) else: return None
def get_data(first_name, last_name, start_date, end_date): try: key = pb.playerid_lookup( last_name, first_name)["key_mlbam"].values[0] # get unique pitcher identifier except: pass data = pb.statcast_pitcher(start_date, end_date, key) # get dataset of pitches thrown by pitcher data = data.sort_values(["pitch_number" ]) # sort pitches by order thrown, earliest first data = data.dropna(subset=[ "pitch_type", "des", "description", "release_spin_rate" ]) # make sure dataset does not contain nulls data["order"] = data.reset_index( ).index # create new column with pitch order df = pd.DataFrame(data) df = df.rename( { "des": "Play by Play", "description": "Result of Pitch", "order": "Pitch Number", "pitch_name": "Pitch Type", "release_speed": "Pitch Speed", }, axis=1, ) return df
def get_atbats(first, last): # Lookup player player_info = playerid_lookup(last, first) player_id = player_info["key_mlbam"].iloc[0] # assume only one line start_year = int(player_info["mlb_played_first"].iloc[0]) end_year = int(player_info["mlb_played_last"].iloc[0]) # ignore this year if end_year == 2019: end_year = 2018 # Get all the stats start_date = "{0}-01-01".format(start_year) end_date = "{0}-12-31".format(end_year) print("Scraping from {0} to {1}".format(start_date, end_date)) d_all_stats = statcast_pitcher(start_date, end_date, player_id) d_features = d_all_stats[features] # Iterate over strikeout rows, build into AtBat Objects strikeout_rows = d_all_stats.index[d_all_stats["events"] == "strikeout"].to_list() at_bats, ab_arrays = [], [] for row in strikeout_rows: this_ab = AtBat(d_features, row) at_bats.append(this_ab) ab_arrays.append(this_ab.np) return at_bats, ab_arrays
def statcastData(pitcherId, stats, dateRange): if pitcherId is None: # TODO raise Exception data = bball.statcast_pitcher(dateRange[0], dateRange[1], pitcherId) statcastDF = pd.DataFrame(data) statsOnly = statcastDF[stats] return statsOnly
def pand(df): frames = [] for index, value in df.iterrows(): frame = statcast_pitcher('2018-02-01', '2018-12-01', player_id=df['key_mlbam'][index]) frames.append(frame) result = pd.concat(frames) return result
def dataGrab(number, start, end): data = statcast_pitcher(start_dt=start, end_dt=end, player_id=number) data = data[[ 'pitch_type', 'release_speed', 'effective_speed', 'release_pos_x', 'plate_x', 'release_pos_z', 'plate_z', 'release_extension', 'zone', 'launch_speed', 'launch_angle', 'estimated_woba_using_speedangle' ]] data.index = range(len(data['pitch_type'])) return data
def import_data(number, start, end): data = statcast_pitcher(start_dt=start, end_dt=end, player_id=number) data = data[['pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z', 'pfx_x', 'pfx_z', 'release_spin_rate', 'plate_x', 'plate_z', 'estimated_woba_using_speedangle', 'woba_value', 'description', 'launch_speed_angle', 'launch_angle', 'launch_speed', 'bb_type', 'effective_speed', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'release_extension']] data.index = range(len(data['pitch_type'])) return data
def collect_statcast(sample_size, target, features, pitcher_names): """Scrapes the Statcast data for each pithcer based on specified criteria; see arguments. Arguments: sample_size {int} -- the number of pitches to collect for each pithcer target {list} -- a list containing the categories desired in the resulting pitch features {list} -- a list containing the desired features to keep for the resulting data. pitcher_names {list} -- the list of pitcher names from the read_pitchers function. Returns: pandas dataframe -- a pandas dataframe where each row is a single pitch for a particular pitcher and each column is a specified feature in the 'features' argument. """ #loop through all the names print('Begin scraping \n') final_data = pd.DataFrame(columns=features) for i, pitcher in enumerate(pitcher_names): if len(pitcher) == 2: fname, lname = pitcher[0], pitcher[1] elif len(pitcher) >= 3: fname, lname = pitcher[0], " ".join(pitcher[1:]) else: pass print( f'\n Pitcher Name: {fname} {lname}, #: {i+1}/{len(pitcher_names)} \n' ) #grap the unique identifier of the pitcher player = playerid_lookup(lname, fname) #to avoid any possible errors, execute following try statement: # grab the unique identifier value # get all available data in time frame # filter data to only have appropriate targets, defined above # append particular pitcher to 'master' dataframe #if any of these steps fail, particularly the grabbing of 'ID' #pass on to next pitcher try: ID = player['key_mlbam'].iloc[player['key_mlbam'].argmax()] df = statcast_pitcher('2018-03-29', '2018-09-30', player_id=ID) df = df[df['description'].isin(target)].sample(sample_size, random_state=2019) final_data = final_data.append(df[features], ignore_index=True) except ValueError: pass print('Finsihed Scraping') return final_data
def statcast_pitcher_spin(start_dt=None, end_dt=None, player_id=None): pitcher_data = statcast_pitcher(start_dt, end_dt, player_id) spin_df = pitcher_data[[ 'release_extension', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'release_spin_rate']].copy() spin_df = find_intermediate_values(spin_df) pitcher_data[['Mx', 'Mz', 'phi', 'theta']] = spin_df[[ 'Mx', 'Mz', 'phi', 'theta']].copy() return pitcher_data
def player(first_name, last_name, start_date, end_date): player_info = pybaseball.playerid_lookup(last_name, first_name) # if (player_info['mlb_played_last'][0] - player_info['mlb_played_first'][0]) < 10: # start_year = int(player_info['mlb_played_first'][0]) # else: # start_year = int(player_info['mlb_played_last'][0] - 10) player_id = player_info['key_mlbam'][0] # player_info = [player_id,start_year,str(player_info['mlb_played_last'][0])] data = pybaseball.statcast_pitcher(start_dt=start_date, end_dt=end_date, player_id=player_info[0]) data = data.reset_index(drop=True) return data
def get_player_stats(id): """ Takes pitcher id as paramter and retrieves all pitches thrown by that pitcher in the 2019 season. Reduces the dataframe to contain only the columns we want for our analysis and only the pitches that are put in play. returns condensed dataframe. """ data = statcast_pitcher('2019-03-28', '2019-09-29', id) df = data[[ 'pitch_type', 'release_speed', 'release_spin_rate', 'if_fielding_alignment', 'launch_angle', 'launch_speed', 'hc_x', 'hc_y', 'stand', 'type', 'events' ]] new_df = df[df['type'] == 'X'] return new_df
def pitcher(self, name, team): Xcols = ['pfx_x', 'pfx_z', 'release_speed', 'release_spin_rate'] fgp = self.fgp player = fgp[(fgp.Name.str.lower() == name.lower()) & (fgp.Team.str.lower() == team.lower())].playerid pid = int(playerid_reverse_lookup(player, 'fangraphs').key_mlbam) pitch = statcast_pitcher(start_dt='2015-03-28', end_dt='2019-09-29', player_id=pid) if set(pitch.p_throws) == {'R'}: throws = 'R' scaler = self.scalerR kmeans = self.modelR else: throws = 'L' scaler = self.scalerL kmeans = self.modelL pitch.dropna(subset=Xcols, inplace=True) pitch.reset_index(drop=True, inplace=True) pitch['p_type'] = kmeans.predict(scaler.transform(pitch[Xcols])) pitchdict = {} for i in range(13): if throws == 'R': if i == 7: pitchernum = pitch[(pitch.p_type == 7) | (pitch.p_type == 12)] elif i == 12: pitchernum = [] else: pitchernum = pitch[pitch.p_type == i] else: if i == 0: pitchernum = pitch[(pitch.p_type == 0) | (pitch.p_type == 4)] elif i == 4: pitchernum = [] else: pitchernum = pitch[pitch.p_type == i] cutoff = len(pitchernum) / len(pitch) if cutoff > (1 / 20): pitchdict[i] = round((cutoff * 100), 1) return pid, pitchdict, throws
data = statcast(start_dt='2017-06-24', end_dt='2017-06-27') data.head(2) from pybaseball import pitching_stats data = pitching_stats(2012, 2016) data.head() from pybaseball import playerid_lookup from pybaseball import statcast_pitcher import pandas as pd csv = '2019pitchers.csv' df = pd.read_csv(csv) print(df) import pandas as pd alldata = statcast_pitcher('2019-03-27', '2019-11-01', df.get_value(0, 'MLBID')) for i in range(1, 121): data = statcast_pitcher('2019-03-27', '2019-11-01', df.get_value(i, 'MLBID')) data = data[::-1] alldata = pd.concat([alldata, data]) print(alldata) import pandas as pd kershaw = pd.DataFrame(alldata) print(kershaw) a = 18 df1 = kershaw[kershaw['at_bat_number'] <= a] df1_ = kershaw[kershaw['at_bat_number'] > a] print(df1) print(df1_)
def SpinRate(names, start_date, end_date, dictionary): """ The function takes pitcher names and dates to pull spin rate data from Statcast. A list of names is required under the current setting. A singl set of dates or list of dates can be supplied. Additionally, a dictionary of player names and their MLB IDs must be supplied. This dictionary can be created using other pybaseball functions. Returns a dataframe with spin rate data for four-seam fastball, curveball, and two-seam fastballs as well as two-seam fastball use and wOBA and total wOBA.""" # import warnings filter and ignore warnings from warnings import simplefilter simplefilter(action='ignore', category=Warning) # Load packages for analysis import pandas as pd import pybaseball as pb import numpy as np player_dict = dictionary df_final = pd.DataFrame(columns=[ 'Name', 'Total wOBA', 'FF Spin', 'CU/KC Spin', 'FT/SI Spin', 'FT/SI Use', 'FT/SI wOBA' ]) if isinstance(names, list) and isinstance(start_date, list): for (name, sdt, edt) in zip(names, start_date, end_date): player_ID = player_dict[name] df_data = pb.statcast_pitcher(start_dt=sdt, end_dt=edt, player_id=player_ID) total_pitches = len(df_data) total_woba = np.mean(df_data.woba_value) FF_data = df_data[(df_data.pitch_type == 'FF')] CU_data = df_data[(df_data.pitch_type == 'KC') | (df_data.pitch_type == 'CU')] FT_data = df_data[(df_data.pitch_type == 'FT') | (df_data.pitch_type == 'SI')] FF_spin = np.mean(FF_data.release_spin_rate) CU_spin = np.mean(CU_data.release_spin_rate) FT_spin = np.mean(FT_data.release_spin_rate) FT_use = len(FT_data) / total_pitches FT_woba = np.mean(FT_data.woba_value) temp = [ name, total_woba, FF_spin, CU_spin, FT_spin, FT_use, FT_woba ] df_temp = pd.DataFrame([temp], columns=[ 'Name', 'Total wOBA', 'FF Spin', 'CU/KC Spin', 'FT/SI Spin', 'FT/SI Use', 'FT/SI wOBA' ]) df_final = pd.concat([df_final, df_temp], axis=0) df_final = df_final.fillna(0.0) if isinstance(names, list): for name in names: player_ID = player_dict[name] if name == 'Will Smith': player_ID = 519293 df_data = pb.statcast_pitcher(start_dt=start_date, end_dt=end_date, player_id=player_ID) total_pitches = len(df_data) total_woba = np.mean(df_data.woba_value) FF_data = df_data[(df_data.pitch_type == 'FF')] CU_data = df_data[(df_data.pitch_type == 'KC') | (df_data.pitch_type == 'CU')] FT_data = df_data[(df_data.pitch_type == 'FT') | (df_data.pitch_type == 'SI')] FF_spin = np.mean(FF_data.release_spin_rate) CU_spin = np.mean(CU_data.release_spin_rate) FT_spin = np.mean(FT_data.release_spin_rate) FT_use = len(FT_data) / total_pitches FT_woba = np.mean(FT_data.woba_value) temp = [ name, total_woba, FF_spin, CU_spin, FT_spin, FT_use, FT_woba ] df_temp = pd.DataFrame([temp], columns=[ 'Name', 'Total wOBA', 'FF Spin', 'CU/KC Spin', 'FT/SI Spin', 'FT/SI Use', 'FT/SI wOBA' ]) df_final = pd.concat([df_final, df_temp], axis=0) df_final = df_final.fillna(0.0) return df_final
from pybaseball import statcast_pitcher from pybaseball import playerid_lookup from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from imblearn.over_sampling import SMOTE from keras.utils import to_categorical from tensorflow import feature_column from tensorflow.keras import layers import tensorflow as tf pid = playerid_lookup('berrios', 'jose')["key_mlbam"][0] print(pid) # get all available data data = statcast_pitcher('2017-03-01', '2019-10-10', player_id=pid) data = data[[ "pitch_type", "bat_score", "fld_score", "on_3b", "on_2b", "on_1b", "outs_when_up", "inning", "inning_topbot", "pitch_number", "p_throws", "balls", "strikes", "stand", "batter", "release_speed", "description" ]] data = data[data.pitch_type != 'EP'] data = data[data.pitch_type != 'PO'] data[["on_3b", "on_2b", "on_1b"]] = data[["on_3b", "on_2b", "on_1b"]].replace(np.nan, 0) data.loc[data.on_3b > 0, "on_3b"] = 1 data.loc[data.on_2b > 0, "on_2b"] = 1 data.loc[data.on_1b > 0, "on_1b"] = 1
def get_data(year = 2018, minimum_starts = 5): if not os.path.exists(str(year)): os.mkdir(str(year)) if not os.path.exists(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")): player_stats = pitching_stats(year, year) player_stats = player_stats[player_stats['GS']>minimum_starts] player_stats.to_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")) else: player_stats = pd.read_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")) out = None for name in player_stats['Name']: if not os.path.exists(os.path.join(str(year),'player')): os.mkdir(os.path.join(str(year),'player')) splitname = name.split(' ') # Database is really good and has some mistakes, so when we go to the lookup table for MLB Player IDs sometimes # it doesn't match up. This corrects the issues that I've found. Obviously this won't work for every year # out of the box because of this. splitname[0] = splitname[0].replace('.', '. ', 1) # print(splitname[0]) if splitname[0] == 'J.A.': splitname[0] = 'J. A.' if name == 'Zack Wheeler': splitname[0] = 'Zach' if name == 'Matthew Boyd': splitname[0] = 'Matt' if name == 'C.J. Wilson': splitname[0] = 'c. j.' if name == 'R.A. Dickey': splitname[0] = 'R. A.' if name == 'Jon Niese': splitname[0] = 'Jonathon' if name == 'A.J. Burnett': splitname[0] = 'A. J.' if name == 'Jorge De La Rosa': splitname[0] = 'Jorge' splitname[1] = 'De La Rosa' if name == 'Rubby de la Rosa': splitname[0] = 'Rubby' splitname[1] = 'de la Rosa' if name == 'Cole DeVries': splitname[1] = 'De Vries' if name == 'Samuel Deduno': splitname[0] = 'Sam' if name == 'JC Ramirez': splitname[0] = 'J. C.' if name == 'Nathan Karns': splitname[0] = 'Nate' if name == 'Daniel Ponce de Leon': splitname[1] = 'Ponce de Leon' if name == 'Chi Chi Gonzalez': splitname[0] = 'Chi Chi' splitname[1] = 'Gonzalez' if name == 'Josh A. Smith': splitname[0] = 'Josh' splitname[1] = 'Smith' if name == 'Joel De La Cruz': splitname[1] = 'De La Cruz' if not os.path.exists(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')): player_id = playerid_lookup(splitname[1], splitname[0]) print(year) player_id = player_id[player_id['mlb_played_first'] <= year] player_id = player_id[player_id['mlb_played_last'] >= year] print(player_id) print(len(player_id)) if len(player_id) != 1: print(player_id) print(name) print("Concerning") player = statcast_pitcher(str(year)+'-1-01', str(year)+'-12-31', player_id['key_mlbam'].iloc(0)[0]) player.to_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')) else: player = pd.read_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')) # ['SL' 'FF' 'CU' 'FT' 'CH' nan 'FC' 'KC' 'SI' 'PO' 'FS' 'EP' 'SC'] player_row = pd.DataFrame({'Name':[name]}) pitch_types = ['SL','FF','CU','FT','CH','FC','KC','SI','PO','FS','EP','SC','KN'] soi = ['release_speed','release_pos_x','release_pos_z','pfx_x','pfx_z','vx0','vy0','vz0','ax','ay','az','effective_speed','release_spin_rate'] for pitch in pitch_types: pitches = player[player['pitch_type'] == pitch] pitches = pitches[soi] for stat in soi: mean = np.mean(pitches[stat]) if math.isnan(mean): mean = 0 std = np.std(pitches[stat])+0 if math.isnan(std): std = 0 min = np.min(pitches[stat])+0 if math.isnan(min): min = 0 max = np.max(pitches[stat])+0 if math.isnan(max): max = 0 player_row[pitch+"_"+stat + '_std'] = std player_row[pitch+"_"+stat + '_mean'] = mean player_row[pitch + "_" + stat + '_min'] = min player_row[pitch + "_" + stat + '_max'] = max if out is None: out = player_row else: out = pd.concat([out,player_row]) out out.to_csv(str(year)+".csv")
# Made by Noah Mitchem for MLB Pitchers # Vertical pitch breaks seem off, don't know what other data can be used import plotly.graph_objs as go from plotly import tools from plotly.offline import plot import numpy as np from matplotlib import cm from pybaseball import playerid_lookup from pybaseball import statcast_pitcher file = statcast_pitcher("2019-03-25", "2019-10-01", playerid_lookup("scherzer", "max")["key_mlbam"][0]) def colorcode(speed): speed1 = int((speed - 50) * 4.3) co = np.array(cm.magma(speed1)) * 255 return "rgb(" + str(int(co[0])) + "," + str(int(co[1])) + "," + str( int(co[2])) + ")" data = [] data1 = [] pitchTrack = 0 breaks = 0 x = {} extremes = [] differentPitches = file["pitch_type"].unique().size totalPitches = file.index.size color = [ "rgb(102, 204, 0)", "rgb(0, 214, 214)", "rgb(204, 0, 0)", "rgb(255, 153, 0)", "rgb(153, 0, 255)"
def retrieve_data(): """ Function for retrieving data from Statcast and performing some custom formatting """ run_button.label = 'Running...' reset_data() global p_dict global h_dict global data global data_cds global pitch_cds_p global pitches_p global pitcher_data global batter_data global sub_batter # update plot title pitchername = pitcherselect.value.split(' -')[0] battername = hitterselect.value.split(' -')[0] plot.title.text = f'{pitchername} vs. {battername}' pitcher_id = p_dict[pitcherselect.value] hitter_id = h_dict[hitterselect.value] # all the data for the batter in the time frame batter_data_temp = statcast_batter(str(start_date.value), str(end_date.value), hitter_id) batter_data = pitch_info(batter_data_temp) # all data for the pitcher in the time frame pitcher_data_temp = statcast_pitcher(str(start_date.value), str(end_date.value), pitcher_id) pitcher_data = pitch_info(pitcher_data_temp) # filter to only the pitches thrown to selected batter data = pitcher_data[pitcher_data['batter'] == hitter_id].copy() sub_batter = batter_data[batter_data['pitcher'] == pitcher_id].copy() if len(data) == 0: warning_txt = 'No matchups in specified time frame' warning_label.text = warning_txt else: warning_label.text = '' result = [] count = [] for r in data.iterrows(): # assign event names event = results(r[1]['events'], r[1]['description']) result.append(event) count_str = f"{r[1]['balls']}, {r[1]['strikes']}" count.append(count_str) data['result'] = result data['count'] = count # update column data source new_data = { 'pitch': data['pitch_name'], 'speed': data['release_speed'], 'result': data['result'], 'count': data['count'], 'color': data['color'], 'plate_x': data['plate_x'], 'plate_z': data['plate_z'] } data_cds.data = new_data # update strike zoe new_top = data.sz_top.sum() / len(data.sz_top) new_bottom = data.sz_bot.sum() / len(data.sz_bot) new_zone = { 'x': [-8.5 / 12, 8.5 / 12], 'x_side1': [-8.5 / 12, -8.5 / 12], 'x_side2': [8.5 / 12, 8.5 / 12], 'top': [new_top, new_top], 'bottom': [new_bottom, new_bottom], 'side1': [new_top, new_bottom], 'side2': [new_bottom, new_top] } strike_zone_cds.data = new_zone # update pitch plots p_unique, p_matchup, p_overall = pitch_frequency(pitcher_data, data) pitches_p.x_range.factors = p_unique new_data_pitcher = { 'pitches': p_unique, 'matchup': p_matchup, 'overall': p_overall } pitch_cds_p.data = new_data_pitcher b_unique, b_matchup, b_overall = pitch_frequency( batter_data, sub_batter) pitches_b.x_range.factors = b_unique new_data_batter = { 'pitches': b_unique, 'matchup': b_matchup, 'overall': b_overall } pitch_cds_b.data = new_data_batter run_button.label = 'Run'
def get_data(first_name, last_name): train_filename = 'Data/' + str(last_name) + "_" + str( first_name) + "_train.csv" test_filename = 'Data/' + str(last_name) + "_" + str( first_name) + "_test.csv" if os.path.isfile(train_filename) and os.path.isfile( test_filename): #If we've already gotten the data, read it in train_data = pd.read_csv(train_filename) test_data = pd.read_csv(test_filename) else: #If we haven't, get it off the web and store it for future runs #training is done on data from 2015 through 2017 train_data = statcast_pitcher( start_dt='2015-01-01', end_dt='2017-12-31', player_id=int(playerid_lookup('sale', 'chris')['key_mlbam'])) train_data.to_csv(train_filename) #testing is done on data from the beginning of 2018 to present test_data = statcast_pitcher( start_dt='2018-01-01', end_dt='2019-12-31', player_id=int(playerid_lookup('sale', 'chris')['key_mlbam'])) test_data.to_csv(test_filename) #Get all of the pitch types that a pitcher throws, then encode them using our system train_data = train_data[train_data['pitch_type'].isin(pitcher_pitches)] train_data = train_data.dropna(subset=['pitch_type']) train_data['pitch_code'] = train_data.apply( lambda row: get_pitch_code(row, pitcher_pitches), axis=1) #Do the same as above but for the testing data in case they added a new pitch test_data = test_data[test_data['pitch_type'].isin(pitcher_pitches)] test_data = test_data.dropna(subset=['pitch_type']) #Encode all the pitch type/location info to a unique int test_data['pitch_code'] = test_data.apply( lambda row: get_pitch_code(row, pitcher_pitches), axis=1) train_data = get_prev_pitch(train_data) test_data = get_prev_pitch(test_data) #Fills the Na values, turns the batter ID for the player on base into a bool value train_data['on_3b'] = train_data['on_3b'].fillna( value=0).astype(bool).astype(int) train_data['on_2b'] = train_data['on_2b'].fillna( value=0).astype(bool).astype(int) train_data['on_1b'] = train_data['on_1b'].fillna( value=0).astype(bool).astype(int) test_data['on_3b'] = test_data['on_3b'].fillna( value=0).astype(bool).astype(int) test_data['on_2b'] = test_data['on_2b'].fillna( value=0).astype(bool).astype(int) test_data['on_1b'] = test_data['on_1b'].fillna( value=0).astype(bool).astype(int) #Get the data we need and drop any null values (which is why it double selects) train_data_input = train_data[[ 'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes', 'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number', 'pitch_code' ]].dropna() train_data_result = train_data_input[['pitch_code']] train_data_input = train_data_input[[ 'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes', 'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number' ]] test_data = test_data[[ 'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes', 'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number', 'pitch_code' ]].dropna() return train_data_input, train_data_result, test_data