Exemplo n.º 1
0
def update_data(date, team):
    """Load game data using pybaseball's scraper.

    Uses pybaseball.statcast and pybaseball.playerid_reverse_lookup to load
    data from a specific game, and stores it as json in the user's browser
    session in a hidden div.

    """
    print("Loading data from statcast... ")
    raw = statcast(start_dt=date, end_dt=date, team=team)
    raw = raw.rename(columns={"player_name": "pitcher_name"})

    print("Adding batter names... ")
    batter_ids = raw["batter"].unique()
    batters = playerid_reverse_lookup(batter_ids, key_type="mlbam")
    batters["batter_name"] = (batters["name_first"].str.capitalize() + " " +
                              batters["name_last"].str.capitalize())

    raw = raw.merge(
        batters[["key_mlbam", "batter_name"]],
        how="left",
        left_on="batter",
        right_on="key_mlbam",
    )

    print("Done.")

    return raw.to_json(date_format="iso", orient="split")
Exemplo n.º 2
0
def finalize(input_files):
	header_dict = {"last_name": 1, "first_name": 1, "position": 1, "total_bases": 1, "opportunities": 1, "percentage": 1, "player_id": 1}
	for file in input_files:
		with open(file) as file_r:
			output_file = file[:-4] + "_final.csv"
			with open(output_file, 'a') as file_w:

				reader = csv.DictReader(file_r)
				writer = csv.DictWriter(file_w, header_dict.keys())
				writer.writeheader()

				data = list(reader)

				for old_row in data:
					new_row = {}
					# print(old_row.keys())
					if old_row['player_id'] == "":
						continue
					lookup = int(float(old_row['player_id']))
					data = playerid_reverse_lookup([lookup], key_type="mlbam")
					new_row['player_id'] = lookup
					new_row['position'] = old_row['position']
					new_row['last_name'] = data['name_last'][0]
					new_row['first_name'] = data['name_first'][0]
					new_row['total_bases'] = old_row['total_bases']
					new_row['opportunities'] = old_row['opportunities']

					percentage = int(new_row['total_bases']) / int(new_row['opportunities'])
					new_row['percentage'] = round(percentage, 3)
					# print(new_row)
					writer.writerow(new_row)
			file_w.close()
		file_r.close()
def make_efp_json(statcast):
    efp_s = make_efp_series(make_pitcher_df(statcast))
    ids = []
    for i in efp_s.index:
        ids.append(i)

    pitcher_ids = playerid_reverse_lookup(ids)

    p_df = make_pitcher_df(statcast)
    efp_s = make_efp_series(p_df).rename("EFP")
    combined = p_df.merge(efp_s.to_frame(), left_index=True, right_index=True)
    with_ids = combined.merge(pitcher_ids[["name_last", "name_first", "key_mlbam"]], left_index=True, right_on="key_mlbam").set_index("key_mlbam", drop=True)

    return with_ids.to_json(orient="columns")
Exemplo n.º 4
0
    def pitcher(self, name, team):
        Xcols = ['pfx_x', 'pfx_z', 'release_speed', 'release_spin_rate']

        fgp = self.fgp
        player = fgp[(fgp.Name.str.lower() == name.lower())
                     & (fgp.Team.str.lower() == team.lower())].playerid
        pid = int(playerid_reverse_lookup(player, 'fangraphs').key_mlbam)
        pitch = statcast_pitcher(start_dt='2015-03-28',
                                 end_dt='2019-09-29',
                                 player_id=pid)
        if set(pitch.p_throws) == {'R'}:
            throws = 'R'
            scaler = self.scalerR
            kmeans = self.modelR
        else:
            throws = 'L'
            scaler = self.scalerL
            kmeans = self.modelL
        pitch.dropna(subset=Xcols, inplace=True)
        pitch.reset_index(drop=True, inplace=True)
        pitch['p_type'] = kmeans.predict(scaler.transform(pitch[Xcols]))
        pitchdict = {}
        for i in range(13):
            if throws == 'R':
                if i == 7:
                    pitchernum = pitch[(pitch.p_type == 7) |
                                       (pitch.p_type == 12)]
                elif i == 12:
                    pitchernum = []
                else:
                    pitchernum = pitch[pitch.p_type == i]
            else:
                if i == 0:
                    pitchernum = pitch[(pitch.p_type == 0) |
                                       (pitch.p_type == 4)]
                elif i == 4:
                    pitchernum = []
                else:
                    pitchernum = pitch[pitch.p_type == i]
            cutoff = len(pitchernum) / len(pitch)
            if cutoff > (1 / 20):
                pitchdict[i] = round((cutoff * 100), 1)
        return pid, pitchdict, throws
Exemplo n.º 5
0
 def batter(self, name, team, throws='R'):
     Xcols = ['pfx_x', 'pfx_z', 'release_speed', 'release_spin_rate']
     if throws == 'R':
         scaler = self.scalerR
         kmeans = self.modelR
     else:
         scaler = self.scalerL
         kmeans = self.modelL
     fgh = self.fgh
     player = fgh[(fgh.Name.str.lower() == name.lower())
                  & (fgh.Team.str.lower() == team.lower())].playerid
     pid = int(playerid_reverse_lookup(player, 'fangraphs').key_mlbam)
     bat = statcast_batter(start_dt='2015-03-28',
                           end_dt='2019-09-29',
                           player_id=pid)
     bat.dropna(subset=Xcols, inplace=True)
     bat.reset_index(drop=True, inplace=True)
     bat = bat[bat.p_throws == throws]
     bat['p_type'] = kmeans.predict(scaler.transform(bat[Xcols]))
     batdict = {}
     for i in range(12):
         if throws == 'R':
             if i == 7:
                 batnum = bat[(bat.p_type == 7) | (bat.p_type == 12)]
             elif i == 12:
                 continue
             else:
                 batnum = bat[bat.p_type == i]
         else:
             if i == 0:
                 batnum = bat[(bat.p_type == 0) | (bat.p_type == 4)]
             elif i == 4:
                 continue
             else:
                 batnum = bat[bat.p_type == i]
         batdict[i] = [len(batnum)]
         batdict[i] += [
             round((np.sum(batnum.woba_value) / np.sum(batnum.woba_denom)),
                   3)
         ]
     return pid, batdict
Exemplo n.º 6
0
    def calc_batting_fd_score(self,
                              start_date='2015-04-01',
                              end_date='2018-07-19',
                              preload=True,
                              write_csv=False,
                              path2017="",
                              path2018=""):
        # PART 1 - get bbref data
        # Inputs:
        # start_date - beginning of time to pull statcast data
        # stop_date - time to cease pulling statcast data
        # Filepath1 - include path to filepath to bbref .jl file from scraper
        # Filepath2 - include path to baseball_name_translator

        # Outputs:
        # DF of batting stats merged from both bbref and statcast sources

        # Note:
        # There are no dates for bbref data bcause the scraping system has what it has
        # As we scrape more and icnrease our dataset we can hone in on specific seasons using these date variables

        # PART 1 - pull in bbref data and store as a df to be merge later
        batting_df, pitching_df = self.load_data(path2017=path2017,
                                                 path2018=path2018,
                                                 preload=preload,
                                                 write_csv=write_csv)

        # PART 2 - get statcast data
        try:
            print("Accessing statcast_cache...")
            print("If dates are missing try rebuilding cache...")
            statcast_input_frame = pd.read_csv('statcast_cache.csv')
        except:
            print("Getting raw statcast data...")
            statcast_input_frame = self.pull_raw_statcast_data(
                start_date=start_date, end_date=end_date)
            statcast_input_frame.to_csv('statcast_cache.csv')

        events_worth_points = [
            'single', 'double', 'triple', 'walk', 'hit_by_pitch', 'home_run',
            'hit_by_pitch'
        ]
        statcast_df = statcast_input_frame[statcast_input_frame['events'].isin(
            events_worth_points)]

        # Gets a list of batter keys, (unique list prevents repeat occurances)
        player_list = list(statcast_df['batter'].unique().astype(int))

        # Lookup keys to get each player's various keys (mlb, bbref, etc.)
        player_id_values = playerid_reverse_lookup(player_list,
                                                   key_type='mlbam')

        # Merge player keys to batter df based on key
        cols_to_merge = [
            'name_last', 'name_first', 'key_mlbam', 'key_bbref',
            'key_fangraphs', 'key_retro'
        ]
        statcast_df_2 = statcast_df.merge(player_id_values[cols_to_merge],
                                          how='inner',
                                          left_on='batter',
                                          right_on='key_mlbam')

        # Bring in stadium codes to to use with "home team" to determine the "stadium" where the game took place
        try:
            stadium_codes = pd.read_csv(self.key_join_path)

        except FileNotFoundError:
            print(
                "Couldn't find baseball_key_joiner.csv in the same directory.")

        # Merge stadium codes onto existing statcast DF, merge on home team name
        statcast_df_3 = statcast_df_2.merge(stadium_codes,
                                            how='left',
                                            left_on='home_team',
                                            right_on='team_abbr')

        # Ad hoc basic key generation - can extract this to a function later if necessary
        # Matches on game_id since no 'start_time' value is available for statcast
        statcast_df_3['game_date'] = pd.to_datetime(statcast_df_3['game_date'])
        statcast_df_3['game_date'] = statcast_df_3['game_date'].astype(str)
        statcast_df_3['stadium'] = statcast_df_3['stadium'].astype(str)

        statcast_df_3['game_id'] = statcast_df_3['game_date'] + \
               statcast_df_3['stadium'] + \
               statcast_df_3['key_bbref'].astype(str)
        print("Aggregating data...")
        # Counts and groups events by game_id and event type, then unpacks events via unstack into their own columns
        batter_agg = statcast_df_3.groupby(['batter', 'home_team', 'game_date', 'game_id', 'events']).size() \
               .unstack(fill_value=0)
        batter_agg2 = batter_agg.reset_index()

        # Aggregates fan duel values
        batter_agg3 = batter_agg2.groupby(['batter', 'home_team', 'game_date', 'game_id']) \
                 .agg({ 'hit_by_pitch' : 'sum', \
                 'home_run' : 'sum', \
                 'single' : 'sum', \
                 'double' : 'sum', \
                 'triple' : 'sum', \
                 'walk' : 'sum', \
                 'hit_by_pitch' : 'sum'})
        statcast_data = batter_agg3.reset_index()

        print("Merging bbref and statcast data...")
        # Merge statcast and bbref databases, dropna (there are a lot b/c statcast has +1 year of data with no bbref values)
        batter_dataframe_final = batting_df.merge(statcast_data,
                                                  how='left',
                                                  left_on='game_id',
                                                  right_on='game_id')
        batter_dataframe_final.drop(columns=['home_team_y', 'game_date_y'],
                                    inplace=True)
        batter_dataframe_final.rename(columns={
            "home_team_x": "home_team",
            "game_date_x": "game_date"
        },
                                      inplace=True)
        #batter_dataframe_final = batter_dataframe_final.dropna()

        # Score game performance
        batter_dataframe_final['fd_score'] = batter_dataframe_final.apply(
            self.fd_batting_score, axis=1)

        # NAN values after the bbref-statcast join a lack of value for an in game event.  A player
        # who has at lease 1 AT BAT in a game, but fails to generate a FD scoring event, returns a NAN
        # This NAN should be a zero, since the player scored zero FD points
        batter_dataframe_final[[
            'hit_by_pitch', 'home_run', 'single', 'double', 'triple'
        ]] = batter_dataframe_final[[
            'hit_by_pitch', 'home_run', 'single', 'double', 'triple'
        ]].fillna(value=0)

        batter_dataframe_final = batter_dataframe_final[
            (batter_dataframe_final['game_date'] < end_date)
            & (batter_dataframe_final['game_date'] > start_date)]

        # Take walk values from BB if 'walk' value is not filled due to failure to join between
        # statcast and bbref
        batter_dataframe_final['walk'] = np.where(
            batter_dataframe_final['walk'].isnull(),
            batter_dataframe_final['BB'], batter_dataframe_final['walk'])

        batter_dataframe_final[
            'roto_game_id'] = batter_dataframe_final['game_date'].astype(
                str) + batter_dataframe_final['player'].astype(str)

        print("Batting FD Score calculated! Returning data..")
        return batter_dataframe_final
Exemplo n.º 7
0
    def calc_fd_scores_roto(self,
                            start_date='2015-04-01',
                            end_date='2018-07-19',
                            preload=True,
                            write_csv=False,
                            path2017="",
                            path2018=""):
        #the dates in rotoguru are in a weird format, need to clean them

        # PART 1 - pull in bbref data and store as a df to be merge later
        batting_df, pitching_df = self.load_data(preload=True,
                                                 write_csv=False,
                                                 path2017="",
                                                 path2018="")
        #Our batting and pitching df need to match on this new game_id
        batting_df[
            'roto_game_id'] = batting_df['game_date'] + batting_df['player']
        pitching_df[
            'roto_game_id'] = pitching_df['game_date'] + pitching_df['player']

        print("Loading rotoguru data..")

        try:
            rotoguru = pd.read_csv("roto_data_2015-2018.csv")
        except FileNotFoundError:
            print("Couldn't find the rotoguru csv!")

        #match rotoguru to baseball reference with different keys
        print("Getting bbref key to merge rotoguru and bbref data")
        unique_players = list(rotoguru['MLB_ID'].unique())
        lookup = playerid_reverse_lookup(unique_players)

        rotoguru = pd.merge(rotoguru,
                            lookup[['key_mlbam', 'key_bbref']],
                            left_on='MLB_ID',
                            right_on='key_mlbam')

        print("Cleaning up rotoguru dates..")
        #we need to make the rotoguru dates match the bbref date format - use a helper function
        rotoguru['game_date'] = rotoguru.apply(self.clean_rotoguru_dates,
                                               axis=1)
        rotoguru.drop('Date', axis=1, inplace=True)

        print("Merging bbref and rotoguru data to get FD scores")
        #create a unique id to merge rotoguru and bbref data
        rotoguru[
            'roto_game_id'] = rotoguru['game_date'] + rotoguru['key_bbref']

        print("Batting df pre merge: ", batting_df.shape)
        rotoguru = rotoguru[['roto_game_id', 'FD_points', 'Pos']]
        batting_df = pd.merge(batting_df, rotoguru, on='roto_game_id')
        print("Batting df post merge: ", batting_df.shape)

        print("Pitching df pre merge: ", pitching_df.shape)
        rotoguru = rotoguru[['roto_game_id', 'FD_points', 'Pos']]
        pitching_df = pd.merge(pitching_df, rotoguru, on='roto_game_id')
        print("Pitching df post merge: ", pitching_df.shape)

        #we want to remove pitchers - we're not going to include them in the batting model, we have a separate df for pitching
        batting_df = batting_df[batting_df['Pos'] != 'P']
        batting_df.dropna(inplace=True)

        return batting_df, pitching_df
Exemplo n.º 8
0
def main():

    pth = "/Users/irarickman/Google Drive/Data Science/Projects/MLB Projections/Batting Average"
    td = format(datetime.today(), "%Y-%m-%d")
    old_data = pd.read_pickle(pth + "/lastabs.pkl")
    old_data.game_date = pd.to_datetime(old_data.game_date,
                                        infer_datetime_format=True)
    prev_date = old_data.game_date.max()
    od = format(prev_date, "%Y-%m-%d")
    if od != td:
        new_d = statcast(od, td)
        new_data = new_d[new_d.events.notnull()]
        players_ids = playerid_reverse_lookup(new_data.batter.unique())
        id_df = players_ids[['name_last', 'name_first', 'key_mlbam']]
        new_names = new_data.merge(id_df,
                                   how='left',
                                   left_on='batter',
                                   right_on='key_mlbam')
        df = pd.concat([old_data, new_names])
    else:
        df = old_data
    df.drop_duplicates(inplace=True)
    df.to_pickle(pth + "/lastabs.pkl")
    df['hit'] = df.events.apply(
        lambda x: 1 if x in ["single", 'double', 'home_run', 'triple'] else 0)
    df['ab'] = df.events.apply(lambda x: 0 if x in [
        'walk', 'hit_by_pitch', "caught_stealing_2b",
        "pickoff_caught_stealing_2b", 'pickoff_1b', 'catcher_interf',
        'pickoff_caught_stealing_3b', 'pickoff_2b',
        'pickoff_caught_stealing_home', 'caught_stealing_3b',
        'caught_stealing_home', "sac_fly", 'sac_bunt', 'sac_fly_double_play',
        'sac_bunt_double_play'
    ] else 1)
    df['player_team'] = df.apply(lambda x: x.home_team
                                 if x.inning_topbot == "Bot" else x.away_team,
                                 axis=1)
    df['Opp'] = df.apply(lambda x: x.away_team
                         if x.inning_topbot == "Bot" else x.home_team,
                         axis=1)
    df['Place'] = df.apply(lambda x: "Home"
                           if x.inning_topbot == "Bot" else "Away",
                           axis=1)
    teams = df.player_team.unique()
    fixers = {"WSH": "WSN", "CWS": "CHW"}
    teams_fixed = [x if x not in fixers.keys() else fixers[x] for x in teams]

    team_schedule = {}
    missed = []
    for t in teams_fixed:
        try:
            d = schedule_and_record(2018, t)
            d['fix_date'] = d.Date.str.replace("\(\d\)",
                                               "").str.strip() + " 2018"
            d['game_date'] = pd.to_datetime(d.fix_date.apply(
                lambda x: datetime.strptime(x, "%A, %b %d %Y")).apply(
                    lambda x: x.strftime("%Y-%m-%d")),
                                            infer_datetime_format=True)
            d['Place'] = d.Home_Away.apply(lambda x: "Home"
                                           if x == "Home" else "Away")
            d2 = d[d.game_date >= datetime.today()][[
                'Place', "Opp", "game_date"
            ]]
            team_schedule[t] = d2
        except ValueError:
            print(t)
            missed.append(t)

    df['name_last'] = df['name_last'].str.capitalize()
    df['name_first'] = df['name_first'].str.capitalize()
    df['player_name'] = df.name_first + " " + df.name_last
    sm_df = df[[
        'game_date', 'game_pk', 'hit', 'ab', 'Opp', 'Place', 'player_name',
        'player_team', 'key_mlbam'
    ]]
    sm_df.sort_values(['player_name', 'game_date', 'game_pk'], inplace=True)
    trim_df = sm_df.groupby([
        'player_name', 'game_date', 'game_pk', 'Opp', 'Place', 'player_team',
        'key_mlbam'
    ]).sum().reset_index()

    def player_df(player, d=trim_df):
        temp = d[d.player_name == player]
        temp = temp.sort_values(['game_date']).reset_index(drop=True)
        tm = temp.loc[len(temp) - 1, 'player_team']
        if tm in fixers.keys():
            sched = team_schedule[fixers[tm]]
        else:
            sched = team_schedule[tm]
        tdf = pd.concat([temp, sched])
        tdf.ab.fillna(0, inplace=True)
        tdf.hit.fillna(0, inplace=True)
        tdf.player_name.fillna(player, inplace=True)
        tdf.player_team.fillna(tm, inplace=True)
        return tdf

    master_df = player_df(trim_df.player_name.unique()[0])
    for p in trim_df.player_name.unique()[1:]:
        got = player_df(p)
        master_df = pd.concat([master_df, got])
    master_df.game_date = master_df.game_date.apply(
        lambda x: format(x, "%Y-%m-%d"))

    ## now write to the google sheet

    # #authorization
    gc = pygsheets.authorize(outh_file='/Users/irarickman/client_secret.json')
    mlb = 'MLB At Bats'
    sh = gc.open(mlb)

    #select the first sheet
    wks = sh[0]

    wks.set_dataframe(master_df, (1, 1))
Exemplo n.º 9
0
from luigi import Task, LocalTarget, IntParameter, Parameter
from pybaseball import pitching_stats, batting_stats, playerid_reverse_lookup, schedule_and_record, home_games

# a list of mlbam ids
player_ids = [116539, 116541, 641728, 116540]

# find the names of the players in player_ids, along with their ids from other data sources
data = playerid_reverse_lookup(player_ids, key_type='mlbam')
# find their names and ids from other data sources
fg_ids = [826, 5417, 210, 1101]
data = playerid_reverse_lookup(fg_ids, key_type='fangraphs')


class GetData(Task):
    start_year = IntParameter(default=2015)
    end_year = IntParameter(default=2018)
    type = Parameter()

    data_function = {'pitching': pitching_stats, 'batting': batting_stats}

    def requires(self):
        return None

    def output(self):
        return LocalTarget(f'./data/external/{self.type}_data.csv')

    def run(self):
        params = {'start_season': self.start_year, 'end_season': self.end_year}

        pitching_data = self.data_function[str(self.type)](**params)
        pitching_data.to_csv(self.output().path)
Exemplo n.º 10
0
clf = pickle.load(open('xBA_knn_model.pickle', 'rb'))
# get 2019 batting data
data2019 = pd.read_csv('data/all_outcomes_2019.csv', index_col=0)
# remove players with fewer than 300 ABs
AB300 = []
for pid in data2019['batter'].unique():
    if data2019.loc[data2019['batter'] ==
                    pid].shape[0] >= 300 and pid not in AB300:
        AB300.append(pid)

cBA, xBA = [], []
xBA_dict = {}
for i, pid in enumerate(AB300):
    print('Processing {:d}/{:d}'.format(i + 1, len(AB300)) + ' Player IDs...')
    # get player name from player_id
    plast, pfirst = playerid_reverse_lookup([pid],
                                            key_type='mlbam').iloc[0, :2]
    print('Player ID {} --> {} {}'.format(pid, pfirst, plast))
    # filter player data from league-wide batting data
    pdata = data2019.loc[data2019['batter'] == pid].copy()
    Xp, yp, dfp = pre_process(pdata)
    # calculate xBA from model
    predicted_outcomes = clf.predict(Xp)
    unique, counts = np.unique(predicted_outcomes, return_counts=True)
    d = dict(zip(unique, counts))
    # hit=1, out=0
    xBA = d[1] / (d[0] + d[1])
    # calculate standard BA
    cBA = calc_BA(dfp, from_file=False)
    xBA_dict[pid] = [pfirst, plast, round(cBA, 3), round(xBA, 3)]

# convert to datafrom and save results to csv
Exemplo n.º 11
0
    def rotoguru_features(self, batting=True):
        '''Rotoguru contains data on weather, batting order and windspeed/direction.'''

        try:
            rotoguru = pd.read_csv("roto_data_2015-2018.csv")
        except FileNotFoundError:
            print("Couldn't find the rotoguru csv!")

        #match rotoguru to baseball reference with different keys
        print("Getting bbref key to merge rotoguru and bbref data")
        unique_players = list(rotoguru['MLB_ID'].unique())
        lookup = playerid_reverse_lookup(unique_players)

        rotoguru = pd.merge(rotoguru,
                            lookup[['key_mlbam', 'key_bbref']],
                            left_on='MLB_ID',
                            right_on='key_mlbam')

        print("Cleaning up rotoguru dates..")
        #we need to make the rotoguru dates match the bbref date format - use a helper function
        rotoguru['game_date'] = rotoguru.apply(self.clean_rotoguru_dates,
                                               axis=1)
        rotoguru.drop('Date', axis=1, inplace=True)

        print("Merging bbref and rotoguru data to get FD scores")
        #create a unique id to merge rotoguru and bbref data
        rotoguru[
            'roto_game_id'] = rotoguru['game_date'] + rotoguru['key_bbref']

        if batting:
            #there are only certain relevant columns we want to keep
            batter_cols = ['Condition', 'Hand', 'FD_points', 'FD_salary', 'Gametime_ET', 'Home_Ump', 'H/A', 'Oppt', 'Oppt_pitch_Name', 'Oppt_pitch_MLB_ID', 'Oppt_pitch_hand', 'Order', 'Pos', 'Temp', \
                           'W_dir', 'W_speed', 'roto_game_id']

            batters = rotoguru[rotoguru['Pos'] != 'P']
            batters = batters[batter_cols]

            batters['Order'] = batters['Order'].astype(str)

            ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
            batters_ohe = ohe.fit_transform(
                batters[['H/A', 'Condition', 'W_dir', 'Order']])
            batters_ohe.drop(
                ['H/A_a', 'Condition_nan', 'W_dir_nan', 'Order_nan'],
                axis=1,
                inplace=True)
            #add in relevant game_id
            batters_ohe['roto_game_id'] = batters['roto_game_id']

            return batters_ohe
        else:  #return pitching df instead
            pitcher_cols = ['Condition', 'FD_points', 'FD_salary', 'Gametime_ET', 'Home_Ump', 'IP', 'H/A', 'Oppt', 'Oppt_pitch_Name', 'Oppt_pitch_MLB_ID', 'Oppt_pitch_hand', 'QS', 'Temp', \
                           'W_dir', 'W_speed', 'roto_game_id']

            pitchers = rotoguru[rotoguru['Pos'] == 'P']
            pitchers = pitchers[pitcher_cols]

            ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
            pitchers_ohe = ohe.fit_transform(
                pitchers[['H/A', 'Condition', 'W_dir']])
            pitchers_ohe.drop(['H/A_a', 'Condition_nan', 'W_dir_nan'],
                              axis=1,
                              inplace=True)

            pitchers_ohe['roto_game_id'] = pitchers['roto_game_id']

            return pitchers_ohe
Exemplo n.º 12
0
    def stadium_batter_avg(self,
                           switch_cutoff=0.05,
                           preload=False,
                           filepath_statcast_cache='../statcast_cache.csv'):

        batting_df = self.avg_df.copy()

        batting_column = 'batting_hand_' + str(switch_cutoff)

        if not preload:
            try:
                statcast_frame_raw = pd.read_csv(filepath_statcast_cache)
            except:
                print(
                    "Could not locate statcast_cache.csv, please check filepath_statcast_cache value"
                )
                return ""

            left_right_hand = statcast_frame_raw.groupby(['batter',
                                                          'stand']).size()

            left_right_hand = left_right_hand.reset_index()
            left_right_hand.rename(columns={0: "bat_count"}, inplace=True)

            # Convert to long format with columns titled L / R (Left or Right handed pitches received)
            lr_pivot = left_right_hand.pivot(columns='stand',
                                             index="batter",
                                             values='bat_count')
            lr_pivot = lr_pivot.reset_index()
            lr_pivot = lr_pivot.fillna(0)

            # Creates helpyer columns to calculate how many pitches each player received
            # for each hand, and calculate the % of switch hitting each engaged in
            lr_pivot['primary_hand'] = np.where(lr_pivot['L'] > lr_pivot['R'],
                                                'L', 'R')
            lr_pivot['major_count'] = np.where(lr_pivot['L'] > lr_pivot['R'],
                                               lr_pivot['L'], lr_pivot['R'])
            lr_pivot['minor_count'] = np.where(lr_pivot['L'] > lr_pivot['R'],
                                               lr_pivot['R'], lr_pivot['L'])
            lr_pivot['switch_perc'] = (lr_pivot['minor_count'] /
                                       lr_pivot['major_count'])
            lr_pivot = lr_pivot.replace(np.inf, np.nan).fillna(0)

            # if a player performs more than 5% of their bats using the other hand, they're classified as a switch hitter
            lr_pivot[batting_column] = np.where(
                lr_pivot['switch_perc'] < switch_cutoff,
                lr_pivot['primary_hand'], "S")

            lr_pivot.sort_values(by='switch_perc', ascending=False)

            left_right = lr_pivot[['batter', batting_column]]

            # Lookup 'batter keys' to mlb keys
            player_list = left_right['batter'].tolist()

            # Lookup keys to get each player's various keys (mlb, bbref, etc.)
            player_id_values = playerid_reverse_lookup(player_list,
                                                       key_type='mlbam')

            # Merge player keys to batter df based on key
            cols_to_merge = [
                'name_last', 'name_first', 'key_mlbam', 'key_bbref',
                'key_fangraphs', 'key_retro'
            ]
            left_right_with_keys = left_right.merge(
                player_id_values[cols_to_merge],
                how='inner',
                left_on='batter',
                right_on='key_mlbam')

            # Cache
            print('Creating cache "batter_hand.csv" in current directory...')
            left_right_with_keys.to_csv('batter_hand.csv', index=False)

        # Load the cache
        left_right = pd.read_csv('batter_hand.csv')

        batting_df2 = batting_df.merge(
            left_right[[batting_column, 'key_bbref']],
            how="left",
            left_on="player",
            right_on="key_bbref")
        batting_df2.drop('key_bbref', 1, inplace=True)

        # Grouyp by stadium and batting hand.  Possible future expansion here based on date, maybe not tho
        stadium_hand_averages = batting_df2.groupby(
            ['stadium', batting_column])['batting_avg'].mean()
        stadium_hand_averages = stadium_hand_averages.reset_index()

        stadium_hand_averages.rename(
            {"batting_avg": 'stadium_batting_avg_' + str(switch_cutoff)},
            axis=1,
            inplace=True)

        # Bring in the stadium averages to our normal DF (so we link back up with game_id)
        batting_df3 = batting_df2.merge(stadium_hand_averages,
                                        how="left",
                                        left_on=["stadium", batting_column],
                                        right_on=["stadium", batting_column])

        return_frame = batting_df3[[
            'game_id', 'stadium_batting_avg_' + str(switch_cutoff)
        ]]

        return return_frame
def run_pull(start_date, yr=2021):
    pth = "/home/irarickman/data"
    yd = (datetime.now(pytz.timezone('US/Eastern')) -
          timedelta(1)).strftime('%Y-%m-%d')
    if path.exists(pth + '/lastabs.pkl'):
        old_data = pd.read_pickle(pth + "/lastabs.pkl")
        old_data.game_date = pd.to_datetime(old_data.game_date,
                                            infer_datetime_format=True)
        prev_date = old_data.game_date.max()
        od = prev_date.strftime("%Y-%m-%d")
        if od == yd:
            return
        else:
            ## if the entered date equals yesterday (which it will in the dag), we need to check the previous day's data
            ## to make sure that we didn't miss anything
            new_d = statcast(od, yd)
            new_data = new_d[new_d.events.notnull()]
            players_ids = playerid_reverse_lookup(new_data.batter.unique())
            id_df = players_ids[['name_last', 'name_first', 'key_mlbam']]
            new_names = new_data.merge(id_df,
                                       how='left',
                                       left_on='batter',
                                       right_on='key_mlbam')
            df = pd.concat([old_data, new_names])
    else:
        new_d = statcast(start_date, yd)
        new_data = new_d[new_d.events.notnull()]
        players_ids = playerid_reverse_lookup(new_data.batter.unique())
        id_df = players_ids[['name_last', 'name_first', 'key_mlbam']]
        new_names = new_data.merge(id_df,
                                   how='left',
                                   left_on='batter',
                                   right_on='key_mlbam')
        df = new_names
    df.drop_duplicates(inplace=True)
    df.to_pickle(pth + "/lastabs.pkl")
    df['hit'] = df.events.apply(
        lambda x: 1 if x in ["single", 'double', 'home_run', 'triple'] else 0)
    df['ab'] = df.events.apply(lambda x: 0 if x in [
        'walk', 'hit_by_pitch', "caught_stealing_2b",
        "pickoff_caught_stealing_2b", 'pickoff_1b', 'catcher_interf',
        'pickoff_caught_stealing_3b', 'pickoff_2b',
        'pickoff_caught_stealing_home', 'caught_stealing_3b',
        'caught_stealing_home', "sac_fly", 'sac_bunt', 'sac_fly_double_play',
        'sac_bunt_double_play'
    ] else 1)
    df['player_team'] = df.apply(lambda x: x.home_team
                                 if x.inning_topbot == "Bot" else x.away_team,
                                 axis=1)
    df['Opp'] = df.apply(lambda x: x.away_team
                         if x.inning_topbot == "Bot" else x.home_team,
                         axis=1)
    df['Place'] = df.apply(lambda x: "Home"
                           if x.inning_topbot == "Bot" else "Away",
                           axis=1)
    teams = df.player_team.unique()
    fixers = {"WSH": "WSN", "CWS": "CHW"}
    teams_fixed = [x if x not in fixers.keys() else fixers[x] for x in teams]

    team_schedule = {}
    missed = []
    for t in teams_fixed:
        try:
            d = schedule_and_record(yr, t)
            d['fix_date'] = d.Date.str.replace("\(\d\)",
                                               "").str.strip() + " " + str(yr)
            d['game_date'] = pd.to_datetime(d.fix_date.apply(
                lambda x: datetime.strptime(x, "%A, %b %d %Y")).apply(
                    lambda x: x.strftime("%Y-%m-%d")),
                                            infer_datetime_format=True)
            d['Place'] = d.Home_Away.apply(lambda x: "Home"
                                           if x == "Home" else "Away")
            d2 = d[d.game_date > df.game_date.max()][[
                'Place', "Opp", "game_date"
            ]]
            team_schedule[t] = d2
        except ValueError:
            print(t)
            missed.append(t)

    df['name_last'] = df['name_last'].str.capitalize()
    df['name_first'] = df['name_first'].str.capitalize()
    df['player_name'] = df.name_first + " " + df.name_last
    sm_df = df[[
        'game_date', 'game_pk', 'hit', 'ab', 'Opp', 'Place', 'player_name',
        'player_team', 'key_mlbam'
    ]]
    sm_df.sort_values(['player_name', 'game_date', 'game_pk'], inplace=True)
    trim_df = sm_df.groupby([
        'player_name', 'game_date', 'game_pk', 'Opp', 'Place', 'player_team',
        'key_mlbam'
    ]).sum().reset_index()

    def player_df(player, d=trim_df):
        temp = d[d.player_name == player]
        temp = temp.sort_values(['game_date']).reset_index(drop=True)
        tm = temp.loc[len(temp) - 1, 'player_team']
        if tm in fixers.keys():
            sched = team_schedule[fixers[tm]]
        else:
            sched = team_schedule[tm]
        tdf = pd.concat([temp, sched], sort=False)
        tdf.ab.fillna(0, inplace=True)
        tdf.hit.fillna(0, inplace=True)
        tdf.player_name.fillna(player, inplace=True)
        tdf.player_team.fillna(tm, inplace=True)
        return tdf

    master_df = player_df(trim_df.player_name.unique()[0])
    for p in trim_df.player_name.unique()[1:]:
        got = player_df(p)
        master_df = pd.concat([master_df, got], sort=False)
    master_df.game_date = master_df.game_date.apply(
        lambda x: format(x, "%Y-%m-%d"))
    ## now write to the google sheet

    # #authorization
    gc = pygsheets.authorize(
        service_file='/home/irarickman/formal-thunder-186123-ab6b0fb6bc46.json'
    )
    mlb = 'MLB At Bats'
    sh = gc.open(mlb)

    #select the first sheet
    wks = sh[0]

    wks.set_dataframe(master_df, (1, 1))