示例#1
0
def pitchingFangraphsData(dateRange, stats):
    # extracting year from start date
    startStr = dateRange[0]
    start = int(startStr[:4])

    # extracting year from end date
    endStr = dateRange[1]
    end = int(endStr[:4])

    # get pitching_stats for specific range given
    print("Gathering Data from Fangraphs")
    pitchStats = bball.pitching_stats(start, end)
    print("Data Gathering Complete")
    pitchStatsDF = pd.DataFrame(pitchStats)
    headersList = pitchStatsDF.columns
    headers = np.asarray(headersList)

    # drop columns that are in the input stats array
    keepStats = []
    index = []
    count = 0
    for i in stats:
        for j in headers:
            if j == i:
                index.insert(len(index) + 1, count)
                keepStats.insert(len(keepStats) + 1, j)
            count = count + 1
        count = 0

    dropCols = np.delete(headers, index)
    drop = np.asarray(dropCols)

    # drop the columns from pitching stats with these names
    pitchStatsDF = pitchStatsDF.drop(columns=drop)
    return pitchStatsDF
示例#2
0
def load_pitching_data():
    pitching_2019_L = pitching_stats(2019,
                                     end_season=None,
                                     league='all',
                                     qual=1,
                                     ind=0,
                                     handVar='L')
    pitching_2019_R = pitching_stats(2019,
                                     end_season=None,
                                     league='all',
                                     qual=1,
                                     ind=0,
                                     handVar='R')
    # Restrict data to only players with more than 9 IP
    pitching_2019_L = pitching_2019_L[pitching_2019_L['IP'] >= 9]
    pitching_2019_R = pitching_2019_R[pitching_2019_R['IP'] >= 9]
    return pitching_2019_L, pitching_2019_R
def make_period_dicts(dictionary):
    batter_df = {
        dic: pyb.batting_stats(int(dic), qual=False)
        for dic in dictionary.keys()
    }
    pitcher_df = {
        dic: pyb.pitching_stats(int(dic), qual=False)
        for dic in dictionary.keys()
    }

    return batter_df, pitcher_df
示例#4
0
def preprocess_data():
    # read in CY historical data
    print("\nload cy young award winner data")
    append_data = []
    # i have Cy Young winner data from 2006 to 2015
    for i in range(2006, 2016):
        for j in ['AL', 'NL']:
            file_name = "CY_" + str(j) +"_" + str(i) + ".txt"
            tmp = pd.read_table(os.path.join('Data', file_name), sep = ",")
            tmp['Season'] = i
            tmp['League'] = j
            append_data.append(tmp)
    old_cy = pd.concat(append_data, axis = 0)
    # winner of CY has Rank 1
    old_cy['CY'] = [int(x == 1) for x in old_cy['Rank']]
    cy = old_cy[['Name', 'Season', 'CY']]
    # download baseball data: start from 2006 because more advanced stats are recorded since
    print("\nstart downloading data")
    stats1 = pb.pitching_stats(start_season = 2006, end_season = 2010, qual = qual)
    stats2 = pb.pitching_stats(start_season = 2011, end_season = last_season, qual = qual)
    df = create_var(pd.concat([stats1, stats2], axis = 0))
    data = pd.merge(df, cy, on = ['Name', 'Season'], how = 'outer')
    data['CY'].fillna(0, inplace = True)
    # enter 2016 and 2017 winners
    data.loc[[all([any([a, b]), c]) for a, b, c in zip(data.Name == 'Max Scherzer', data.Name == 'Rick Porcello', data.Season == 2016)], 'CY'] = 1
    data.loc[[all([any([a, b]), c]) for a, b, c in zip(data.Name == 'Max Scherzer', data.Name == 'Corey Kluber', data.Season == 2017)], 'CY'] = 1    
    # split training and testing data
    # drop columns if it has at least one missing value
    print("\npreprocess data")
    mod_data = data.dropna(thresh = data.shape[0], axis = 1)
    dout = mod_data.drop(['Team', 'Dollars', 'Age Rng'], axis = 1).set_index('Season')
    # set Season as index so I can extract season for splitting data later
    print("\nwrite data to files\n")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    dout.to_csv(os.path.join(data_dir, 'train_data.csv'), index = True)
示例#5
0
def cmd_pitching_upload(start_year, end_year, db_username, db_password, db_hostname, db_name, db_tablename):

    click.echo('[[[ PULLING PITCHING DATAFRAME ]]]')

    engine = initdb_pitching(db_username, db_password, db_hostname, db_name, db_tablename)

    click.echo(str('Pulling data...'))
    try:
        data = pitching_stats(start_year, end_year)
        data.columns = data.columns.str.replace('%', '')
        upload_block(data, engine, db_tablename)
    except Exception as exc:
        click.echo("ERROR pulling down data - Error was = " + str(exc))
    else:
        click.echo(str('SUCCESS, pulled ' + str(data.shape[0]) + ' records'))
示例#6
0
def retrieve_stats(team, year):
    batting = batting_stats(year)
    pitching = pitching_stats(year)

    team_batting = batting[batting['Team'] == team][[
        'Name', 'PA', 'WAR', 'wRC+', 'AVG', 'OBP', 'SLG', 'OPS', 'HR'
    ]].sort_values('PA', ascending=False)
    team_batting = team_batting.set_index('Name')

    print("***** BATTING STATS *****")
    print(team_batting)

    print("")

    team_pitching = pitching[pitching['Team'] == team][[
        'Name', 'IP', 'WAR', 'FIP'
    ]].sort_values('IP', ascending=False)
    team_pitching = team_pitching.set_index('Name')

    print("***** PITCHING STATS *****")
    print(team_pitching)
示例#7
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pybaseball as pyb
from pybaseball import statcast_batter
from pybaseball import playerid_lookup

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("chained_assignment", None)

pitchers_2018 = pyb.pitching_stats(2018)
pitchers_2019 = pyb.pitching_stats(2019)
pitchers_2020 = pyb.pitching_stats(2020)

df_2018 = pyb.batting_stats(2018)
df_2019 = pyb.batting_stats(2019)
df_2020 = pyb.batting_stats(2020)

#cards_df = df_2019.loc[df_2019['Team'] == "Cardinals"]
#cards_pitchers = pitchers_2019.loc[pitchers_2019['Team'] == "Cardinals"]
"""
winker_stats = statcast_batter('2020-08-01', '2020-08-03', 608385)
print(winker_stats)
"""
"""
Sorting the Cardinals (2019) by WAR (top 30)
And ONLY showing the WAR and name columns.
"""
#print(cards_df.sort_values(by = 'WAR', ascending = False).head(30))
#print(cards_pitchers)
示例#8
0
def get_data(year = 2018, minimum_starts = 5):
    if not os.path.exists(str(year)):
        os.mkdir(str(year))
    if not os.path.exists(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")):
        player_stats = pitching_stats(year, year)
        player_stats = player_stats[player_stats['GS']>minimum_starts]
        player_stats.to_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv"))
    else:
        player_stats = pd.read_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv"))
    out = None
    for name in player_stats['Name']:
        if not os.path.exists(os.path.join(str(year),'player')):
            os.mkdir(os.path.join(str(year),'player'))
        splitname = name.split(' ')
        # Database is really good and has some mistakes, so when we go to the lookup table for MLB Player IDs sometimes
        # it doesn't match up. This corrects the issues that I've found. Obviously this won't work for every year
        # out of the box because of this.
        splitname[0] = splitname[0].replace('.', '. ', 1)
        # print(splitname[0])
        if splitname[0] == 'J.A.':
            splitname[0] = 'J. A.'
        if name == 'Zack Wheeler':
            splitname[0] = 'Zach'
        if name == 'Matthew Boyd':
            splitname[0] = 'Matt'
        if name == 'C.J. Wilson':
            splitname[0] = 'c. j.'
        if name == 'R.A. Dickey':
            splitname[0] = 'R. A.'
        if name == 'Jon Niese':
            splitname[0] = 'Jonathon'
        if name == 'A.J. Burnett':
            splitname[0] = 'A. J.'
        if name == 'Jorge De La Rosa':
            splitname[0] = 'Jorge'
            splitname[1] = 'De La Rosa'
        if name == 'Rubby de la Rosa':
            splitname[0] = 'Rubby'
            splitname[1] = 'de la Rosa'
        if name == 'Cole DeVries':
            splitname[1] = 'De Vries'
        if name == 'Samuel Deduno':
            splitname[0] = 'Sam'
        if name == 'JC Ramirez':
            splitname[0] = 'J. C.'
        if name == 'Nathan Karns':
            splitname[0] = 'Nate'
        if name == 'Daniel Ponce de Leon':
            splitname[1] = 'Ponce de Leon'
        if name == 'Chi Chi Gonzalez':
            splitname[0] = 'Chi Chi'
            splitname[1] = 'Gonzalez'
        if name == 'Josh A. Smith':
            splitname[0] = 'Josh'
            splitname[1] = 'Smith'
        if name == 'Joel De La Cruz':
            splitname[1] = 'De La Cruz'

        if not os.path.exists(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')):
            player_id = playerid_lookup(splitname[1], splitname[0])
            print(year)
            player_id = player_id[player_id['mlb_played_first'] <= year]
            player_id = player_id[player_id['mlb_played_last'] >= year]

            print(player_id)
            print(len(player_id))
            if len(player_id) != 1:
                print(player_id)
                print(name)
                print("Concerning")


            player = statcast_pitcher(str(year)+'-1-01', str(year)+'-12-31', player_id['key_mlbam'].iloc(0)[0])
            player.to_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv'))
        else:
            player = pd.read_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv'))

        # ['SL' 'FF' 'CU' 'FT' 'CH' nan 'FC' 'KC' 'SI' 'PO' 'FS' 'EP' 'SC']
        player_row = pd.DataFrame({'Name':[name]})
        pitch_types = ['SL','FF','CU','FT','CH','FC','KC','SI','PO','FS','EP','SC','KN']
        soi = ['release_speed','release_pos_x','release_pos_z','pfx_x','pfx_z','vx0','vy0','vz0','ax','ay','az','effective_speed','release_spin_rate']
        for pitch in pitch_types:
            pitches = player[player['pitch_type'] == pitch]
            pitches = pitches[soi]
            for stat in soi:
                mean = np.mean(pitches[stat])
                if math.isnan(mean):
                    mean = 0
                std = np.std(pitches[stat])+0
                if math.isnan(std):
                    std = 0
                min = np.min(pitches[stat])+0
                if math.isnan(min):
                    min = 0
                max = np.max(pitches[stat])+0
                if math.isnan(max):
                    max = 0
                player_row[pitch+"_"+stat + '_std'] = std
                player_row[pitch+"_"+stat + '_mean'] = mean
                player_row[pitch + "_" + stat + '_min'] = min
                player_row[pitch + "_" + stat + '_max'] = max
        if out is None:
            out = player_row
        else:
            out = pd.concat([out,player_row])
    out
    out.to_csv(str(year)+".csv")
示例#9
0
    #Create Empty set to use for export
    Result = []

    #Import Files
    # last three years of player data
    MarcelTable = pd.read_csv('data/marcel/MarcelTable.csv')

    # League Averages by Year
    lgAVG = pd.read_csv('data/marcel/lgAVG.csv')

    Year = lgAVG.index[lgAVG["Season_bat"] == 2019.0]
    Year = lgAVG[lgAVG['Season_bat'] == Year1].index.values.astype(int)[0]
    print(Year)


#marcelCalculations()

# Import pandas package
import pandas as pd
from pybaseball import pitching_stats

data = pitching_stats(2019)

# making data frame
#data = pd.read_csv("data/marcel/MarcelTable.csv")

# iterating the columns
for col in data.columns:
    print(col)
示例#10
0
from pybaseball import pitching_stats
import pathlib, time

years = [2019 - x for x in range(50)]
years = list(map(str, years))

folder_path = pathlib.Path.cwd().joinpath('pitching_status')
if not folder_path.exists():
    folder_path.mkdir()

for year in years:
    print(f'Start with {year}')
    filename = f'{year}_pitching_status.csv'
    file_path = folder_path.joinpath(filename)

    if file_path.exists():
        print(f'{year} already downloaded !')
        continue

    data = pitching_stats(year)
    data.to_csv(file_path, encoding='utf_8_sig')
    print(f'Done with {year}')
示例#11
0
"""PyBaseball.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1BZ3IYkf_Kka410P_Hne0RCZRT84UWPFm
"""

!pip install pybaseball

from pybaseball import statcast
data = statcast(start_dt='2017-06-24', end_dt='2017-06-27')
data.head(2)

from pybaseball import pitching_stats
data = pitching_stats(2012, 2016)
data.head()

from pybaseball import playerid_lookup
from pybaseball import statcast_pitcher
import pandas as pd
csv = '2019pitchers.csv'
df = pd.read_csv(csv)
print(df)

import pandas as pd
alldata = statcast_pitcher('2019-03-27', '2019-11-01', df.get_value(0, 'MLBID'))
for i in range(1, 121): 
  data = statcast_pitcher('2019-03-27', '2019-11-01', df.get_value(i, 'MLBID'))
  data = data[::-1]
  alldata = pd.concat([alldata, data])
from pybaseball import batting_stats
from pybaseball import pitching_stats

#hitter_data = batting_stats(2015, 2017)
pitcher_data = pitching_stats(2015, 2017)

print(pitcher_data)
示例#13
0
    'K/9+': 'K_9_plus',
    'BB/9+': 'BB_9_plus',
    'H/9+': 'H_9_plus',
    'HR/9+': 'HR_9_plus',
    'LOB%+': 'LOB_pct_plus',
    'WHIP+': 'WHIP_plus'
}

for year in years:
    for side in sides:
        table_name = side + '_' + str(year)
        table_id = "fangraphs." + table_name
        if side == 'batting':
            df = batting_stats(year, qual=0)
        elif side == 'pitching':
            df = pitching_stats(year, qual=0)
        df.rename(columns=renamed_columns, inplace=True)
        print('Loading ' + table_id)
        table_schema = []
        for column in df.columns:
            table_schema.append({'name': column, 'type': 'STRING'})
        pandas_gbq.to_gbq(df,
                          table_id,
                          project_id=project_id,
                          if_exists='replace',
                          table_schema=table_schema,
                          chunksize=10000)
        # try:
        #     if side == 'batting':
        #         df = batting_stats(year)
        #     elif side == 'pitching':
示例#14
0
def profile_pitching_stats():
    pitching_stats(2019)
示例#15
0
def yearGrab(currentSeason):

    # Batting Stats
    from pybaseball import batting_stats

    BattingStats_Year = batting_stats(currentSeason, qual=1)
    # print(BattingStats_Year.head()) # Test
    BattingStats_Year.to_csv('data/YearlyData/temp/bstats.csv')

    import pybaseball

    #Team Batting Stats
    BattingStats_Team_Year = pybaseball.team_batting(currentSeason)
    BattingStats_Team_Year.to_csv('data/YearlyData/temp/team_bstats.csv')

    # Pitching Stats
    from pybaseball import pitching_stats

    PitchingStats_Year = pitching_stats(currentSeason)
    # print(PitchingStats_Year.head()) # Test
    PitchingStats_Year.to_csv('data/YearlyData/temp/pstats.csv')

    print(str(currentSeason) + " : Stats Grab: Successful")

    # Team Pitching Stats
    PitchingStats_Team_Year = pybaseball.team_pitching(currentSeason)
    PitchingStats_Team_Year.to_csv('data/YearlyData/temp/team_pstats.csv')
    """
    # Year Standings
    if currentSeason >=1969:
        from pybaseball import standings
        # get the end-of-season division standings for each season
        Standings = standings(currentSeason)
        Standings.insert(0, 'Season', currentSeason)
        Standings.to_csv('data/YearlyData/temp/team_standings.csv')
    """
    """
    # Amateur Draft Data
    if currentSeason >= 1965:

        from pybaseball import amateur_draft

        # Get amateur Draft Results
        Number_Rounds = 30
        RoundRange = list(range(1, Number_Rounds + 1))
        for i in RoundRange:
            #Pull season draft data 1 round at a time
            Amateur_Draft = amateur_draft(currentSeason, i)
            #Add Draft Year
            Amateur_Draft.insert(0, 'Draft_Year', currentSeason)

            # print(Amateur_Draft.head()) # Test

            # Read file with all draft files so far
            A_Draft = pd.read_csv('data/YearlyData/temp/amateur_draft.csv') 
            # Add New Round
            A_Draft = A_Draft.append(Amateur_Draft, ignore_index=True)
            #Save new round to File
            A_Draft.to_csv('data/YearlyData/temp/amateur_draft.csv', sep=',', index=False, encoding='utf-8')
        """

    # Exit Veolocity Data

    from pybaseball import statcast_batter_exitvelo_barrels

    Exit_Velocity = statcast_batter_exitvelo_barrels(currentSeason)
    Exit_Velocity.insert(0, 'Season', currentSeason)
    Exit_Velocity.to_csv('data/YearlyData/temp/statcast_exit_velocity.csv')

    # FanGraph Data
    import pybaseball

    # Individual Batting Stats
    fan_bat = pybaseball.batting_stats(currentSeason)
    fan_bat.to_csv('data/YearlyData/temp/fan_bat.csv')

    # Individual Pitching Stats
    fan_pit = pybaseball.pitching_stats(currentSeason)
    fan_pit.to_csv('data/YearlyData/temp/fan_pit.csv')

    # Team Batting Stats
    fan_team_bat = pybaseball.team_batting(currentSeason)
    fan_team_bat.to_csv('data/YearlyData/temp/fan_team_bat.csv')

    # Team Pitching Stats
    fan_team_pit = pybaseball.team_pitching(currentSeason)
    fan_team_pit.to_csv('data/YearlyData/temp/fan_team_pit.csv')

    from pybaseball import top_prospects

    # Get top overall prospects leaguewide
    topProspects = top_prospects()
    topProspects.to_csv('data/YearlyData/Top_Prospects.csv')
示例#16
0
def points(currentSeason):
    from pybaseball import batting_stats

    data = batting_stats(currentSeason, qual=1)

    pointAmountsBat()

    points = ( ( R * data['R'] )
    + ( Single * data['1B'] )
    + ( Double * data['2B'] )
    + ( Triple * data['3B'] )
    + ( HR * data['HR'] )
    + ( TB * ( data['1B'] + (2 * data['2B']) + (3 * data['3B']) + (4 * data['HR']) ) )
    + ( RBI * data['RBI'] )
    + ( BB * data['BB'] )
    + ( K * data['SO'] )
    + ( SB * data['SB'] )
    + ( AB * data['AB'] )
    + ( Hits * data['H'])
    + ( XBH * ( data['2B'] + data['3B'] + data['HR']) )
    + ( IBB * data['IBB'] )
    + ( HBP * data['HBP'] )
    + ( CS * data['CS'] ) )


    # missing Sac and GWRBI and everything past CS

    data['Points'] = points
    data['Points'] = data['Points'].round(decimals=0)

    BattingStats = data[['Season', 'Name', 'Team', 'Age', 'Points', 'G', 'PA', 'AB', 'AVG', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SB', 'CS', 'Pitches']]

    BattingStats = BattingStats.sort_values('Points', ascending=False)
    BattingStats = BattingStats.reset_index(drop=True)


    BattingStats.to_csv('data/bstats.csv')

    from pybaseball import pitching_stats

    pdata = pitching_stats(currentSeason)

    pointAmountsPit()

    ppoints = ( ( IP * pdata['IP'] )
    + ( ER * pdata['ER'] )
    + ( K * pdata['SO'] )
    + ( SO * pdata['ShO'] )
    + ( W * pdata['W'] )
    + ( L * pdata['L'] )
    + ( SV * pdata['SV'] )
    + ( BS * pdata['BS'] )
    + ( G * pdata['G'] )
    + ( GS * pdata['GS'] )
    + ( H * pdata['H'] )
    + ( RA * pdata['R'] )
    + ( HR * pdata['HR'] )
    + ( BB * pdata['BB'] )
    + ( HB * pdata['HBP'] )
    + ( IBB * pdata['IBB'] )
    + ( B * pdata['BK'] )
    #+ ( PKO * pdata['PKO'] )
    + ( QS * pdata['BK'] )
    + ( CG * pdata['CG'] )
    #+ ( NH * pdata['NH'] )
    #+ ( PG * pdata['PG'] )
    #+ ( BF * pdata['BF'] )
    + ( PC * pdata['Pitches'] )
    #+ ( SOP * pdata['SOP'] )
    #+ ( HD * pdata['HD'] )
    )

    pdata['Points'] = ppoints
    pdata['Points'] = pdata['Points'].round(decimals=0)

    PitchingStats = pdata[['Season', 'Name', 'Team', 'Age', 'Points', 'W', 'L', 'ERA', 'ER', 'WAR', 'G', 'GS', 'CG', 'ShO', 'SV', 'BS', 'IP', 'H', 'R', 'HR', 'BB', 'IBB', 'HBP', 'BK', 'SO', 'TBF', 'Pitches']]

    PitchingStats = PitchingStats.sort_values('Points', ascending=False)
    PitchingStats = PitchingStats.reset_index(drop=True)

    PitchingStats.to_csv('data/pstats.csv')
示例#17
0
import json

currentDate = datetime.now()
currentMonth = f"{currentDate.month:02d}"
if int(currentMonth) > 3:
    currentYear = currentDate.year
    currentDay = date.today()
    recentDay = currentDay - timedelta(days=7)
    currentDay = str(currentDay)
    recentDay = str(recentDay)
else:
    currentYear = currentDate.year -1
    currentDay = None
    recentDay = None

pitchingData = pitching_stats(currentYear)
battingData = batting_stats(currentYear)

if currentDay is not None:
    recentPitchingData = pitching_stats_range(recentDay, currentDay)
    recentBattingData = batting_stats_range(recentDay, currentDay)

    recentPitchingDataFile = open("../public/json/pitcherRankingsRecent.json", "w")
    recentPitchingDataFile.write(json.dumps(json.loads(recentPitchingData.reset_index().to_json(orient='index')), indent=2))
    recentPitchingDataFile.close()

    recentBattingDataFile = open("../public/json/batterRankingsRecent.json", "w")
    recentBattingDataFile.write(json.dumps(json.loads(recentBattingData.reset_index().to_json(orient='index')), indent=2))
    recentBattingDataFile.close()

else:
示例#18
0
## grammar of graphics module
from plotnine import *

## further modules
from datetime import date

#%% get data

today = date.today()  #today as reference

## first, standard pitching stats for last 5 years
end_1 = today.year
start_1 = today.year - 5  #year as integer

pitching = pitching_stats(start_1, end_1)  #takes a while

# give sucess-message and summary stats of data set
print()
print("Sucessfully sampled pitching stats of the last 5 years:")
print()
#list(pitching.columns) #list variable names
pitching.describe()  #summary stats of pitching data frame

## then, more detailed pitching stats for last month

# date of one month ago - be careful current day is 31st
if today.month - 1 == 0:  #check if its january
    one_month_ago = today.replace(month=12)  #if january
else:
    one_month_ago = today.replace(month=today.month - 1)