Exemplo n.º 1
0
def scrape_xxxfip():
    import glob
    import csv
    bbdb = postgres.connect_to_bbdb()

    # Get the home page
    driver = selenium_utilities.start_driver(headless=True)
    driver.get("https://mlpitchquality.shinyapps.io/xxxfip_app/")
    time.sleep(5)
    input_submit = driver.find_element_by_id('download_but')
    input_submit.click()
    time.sleep(5)
    driver.close()
    print('Closed driver, hopefully successfully scraped xxxfip data')

    dl_location = "/Users/andrewfelton/Downloads/docker/*.*"
    dl_file = max(glob.glob(dl_location), key=os.path.getmtime)
    filename = dl_file.split('/')[-1]
    asof_date = filename.split('_')[2].split('.')[0]
    assert (filename.split('_')[0] == 'xxxFIP')
    print('Downloaded file ' + dl_file)

    basepath = "/Users/andrewfelton/Documents/bb/bb-2021/data/xxxfip/"
    new_file = basepath + filename
    stream_command = os.popen('mv ' + dl_file + ' ' + new_file)
    mv_file = stream_command.read()

    # create the soft link
    ln_file = basepath + "xxxfip.csv"
    command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file)

    xxxfip = pd.read_csv(ln_file)
    xxxfip.insert(0, 'asof_date', asof_date)

    # Merge in the FG IDs
    # We only have name to match on so need to do some manual adjustments
    xxxfip.rename(columns={
        'Name': 'name',
        'Batters Faced': 'batters_faced'
    },
                  inplace=True)
    names = player_names.get_player_names()
    xxxfip = xxxfip.merge(right=names[['name', 'fg_id']],
                          how='left',
                          on='name')
    xxxfip.loc[(xxxfip['name'] == 'JT Brubaker'), 'fg_id'] = '17578'
    xxxfip.loc[(xxxfip['name'] == 'Hyun Jin Ryu'), 'fg_id'] = '14444'
    xxxfip.loc[(xxxfip['name'] == 'Kwang Hyun Kim'), 'fg_id'] = '27458'

    #fg_ids = xxxfip[['playerid']].astype(str).values
    #put_missing_in_GS(id_list=pd.DataFrame(fg_ids, columns=['fg_id']), type='fg_id')

    xxxfip = xxxfip[['asof_date', 'fg_id', 'xxxFIP']]
    schema = 'tracking'
    tablename = 'xxxfip'
    bbdb = postgres.connect_to_bbdb()
    xxxfip.to_sql(tablename, bbdb, schema=schema, if_exists='replace')

    return xxxfip
Exemplo n.º 2
0
def scrape_actuals(year=2021):
    bbdb = postgres.connect_to_bbdb()

    urlprefix = 'https://www.baseball-reference.com/leagues/MLB/'

    urlsuffix = '-standard-batting.shtml'
    tableid = 'players_standard_batting'
    batting_2021 = scrape_bbref(urlprefix, 2021, urlsuffix, tableid)
    batting_2021[['year', 'bbref_id', 'Name', 'Age', 'Tm', 'Lg', 'G', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB']].to_sql('bbref_batting_standard', bbdb, schema='tracking', if_exists='replace')

    urlsuffix = '-standard-pitching.shtml'
    tableid = 'players_standard_pitching'
    pitching_standard_2021 = scrape_bbref(urlprefix, 2021, urlsuffix, tableid)
    pitching_standard_2021[["year", "bbref_id", "Name", "Age", "Tm", "Lg", "W", "L", "ERA", "G", "GS", "GF", "CG", "SHO", "SV", "IP", "H", "R", "ER", "HR", "BB", "IBB", "SO", "HBP", "BK", "WP", "BF", "FIP", "WHIP", "H9", "HR9", "BB9", "SO9"]].to_sql('bbref_pitching_standard', bbdb, schema='tracking', if_exists='replace')

    urlsuffix = '-starter-pitching.shtml'
    tableid = 'players_starter_pitching'
    pitching_starter_2021 = scrape_bbref(urlprefix, 2021, urlsuffix, tableid)
    pitching_starter_2021[['year', 'bbref_id', 'Name', 'Age', 'Tm', 'IP', 'G', 'GS', 'Wgs', 'Lgs', 'ND', 'Wchp', 'Ltuf', 'Wtm', 'Ltm', 'Wlst', 'Lsv', 'CG','SHO', 'QS', 'GmScA', 'Best', 'Wrst', 'BQR', 'BQS', 'sDR', 'lDR']].to_sql('bbref_pitching_starter', bbdb, schema='tracking', if_exists='replace')

    urlsuffix = '-reliever-pitching.shtml'
    tableid = 'players_reliever_pitching'
    pitching_reliever_2021 = scrape_bbref(urlprefix, 2021, urlsuffix, tableid)
    pitching_reliever_2021[['year', 'bbref_id', 'Name', 'Age', 'Tm', 'IP', 'G', 'GR', 'GF', 'Wgr', 'Lgr', 'SVOpp', 'SV', 'BSv', 'SVSit', 'Hold', 'IR', 'IS', 'aLI', 'LevHi', 'LevMd', 'LevLo', 'Ahd', 'Tie', 'Bhd', 'Runr']].to_sql('bbref_pitching_reliever', bbdb, schema='tracking', if_exists='replace')
Exemplo n.º 3
0
def scrape_bp_pitching():
    import os
    from datetime import date
    import sys
    sys.path.append('python/general')
    import selenium_utilities
    sys.path.append('python/munging')
    import player_names
    from player_names import put_missing_in_GS
    import postgres
    import time
    import pandas as pd
    import datetime

    driver = selenium_utilities.start_driver()
    driver.get("https://www.baseballprospectus.com/leaderboards/pitching/")
    print('Arrived at ' + driver.current_url)

    dl_btn = driver.find_element_by_xpath(
        '//*[@id="app"]/div[2]/div/div[3]/div[1]/div[2]/button')
    dl_btn.click()
    time.sleep(4)

    today = date.today().strftime("%Y%m%d")
    basepath = "/Users/andrewfelton/Documents/bb/bb-2021"
    dl_file = "/Users/andrewfelton/Downloads/docker/bp_export_" + today + ".csv"

    assert os.path.isfile(dl_file)

    new_file = basepath + "/data/bp/bp_pitchers_" + today + ".csv"
    stream_command = os.popen('mv ' + dl_file + ' ' + new_file)

    # create the soft link
    ln_file = basepath + "/data/bp/bp_pitchers.csv"
    command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file)

    driver.close()
    #selenium_utilities.stop_selenium('bbsel')
    print("Finished scraping " + ln_file)

    bp_pitchers = pd.read_csv(ln_file)
    bp_pitchers.insert(0, 'asof_date', date.today().strftime('%Y-%m-%d'))
    bp_pitchers.rename(columns={
        "bpid": "bp_id",
        "mlbid": "mlb_id",
        "IP": "IP",
        "DRA": "DRA",
        '+/-': 'plus_minus',
        'DRA-': 'DRAminus',
        'Whiff%': 'whiff_pct',
        'K%': 'k_pct',
        'BB%': 'bb_pct',
        'K/9': 'k9',
        'BB/9': 'bb9',
        'HR%': 'hr_pct',
        'HR/9': 'hr9',
        'IFFB%': 'iffb_pct',
        'GB%': 'gb_pct',
        'LD%': 'ld_pct',
        'FB%': 'fb_pct',
        '1B': '_1B',
        '2B': '_2B',
        '3B': '_3B'
    },
                       inplace=True)
    bp_pitchers[['bp_id']] = bp_pitchers[['bp_id']].astype("string")
    bp_pitchers[['mlb_id']] = bp_pitchers[['mlb_id']].astype("string")

    names = player_names.get_player_names()
    bp_pitchers = bp_pitchers.merge(names[['mlb_id', 'fg_id']],
                                    on='mlb_id',
                                    how='left')

    tablename = "bp_pitchers_raw"
    bbdb = postgres.connect_to_bbdb()

    query_tables = "SELECT * FROM pg_catalog.pg_tables WHERE schemaname='hist';"
    tables_list_result = bbdb.execute(query_tables)
    tables_list = []
    for table in tables_list_result:
        tables_list.append(table[1])

    if (tablename in tables_list):
        command = 'TRUNCATE TABLE hist.' + tablename
        bbdb.execute(command)
    bp_pitchers[['fg_id', 'bp_id', 'IP', 'DRAminus', 'cFIP']]\
        .to_sql(tablename, bbdb, schema='hist', if_exists='append', index=False)
Exemplo n.º 4
0
def scrape_cm_draft(draft_num, gs=None, db=None):

    import sys
    sys.path.append('python/general')
    import postgres
    sys.path.append('python/munging')
    import player_names
    import pandas as pd
    import gspread
    import gspread_dataframe
    import requests

    assert gs == None or gs in ('SoS', 'Legacy')

    #draft_num = '46233' # SoS D2 testing
    draft_url = "https://www.couchmanagers.com/mock_drafts/csv/download.php?draftnum=" + str(
        draft_num)
    print('Going to scrape draft ' + draft_num + ' from ' + draft_url)
    r = requests.get(draft_url).text.splitlines()
    print('Downloaded the .csv')

    mock = []
    for line in r:
        pick = line.split(',')
        for i in range(0, len(pick)):
            pick[i] = str(pick[i].replace('"', ''))
        mock.append(pick)
    colnames = mock.pop(0)
    mock = pd.DataFrame(mock, columns=colnames)

    names = player_names.get_player_names()

    mock = mock.merge(names[['name', 'fg_id', 'otto_id']],
                      how='left',
                      left_on='ottid',
                      right_on='otto_id')
    mock = mock[['Pick', 'Rd', 'Owner', 'name', 'fg_id']]
    print('Munged into mock draft format')

    if (gs != None):
        gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json')
        bb2021 = gc.open("BB 2021 " + gs)
        sheettitle = "Mock " + draft_num
        if (sheettitle in bb2021.worksheets() == False):
            bb2021.add_worksheet(title=sheettitle, rows='1', cols='1')
        else:
            bb2021.values_clear(sheettitle + "!A:Z")
        gspread_dataframe.set_with_dataframe(bb2021.worksheet(sheettitle),
                                             mock)
        combined = bb2021.worksheet('Combined')
        hitter_projections = bb2021.worksheet('Hitter Projections')
        combined.update
        hitter_projections.update
        print('Updated Google sheet')

    if (db != None):
        bbdb = postgres.connect_to_bbdb()
        mock.to_sql('cm_mock_' + draft_num,
                    bbdb,
                    schema='drafts',
                    if_exists='replace')
        print('Updated database')
Exemplo n.º 5
0
        except IndexError:
            errors.append(tr)

    players = pd.DataFrame(data=players, columns=colnames)
    players.insert(0, 'year', year)

    # Close it down
    driver.close()
    return players


if (1==2):
    urlprefix = 'https://www.baseball-reference.com/leagues/MLB/'
    urlsuffix = '-appearances-fielding.shtml'
    tableid = 'players_players_appearances_fielding'
    bbdb = postgres.connect_to_bbdb()

    positions2020 = scrape_bbref(urlprefix, 2020, urlsuffix, tableid)
    positions2020.to_sql('positions', bbdb, schema='reference', if_exists='replace')
    for y in range(2010,2019):
        positions = scrape_bbref(urlprefix, y, urlsuffix, tableid)
        positions.to_sql('positions', bbdb, schema='reference', if_exists='append')

    urlprefix = 'https://www.baseball-reference.com/leagues/MLB/'
    urlsuffix = '-standard-batting.shtml'
    tableid = 'players_standard_batting'

    batting_2021 = scrape_bbref(urlprefix, 2021, urlsuffix, tableid)
    batting_2021.to_sql('positions', bbdb, schema='reference', if_exists='replace')

    urlsuffix = '-standard-pitching.shtml'