def manager_tendencies(year): driver_logger.log("\tStoring manager tendencies") print("storing manager tendencies") start_time = time.time() logger.log("Downloading " + str(year) + " manager tendencies || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log('\tMaking HTTP requests') db = DatabaseConnection(sandbox_mode) managers = db.read( 'select manager_teams.managerid, manager_teams.teamid from manager_teams, manager_year where ' 'manager_year.year = ' + str(year) + ' and manager_year.mt_uniqueidentifier = manager_teams.' 'mt_uniqueidentifier;') db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for manager in managers: executor.submit(load_url, manager[0], manager[1]) logger.log('\t\tTime = ' + time_converter(time.time() - start_time)) process_manager_tendencies(year) write_time = time.time() logger.log('\tWriting data to database') global stats with ThreadPoolExecutor(os.cpu_count()) as executor2: for manager_team, tendencies in stats.items(): if len(tendencies) > 0: executor2.submit(write_to_file, year, manager_team, tendencies) logger.log('\t\tTime = ' + time_converter(time.time() - write_time)) total_time = time_converter(time.time() - start_time) driver_logger.log("\t\tTime = " + total_time) logger.log("Done storing manager tendencies: time = " + total_time + '\n\n')
def driver(day, month, year): driver_logger.log(str(month) + '/' + str(day) + '/' + str(year)) driver_time = time.time() print('\n\n' + str(month) + '/' + str(day) + '/' + str(year)) populate_teams_table(year) get_year_data(year) ballpark_and_manager_data(year) league_standings(year) team_offensive_statistics(year) team_defensive_statistics(year) batting_constructor(year) pitching_constructor(year) fielding_constructor(year) team_fielding_file_constructor(year) team_pitching_rotation_constructor(year) team_batting_order_constructor(year) primary_and_secondary_positions(year) determine_pitcher_roles_year(year) get_pitch_fx_data(year, month, day) hitter_tendencies(year) pitcher_tendencies(year) manager_tendencies(year) hitter_spray_chart_constructor(year) pitcher_spray_chart_constructor(year) team_certainties(year) consolidate_data(year) driver_logger.log('Time taken to download ' + str(month) + '/' + str(day) + '/' + str(year) + ' data: ' + time_converter(time.time()-driver_time) + '\n')
def hof_finder(): print("adding HOF data") driver_logger.log("\tAdding HOF data") start_time = time.time() logger.log("Begin finding hall of famers || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) hof_table = str(BeautifulSoup(urlopen('https://www.baseball-reference.com/awards/hof.shtml'), 'html.parser')).\ split('<tbody>')[1].split('</tbody>')[0] rows = hof_table.split('<tr>')[1:] db = DatabaseConnection(sandbox_mode) for row in rows: person = row.split('data-append-csv="')[1].split('"')[0] year = row.split('<a href="/awards/hof_')[1].split('.shtml')[0] induction_type = row.split('data-stat="category_hof">')[1].split( '<')[0] if induction_type == 'Player': db.write('update players set HOF = ' + str(year) + ' where playerId = "' + person + '";') elif induction_type == 'Manager': db.write('update managers set HOF = ' + str(year) + ' where managerId = "' + person + '";') else: continue db.close() total_time = time_converter(time.time() - start_time) logger.log("Done finding hall of famers: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def main(from_server, begin_year, end_year, frame=None): print('\n') if end_year > begin_year >= 1876: driver_logger.log( 'Begin Yearly Driver || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) start_time = time.time() if not from_server: frame.withdraw() league_table_constructor() manager_table_constructor() years = [] for year in range(begin_year, end_year, 1): years.append(year) driver(year) create_strike_zone() rank_driver(years[-1]) comparisons_driver(years[-1]) hof_finder() clean_up_deadlocked_file() auto_migrate() driver_logger.log('Driver complete for year' + stringify_list(years) + ': time = ' + time_converter(time.time() - start_time) + '\n\n\n') else: print( 'Begin year must be lower than End year, but cannot be lower than 1876.' ) exit()
def consolidate_data(year): driver_logger.log("\tConsolidating data") print("Consolidating data") start_time = time.time() logger.log("Consolidating team data || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) for ty_uid in db.read( 'select ty_uniqueidentifier from team_years where year = ' + str(year) + ';'): team_start_time = time.time() logger.log('\t' + db.read( 'select teamId from team_years where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')[0][0]) write_roster_info( ty_uid[0], { 'hitter_spots': consolidate_hitter_spots(ty_uid[0]), 'player_positions': consolidate_player_positions(ty_uid[0]), 'batter_stats': consolidate_player_stats(ty_uid[0], 'batting', year), 'pitcher_stats': consolidate_player_stats(ty_uid[0], 'pitching', year), 'fielder_stats': consolidate_player_stats(ty_uid[0], 'fielding', year) }) logger.log('\t\tTime = ' + time_converter(time.time() - team_start_time)) db.close() total_time = time_converter(time.time() - start_time) logger.log("Done consolidating team data: Time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def populate_teams_table(year): driver_logger.log('\tPopulating teams table') print("Populating teams table") start_time = time.time() logger.log('Begin populating teams table for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) with open(os.path.join("..", "background", "yearTeams.txt"), 'rt') as file: db = DatabaseConnection(sandbox_mode) db.write('ALTER TABLE teams DROP INDEX teamId;') for line in file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: team_id = team.split(';')[0] db.write('insert into teams (teamId, teamName) values ("' + team_id + '", "' + translate_team_name(team_id).replace("'", "\'") + '");') break db.write('ALTER TABLE teams ADD INDEX(teamId);') db.close() total_time = time.time() - start_time logger.log('Populating teams table completed: ' + time_converter(total_time)) driver_logger.log('\t\tTime = ' + time_converter(total_time))
def team_certainties(year): print('aggregating team statistic certainties') driver_logger.log("\tAggregating team statistic certainties") start_time = time.time() logger.log("Calculating team certainties || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) stat_types = ["batting", "pitching"] for stat_type in stat_types: ty_uids = db.read('select ty_uniqueidentifier, teamid from team_years where year = ' + str(year)) for ty_uid in ty_uids: pau = 0 player_list = list(db.read('select playerid from player_positions where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')) for player in player_list: pt_uid = db.read('select pt_uniqueidentifier from player_teams where playerid = "' + player[0] + '" and' ' teamid = "' + ty_uid[1] + '";')[0][0] try: ent = db.read('select pa, certainty from player_' + stat_type + ' where year = ' + str(year) + ' and pt_uniqueidentifier = ' + str(pt_uid) + ';') pau += int(ent[0][0]) - (int(ent[0][0]) * float(ent[0][1])) except IndexError: continue except TypeError: continue pa = int(db.read('select pa from team_years where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')[0][0]) db.write('update team_years set certainty = ' + str((pa - pau) / pa) + ' where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';') db.close() total_time = time_converter(time.time() - start_time) logger.log("Done calculating team certainties: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def clean_up_deadlocked_file(): driver_logger.log("\tCleaning up deadlocked records") with open(os.path.join("utilities", "deadlocked.txt"), 'r') as f: db = DatabaseConnection(sandbox_mode) for line in f: db.write(line) db.close() file = open(os.path.join("utilities", "deadlocked.txt"), "w").close()
def auto_migrate(): import_driver_logger.log( "\tTransferring all sandbox data to production environment") submit({ 'baseballData': { True: ["All"] }, 'pitch_fx': { True: ["All"] } }, True, True)
def get_pitch_fx_data(year, month=None, day=None): if year < 2008: driver_logger.log("\tNo pitch fx data to download before 2008") return start_time = time.time() if month is None and day is None: driver_logger.log("\tFetching " + str(year) + " pitch fx data") print("Fetching " + str(year) + " pitch fx data") logger.log("Downloading pitch fx data for " + str(year) + " || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) opening_day = db.read('select opening_day from years where year = ' + str(year) + ';')[0][0] db.close() for month in range(3, 12, 1): # if month > 11: if month >= int(opening_day.split('-')[0]): for day in range(1, 32, 1): # if day > 14: if month == int( opening_day.split('-')[0]) and int(day) < int( opening_day.split('-')[1]): continue if len(str(day)) == 1: this_day = '0' + str(day) else: this_day = str(day) if len(str(month)) == 1: this_month = '0' + str(month) else: this_month = str(month) get_day_data(this_day, this_month, str(year)) logger.log("Done fetching " + str(year) + " pitch fx data: time = " + time_converter(time.time() - start_time) + '\n\n\n\n') driver_logger.log("\t\tTime = " + time_converter(time.time() - start_time)) aggregate_pitch_fx(year) else: driver_logger.log("\tFetching " + str(month) + "-" + str(day) + "-" + str(year) + " pitch fx data") print("Fetching " + str(month) + "-" + str(day) + "-" + str(year) + " pitch fx data") logger.log("Downloading pitch fx data for " + str(month) + "-" + str(day) + "-" + str(year) + " || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) get_day_data(str(day), str(month), str(year)) driver_logger.log("\t\tTime = " + time_converter(time.time() - start_time)) aggregate_pitch_fx(year, month, day)
def team_defensive_statistics(year): driver_logger.log("\tGathering team defensive statistics") print('Gathering team defensive statistics') start_time = time.time() logger.log('Downloading team defensive data for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().\ strftime('%Y-%m-%d %H:%M:%S')) page1 = str( BeautifulSoup( urlopen("https://www.baseball-reference.com/leagues/MLB/" + str(year) + "-standard-pitching.shtml"), "html.parser")) try: page2 = str( BeautifulSoup( urlopen("https://www.baseball-reference.com/leagues/MLB/" + str(year) + "-batting-pitching.shtml"), "html.parser")) batting_against_rows = page2.split('Player Batting Against')[0].split('<h2>Team Batting Against')[1].\ split('<tbody>')[1].split('</tbody>')[0].split('<tr>') except Exception: batting_against_rows = [] standard_pitching_rows = page1.split('Player Standard Pitching')[0].split('<h2>Team Standard Pitching')[1].\ split('<tbody>')[1].split('</tbody>')[0].split('<tr>') stats1 = { 'R': 'RA', 'ER': 'ER', 'H': "HA", 'HR': 'HRA', 'BB': 'BBA', 'HBP': 'HBPA', 'IBB': 'IBBA', 'SO': 'K', 'ERA': 'ERA', 'whip': 'WHIP' } stats2 = { 'PA': 'PAA', 'AB': 'ABA', '2B': '2BA', '3B': '3BA', 'batting_avg': 'BAA', 'onbase_perc': 'OBA', 'slugging_perc': 'SLGA', 'onbase_plus_slugging': 'OPSA', 'batting_avg_bip': 'BABIPA' } extract_data(standard_pitching_rows, stats1, year) extract_data(batting_against_rows, stats2, year) total_time = time_converter(time.time() - start_time) logger.log("Done donwloading team defensive data for " + str(year) + ': time = ' + total_time + '\n\n') driver_logger.log('\t\tTime = ' + total_time)
def determine_pitcher_roles_year(year): driver_logger.log("\tDetermining Pitcher Roles") print("Determining Pitcher Roles") start_time = time.time() logger.log("Determining Pitcher Roles || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) with ThreadPoolExecutor(os.cpu_count()) as executor: for pt_uid in db.read('select pt_uniqueidentifier from player_pitching where year = ' + str(year) + ';'): player_id_team_id = db.read('select playerid, teamid from player_teams where pt_uniqueidentifier=' + str(pt_uid[0]) + ';')[0] player_id = player_id_team_id[0] team_id = player_id_team_id[1] if team_id == 'TOT': continue ty_uid = str(db.read('select ty_uniqueidentifier from team_years where teamId = "' + team_id + '" and year = ' + str(year) + ';')[0][0]) try: positions = db.read('select positions from player_positions where playerId = "' + player_id + '" and ' 'ty_uniqueidentifier = ' + ty_uid + ';')[0][0] except IndexError: continue update_positions = [] if 'P' in positions: appearances_starts = db.read('select G, GS from player_pitching where pt_uniqueidentifier = ' + str(pt_uid[0]) + ' and year = ' + str(year) + ';')[0] appearances = appearances_starts[0] starts = appearances_starts[1] start_percent = starts / appearances if start_percent > 0.75: role = ['SP'] elif start_percent > 0.50: role = ['SP', 'RP'] elif start_percent > 0.25: role = ['RP', 'SP'] else: role = ['RP'] for position in positions.split(','): if position == 'P': update_positions += role else: update_positions.append(position) executor.submit(db.write('update player_positions set positions = "' + ','.join(update_positions) + '" where ty_uniqueidentifier = ' + ty_uid + ' and playerId = "' + player_id + '";')) db.close() total_time = time_converter(time.time() - start_time) logger.log("Done: Time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def send_results(): print('Emailing results') driver_logger.log('\tEmailing results\n\n\n\n') sender = config.MAIL_USERNAME recipient = config.MAIL_RECIPIENT pwd = config.MAIL_PASSWORD header = "To: " + recipient + '\nFrom: ' + sender + '\nSubject: Daily download results' s = smtplib.SMTP(config.MAIL_SERVER, config.MAIL_PORT) s.ehlo() s.starttls() s.ehlo() s.login(sender, pwd) s.sendmail( sender, recipient, header + '\n\n' + get_csv_results() + '\n' + get_driver_results()) s.quit()
def team_offensive_statistics(year): driver_logger.log("\tGathering team offensive statistics") print('Gathering team offensive statistics') start_time = time.time() logger.log('Downloading team offensive data for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().\ strftime('%Y-%m-%d %H:%M:%S')) page = str( BeautifulSoup( urlopen("https://www.baseball-reference.com/leagues/MLB/" + str(year) + "-standard-batting.shtml"), "html.parser")) stats = { 'PA': 'PA', 'AB': 'AB', 'R': 'R', 'H': 'H', '2B': '2B', '3B': '3B', 'HR': 'HR', 'RBI': 'RBI', 'SB': 'SB', 'CS': 'CS', 'BB': 'BB', 'SO': 'SO', 'GIDP': 'GDP', 'HBP': 'HBP', 'SH': 'SH', 'SF': 'SF', 'IBB': 'IBB', 'G': 'G', 'batting_avg': 'BA', 'onbase_perc': 'OBP', 'slugging_perc': 'SLG', 'onbase_plus_slugging': 'OPS' } standard_batting_rows = page.split('Player Standard Batting')[0].split('<h2>Team Standard Batting')[1].\ split('<tbody>')[1].split('</tbody>')[0].split('<tr>') extract_data(standard_batting_rows, stats, year) total_time = time_converter(time.time() - start_time) logger.log("Done donwloading team offensive data for " + str(year) + ': time = ' + total_time + '\n\n') driver_logger.log('\t\tTime = ' + total_time)
def manager_table_constructor(): driver_logger.log('\tGathering manager data (all-time)') print("Gathering manager data (all-time)") start_time = time.time() logger.log('Begin populating teams table || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) table = str( bs( urllib.request.urlopen( 'https://www.baseball-reference.com/managers/'), 'html.parser')) rows = table.split('<tr') db = DatabaseConnection(sandbox_mode=True) db.write('ALTER TABLE managers DROP INDEX managerId;') db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for row in rows: if '<td class="left" csk="' in row: this_row = row.split('</tr>')[0] try: manager_id = this_row.split( '<a href="/managers/')[1].split('.shtml')[0].replace( "'", "\'") last_first = this_row.split('</tr>')[0].split( '<td class="left" csk="')[1].split('"')[0] last = last_first.split(',')[0].replace("'", "\'") first = last_first.split(',')[1].replace("'", "\'") wins = this_row.split('data-stat="W">')[1].split('<')[0] loses = this_row.split('data-stat="L">')[1].split('<')[0] executor.submit( write_to_file, '"' + manager_id + '","' + last + '","' + first + '",' + wins + ',' + loses) except AttributeError: continue db = DatabaseConnection(sandbox_mode=True) db.write('ALTER TABLE managers ADD INDEX(managerId);') db.close() total_time = time.time() - start_time logger.log('Constructing manager table completed: time = ' + time_converter(total_time)) driver_logger.log('\t\tTime = ' + time_converter(total_time))
def primary_and_secondary_positions(year): print("adding primary and secondary positions") driver_logger.log("\tAdding primary and secondary positions") start_time = time.time() logger.log("Downloading " + str(year) + " primary and secondary data || Timestamp: " + datetime.datetime.today()\ .strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) logger.log("\tAssembling list of players") assembly_time = time.time() teams_from_year = db.read( "select TY_uniqueidentifier from team_years where year=" + str(year) + ';') teams_from_year_range = db.read( "select TY_uniqueidentifier from team_years where year between " + str(year - 25) + ' and ' + str(year) + ';') player_positions = [] player_positions_range = [] for team in teams_from_year: player_positions += db.read( 'select playerId, positions from player_positions where ' + 'TY_uniqueidentifier = ' + str(team[0]) + ';') for team in teams_from_year_range: player_positions_range += db.read( 'select playerId, positions from player_positions where ' + 'TY_uniqueidentifier = ' + str(team[0]) + ';') logger.log("\t\tTime = " + time_converter(time.time() - assembly_time)) logger.log("\tDetermining positions") determination_time = time.time() for player in player_positions: player_position_string = get_player_positions(player, player_positions_range) player_positions_dict = determine_primary_position( player_position_string) write_to_file(player[0].replace("'", "\'"), player_positions_dict) db.close() logger.log("\t\tTime = " + time_converter(time.time() - determination_time)) total_time = time_converter(time.time() - start_time) logger.log("Done downloading primary and secondary positions: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def award_winner_driver(year): print("gathering award winner data") driver_logger.log("\tGathering award winner data") start_time = time.time() logger.log("Beginning award winner driver || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) t1 = mvp_cy_young(year, logger) t2 = roy_gatherer(year, logger) t3 = moy_gatherer(year, logger) t4, t5 = gold_glove_winners(year, logger) t6, t7 = silver_slugger_winners(year, logger) t8, t9 = triple_crown_winners(year, logger) write_to_file(year, [t1, t2, t3, t4, t5, t6, t7, t8, t9]) if year >= 1933: if year not in [1945, 1959, 1960, 1961, 1962]: all_star_finder(year, True, logger) else: all_star_finder(year, False, logger) total_time = time_converter(time.time() - start_time) logger.log("Award winner driver complete: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def main(from_server, day, month, year, frame=None): print('\n') if 0 < day <= 31 and 0 < month <= 12 and year >= 1876: try: driver_logger.log('Begin Daily Driver || Timestamp: ' + datetime.datetime.today(). strftime('%Y-%m-%d %H:%M:%S')) start_time = time.time() if not from_server: frame.withdraw() league_table_constructor() manager_table_constructor() driver(day, month, year) create_strike_zone() clean_up_deadlocked_file() auto_migrate() driver_logger.log('Driver complete for year ' + str(year) + ': time = ' + time_converter(time.time()-start_time) + '\n') except Exception as e: driver_logger.log("ERROR:\t" + str(e)) send_results() raise e else: print('Must enter a valid date.') send_results() exit()
def pitcher_spray_chart_constructor(year): print("creating pitcher spray charts") start_time = time.time() global bad_gateway_data bad_gateway_data = [] logger.log("Downloading " + str(year) + " pitcher spray charts || Timestamp: " + datetime.datetime.today()\ .strftime('%Y-%m-%d %H:%M:%S')) if year >= 1988: driver_logger.log("\tCreating pitcher spray charts") db = DatabaseConnection(sandbox_mode) pt_uid_players = set( db.read( 'select PT_uniqueidentifier from player_pitching where year = ' + str(year) + ' and pa_infield is NULL;')) db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for ent in pt_uid_players: executor.submit(reduce_functionality, year, ent) driver_logger.log("\t\tTime = " + time_converter(time.time() - start_time)) else: driver_logger.log("\tNo pitcher spray chart data before 1988") logger.log("\tNo spray pitcher chart data before 1988") return if len(bad_gateway_data) > 0: revisit_bad_gateways(year, bad_gateway_data) logger.log("Done downloading pitcher spray charts: time = " + time_converter(time.time() - start_time) + '\n\n')
def league_table_constructor(): driver_logger.log('\tPopulating leagues table (all-time)') print('Populating leagues table (all-time)') logger.log('Begin populating leagues table || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) leagues = { 'NL': 'National League', 'AL': 'American League', 'AA': 'American Association', 'FL': 'Federal League', 'PL': 'Players League', 'UA': 'Union Association', 'NA': 'National Association' } db = DatabaseConnection(sandbox_mode) db.write('ALTER TABLE leagues DROP INDEX leagueId;') for league_id, league_name in leagues.items(): db.write('insert into leagues (leagueId, leagueName) values ("' + league_id + '", "' + league_name + '");') db.write('ALTER TABLE leagues ADD INDEX(leagueId);') db.close() logger.log('Populating leagues table completed\n\n') driver_logger.log('\t\tPopulating leagues table completed')
def team_fielding_file_constructor(year): print('getting team fielding positions') driver_logger.log("\tGetting team fielding positions") start_time = time.time() global pages pages = {} logger.log("Downloading " + str(year) + " team fielding positions || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log("\tDownloading team pages") try: year_file = open(os.path.join("..", "background", "yearTeams.txt"), 'r') except FileNotFoundError: year_file = open( os.path.join("..", "..", "..", "background", "yearTeams.txt"), 'r') with ThreadPoolExecutor(os.cpu_count()) as executor: for line in year_file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: split_team = team.split(';') if "TOT" not in split_team: executor.submit(load_url, year, split_team[0], split_team[1]) year_file.close() break logger.log("\t\tTime = " + time_converter(time.time() - start_time)) logger.log("\tOrganizing team position data") write_time = time.time() write_to_file(year) logger.log("\t\tTime = " + time_converter(time.time() - write_time)) total_time = time_converter(time.time() - start_time) logger.log("Done downloading team fielding data: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def ballpark_and_manager_data(year): driver_logger.log('\tGathering ballpark and manager data') print("Gathering ballpark and manager data") start_time = time.time() global pages pages = {} logger.log('Beginning ballpark and manager data download for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) teams = {} with open(os.path.join("..", "background", "yearTeams.txt"), 'rt') as file: for line in file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: temp_team = team.split(';') if 'TOT' not in temp_team: teams[temp_team[1]] = temp_team[0] break logger.log('Begin downloading team pages') download_time = time.time() with ThreadPoolExecutor(os.cpu_count()) as executor1: for team_key, team_id in teams.items(): executor1.submit(load_url, year, team_key) logger.log('\tDone downloading team pages: time = ' + time_converter(time.time() - download_time)) logger.log("Calculating and writing ballpark numbers and downloading images") calc_and_download_time = time.time() team_count = len(teams) with ThreadPoolExecutor(os.cpu_count()) as executor2: for team_key, team_id in teams.items(): executor2.submit(gather_team_home_numbers, team_id, team_key, year, team_count) # break logger.log("\tDone calculating and writing ballpark numbers and downloading manager data: time = " + time_converter(time.time() - calc_and_download_time)) total_time = time_converter(time.time() - start_time) logger.log('Ballpark and manager data download completed: time = ' + total_time + '\n\n') driver_logger.log('\t\tTime = ' + total_time)
def team_batting_order_constructor(year): if year < 1908: logger.log("\tNo team batting order data to download before 1908.") driver_logger.log( "\tNo team batting order data to download before 1908.") return print("getting team batting order data") driver_logger.log("\tGetting team batting order data") start_time = time.time() global pages pages = {} logger.log("Downloading " + str(year) + " team batting order data || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log("\tDownloading team pages") try: year_file = open(os.path.join("..", "background", "yearTeams.txt"), 'r') except FileNotFoundError: year_file = open( os.path.join("..", "..", "..", "background", "yearTeams.txt"), 'r') with ThreadPoolExecutor(os.cpu_count()) as executor: for line in year_file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: if "TOT" not in team: executor.submit(load_url, year, team.split(';')[0], team.split(';')[1]) break logger.log("\t\t\tTime = " + time_converter(time.time() - start_time)) logger.log("\tOrganizing batting orders") write_time = time.time() get_hitters(year) logger.log("\t\t\tTime = " + time_converter(time.time() - write_time)) total_time = time_converter(time.time() - start_time) logger.log("Done downloading team batting order data: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def pitcher_tendencies(year): print("storing pitcher tendencies") start_time = time.time() logger.log("Downloading " + str(year) + " pitcher tendencies || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) if year >= 1988: driver_logger.log("\tStoring pitcher tendencies") logger.log("\tDownloading data") prev_player_id = "" page = str( BeautifulSoup( urlopen('https://www.baseball-reference.com/leagues/MLB/' + str(year) + '-pitches-pitching.shtml'), 'html.parser')) table = page.split('<h2>Player Pitching Pitches</h2>')[1].split( '<tbody>')[1].split('</tbody>')[0] rows = table.split('<tr') logger.log("\t\tTime = " + time_converter(time.time() - start_time)) logger.log("\tFormatting data") format_time = time.time() stat_dictionary = {} for row in rows: player_id, temp_stats = intermediate(row, prev_player_id) if player_id is not None: stat_dictionary[player_id] = temp_stats prev_player_id = player_id for player_id, stats in stat_dictionary.items(): write_to_file(year, player_id, stats) fill_pitchers_with_0_pa(year) total_time = time_converter(time.time() - format_time) logger.log("\t\tTime = " + total_time) driver_logger.log("\t\tTime = " + total_time) else: driver_logger.log("\tNo pitcher tendency data before 1988") logger.log("\tNo pitcher tendency data before 1988") fill_fields(year) logger.log("Done storing pitcher tendencies: time = " + time_converter(time.time() - start_time) + '\n\n')
def league_standings(year): driver_logger.log("\tAdding to team_years (standings)") print("Adding to team_years (standings)") start_time = time.time() logger.log('Begin organizing league standings for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) page = str( BeautifulSoup( urlopen("https://www.baseball-reference.com/leagues/MLB/" + str(year) + "-standings.shtml"), "html.parser")) try: playoffs = page.split('<h2>Postseason</h2>')[1].split( '</tbody></table>')[0] except IndexError: logger.log("\tNo playoffs in " + str(year)) playoffs = "" divisions = {} if year != 1981: try: divisions['al_east'] = page.split('<h2>East Division</h2>')[1].split('<tbody>')[1].\ split('</tbody></table>')[0] except IndexError: pass try: divisions['nl_east'] = page.split('<h2>East Division</h2>')[2].split('<tbody>')[1].\ split('</tbody></table>')[0] except IndexError: pass try: divisions['al_central'] = page.split('<h2>Central Division</h2>')[1].split('<tbody>')[1].\ split('</tbody></table>')[0] except IndexError: pass try: divisions['nl_central'] = page.split('<h2>Central Division</h2>')[2].split('<tbody>')[1].\ split('</tbody></table>')[0] except IndexError: pass try: divisions['al_west'] = page.split('<h2>West Division</h2>')[1].split('<tbody>')[1].\ split('</tbody></table>')[0] except IndexError: pass try: divisions['nl_west'] = page.split('<h2>West Division</h2>')[2].split('<tbody>')[1].\ split('</tbody></table>')[0] except IndexError: pass else: try: divisions['al_east'] = page.split('<h2>East Division -- Overall</h2>')[1].split('<tbody>')[1]\ .split('</tbody>' + '</table>')[0] except IndexError: pass try: divisions['al_west'] = page.split('<h2>West Division -- Overall</h2>')[1].split('<tbody>')[1].\ split('</tbody>' + '</table>')[0] except IndexError: pass try: divisions['nl_east'] = page.split('<h2>East Division -- Overall</h2>')[2].split('<tbody>')[1]\ .split('</tbody>' + '</table>')[0] except IndexError: pass try: divisions['nl_west'] = page.split('<h2>West Division -- Overall</h2>')[2].split('<tbody>')[1].\ split('</tbody>' + '</table>')[0] except IndexError: pass main_table = page.split('<div class="overthrow table_container" id="div_expanded_standings_overall">')[1].\ split('<tbody>')[1].split('<tr class="league_average_table')[0].split('<tr') champs = {} for row in main_table: if year == 1904 or year < 1903: if 'data-stat="lg_ID" ><strong>' in row: champs[row.split('data-stat="lg_ID" ><strong>')[1].split('<')[0]] = \ translate_team_id(row.split('href="/teams/')[1].split('/')[0], year) try: team_key = row.split('/teams/')[1].split('/')[0] team_id = translate_team_id(team_key, year) if year > 1968: this_string = "'" + team_id + "'," + str( year) + "," + get_league_division(divisions, team_key, year) else: this_string = "'" + team_id + "'," + str( year) + "," + get_league_only(row) this_string += ',' + wins_loses(row) this_string += ',' + is_in_playoffs(playoffs, team_key, year) except IndexError: continue write_to_db(this_string, team_id, year) if year == 1903 or year > 1904: # the first world series (1903); didn't play a WS in 1904 series = { 'World Series': 1, 'ALCS': 1, 'NLCS': 1, 'AL Division Series': 2, 'NL Division Series': 2 } abbreviation = { 'World Series': 'ws', 'ALCS': 'alcs', 'NLCS': 'nlcs', 'AL Division Series': 'alds', 'NL Division Series': 'alds' } playoff_picture = {} for matchup, times in series.items(): for instance in range(times): try: playoff_picture[abbreviation[matchup] + '_champ' + str(instance + 1)] = translate_team_id( playoffs.split('>' + matchup + '<') [1].split('a href="/teams/')[1].split( '/')[0], year) except IndexError: playoff_picture[abbreviation[matchup] + '_champ' + str(instance + 1)] = None try: playoff_picture[abbreviation[matchup] + '_runnerup' + str(instance + 1)] = translate_team_id( playoffs.split('>' + matchup + '<') [1].split('a href="/teams/')[2].split( '/')[0], year) except IndexError: playoff_picture[abbreviation[matchup] + '_runnerup' + str(instance + 1)] = None write_playoff_data(year, playoff_picture) else: write_league_champs_non_ws(champs, year) total_time = time_converter(time.time() - start_time) logger.log('Done organizing league standings for ' + str(year) + ': time = ' + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def create_strike_zone(): start_time = time.time() driver_logger.log('\tCreating Strike Zone') points = {} db = PitchFXDatabaseConnection(sandbox_mode) x_strikes = [ x[0] for x in db.read( 'select x from pitcher_pitches where x is not NULL and ball_strike = "strike" ' 'and swing_take = "take";') ] x_strikes.sort() y_strikes = [ y[0] for y in db.read( 'select y from pitcher_pitches where y is not NULL and ball_strike = "strike" ' 'and swing_take = "take";') ] y_strikes.sort() passes = 1 for median, coordinates in { stat.median(x_strikes): x_strikes, stat.median(y_strikes): y_strikes }.items(): coordinate_orientation = 'x' if passes == 1 else 'y' threshold = 1000 for direction in ['positive', 'negative']: sparse_intervals = 0 if direction == 'positive': incrementer = 1 else: incrementer = -1 place_on_number_line = median + incrementer while sparse_intervals < 3: # a sparse interval is an interval with less than a given number of data points if coordinates.count(place_on_number_line) < threshold: sparse_intervals += 1 # increment the number of sparse interval else: sparse_intervals = 0 # reset the number of sparse intervals place_on_number_line += incrementer # move farther from the median in the appropriate direction points[coordinate_orientation + ('_high_strike' if direction == 'positive' else '_low_strike')] =\ place_on_number_line - (incrementer * 3) points[coordinate_orientation + '_middle'] = \ (points[coordinate_orientation + '_low_strike'] + points[coordinate_orientation + '_high_strike']) / 2 for meridian, multiplier in { '_meridian_1': 1, '_meridian_2': 2 }.items(): points[coordinate_orientation + meridian] = points[coordinate_orientation + '_low_strike'] + \ ((points[coordinate_orientation + '_high_strike'] - points[coordinate_orientation + '_low_strike']) / 3) * multiplier for extreme, meridian in {'_low_ball': 1, '_high_ball': 2}.items(): points[coordinate_orientation + extreme] = \ points[coordinate_orientation + extreme[:-4] + 'strike'] + \ (abs(points[coordinate_orientation + extreme[:-4] + 'strike'] - points[coordinate_orientation + '_meridian_' + str(meridian)]) * (-1 if 'low' in extreme else 1)) passes += 1 db.close() try: with open(os.path.join("..", "background", "strike_zone.json"), "w") as strike_zone_file: json.dump(points, strike_zone_file, sort_keys=True, indent=4) except FileNotFoundError: with open( os.path.join("..", "..", "..", "background", "strike_zone.json"), "w") as strike_zone_file: json.dump(points, strike_zone_file, sort_keys=True, indent=4) driver_logger.log('\t\tTime = ' + time_converter(time.time() - start_time))
def comparisons_driver(most_recent_year): if most_recent_year < 1998 or not data_continuity(most_recent_year): driver_logger.log( "\tCannot make comparisons if there is not data continuity back to 1876 or the current year " "is prior to 1998") logger.log( "Cannot make comparisons if there is not data continuity back to 1876 or the current year is prior " "to 1998") return driver_logger.log("\tBeginning comparisons driver") start_time = time.time() logger.log("Beginning comparisons driver || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) possible_hitter_comps = {} possible_pitcher_comps = {} possible_offensive_comps = {} possible_defensive_comps = {} print('making hitter comparisons (overall)') driver_logger.log('\t\tMaking hitter comparisons (overall)') hc_time = time.time() logger.log("\tMaking hitter comparisons (overall)") logger.log('\t\tGathering list of possible comps') for year_to_compare in [ year for year in range(most_recent_year, 1997, -1) ]: possible_hitter_comps[year_to_compare] = {} year_pa, year_totals = hitter_year_totals(year_to_compare, logger) for comp_hitter in gather_players(year_to_compare, "batting", True, logger): comp_hitter_pa, comp_stats = get_hitter_stats( comp_hitter, year_to_compare) if comp_hitter_pa >= 300: possible_hitter_comps[year_to_compare][comp_hitter + ';' + str(year_to_compare)] =\ hitter_dr_calc(comp_hitter_pa, comp_stats, year_pa, year_totals) logger.log('\t\tBegin calculating comparisons scores') for comp_year in range(1876, most_recent_year + 1, 1): try: year_pa, year_totals = hitter_year_totals(comp_year, logger) except IndexError: continue make_hitter_comparisons( gather_players(comp_year, "batting", False, logger), comp_year, possible_hitter_comps, year_pa, year_totals, logger) total_time = time_converter(time.time() - hc_time) logger.log("\t\tTime = " + total_time) driver_logger.log('\t\t\tTime = ' + total_time) print('making pitcher comparisons (overall)') driver_logger.log('\t\tMaking pitcher comparisons (overall)') pc_time = time.time() logger.log("\tMaking pitcher comparisons (overall)") logger.log('\t\tGathering list of possible comps') for year_to_compare in [ year for year in range(most_recent_year, 1996, -1) ]: possible_pitcher_comps[year_to_compare] = {} year_pa, year_totals = pitcher_year_totals(year_to_compare, logger) for comp_pitcher in gather_players(year_to_compare, "pitching", True, logger): comp_pitcher_pa, comp_stats = get_pitcher_stats( comp_pitcher, year_to_compare) if comp_pitcher_pa >= 200: possible_pitcher_comps[year_to_compare][comp_pitcher + ';' + str(year_to_compare)] = \ pitcher_dr_calc(comp_pitcher_pa, comp_stats, year_pa, year_totals) logger.log('\t\tBegin calculating comparisons scores') for comp_year in range(1876, most_recent_year + 1, 1): try: year_pa, year_totals = pitcher_year_totals(comp_year, logger) except IndexError: continue make_pitcher_comparisons( gather_players(comp_year, "pitching", False, logger), comp_year, possible_pitcher_comps, year_pa, year_totals, logger) total_time = time_converter(time.time() - pc_time) logger.log("\t\tTime = " + total_time) driver_logger.log('\t\t\tTime = ' + total_time) print('making team offensive comparisons') driver_logger.log('\t\tMaking team offensive comparisons') logger.log('\t\tGathering list of possible comps') oc_time = time.time() logger.log("\tMaking offensive comparisons (overall)") for year_to_compare in [ year for year in range(most_recent_year, 1997, -1) ]: year_pa, year_totals = hitter_year_totals(year_to_compare, logger) for comp_ty_uid in gather_teams(year_to_compare, logger): comp_team_pa, comp_stats = get_offensive_stats(comp_ty_uid) possible_offensive_comps[comp_ty_uid] = offensive_dr_calc( comp_team_pa, comp_stats, year_pa, year_totals) logger.log('\t\tBegin calculating comparisons scores') for comp_year in range(1876, most_recent_year + 1, 1): try: year_pa, year_totals = offensive_year_totals(comp_year, logger) except IndexError: continue make_offensive_comparisons(gather_teams(comp_year, logger), possible_offensive_comps, year_pa, year_totals, logger) total_time = time_converter(time.time() - oc_time) logger.log("\t\tTime = " + total_time) driver_logger.log('\t\t\tTime = ' + total_time) print('making team defensive comparisons') driver_logger.log('\t\tMaking team defensive comparisons') logger.log('\t\tGathering list of possible comps') dc_time = time.time() logger.log("\tMaking defensive comparisons (overall)") for year_to_compare in [ year for year in range(most_recent_year, 1997, -1) ]: year_pa, year_totals = defensive_year_totals(year_to_compare, logger) for comp_ty_uid in gather_teams(year_to_compare, logger): comp_team_pa, comp_stats = get_defensive_stats(comp_ty_uid) possible_defensive_comps[comp_ty_uid] = defensive_dr_calc( comp_team_pa, comp_stats, year_pa, year_totals) logger.log('\t\tBegin calculating comparisons scores') for comp_year in range(1876, most_recent_year + 1, 1): try: year_pa, year_totals = defensive_year_totals(comp_year, logger) except IndexError: continue make_defensive_comparisons(gather_teams(comp_year, logger), possible_defensive_comps, year_pa, year_totals, logger) total_time = time_converter(time.time() - dc_time) logger.log("\t\tTime = " + total_time) driver_logger.log('\t\t\tTime = ' + total_time) total_time = time_converter(time.time() - start_time) logger.log("Done making comparisons: time = " + total_time + '\n\n') driver_logger.log("\tDone making comparisons: time = " + total_time)
def get_year_data(year): driver_logger.log('\tGathering year data') print("Gathering year data") start_time = time.time() global pages global strings pages = {} strings = {} logger.log('Beginning year_data download for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) batting_list = { 'PA': 'pa', 'AB': 'ab', 'R': 'r', 'H': 'h', '2B': '2b', '3B': '3b', 'HR': 'hr', 'RBI': 'rbi', 'SB': 'sb', 'BB': 'bb', 'SO': 'so', 'batting_avg': 'ba', 'onbase_perc': 'obp', 'slugging_perc': 'slg', 'onbase_plus_slugging': 'ops' } pitching_list = { 'earned_run_avg': 'era', 'SV': 'sv', 'IP': 'ip', 'ER': 'er', 'whip': 'whip', 'strikeouts_per_nine': 'k_9', 'strikeouts_per_base_on_balls': 'k_bb' } fielding_list = {'E_def': 'e', 'fielding_perc': 'f_percent'} stat_list = { "batting": batting_list, "pitching": pitching_list, "fielding": fielding_list } db = DatabaseConnection(sandbox_mode) if len(db.read('select * from years where year = ' + str(year) + ';')) == 0: db.write('alter table years drop index year;') db.write('insert into years (year) values (' + str(year) + ');') db.write('alter table years add index(year);') db.close() write_opening_day(year) download_start = time.time() logger.log("making HTTP requests for year data") with ThreadPoolExecutor(3) as executor1: for key, value in stat_list.items(): executor1.submit(load_url, year, key) logger.log("\tdone making HTTP requests: time = " + time_converter(time.time() - download_start)) for key, dictionary in stat_list.items(): assemble_stats(key, dictionary, pages[key]) write_start = time.time() logger.log("writing to database") with ThreadPoolExecutor(3) as executor2: for key, value in stat_list.items(): executor2.submit(write_to_db, year, strings[key], key) logger.log("\tdone writing to database: time = " + time_converter(time.time() - write_start)) total_time = time_converter(time.time() - start_time) logger.log('year_data download completed: time = ' + total_time + '\n\n') driver_logger.log('\t\tTime = ' + total_time)
def fielding_constructor(year): print('Downloading fielder images and attributes') driver_logger.log("\tDownloading fielder images and attributes") start_time = time.time() global data data = {} catcher_info = catcher_defense(year, logger) logger.log("Downloading fielder " + str(year) + " data || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log("\tAssembling list of players") table = str(BeautifulSoup(urlopen("https://www.baseball-reference.com/leagues/MLB/" + str(year) + "-standard-fielding.shtml"), 'html.parser')).\ split('<table class="sortable stats_table" id')[1].split('<tbody>')[1].split('</tbody>')[0].split('<tr') for row in table: if 'data-stat="player" csk="' in row and 'data-append-csv="' in row: player_id = row.split('data-append-csv="')[1].split( '"')[0].replace("'", "\'") try: team = translate_team_id( row.split('a href="/teams/')[1].split('/')[0], year) if len(team) == 4: team = anomaly_team(year) except IndexError: team = 'TOT' if player_id not in data: data[player_id] = {} if team not in data[player_id]: this_index = 1 data[player_id][team] = {} else: if team != 'TOT': this_index = len(data[player_id][team]) + 1 else: continue data[player_id][team][this_index] = { 'row': row.split('data-stat'), 'temp_player': row.split('ata-stat="player" csk="')[1].split('" >')[0] } logger.log("\t\tDone assembling list of players") bulk_time = time.time() logger.log( "\tParsing player pages, downloading images, and extracting player attributes" ) global temp_pages with ThreadPoolExecutor(os.cpu_count()) as executor: for player_id, dictionary in data.items(): if len(temp_pages) == os.cpu_count(): condense_pages() executor.submit(load_url(player_id)) condense_pages() write_player_attributes_to_db() with ThreadPoolExecutor(os.cpu_count()) as executor3: for player_id, dictionary in data.items(): for team, dictionary2 in dictionary.items(): for index, dictionary3 in dictionary2.items(): executor3.submit(intermediate, team, index, player_id) for player_id, dictionary in data.items(): for team, dictionary2 in dictionary.items(): try: write_teams_and_stats(player_id, dictionary2, team, year, catcher_info[player_id]) except KeyError: write_teams_and_stats(player_id, dictionary2, team, year, {}) logger.log("\t\tTime = " + time_converter(time.time() - bulk_time)) total_time = time_converter(time.time() - start_time) logger.log("Done downloading player images and attributes: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def rank_driver(year): print("\n\ncalculating team ranks (year)") driver_logger.log("\tBeginning rank driver") start_time = time.time() logger.log("Beginning rank driver || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log("\tCalculating team ranks (year)") runs = {} allowed = {} difference = {} standard_deviation_for = {} standard_deviation_against = {} standard_deviation_ovr = {} ws_winners = {} driver_logger.log("\t\tCalculating team ranks (year)") for data_year in range(year, get_oldest_year() - 1, -1): runs[data_year], allowed[data_year], difference[ data_year] = team_ranker_year(data_year) standard_deviation_for[str(data_year)] = stdev( [team_runs_for[1] for team_runs_for in runs[data_year]]) standard_deviation_against[str(data_year)] = stdev( [team_runs_against[1] for team_runs_against in allowed[data_year]]) standard_deviation_ovr[str(data_year)] = stdev( [team_runs_diff[1] for team_runs_diff in difference[data_year]]) ws_winners[data_year] = get_ws_winner(data_year) total_time = time_converter(time.time() - start_time) logger.log("\t\tTime = " + total_time) driver_logger.log("\t\t\tTime = " + total_time) second_time = time.time() driver_logger.log("\t\tCalculating team ranks (overall)") logger.log("\tCalculating team ranks (overall)") print("calculating team ranks (overall)") total_list = [] years = [value for key, value in runs.items()] for ent in years: for team_total in ent: total_list.append(team_total[1]) average_deviation_for = mean( [value for key, value in standard_deviation_for.items()]) average_deviation_against = mean( [value for key, value in standard_deviation_against.items()]) average_deviation_diff = mean( [value for key, value in standard_deviation_ovr.items()]) all_time_rpg = get_all_time_rpg() team_ranker_ovr(runs, True, "offRank_ovr", all_time_rpg, standard_deviation_for, average_deviation_for) team_ranker_ovr(allowed, False, "defRank_ovr", all_time_rpg, standard_deviation_against, average_deviation_against) team_ranker_ovr(difference, True, "ovrRank_ovr", all_time_rpg, standard_deviation_ovr, average_deviation_diff, ws_winners) second = time_converter(time.time() - second_time) logger.log("\t\tTime = " + second) driver_logger.log("\t\t\tTime = " + second) total_time = time_converter(time.time() - start_time) logger.log("Rank driver complete: time = " + total_time + '\n\n') driver_logger.log("\t\tRank driver time = " + total_time)