def resolve_player_id(player_num, year, team, player_type): players_file = minidom.parse(os.path.join("..", "..", "baseball-sync", "src", "import_data", "player_data", "pitch_fx", "xml", "players.xml")) for ent in players_file.getElementsByTagName('player'): if ent.getAttribute('id') == str(player_num): last_name = ent.getAttribute('last') first_name = ent.getAttribute('first') break db = DatabaseConnection(sandbox_mode) pid = db.read('select playerid from players where lastName="' + last_name + '" and firstName="' + first_name + '";') if len(pid) == 0: name = name_alterator(first_name, last_name) try: pid = db.read('select playerid from players where lastName = "' + name.split(';')[1] + '" and firstName = "' + name.split(';')[0] + '";') except AttributeError: pid = name db.close() if pid is not None: if len(pid) == 1: player_id = pid[0][0] else: player_id = resolve_further(pid, team, year, player_type) else: player_id = None return player_id
def populate_teams_table(year): driver_logger.log('\tPopulating teams table') print("Populating teams table") start_time = time.time() logger.log('Begin populating teams table for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) with open(os.path.join("..", "background", "yearTeams.txt"), 'rt') as file: db = DatabaseConnection(sandbox_mode) db.write('ALTER TABLE teams DROP INDEX teamId;') for line in file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: team_id = team.split(';')[0] db.write('insert into teams (teamId, teamName) values ("' + team_id + '", "' + translate_team_name(team_id).replace("'", "\'") + '");') break db.write('ALTER TABLE teams ADD INDEX(teamId);') db.close() total_time = time.time() - start_time logger.log('Populating teams table completed: ' + time_converter(total_time)) driver_logger.log('\t\tTime = ' + time_converter(total_time))
def retrieve_secondary_positions(self): db = DatabaseConnection(sandbox_mode) position = db.read( 'select secondaryPositions from players where playerId = "' + self.player_id + '";')[0][0] db.close() return position
def pitcher_spray_chart_constructor(year): print("creating pitcher spray charts") start_time = time.time() global bad_gateway_data bad_gateway_data = [] logger.log("Downloading " + str(year) + " pitcher spray charts || Timestamp: " + datetime.datetime.today()\ .strftime('%Y-%m-%d %H:%M:%S')) if year >= 1988: driver_logger.log("\tCreating pitcher spray charts") db = DatabaseConnection(sandbox_mode) pt_uid_players = set( db.read( 'select PT_uniqueidentifier from player_pitching where year = ' + str(year) + ' and pa_infield is NULL;')) db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for ent in pt_uid_players: executor.submit(reduce_functionality, year, ent) driver_logger.log("\t\tTime = " + time_converter(time.time() - start_time)) else: driver_logger.log("\tNo pitcher spray chart data before 1988") logger.log("\tNo spray pitcher chart data before 1988") return if len(bad_gateway_data) > 0: revisit_bad_gateways(year, bad_gateway_data) logger.log("Done downloading pitcher spray charts: time = " + time_converter(time.time() - start_time) + '\n\n')
def manager_tendencies(year): driver_logger.log("\tStoring manager tendencies") print("storing manager tendencies") start_time = time.time() logger.log("Downloading " + str(year) + " manager tendencies || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log('\tMaking HTTP requests') db = DatabaseConnection(sandbox_mode) managers = db.read( 'select manager_teams.managerid, manager_teams.teamid from manager_teams, manager_year where ' 'manager_year.year = ' + str(year) + ' and manager_year.mt_uniqueidentifier = manager_teams.' 'mt_uniqueidentifier;') db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for manager in managers: executor.submit(load_url, manager[0], manager[1]) logger.log('\t\tTime = ' + time_converter(time.time() - start_time)) process_manager_tendencies(year) write_time = time.time() logger.log('\tWriting data to database') global stats with ThreadPoolExecutor(os.cpu_count()) as executor2: for manager_team, tendencies in stats.items(): if len(tendencies) > 0: executor2.submit(write_to_file, year, manager_team, tendencies) logger.log('\t\tTime = ' + time_converter(time.time() - write_time)) total_time = time_converter(time.time() - start_time) driver_logger.log("\t\tTime = " + total_time) logger.log("Done storing manager tendencies: time = " + total_time + '\n\n')
def get_pitcher_stats(pitcher, year): stats = { 'IP': 0, 'H': 0, '2B': 0, '3B': 0, 'HR': 0, 'SO': 0, 'BB': 0, 'ER': 0, 'SV': 0 } db = DatabaseConnection(sandbox_mode) pa = int( db.read( 'select player_pitching.pa from player_pitching, player_teams where player_pitching.' 'PT_uniqueidentifier = player_teams.PT_uniqueidentifier and player_teams.playerId = "' + pitcher + '" and year = ' + str(year) + ';')[0][0]) for key, value in stats.items(): try: stats[key] = float( db.read( 'select player_pitching.' + key + ' from player_pitching, player_teams where ' 'player_pitching.PT_uniqueidentifier = player_teams.PT_uniqueidentifier and ' 'player_teams.playerId = "' + pitcher + '" and year = ' + str(year) + ';')[0][0]) except TypeError: continue db.close() return pa, stats
def write_to_file(final_data, greater_than, field): db = DatabaseConnection(sandbox_mode) counter = 0 while len(final_data) > 0: target = None target_year = None for a, b in final_data.items(): if target is not None: if greater_than: if b[0][1] > target[1]: target = b[0] target_year = a else: continue else: if b[0][1] < target[1]: target = b[0] target_year = a else: continue else: target = b[0] target_year = a if db.read('select league from team_years where teamId = "' + translate_team_id(target[0], target_year) + '" and year = ' + str(target_year) + ';')[0][0].upper() in ['AL', 'NL']: counter += 1 db.write('update team_years set ' + field + ' = ' + str(counter) + ' where teamId = "' + translate_team_id(target[0], target_year) + '" and year = ' + str(target_year) + ';') del final_data[target_year][0] if len(final_data[target_year]) == 0: del final_data[target_year] else: continue db.close()
def team_certainties(year): print('aggregating team statistic certainties') driver_logger.log("\tAggregating team statistic certainties") start_time = time.time() logger.log("Calculating team certainties || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) stat_types = ["batting", "pitching"] for stat_type in stat_types: ty_uids = db.read('select ty_uniqueidentifier, teamid from team_years where year = ' + str(year)) for ty_uid in ty_uids: pau = 0 player_list = list(db.read('select playerid from player_positions where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')) for player in player_list: pt_uid = db.read('select pt_uniqueidentifier from player_teams where playerid = "' + player[0] + '" and' ' teamid = "' + ty_uid[1] + '";')[0][0] try: ent = db.read('select pa, certainty from player_' + stat_type + ' where year = ' + str(year) + ' and pt_uniqueidentifier = ' + str(pt_uid) + ';') pau += int(ent[0][0]) - (int(ent[0][0]) * float(ent[0][1])) except IndexError: continue except TypeError: continue pa = int(db.read('select pa from team_years where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')[0][0]) db.write('update team_years set certainty = ' + str((pa - pau) / pa) + ' where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';') db.close() total_time = time_converter(time.time() - start_time) logger.log("Done calculating team certainties: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def retrieve_team_info(self): db = DatabaseConnection(sandbox_mode) team_info = db.read( 'select team_info from team_years where teamId = "' + self.team_id + '" and year = ' + str(self.year) + ';')[0][0] db.close() return literal_eval(team_info)
def player_is_on_this_team(ty_uid, p_uid, player_type, year): player_on_team = False db = DatabaseConnection(sandbox_mode=True) this_players_uid_corresponding_team_id = \ db.read('select teamId from player_teams where pt_uniqueidentifier = (select pt_uniqueidentifier from player_' + player_type + ' where p' + player_type[0] + '_uniqueidentifier = ' + str(p_uid[0]) + ' and year = ' + str(year) + ');')[0][0] if this_players_uid_corresponding_team_id == db.read( 'select teamId from team_years where ty_uniqueidentifier = ' + str(ty_uid) + ';')[0][0]: player_on_team = True elif this_players_uid_corresponding_team_id == 'TOT': for pt_uid in db.read( 'select pt_uniqueidentifier from player_teams where playerId = (select playerId from ' 'player_teams where pt_uniqueidentifier = (select pt_uniqueidentifier from player_' + player_type + ' where year = ' + str(year) + ' and p' + player_type[0] + '_uniqueidentifier = ' + str(p_uid[0]) + '));'): if db.read('select count(*) from player_' + player_type + ' where year = ' + str(year) + ' and ' 'pt_uniqueidentifier = ' + str(pt_uid[0]) + ';')[0][0] > 0 and \ (db.read('select teamId from team_years where ty_uniqueidentifier = ' + str(ty_uid) + ';')[0][0] == db.read('select teamId from player_teams where pt_uniqueidentifier = ' + str(pt_uid[0]) + ';')[0][0]): player_on_team = True db.close() return player_on_team
def write_pickoff(pitcher, team_id, year, base, attempts_successes): db = DatabaseConnection(sandbox_mode=True) if len( db.read( 'select pp_uniqueidentifier from player_pitching where playerdId = ' + pitcher + ' and year = ' + str(year) + ';')[0]) > 1: team_id = 'TOT' pt_uid = db.read( 'select pt_uniqueidentifier from player_teams where playerid = "' + pitcher + '" and teamid = "' + team_id + '";')[0][0] if db.read('select pickoff_' + base + '_' + attempts_successes + ' from player_pitching where pt_uniqueidentifier' ' = ' + str(pt_uid) + ' and year = ' + str(year) + ';')[0][0] is None: db.write('update player_pitching set pickoff_' + base + '_' + attempts_successes + ' = 1 where ' 'pt_uniqueidentifier = ' + str(pt_uid) + ' and year = ' + str(year) + ';') else: db.write( 'update player_pitching set pickoff_' + base + '_' + attempts_successes + ' = ' + str( int( db.read('select pickoff_' + base + '_' + attempts_successes + ' from player_pitching where ' 'pt_uniqueidentifier = ' + str(pt_uid) + ' and year = ' + str(year) + ';')[0][0]) + 1) + ' where pt_uniqueidentifier = ' + str(pt_uid) + ' and year = ' + str(year) + ';') db.close()
def accept_post_request(): # form = QuickSimForm() # if form.validate_on_submit(): # games = form.games.data # return redirect(url_for('simulate.sim_results')) # else: post_id = int(request.form.get('post_id')) if post_id == 1: print('form not submitted') new_year = request.form.get('newest_year') league_structure = get_league_structure(new_year) return json.dumps({ 'new_year': league_structure, 'league_len': len(league_structure), 'division_len': len(league_structure['nl']), 'year': new_year }) else: away_info = request.form.get('away_team') home_info = request.form.get('home_team') games = int(request.form.get('games')) away_team = away_info.split('.jpg')[0][-7:-4] away_year = int(away_info.split('.jpg')[0][-4:]) home_team = home_info.split('.jpg')[0][-7:-4] home_year = int(home_info.split('.jpg')[0][-4:]) db = DatabaseConnection(sandbox_mode=True) away_year_info = literal_eval( db.read('select year_info from years where year = ' + str(away_year) + ';')[0][0]) home_year_info = literal_eval( db.read('select year_info from years where year = ' + str(home_year) + ';')[0][0]) db.close() return simulation(away_team, away_year, away_year_info, home_team, home_year, home_year_info, games)
def hof_finder(): print("adding HOF data") driver_logger.log("\tAdding HOF data") start_time = time.time() logger.log("Begin finding hall of famers || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) hof_table = str(BeautifulSoup(urlopen('https://www.baseball-reference.com/awards/hof.shtml'), 'html.parser')).\ split('<tbody>')[1].split('</tbody>')[0] rows = hof_table.split('<tr>')[1:] db = DatabaseConnection(sandbox_mode) for row in rows: person = row.split('data-append-csv="')[1].split('"')[0] year = row.split('<a href="/awards/hof_')[1].split('.shtml')[0] induction_type = row.split('data-stat="category_hof">')[1].split( '<')[0] if induction_type == 'Player': db.write('update players set HOF = ' + str(year) + ' where playerId = "' + person + '";') elif induction_type == 'Manager': db.write('update managers set HOF = ' + str(year) + ' where managerId = "' + person + '";') else: continue db.close() total_time = time_converter(time.time() - start_time) logger.log("Done finding hall of famers: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def consolidate_data(year): driver_logger.log("\tConsolidating data") print("Consolidating data") start_time = time.time() logger.log("Consolidating team data || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) for ty_uid in db.read( 'select ty_uniqueidentifier from team_years where year = ' + str(year) + ';'): team_start_time = time.time() logger.log('\t' + db.read( 'select teamId from team_years where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')[0][0]) write_roster_info( ty_uid[0], { 'hitter_spots': consolidate_hitter_spots(ty_uid[0]), 'player_positions': consolidate_player_positions(ty_uid[0]), 'batter_stats': consolidate_player_stats(ty_uid[0], 'batting', year), 'pitcher_stats': consolidate_player_stats(ty_uid[0], 'pitching', year), 'fielder_stats': consolidate_player_stats(ty_uid[0], 'fielding', year) }) logger.log('\t\tTime = ' + time_converter(time.time() - team_start_time)) db.close() total_time = time_converter(time.time() - start_time) logger.log("Done consolidating team data: Time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def get_runs_for(team, year): db = DatabaseConnection(sandbox_mode) runs_for = int( db.read('select r from team_years where teamid = "' + team + '" and year=' + str(year) + ';')[0][0]) db.close() return runs_for
def get_games(team, year): db = DatabaseConnection(sandbox_mode) games = int( db.read('select g from team_years where teamid = "' + team + '" and year = ' + str(year) + ';')[0][0]) db.close() return games
def write_to_file(year): for team_id, table in pages.items(): logger.log("\t\tgathering and writing " + team_id + " positions") db = DatabaseConnection(sandbox_mode) primary_keys = [] try: for row in table: if 'data-append-csv="' in row: this_string = "" primary_keys.append( row.split('data-append-csv="')[1].split('"') [0].replace("'", "\'")) this_string += '"' + primary_keys[-1] + '","' position_summary = row.split( 'data-stat="pos_summary" >')[1].split('<')[0] if '-' in position_summary: positions = position_summary.split('-') for position_index in range(len(positions)): if this_is_position_player_pitching(primary_keys[-1], positions, position_index, team_id, year) or \ this_is_pitcher_playing_in_the_field(primary_keys[-1], positions, position_index, team_id, year): continue # don't give positions players RP eligibility who threw mop-up innings else: if position_index != len(positions) - 1: this_string += positions[ position_index] + "," else: this_string += positions[position_index] else: this_string += position_summary this_string += '"' ty_uid = str( db.read( 'select TY_uniqueidentifier from team_years where teamId = "' + team_id + '" and year = ' + str(year) + ';')[0][0]) if len( db.read( 'select PPos_uniqueidentifier from player_positions where playerId=' + this_string.split(',')[0] + ' and TY_uniqueidentifier = ' + ty_uid + ';')) == 0: db.write( 'insert into player_positions (PPos_uniqueidentifier, playerId, positions, ' 'TY_uniqueidentifier) values (default, ' + this_string + ', ' + ty_uid + ');') else: split_positions = this_string.split(',')[1:] if split_positions[-1] == '"': del split_positions[-1] split_positions[-1] += '"' db.write('update player_positions set positions = ' + ','.join(split_positions) + ' where ' 'playerId = ' + this_string.split(',')[0] + ' and TY_uniqueidentifier = ' + ty_uid + ';') except IndexError: pass db.close()
def get_runs_against(team, year): global runs_against db = DatabaseConnection(sandbox_mode) runs_against = int( db.read('select ra from team_years where teamid = "' + team + '" and year = ' + str(year) + ';')[0][0]) db.close() return runs_against
def get_team_id(uid, player_type): db = DatabaseConnection(sandbox_mode=True) team_id = db.read( 'select teamId from player_teams where pt_uniqueidentifier = (select pt_uniqueidentifier from ' 'player_' + player_type + ' where p' + player_type[0] + '_uniqueidentifier = ' + str(uid) + ')')[0][0] db.close() return team_id
def retrieve_full_name(self): db = DatabaseConnection(sandbox_mode) name = db.read( 'select firstName, lastName from players where playerId = "' + self.player_id + '";')[0] self.first_name = name[0] self.last_name = name[1] db.close()
def clean_up_deadlocked_file(): driver_logger.log("\tCleaning up deadlocked records") with open(os.path.join("utilities", "deadlocked.txt"), 'r') as f: db = DatabaseConnection(sandbox_mode) for line in f: db.write(line) db.close() file = open(os.path.join("utilities", "deadlocked.txt"), "w").close()
def consolidate_player_positions(ty_uid): db = DatabaseConnection(sandbox_mode=True) players_positions = db.read( 'select playerId, positions from player_positions where ty_uniqueidentifier = ' + str(ty_uid) + ';') db.close() roster = {} for player in players_positions: roster[player[0]] = player[1].split(',') return stringify_player_positions(roster)
def write_to_file(year, awards_dict_list): if len(awards_dict_list[0]) + len(awards_dict_list[1]) + len(awards_dict_list[2]) + len(awards_dict_list[3]) \ + len(awards_dict_list[4]) + len(awards_dict_list[5]) > 0: db = DatabaseConnection(sandbox_mode) this_string = "" for dictionary in awards_dict_list: for key, value in dictionary.items(): this_string += key + ' = "' + value.replace("'", "\'") + '", ' db.write('update years set ' + this_string[:-2] + ' where year = ' + str(year) + ';') db.close()
def get_most_recent_year(): db = DatabaseConnection(sandbox_mode=False) try: most_recent_year = int( db.read("select year from years order by year desc limit 1;")[0] [0]) except: most_recent_year = 1876 finally: db.close() return most_recent_year
def player_was_on_more_than_one_team(p_uid, player_type, year): db = DatabaseConnection(sandbox_mode=True) this_players_uid_corresponding_team_id = \ db.read('select teamId from player_teams where pt_uniqueidentifier = (select pt_uniqueidentifier from player_' + player_type + ' where p' + player_type[0] + '_uniqueidentifier = ' + str(p_uid[0]) + ' and year = ' + str(year) + ');')[0][0] db.close() if this_players_uid_corresponding_team_id == 'TOT': return True else: return False
def manager_table_constructor(): driver_logger.log('\tGathering manager data (all-time)') print("Gathering manager data (all-time)") start_time = time.time() logger.log('Begin populating teams table || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) table = str( bs( urllib.request.urlopen( 'https://www.baseball-reference.com/managers/'), 'html.parser')) rows = table.split('<tr') db = DatabaseConnection(sandbox_mode=True) db.write('ALTER TABLE managers DROP INDEX managerId;') db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for row in rows: if '<td class="left" csk="' in row: this_row = row.split('</tr>')[0] try: manager_id = this_row.split( '<a href="/managers/')[1].split('.shtml')[0].replace( "'", "\'") last_first = this_row.split('</tr>')[0].split( '<td class="left" csk="')[1].split('"')[0] last = last_first.split(',')[0].replace("'", "\'") first = last_first.split(',')[1].replace("'", "\'") wins = this_row.split('data-stat="W">')[1].split('<')[0] loses = this_row.split('data-stat="L">')[1].split('<')[0] executor.submit( write_to_file, '"' + manager_id + '","' + last + '","' + first + '",' + wins + ',' + loses) except AttributeError: continue db = DatabaseConnection(sandbox_mode=True) db.write('ALTER TABLE managers ADD INDEX(managerId);') db.close() total_time = time.time() - start_time logger.log('Constructing manager table completed: time = ' + time_converter(total_time)) driver_logger.log('\t\tTime = ' + time_converter(total_time))
def data_continuity(most_recent_year): db = DatabaseConnection(sandbox_mode=True) for year in range(most_recent_year, 1875, -1): if len( db.read('select year from years where year = ' + str(year) + ';')) == 0: continuous = False break else: continuous = True db.close() return continuous
def gather_players(year, player_type, gather_all, logger): db = DatabaseConnection(sandbox_mode) query = "select playerId from player_teams, player_" + player_type + " where player_" + player_type + ".PT_unique"\ "identifier = player_teams.PT_uniqueidentifier and player_" + player_type + ".year = " + str(year) if gather_all: logger.log('\t\t\tGathering all players') player_list = list(db.read(query + ";")) else: logger.log('\t\t\tGathering players that need comps') player_list = list(db.read(query + " and certainty < 1.0 and certainty > 0.0;")) db.close() return [player[0] for player in player_list]
def get_defensive_stats(ty_uid): stats = {'ER': 0, 'HA': 0, '2BA': 0, '3BA': 0, 'HRA': 0, 'BBA': 0, 'K': 0} db = DatabaseConnection(sandbox_mode) pa = int( db.read('select pa from team_years where ty_uniqueidentifier=' + str(ty_uid) + ';')[0][0]) for key, value in stats.items(): stats[key] = int( db.read('select ' + key + ' from team_years where ty_uniqueidentifier = ' + str(ty_uid) + ';')[0][0]) db.close() return pa, stats
def write_to_file(team_data, year): db = DatabaseConnection(sandbox_mode) for team, data in team_data.items(): logger.log("\tWriting " + team + " data to database") sets = '' for field, value in data.items(): if value != '': sets += field + ' = ' + value + ', ' else: continue db.write('update team_years set ' + sets[:-2] + ' where teamid = "' + team + '" and year = ' + str(year) + ';') db.close()