def statistics_war(year): player_q = """SELECT player_name, team_id, pos, inn FROM statistics_fielding WHERE year = %s; """ player_qry = player_q % (year) player_data = db.query(player_qry) entries = [] for row in player_data: entry = {} entry['year'] = year player_name, team_id, position, inn = row entry['player_name'] = player_name search_name = player_name.replace("'","''") lookuptable = 'teams' team_abb = db.lookupValues("teams",("team_id","year",),(team_id,year),val="team_abb",operators=("=","="))[0] entry['team_abb'] = team_abb entry['position'] = position if position.lower() == 'p': continue else: # entry['bats'] = None entry['age'] = None entry['pa'] = None entry['inn'] = inn rn_val, err_val, arm_val, pb_val = helper.get_def_values(search_name, position, year) #1450 innings is a full season defense = float(inn)*(rn_val + err_val + arm_val + pb_val)/1450 entry['defense'] = defense adj = float(helper.get_pos_adj(position.upper())) position_adj = adj*(float(inn)/1450) entry['position_adj'] = position_adj dwar = (defense+position_adj)/10 entry['dWAR'] = dwar entries.append(entry) table = 'processed_compWAR_defensive' if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()
def calculate_war(): player_q = """SELECT year, player_name, team_abb, ab, h, 2b, 3b, hr, bb, so, hbp, ibb, sh, sf, sb, cs FROM zips_offense """ player_qry = player_q player_data = db.query(player_qry) entries = [] for row in player_data: entry = {} year, player_name, team_abb, ab, h, _2, _3, hr, bb, so, hbp, ibb, sh, sf, sb, cs = row pa = ab + bb + hbp + ibb + sh + sf bb2 = bb + ibb _1 = h - _2 - _3 - hr team_abb = team_abb.upper() pf = float(helper.get_park_factors(team_abb, year - 1)) / float(100) babip = float((h - hr)) / float((ab + sh + sf - so - hr)) ops, wOBA, park_wOBA, OPS_plus, wrc, wrc27, wRC_plus, raa, oWAR = helper.get_zips_offensive_metrics( year - 1, pf, pa, ab, bb2, hbp, _1, _2, _3, hr, sb, cs) entry['year'] = year entry['player_name'] = player_name entry['team_abb'] = team_abb entry['pf'] = pf entry['pa'] = pa entry['babip'] = babip entry['OPS_plus'] = OPS_plus entry['park_wOBA'] = park_wOBA entry['wRC_plus'] = wRC_plus entries.append(entry) table = 'zips_WAR_hitters_comp' if entries != []: for i in range(0, len(entries), 1000): db.insertRowDict(entries[i:i + 1000], table, insertMany=True, replace=True, rid=0, debug=1) db.conn.commit()
def initiate(yr, _type, player_mapper): path = '/Users/connordog/Dropbox/Desktop_Files/Work_Things/CodeBase/Python_Scripts/Python_Projects/NSBL/ad_hoc/historical_csv_files/' csv_file = path + '%s_zips_%s_splits.csv' % (yr, _type) print yr, _type entries = [] with codecs.open(csv_file, 'rb', encoding='utf-8', errors='ignore') as f: mycsv = csv.reader(f) i = 0 for row in mycsv: if i == 0: i += 1 continue else: i += 1 year, player_name, vs_hand, ab, h, _2b, _3b, hr, rbi, bb, so, hbp, ibb, sh, sf = row if player_name in player_mapper: player_name = player_mapper.get(player_name) else: helper.input_name(player_name) entry = { "year": yr, "player_name": player_name, "vs_hand": vs_hand, "ab": ab, "h": h, "2b": _2b, "3b": _3b, "hr": hr, "rbi": rbi, "bb": bb, "so": so, "hbp": hbp, "ibb": ibb, "sh": sh, "sf": sf } entries.append(entry) table = 'zips_%s_splits' % (_type) if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()
def scrape_cur_standings(): table_url = 'http://thensbl.com/orgstand.htm' tables = get_tables(table_url) standings_changed = False for table in tables: titles = table.find_all('tr', class_=re.compile('dmrptsecttitle')) for title in titles: element = [] tit = title.get_text() if tit == 'Divisional': sql_table = 'team_standings' rows = table.find_all('tr', class_=re.compile('dmrptbody')) for row in rows: element = [] for data in row: if data.get_text() == ' ': element.append(None) else: #strip takes white space away from the front and end of a text string element.append(data.get_text().strip()) year = element[0] team_location_name = element[1] wins = element[2] losses = element[3] if team_location_name is not None: full_name = helper.get_team_name( team_location_name, year) qry = """SELECT ts.year , ts.team_name , MAX(ts.games_played) AS gp FROM team_standings ts WHERE 1 AND ts.team_name = '%s' AND ts.year = %s GROUP BY ts.team_name, ts.year""" prev_gp = db.query(qry % (full_name, year)) if prev_gp == (): print "\n\nNEW SEASON!!!!!\n\n" prev_gp == 0 else: prev_gp = prev_gp[0][2] if int(wins) + int(losses) != prev_gp: standings_changed = True # print full_name, int(wins)+int(losses), prev_gp, standings_changed return standings_changed
def input_data(ratings, sql_table, cats, year): print '\t' + sql_table entries = [] for player in ratings: entry = {} entry['year'] = year for cat, val in zip(cats, player): # any category we aren't interested in recording, we mark as foo if cat != 'foo': # entry[cat] = val ##### if cat == 'player_name' and val is not None: entry[cat] = val.replace('*', '').replace('#', '') else: entry[cat] = val if (entry.get("player_name") not in ('Total', None, '', 'Other') and entry.get("team_abb") not in ('Total', None, '', 'Other')): entries.append(entry) elif entry.get("team_name") not in ('Total', None, '', 'Other'): full_name = helper.get_team_name(entry.get("team_name"), year) entry['team_name'] = full_name if sql_table == 'team_standings': entry['games_played'] = int(entry.get('w')) + int( entry.get('l')) entries.append(entry) if 'player_name' in entry: helper.input_name(entry.get('player_name')) # used for debugging # if entries != []: # for entry in entries[0:30]: # print '\t\t', # print entry # raw_input("") if entries != []: db.insertRowDict(entries, sql_table, insertMany=True, rid=0, replace=True) db.conn.commit()
def process_prospect_list(year, list_type, list_key): list_url = base_url + "%s%s" % (year, list_key) print "\n", year, list_type, list_url json = getter.get_url_data(list_url, "json") entries = [] for plr in json: entry = {'prospect_type': list_type} for ky, val in plr.items(): if type(val) in (str, unicode): val2 = "".join([i if ord(i) < 128 else "" for i in val]) if val != val2 and 'name' in ky.lower(): print '\n\n\n\nUNICODE NAME!!!! - \n\t', val print '\t', val2, '\n\n\n\n' if 'playerid' in ky.lower(): val = val2.replace(' ', '') else: val = val2 entry[ky.lower().replace("%", "%%")] = val if ('playername' not in entry or entry['playername'] == ''): continue if 'playerid' not in entry or entry['playerid'] == '': entry['playerid'] = str( entry['playername'].replace(' ', '').replace('*', '').replace( ",", "")) + '_' + str(entry['type'].replace(' ', '')) if 'team' not in entry or entry['team'] == '': entry['team'] = '--empty--' print '\t', year, list_key, entry['playername'] helper2.input_name(entry.get('playername')) db.insertRowDict(entry, 'fg_raw', insertMany=False, replace=True, rid=0, debug=1) db.conn.commit() sleep(sleep_time)
def process_division(year): print 'division' for _type in ('roster', 'projected'): # print '\t', _type for div in ('AL East', 'AL Central', 'AL West', 'NL East', 'NL Central', 'NL West'): qry = """SELECT team_abb, team_name, mean_W/162.0, var, year, games_played FROM __playoff_probabilities JOIN (SELECT team_abb, MAX(year) AS year, MAX(games_played) AS games_played FROM __playoff_probabilities GROUP BY team_abb, year) t2 USING (team_abb, year, games_played) WHERE strength_type = '%s' AND division = '%s' AND year = %s;""" query = qry % (_type, div, year) # raw_input(query) res = db.query(query) div_dict = {} for row in res: team_abb, team_name, strength_pct, var, year, games_played = row # print '\t\t', team_name if games_played > 162: strength_pct = float( (float(strength_pct) * 162.0) / float(games_played)) else: strength_pct = float(strength_pct) division, div_teams, conf_teams, non_conf_teams = helper.get_division( team_name, year) win_division_prob = np.prod( get_probabilities(team_name, div_teams, strength_pct, games_played, float(var), _type, year)[0]) div_dict[team_name] = [ win_division_prob, 1.0, False, year, games_played ] col_name = 'win_division' adjust_probabilities(div_dict, col_name, 1.0, _type)
def initiate(yr, _type, player_mapper): path = '/Users/connordog/Dropbox/Desktop_Files/Work_Things/CodeBase/Python_Scripts/Python_Projects/NSBL/ad_hoc/historical_csv_files/' csv_file_ext = '%s_zips_%s.csv' % (yr, _type) csv_file = path+csv_file_ext print yr, _type, csv_file_ext entries = [] with codecs.open(csv_file, 'rb', encoding='utf-8', errors='ignore') as f: mycsv = csv.reader(f) i = 0 for row in mycsv: if i == 0: i += 1 continue else: i += 1 if _type == 'offense': year, player_name, team_abb, age, bats, g, ab, r, h, _2b, _3b, hr, rbi , bb, so , hbp, sb, cs, sh, sf, ibb, war = row if player_name in player_mapper: player_name = player_mapper.get(player_name) else: helper.input_name(player_name) # print player_name entry = {"year":yr, "player_name":player_name, "team_abb":team_abb, "age":age, "bats":bats, "g":g, "ab":ab, "r":r, "h":h, "2b":_2b, "3b":_3b, "hr":hr, "rbi":rbi, "bb":bb, "so":so, "hbp":hbp, "sb":sb, "cs":cs, "sh":sh, "sf":sf, "ibb":ibb, "zWAR":war} entries.append(entry) elif _type == 'pitching': year, player_name, team_abb, age, throws, w, l, era, g, gs, ip, h, r, er, hr, bb, so, war = row if player_name in player_mapper: player_name = player_mapper.get(player_name) else: helper.input_name(player_name) entry = {"year":yr, "player_name":player_name, "team_abb":team_abb, "age":age, "throws":throws, "w":w, "l":l, "era":era, "g":g, "gs":gs, "ip":ip, "h":h, "r":r, "er":er, "hr":hr, "bb":bb, "so":so, "zWAR":war} entries.append(entry) elif _type == 'defense': year, player_name, c_rn, c_er, _1b_rn, _1b_er, _2b_rn, _2b_er, _3b_rn, _3b_er, ss_rn, ss_er, lf_rn, lf_er, cf_rn, cf_er, rf_rn, rf_er, c_arm, of_arm, pb, FOO = row if player_name in player_mapper: player_name = player_mapper.get(player_name) else: helper.input_name(player_name) entry = {"year":yr, "player_name":player_name, "c_range":c_rn, "c_error":c_er, "1b_range":_1b_rn, "1b_error":_1b_er, "2b_range":_2b_rn, "2b_error":_2b_er, "3b_range":_3b_rn, "3b_error":_3b_er, "ss_range":ss_rn, "ss_error":ss_er, "lf_range":lf_rn, "lf_error":lf_er, "cf_range":cf_rn, "cf_error":cf_er, "rf_range":rf_rn, "rf_error":rf_er, "c_arm":c_arm, "of_arm":of_arm, "c_pb":pb} entries.append(entry) # print i, _type, player_name table = 'zips_%s' % (_type) if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()
def process_top_seed(year): print "top seed" for _type in ('roster', 'projected'): # print '\t', _type for conf in ('AL', 'NL'): team_qry = """SELECT team_abb, team_name, win_division, mean_W/162.0, var, year, games_played FROM __playoff_probabilities JOIN (SELECT team_abb, MAX(year) AS year, MAX(games_played) AS games_played FROM __playoff_probabilities GROUP BY team_abb, year) t2 USING (team_abb, year, games_played) WHERE strength_type = '%s' AND LEFT(division,2) = '%s' AND year = %s;""" team_query = team_qry % (_type, conf, year) # raw_input(team_query) team_res = db.query(team_query) top_dict = {} for team_row in team_res: team_abb, team_name, max_prob, strength_pct, var, year, games_played = team_row max_prob = float(max_prob) # print '\t\t', team_name if games_played > 162: strength_pct = float( (float(strength_pct) * 162.0) / float(games_played)) else: strength_pct = float(strength_pct) division, div_teams, conf_teams, non_conf_teams = helper.get_division( team_name, year) top_seed_prob = np.prod( get_probabilities(team_name, conf_teams, strength_pct, games_played, float(var), _type, year)[0]) top_dict[team_name] = [ top_seed_prob, max_prob, False, year, games_played ] col_name = 'top_seed' adjust_probabilities(top_dict, col_name, 1.0, _type)
def process_urls(urls, year): print year for teamcnt, team_pair in enumerate(urls): for tm, url in team_pair.items(): print '\t', str(teamcnt + 1), tm, '-', url tm_list = [] tm_query = db.query( "SELECT DISTINCT team_abb FROM zips_fangraphs_batters_counting WHERE year = %s" % (year)) for t in tm_query: tm_list.append(t[0]) if tm in tm_list: continue sleep(sleep_time) team_data = requests.get(url) team_soup = BeautifulSoup(team_data.content, "lxml") postmeta_date = team_soup.find( class_="postmeta").findAll("div")[-1].getText() post_date = parse(postmeta_date).strftime("%Y-%m-%d") tables = team_soup.findAll( "table", { "class": [ "sortable", "sortable table-equal-width", "table-equal-width" ] }) print len(tables) if len(tables) == 0: tables = team_soup.findAll("table")[11:] print len(tables) j = 0 for table in tables: # raw_input(table) headers = table.find("tr") # raw_input(headers) headers = headers.findAll() cats = [] for h in headers: cat = h.getText().replace('/', '_').replace( '+', '_Plus').replace('-', '_Minus').replace( 'No. 1 Comp', 'Top_Comp').replace('%', '_Pct') cats.append(cat) if len(cats) < 10: continue else: j = j + 1 # raw_input(i) entries = [] if j == 1: db_table = "zips_fangraphs_batters_counting" elif j == 2: db_table = "zips_fangraphs_batters_rate" elif j == 3: db_table = "zips_fangraphs_pitchers_counting" elif j == 4: db_table = "zips_fangraphs_pitchers_rate" print '\t\t', db_table # print cats rows = table.findAll("tr") for r in rows: # print r # print r.get("class") # raw_input("") if r.get("class") is None: entry = {} entry["year"] = year entry["team_abb"] = tm entry["post_date"] = post_date atts = r.findAll("td") # raw_input(atts) if atts != []: for k, att in enumerate(atts): fld = att.getText() fld = "".join( [i if ord(i) < 128 else "" for i in fld]) entry[cats[k]] = fld # print '\t\t\t', entry if entry["Player"] != "": helper.input_name(entry.get('Player')) entries.append(entry) if entries != []: for i in range(0, len(entries), 1000): db.insertRowDict(entries[i:i + 1000], db_table, insertMany=True, replace=True, rid=0, debug=1) db.conn.commit()
def offensive_war(year): player_q = """SELECT player_name, team_abb, position, age, pa, ab, (h-2b-3b-hr) as 1b, 2b, 3b, hr, r, rbi, bb, k, hbp, sb, cs, ops, babip FROM register_batting_primary JOIN register_batting_secondary USING (year, player_name, team_abb, position, age) JOIN register_batting_analytical USING (year, player_name, team_abb, position, age) WHERE year = %s; """ player_qry = player_q % (year) player_data = db.query(player_qry) entries = [] for row in player_data: entry = {} entry['year'] = year player_name, team_abb, position, age, pa, ab, _1b, _2b, _3b, hr, r, rbi, bb, k, hbp, sb, cs, ops, babip = row entry['player_name'] = player_name entry['team_abb'] = team_abb entry['position'] = position # if player_name[len(player_name)-1:] == "*": # bats = 'l' # elif player_name[len(player_name)-1:] == "#": # bats = 's' # else: # bats = 'r' # entry['bats'] = bats entry['age'] = age entry['pa'] = pa team_abb = team_abb.upper() pf = float(helper.get_park_factors(team_abb, year)) / float(100) entry['pf'] = pf entry['ops'] = ops entry['babip'] = babip foo, wOBA, park_wOBA, OPS_plus, wrc, wrc27, wRC_plus, raa, oWAR = helper.get_offensive_metrics( year, pf, pa, ab, bb, hbp, _1b, _2b, _3b, hr, sb, cs) entry['wOBA'] = wOBA entry['park_wOBA'] = park_wOBA entry['OPS_plus'] = OPS_plus entry['wrc'] = wrc entry['wRC_27'] = wrc27 entry['wRC_plus'] = wRC_plus entry['raa'] = raa entry['oWAR'] = oWAR entries.append(entry) table = 'processed_compWAR_offensive' if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()
def calculate_war(): player_q = """SELECT year, player_name, team_abb, age, g, gs, era, ip, h, r, er, bb, so, hr FROM zips_pitching """ player_qry = player_q player_data = db.query(player_qry) entries = [] for row in player_data: entry = {} year, player_name, team_abb, age, g, gs, era, ip, h, r, er, bb, k, hr = row print year, player_name team_abb = team_abb.upper() pf = float(helper.get_park_factors(team_abb, year - 1)) / float(100) if ip == 0: k_9 = 0.0 if bb > 0: bb_9 = 99.0 k_bb = 99.0 else: bb_9 = 0.0 k_bb = 0.0 if hr > 0: hr_9 = 99.0 else: hr_9 = 0.0 else: k_9 = (float(k) / float(ip)) * 9 bb_9 = (float(bb) / float(ip)) * 9 hr_9 = (float(hr) / float(ip)) * 9 if bb == 0: if k > 0: k_bb = 99.0 else: k_bb = 0.0 else: k_bb = (float(k) / float(bb)) fip_const = float( helper.get_zips_average_pitchers(year - 1, 'fip_const')) FIP = ((((13 * float(hr)) + (3 * float(bb)) - (2 * float(k))) / float(ip)) + fip_const) park_FIP, FIP_min, FIP_WAR = helper.get_zips_pitching_metrics( FIP, ip, year - 1, pf, g, gs, 'fip') ERA = float(era) park_ERA, ERA_min, ERA_WAR = helper.get_zips_pitching_metrics( ERA, ip, year - 1, pf, g, gs, 'era') entry['year'] = year entry['player_name'] = player_name entry['team_abb'] = team_abb entry['age'] = age entry['pf'] = pf entry['ip'] = ip entry['k_9'] = k_9 entry['bb_9'] = bb_9 entry['k_bb'] = k_bb entry['hr_9'] = hr_9 entry['FIP'] = FIP entry['park_FIP'] = park_FIP entry['FIP_minus'] = FIP_min entry['FIP_WAR'] = FIP_WAR entry['ERA'] = era entry['park_ERA'] = park_ERA entry['ERA_minus'] = ERA_min entry['ERA_WAR'] = ERA_WAR entries.append(entry) table = 'zips_WAR_pitchers' if entries != []: for i in range(0, len(entries), 1000): db.insertRowDict(entries[i:i + 1000], table, insertMany=True, replace=True, rid=0, debug=1) db.conn.commit()
def get_optimal_lineups(year, season_gp): optimal_query = """SELECT team_abb, starter_val, bullpen_val, l.lineup_val AS lineup_vsL, r.lineup_val AS lineup_vsR, total_val + 0.25*(l.lineup_val) + 0.75*(r.lineup_val) AS roster_WAR, starter_var, bullpen_var, l.lineup_var AS vsL_var, r.lineup_var AS vsR_var, total_var + 0.25*l.lineup_var + 0.75*r.lineup_var AS roster_var FROM __optimal_pitching p JOIN __optimal_lineups l USING (team_abb) JOIN __optimal_lineups r USING (team_abb) WHERE l.vs_hand = 'l' AND r.vs_hand = 'r' AND l.dh_name IS NOT NULL AND r.dh_name IS NOT NULL ORDER BY team_abb ASC;""" total_roster_war_query = """SELECT SUM(p.total_val + 0.25*(l.lineup_val) + 0.75*(r.lineup_val)) AS roster_WAR FROM __optimal_pitching p JOIN __optimal_lineups l USING (team_abb) JOIN __optimal_lineups r USING (team_abb) WHERE l.vs_hand = 'l' AND r.vs_hand = 'r' AND l.dh_name IS NOT NULL AND r.dh_name IS NOT NULL;""" # should be around ~1000 total_roster_war = db.query(total_roster_war_query)[0][0] # should be around 48 (48-114 replacement level?) replacement_team_wins = (2430-float(total_roster_war))/30 # should be around .300 rep_team_win_pct = float(replacement_team_wins)/162 optimal_res = db.query(optimal_query) for row in optimal_res: entry = {} team_abb, starter_val, bullpen_val, lu_vsL, lu_vsR, roster_WAR, starter_var, bullpen_var, vsL_var, vsR_var, roster_var = row mascot_name = helper.get_mascot_names(team_abb.upper(), year) team_name, games_played, rep_WAR, oWAR, dWAR, FIP_WAR, W, L, py_W, py_L = get_standing_metrics(year, mascot_name) team_abb = helper.get_team_abb(team_name, year) # mascot_name = helper.get_mascot_names(team_abb.upper(), year-1) # team_name, games_played, rep_WAR, oWAR, dWAR, FIP_WAR, W, L, py_W, py_L = get_standing_metrics(year, mascot_name) # team_abb = helper.get_team_abb(team_name, year-1) games_played = float(games_played) if games_played > 162.0: roster_W = float(roster_WAR) + rep_team_win_pct*games_played roster_pct = roster_W/games_played ros_g = 0 else: roster_W = float(roster_WAR) + rep_team_win_pct*162 roster_pct = roster_W/162.0 ros_g = 162-games_played try: w_pct = float(W)/float(W+L) py_pct = float(py_W)/float(py_W+py_L) except ZeroDivisionError: w_pct = 0.5 py_pct = 0.5 # logistic weights for pythag% and win& # rest of season projected win% = (1-2w)*(roster%) + w(pythag%) + w(win%) # where w = (0.25) / (1+20e^(-0.035*games_played)) if games_played <= 10: current_weight = 0.0015*float(games_played) else: current_weight = 0.25 / (1 + 20*math.exp(-0.035*float(games_played))) ros_pct = (1-2*current_weight)*roster_pct + (current_weight)*max(py_pct, 0.25) + (current_weight)*max(w_pct, 0.25) ros_W = ros_pct*ros_g # for the total amount of variance for the team, we first take the total amount of variance from team projections (based on the variance in each individual player's projection) total_roster_var = float(roster_var) # then we add a measure of variance based on the difference between true talent record (pythag record) and observed record (see /variance_research/Full Season Pythag Standings std research.png) total_roster_var += -0.0055021865*(ros_pct*162) + 3.4706743014 # Finally we add a value of 5.0 to the STANDARD DEVIATION (not variance). We can express the amount of variance desired to add in the set of equations {std = sqrt(v), std+5.0 = sqrt(v+c)}, and then solving for c (https://tinyurl.com/y8tk64ez) # NB. the value of 5.0 is a guess (~0.33 win for each starter plus a small amount for bench players and relief pitchers) and hack-y and should be cleaned up, or at least weighted more towards defensive #s over wOBA numbers) wins to the variance due to my uncertain nature (mostly from defense) of my conversion from raw ZiPS to DMB WAR (i.e., I think if my projection says the team is a true talent 90 win team, I think there is +/- 5.0 wins of standard deviation in that projection) total_roster_var += 10*math.sqrt(total_roster_var) + 25 projected_W = W + ros_W if games_played > 162.0: roster_L = games_played - roster_W projected_L = games_played - projected_W projected_pct = projected_W/games_played else: roster_L = 162.0 - roster_W projected_L = 162.0 - projected_W projected_pct = projected_W/162.0 entry['team_abb'] = team_abb entry['team_name'] = team_name entry['year'] = year entry['season_gp'] = season_gp entry['games_played'] = games_played entry['starter_val'] = starter_val entry['bullpen_val'] = bullpen_val entry['vsR_val'] = lu_vsR entry['vsL_val'] = lu_vsL entry['roster_strength'] = roster_WAR entry['starter_var'] = starter_var entry['bullpen_var'] = bullpen_var entry['vsR_var'] = vsR_var entry['vsL_var'] = vsL_var entry['roster_var'] = roster_var entry['overall_var'] = total_roster_var entry['roster_W'] = roster_W entry['roster_L'] = roster_L entry['roster_pct'] = roster_pct entry['current_W'] = W entry['current_L'] = L entry['current_pct'] = w_pct entry['ros_W'] = ros_W entry['ros_L'] = ros_g - ros_W entry['ros_pct'] = ros_pct entry['projected_W'] = projected_W entry['projected_L'] = projected_L entry['projected_pct'] = projected_pct # raw_input(entry) db.insertRowDict(entry, '__team_strength', insertMany=False, replace=True, rid=0,debug=1) db.conn.commit()
def parse_prospect(rnk, year, prospect, team): prospect_type = (team if team in ("draft", "international") else "professional") entry = {} def print_prospect_details (prospect): def print_dict(k, v, lvl): for num in range(1, lvl): print "\t", if type(v) is dict: print k for y, z in j.items(): print_dict(y, z, lvl+1) else: print (str(k)[:20] if len(str(k)) > 20 else str(k).ljust(20)), "\t", ("SOME LIST" if type(v) is list else v) for a, b in prospect.items(): print_dict(a, b, 1) def process_grades(year, grades_id, grades, player_type, prospect_type): grade_entry = {"year":year, "grades_id":grades_id, "prospect_type":prospect_type} fv = 0 for g in grades: if g.get("key") is None: continue if g.get("key").lower().strip() == "overall": fv = g.get("value") elif g.get("key").lower().strip() not in ("fastball", "change", "curve", "slider", "cutter", "splitter", "control", "hit", "power", "run", "arm", "field", "speed", "throw", "defense"): grade_entry["other"] = g.get("value") else: if g.get("key").lower().strip() == "speed": grade_entry["run"] = g.get("value") elif g.get("key").lower().strip() == "throw": grade_entry["arm"] = g.get("value") elif g.get("key").lower().strip() == "defense": grade_entry["field"] = g.get("value") else: grade_entry[g.get("key").lower().strip()] = g.get("value") if "hit" in grade_entry or "field" in grade_entry: grades_table = "mlb_grades_hitters" elif "control" in grade_entry or "fastball" in grade_entry: grades_table = "mlb_grades_pitchers" else: print "\n\n\n", grades, "\n\n\n" return fv db.insertRowDict(grade_entry, grades_table, insertMany=False, replace=True, rid=0,debug=1) db.conn.commit() return fv # print_prospect_details(prospect) mlb_id = prospect.get("player").get("id") fname = prospect.get("player").get("useName") lname = prospect.get("player").get("lastName") input_name = fname + " " + lname helper2.input_name(input_name) fname, lname = helper.adjust_mlb_names(mlb_id, fname, lname) position = prospect.get("player").get("positionAbbreviation") position = helper.adjust_mlb_positions(mlb_id, position) entry["year"] = year entry["rank"] = rnk entry["mlb_id"] = mlb_id entry["fname"] = fname entry["lname"] = lname entry["position"] = position try: dob = prospect.get("player").get("birthDate") byear = dob.split("-")[0] bmonth = dob.split("-")[1] bday = dob.split("-")[2] except IndexError: print "\n\nNO BIRTHDAY", fname, lname, mlb_id, "\n\n" prospect_id = helper.add_prospect(mlb_id, fname, lname, byear, bmonth, bday, p_type=prospect_type) if prospect_id == 0 or prospect_id is None: grades_id = mlb_id else: grades_id = prospect_id entry["birth_year"] = byear entry["birth_month"] = bmonth entry["birth_day"] = bday entry["prospect_id"] = prospect_id entry["grades_id"] = grades_id bats = prospect.get("player").get("batSideCode") throws = prospect.get("player").get("pitchHandCode") weight = prospect.get("player").get("weight") try: height = prospect.get("player").get("height").replace("\"","").split("'") height = int(height[0])*12+int(height[1]) except (IndexError, ValueError, AttributeError): height = None entry["bats"] = bats entry["throws"] = throws entry["weight"] = weight entry["height"] = height try: team = prospect.get("player").get("currentTeam").get("parentOrgName") except (AttributeError): team = None entry["team"] = team commit = prospect.get("prospectSchoolCommitted") entry["college_commit"] = commit eta = prospect.get("eta") entry["eta"] = eta hit_fv = None pitch_fv = None if prospect.get("gradesHitting") is not None and prospect.get("gradesHitting") != []: hit_grades = prospect.get("gradesHitting") hit_fv = process_grades(year, grades_id, hit_grades, "hit", prospect_type) if prospect.get("gradesPitching") is not None and prospect.get("gradesPitching") != []: pitch_grades = prospect.get("gradesPitching") pitch_fv = process_grades(year, grades_id, pitch_grades, "pitch", prospect_type) fv = max(hit_fv, pitch_fv) entry["FV"] = fv blurbs = prospect.get("prospectBio") sorted_blurbs = sorted(blurbs, key=lambda k:k["contentTitle"], reverse=True) cleaned_blurbs = [] for i,b in enumerate(sorted_blurbs): if b.get("contentText") is None: sorted_blurbs[i] = None else: blurbtext = str(b.get("contentTitle")) + b.get("contentText").replace("<b>","").replace("</b>","").replace("<br />","").replace("<p>","\n").replace("</p>","").replace("*","").replace("<strong>","").replace("</strong>","") blurbtext = "".join([j if ord(j) < 128 else "" for j in blurbtext]) cleaned_blurbs.append(blurbtext) blurb = "\n\n".join(cleaned_blurbs) entry["blurb"] = blurb # raw_input(entry) return entry
def current_series(year, timestamp): print '\tdetermining current series probabilities' games_query = "SELECT IFNULL(SUM(IF(winning_team IS NOT NULL,1,0)),0) FROM __in_playoff_game_results WHERE year = %s;" % ( year) total_playoff_games_played = db.query(games_query)[0][0] qry = """SELECT series_id, year, strength_type, team, opponent, series_wins, series_losses FROM __in_playoff_bracket WHERE update_time = (SELECT MAX(update_time) FROM __in_playoff_bracket) AND year = %s;""" query = qry % (year) res = db.query(query) for row in res: series_id, year, strength_type, team, opponent, series_wins, series_losses = row series_type = series_id.replace('AL', '').replace('NL', '')[:2] games_dict = {'WC': 1, 'DS': 5, 'CS': 7, 'WS': 7} series_games = games_dict.get(series_type) team_abb = helper.get_team_abb(team, year) oppn_abb = helper.get_team_abb(opponent, year) team_winProb = get_single_game_win_prob(team_abb, oppn_abb, strength_type, year) entry = { 'update_time': timestamp, 'series_id': series_id, 'year': year, 'team': team, 'opponent': opponent, 'series_wins': series_wins, 'series_losses': series_losses, 'strength_type': strength_type, 'team_winProb': team_winProb, 'total_playoff_games_played': total_playoff_games_played } team_probs = [] if series_wins == series_games / 2 + 1: team_probs.append(1) total_games = series_wins + series_losses if total_games > 2: colName = 'team_in' + str(total_games) entry[colName] = 1 if series_losses == series_games / 2 + 1: team_probs.append(0) if (series_wins != series_games / 2 + 1 and series_losses != series_games / 2 + 1): for end_game in range(series_games / 2 + 1, series_games + 1 - series_losses): team_in_N = BinomDist.pmf(n=end_game - 1 - series_wins, k=(series_games / 2 - series_wins), p=team_winProb) * team_winProb col_name = 'team_in' + str(end_game + series_losses) team_probs.append(team_in_N) if end_game > 2: entry[col_name] = team_in_N entry['team_seriesProb'] = sum(team_probs) db.insertRowDict(entry, '__in_playoff_bracket', insertMany=False, replace=True, rid=0, debug=1) db.conn.commit()
def process_wc2(year): print "wc2" for _type in ('roster', 'projected'): print '\t', _type for conf in ('AL', 'NL'): team_query = "SELECT team_abb, team_name, (win_division+wc_1), mean_W/162.0, var, year, games_played FROM __playoff_probabilities JOIN (SELECT team_abb, MAX(year) AS year, MAX(games_played) AS games_played FROM __playoff_probabilities GROUP BY team_abb, year) t2 USING (team_abb, year, games_played) WHERE strength_type = '%s' AND LEFT(division,2) = '%s'AND year = %s" % ( _type, conf, year) team_res = db.query(team_query) wc2_dict = {} for team_row in team_res: team_abb, team_name, po_prob, strength_pct, var, year, games_played = team_row print '\t\t', team_name if games_played > 162: strength_pct = float( (float(strength_pct) * 162.0) / float(games_played)) else: strength_pct = float(strength_pct) division, div_teams, conf_teams, non_conf_teams = helper.get_division( team_name, year) div_winners_qry = """SELECT p1.team_name, p2.team_name, p3.team_name, p4.team_name, (p1.win_division+p1.wc_1)*(p2.win_division+p2.wc_1)*(p3.win_division+p3.wc_1)*(p4.win_division+p4.wc_1) FROM __playoff_probabilities p1 JOIN __playoff_probabilities p2 JOIN __playoff_probabilities p3 JOIN __playoff_probabilities p4 JOIN (SELECT team_abb, MAX(year) AS year, MAX(games_played) AS games_played FROM __playoff_probabilities GROUP BY team_abb, year) t1 ON (p1.team_abb=t1.team_abb AND p1.year=t1.year AND p1.games_played=t1.games_played) JOIN (SELECT team_abb, MAX(YEAR) AS YEAR, MAX(games_played) AS games_played FROM __playoff_probabilities GROUP BY team_abb, year) t2 ON (p2.team_abb=t2.team_abb AND p2.year=t2.year AND p2.games_played=t2.games_played) JOIN (SELECT team_abb, MAX(YEAR) AS YEAR, MAX(games_played) AS games_played FROM __playoff_probabilities GROUP BY team_abb, year) t3 ON (p3.team_abb=t3.team_abb AND p3.year=t3.year AND p3.games_played=t3.games_played) JOIN (SELECT team_abb, MAX(YEAR) AS YEAR, MAX(games_played) AS games_played FROM __playoff_probabilities GROUP BY team_abb, year) t4 ON (p4.team_abb=t4.team_abb AND p4.year=t4.year AND p4.games_played=t4.games_played) WHERE 1 AND p1.strength_type = '%s' AND p2.strength_type = '%s' AND p3.strength_type = '%s' AND p4.strength_type = '%s' AND p1.division = '%s West' AND p2.division = '%s Central' AND p3.division = '%s East' AND LEFT(p4.division,2) = '%s' AND p1.team_name != '%s' AND p2.team_name != '%s' AND p3.team_name != '%s' AND p4.team_name != '%s' AND p1.team_name != p4.team_name AND p2.team_name != p4.team_name AND p3.team_name != p4.team_name AND p1.year = %s AND p2.year = %s AND p3.year = %s AND p4.year = %s;""" div_winners_query = div_winners_qry % ( _type, _type, _type, _type, conf, conf, conf, conf, team_name, team_name, team_name, team_name, year, year, year, year) div_winners_res = db.query(div_winners_query) wc2_pre_prob = float(0.0) for div_row in div_winners_res: div1_team, div2_team, div3_team, div4_team, situation_prob = div_row set_teams = [] for tm in conf_teams: if tm not in (div1_team, div2_team, div3_team, div4_team): set_teams.append(tm) win_wc2_prob = np.prod( get_probabilities(team_name, set_teams, strength_pct, games_played, float(var), _type, year)[0]) wc2_pre_prob += (float(situation_prob) * float(win_wc2_prob)) wc2_pre_prob = wc2_pre_prob * (1.0 - float(po_prob)) wc2_dict[team_name] = [ wc2_pre_prob, (1.0 - float(po_prob)), False, year, games_played ] col_name = 'wc_2' adjust_probabilities(wc2_dict, col_name, 1.0, _type)
def register_war(year): player_q = """SELECT player_name, team_abb, position, age, pa FROM register_batting_primary JOIN register_batting_secondary USING (year, player_name, team_abb, position, age) JOIN register_batting_analytical USING (year, player_name, team_abb, position, age) WHERE year = %s; """ player_qry = player_q % (year) player_data = db.query(player_qry) entries = [] for row in player_data: entry = {} entry['year'] = year player_name, team_abb, position, age, pa = row pa = float(pa) entry['player_name'] = player_name entry['team_abb'] = team_abb entry['position'] = position # bats = helper.get_hand(player_name) # entry['bats'] = bats if player_name[-1] not in ('*', '#'): s_name = player_name else: s_name = player_name[:len(player_name)-1] entry['age'] = age entry['pa'] = pa entry['inn'] = None if year < 2011: defense = 0.0 entry['defense'] = defense adj = float(helper.get_pos_adj(position.upper())) position_adj = adj*(pa/600) entry['position_adj'] = position_adj else: # changes Travis d'Arnoud to Travis d''Arnoud search_name = s_name.replace("'","''") rn_val, err_val, arm_val, pb_val = helper.get_def_values(search_name, position, year) #600 pa is a full season defense = float(pa)*(rn_val + err_val + arm_val + pb_val)/600 entry['defense'] = defense adj = float(helper.get_pos_adj(position.upper())) position_adj = adj*(float(pa)/600) entry['position_adj'] = position_adj # if player_name.lower() == 'derek jeter': # print rn_val, err_val, arm_val, pb_val # raw_input(entry) dwar = (defense+position_adj)/10.0 entry['dWAR'] = dwar entries.append(entry) table = 'processed_compWAR_defensive' if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()
def batters(year): player_q = """SELECT a.year , IFNULL(CONCAT(nm.right_fname, ' ', nm.right_lname), a.Player) AS player , a.team_abb , a.age , a.B as hand , a.PO , COALESCE(a.PA, c.PA) AS pa , a.ab , a.h , a.2b , a.3b , a.hr , a.bb , a.so , a.sb , a.cs , BA , OBP , SLG , BABIP , OPS_Plus , DEF , c.WAR , cv.yr1_WAR , cv.yr1_value , cv.yr2_WAR , cv.yr2_value , cv.yr3_WAR , cv.yr3_value , cv.yr4_WAR , cv.yr4_value , cv.yr5_WAR , cv.yr5_value , cv.yr6_WAR , cv.yr6_value , cv.yr7_WAR , cv.yr7_value , cv.yr8_WAR , cv.yr8_value FROM zips_fangraphs_batters_counting a JOIN( SELECT year , Player , MAX(post_date) AS post_date FROM zips_fangraphs_batters_counting WHERE 1 AND year = %s GROUP BY year, Player ) b USING (year,Player,post_date) LEFT JOIN zips_fangraphs_batters_rate c USING (year, Player, team_abb) LEFT JOIN name_mapper nm ON (1 AND a.Player = nm.wrong_name AND (nm.start_year IS NULL OR nm.start_year <= a.year) AND (nm.end_year IS NULL OR nm.end_year >= a.year) AND (nm.position = '' OR nm.position = a.PO) AND (nm.rl_team = '' OR nm.rl_team = a.team_abb) # AND (nm.nsbl_team = '' OR nm.nsbl_team = rbp.team_abb) ) LEFT JOIN name_mapper nm2 ON (nm.right_fname = nm2.right_fname AND nm.right_lname = nm2.right_lname AND (nm.start_year IS NULL OR nm.start_year = nm2.start_year) AND (nm.end_year IS NULL OR nm.end_year = nm2.end_year) AND (nm.position = '' OR nm.position = nm2.position) AND (nm.rl_team = '' OR nm.rl_team = nm2.rl_team) ) JOIN zips_FA_contract_value_batters cv ON (a.year = cv.year AND a.team_abb = cv.team_abb AND IFNULL(nm2.wrong_name, a.Player) = cv.Player ) ;""" player_qry = player_q % (year) # raw_input(player_qry) player_data = db.query(player_qry) entries = [] for row in player_data: entry = {} year, player_name, team_abb, age, hand, po, pa, ab, h, _2, _3, hr, bb, so, sb, cs, ba, obp, slg, babip, zOPS_Plus, DEF, WAR, yr1_WAR, yr1_value, yr2_WAR, yr2_value, yr3_WAR, yr3_value, yr4_WAR, yr4_value, yr5_WAR, yr5_value, yr6_WAR, yr6_value, yr7_WAR, yr7_value, yr8_WAR, yr8_value = row if pa is None: pa = ab + bb bb2 = bb hbp = 0 _1 = h - _2 - _3 - hr team_abb = team_abb.upper() pf = float(helper.get_park_factors(team_abb, year - 1)) / float(100) if po.lower() != 'c': scaledWAR = 600 * (float(WAR) / float(pa)) else: scaledWAR = 450 * (float(WAR) / float(pa)) ops, wOBA, park_wOBA, OPS_plus, wrc, wrc27, wRC_plus, raa, oWAR = helper.get_zips_offensive_metrics( year - 1, pf, pa, ab, bb2, hbp, _1, _2, _3, hr, sb, cs) entry['year'] = year entry['player_name'] = player_name entry['team_abb'] = team_abb entry['age'] = age entry['hand'] = hand entry['pos'] = po entry['pf'] = pf entry['pa'] = pa entry['ba'] = ba entry['obp'] = obp entry['slg'] = slg entry['zOPS_Plus'] = zOPS_Plus entry['DEF'] = DEF entry['zWAR'] = WAR entry['babip'] = babip entry['OPS_plus'] = OPS_plus entry['park_wOBA'] = park_wOBA entry['wRC_plus'] = wRC_plus entry['scaledWAR'] = scaledWAR entry['yr1_WAR'] = yr1_WAR entry['yr1_value'] = yr1_value entry['yr2_WAR'] = yr2_WAR entry['yr2_value'] = yr2_value entry['yr3_WAR'] = yr3_WAR entry['yr3_value'] = yr3_value entry['yr4_WAR'] = yr4_WAR entry['yr4_value'] = yr4_value entry['yr5_WAR'] = yr5_WAR entry['yr5_value'] = yr5_value entry['yr6_WAR'] = yr6_WAR entry['yr6_value'] = yr6_value entry['yr7_WAR'] = yr7_WAR entry['yr7_value'] = yr7_value entry['yr8_WAR'] = yr8_WAR entry['yr8_value'] = yr8_value entries.append(entry) table = 'zips_fangraphs_prep_FA_batters' print table if entries != []: for i in range(0, len(entries), 1000): db.insertRowDict(entries[i:i + 1000], table, insertMany=True, replace=True, rid=0, debug=1) db.conn.commit()
def process_players(player_list, year, season_gp, team_name, team_abb, date): entries = [] pos = '' for plr in player_list: if plr == []: continue if plr[0] == 'Pitchers': pos = 'p' elif plr[0] == 'Catchers': pos = 'c' elif plr[0] == 'Infield': pos = 'if' elif plr[0] == 'Outfield': pos = 'of' try: if (plr[1] == 'MLI') or ((float(plr[2]) > 0 or float(plr[3]) > 0) and plr[2] != ''): entry = { 'year': year, 'gp': season_gp, 'position': pos, 'team_abb': team_abb, 'date': date } entered_name = plr[0] if pos == 'c' and entered_name == 'Smith, Will': entered_name = 'D. Smith, Will' player_name, first_name, last_name = name_parser(entered_name) entry['player_name'] = player_name entry['fname'] = first_name entry['lname'] = last_name entry['entered_name'] = entered_name contract_year = plr[1] if player_name == 'Max Stassi': contract_year = 'V' entry['contract_year'] = contract_year if plr[1] == 'MLI': salary = 1.1 else: salary = plr[2] entry['salary'] = salary if len(plr) < 4: expires = 0 else: expires = plr[3] if len(plr) < 5: opt = '' else: opt = plr[4] if len(plr) < 6: ntc = None else: ntc = plr[5] # if len(plr) < 7: # salary_counted = 'N' # else: salary_counted = 'N' if (contract_year.lower() in ('v', 'ce', '4th', '5th', '6th') or contract_year[-1] == 'G'): salary_counted = 'Y' entry['expires'] = expires entry['opt'] = opt entry['ntc'] = ntc entry['salary_counted'] = salary_counted # for i,v in entry.items(): # print i, '\t', v helper.input_name(entry.get('player_name')) entries.append(entry) except (IndexError, ValueError): continue if entries != []: db.insertRowDict(entries, 'excel_rosters', replace=True, insertMany=True, rid=0) db.conn.commit()
def scrape_prospects(year, prospect_lists): list_cnt = 0 for list_type in (prospect_lists): entries = [] if list_type not in ("rule5", "prospects", "pdp", "rhp", "lhp", "c", "1b", "2b", "3b", "ss", "of"): # if list_type in ("draft","int"): list_cnt += 1 ind_list = prospect_lists[list_type] i = 0 for player in ind_list: entry = {} i += 1 sleep(sleep_time) mlb_id = player["player_id"] player_url = player_base_url % (year, mlb_id) print list_cnt, year, list_type, i, "\t", str(mlb_id) print "\t\t", str(player_url) sleep(sleep_time) player_json = getter.get_url_data(player_url, "json") try: player_info = player_json["prospect_player"] except TypeError: print "\n\n**ERROR TAG** TYPE_ERROR", str(year), str( mlb_id), "\n\n" continue fname = player_info["player_first_name"] lname = player_info["player_last_name"] input_name = fname + ' ' + lname helper2.input_name(input_name) fname, lname = helper.adjust_mlb_names(mlb_id, fname, lname) position = player_info["positions"] position = helper.adjust_mlb_positions(mlb_id, position) entry["year"] = year entry["rank"] = i entry["mlb_id"] = mlb_id entry["fname"] = fname entry["lname"] = lname entry["position"] = position if list_type in ("int", "draft"): bats = player_info["bats"] throws = player_info["thrw"] try: height = player_info["height"].replace("\"", "").split("\"") height = int(height[0]) * 12 + int(height[1]) except (IndexError, ValueError, AttributeError): height = None weight = player_info["weight"] try: dob = player_info["birthdate"] byear = dob.split("/")[2] bmonth = dob.split("/")[0] bday = dob.split("/")[1] except IndexError: print '\n\nNO BIRTHDAY', fname, lname, mlb_id, "\n\n" continue byear, bmonth, bday = helper.adjust_mlb_birthdays( mlb_id, byear, bmonth, bday) prospect_id = helper.add_prospect(mlb_id, fname, lname, byear, bmonth, bday, p_type=list_type) else: info_url = player2_base_url % mlb_id print "\t\t", info_url sleep(sleep_time) info_json = getter.get_url_data(info_url, "json", json_unicode_convert=True) try: info_info = info_json["player_info"]["queryResults"][ "row"] except TypeError: print "\n\n**ERROR TAG** MLB_ERROR", str(year), str( mlb_id), str(fname), str(lname), "\n\n" continue dob = info_info["birth_date"] byear = dob.split("-")[0] bmonth = dob.split("-")[1] bday = dob.split("-")[2].split("T")[0] prospect_id = helper.add_prospect(mlb_id, fname, lname, byear, bmonth, bday, p_type="professional") try: bats = info_info["bats"] throws = info_info["throws"] height = int(info_info["height_feet"]) * 12 + int( info_info["height_inches"]) weight = int(info_info["weight"]) except UnicodeDecodeError: bats, throws, height, weight = (None, None, None, None) except ValueError: print "\n\n**ERROR TAG** MLB_ERROR", str(year), str( mlb_id), str(fname), str(lname), "\n\n" continue if prospect_id == 0 or prospect_id is None: grades_id = mlb_id else: grades_id = prospect_id entry["prospect_id"] = prospect_id entry["grades_id"] = grades_id entry["bats"] = bats entry["throws"] = throws entry["height"] = height entry["weight"] = weight entry["birth_year"] = byear entry["birth_month"] = bmonth entry["birth_day"] = bday entry["team"] = player["team_file_code"] drafted = player_info["drafted"] if list_type == "int": drafted = None try: sign_text = player_info["signed"] sign_value = sign_text.split(" - ")[1] signed = sign_value except IndexError: signed = "" try: signed = int(signed.replace("$", "").replace(",", "")) except ValueError: signed = None schoolcity = player_info["school"] gradecountry = player_info["year"] commit = None elif list_type == "draft": try: signed = player_info["preseason20"].replace( " ", "").replace(",", "").replace("$", "").split("-")[1] except (KeyError, IndexError): signed = player_info["signed"].replace( " ", "").replace(",", "").replace("$", "") try: signed = int(signed) except ValueError: signed = None schoolcity = player_info["school"] gradecountry = player_info["year"] commit = player_info["signed"] else: signed = player_info["signed"] schoolcity = None gradecountry = None commit = None entry["drafted"] = drafted entry["signed"] = signed entry["school_city"] = schoolcity entry["grade_country"] = gradecountry entry["college_commit"] = commit if list_type not in ("int", "draft"): eta = player_info["eta"] try: pre_top100 = player_info["preseason100"] except KeyError: pre_top100 = None else: pre_top100 = None eta = None entry["pre_top100"] = pre_top100 entry["eta"] = eta entry["twitter"] = player_info["twitter"] blurb = player_info["content"]["default"].replace( "<b>", "").replace("</b>", "").replace("<br />", "").replace( "<p>", "").replace("</p>", "").replace("*", "") entry["blurb"] = blurb try: overall_text = blurb.split("Overall")[1].split( '\n')[0].replace(':', '').replace(' ', '')[:8] if overall_text[0] not in (' ', ':', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'): raise IndexError try: text2 = overall_text.split('/')[1] except IndexError: text2 = overall_text.split('/')[-1] overall = int(filter(str.isdigit, text2[:2])) except IndexError: overall = 0 if overall < 20 and overall is not None: overall = overall * 10 entry["FV"] = overall entries.append(entry) if list_type == "draft": table = "mlb_prospects_draft" elif list_type == "int": table = "mlb_prospects_international" else: table = "mlb_prospects_professional" for e in entries: raw_input(e)
def pitchers(year): player_q = """SELECT a.year , IFNULL(CONCAT(nm.right_fname, ' ', nm.right_lname), a.Player) AS player , a.team_abb , a.age , T as hand , ERA , a.G , a.GS , IP , H , ER , HR , BB , SO , k_9 , bb_9 , hr_9 , bb_pct , k_pct , BABIP , ERA_Plus , ERA_minus , COALESCE(a.FIP, c.FIP) AS FIP , c.WAR , cv.yr1_WAR , cv.yr1_value , cv.yr2_WAR , cv.yr2_value , cv.yr3_WAR , cv.yr3_value , cv.yr4_WAR , cv.yr4_value , cv.yr5_WAR , cv.yr5_value , cv.yr6_WAR , cv.yr6_value , cv.yr7_WAR , cv.yr7_value , cv.yr8_WAR , cv.yr8_value FROM zips_fangraphs_pitchers_counting a JOIN( SELECT year , Player , MAX(post_date) AS post_date FROM zips_fangraphs_pitchers_counting WHERE 1 AND year = %s GROUP BY year, Player ) b USING (year,Player,post_date) LEFT JOIN zips_fangraphs_pitchers_rate c USING (year, Player, team_abb) LEFT JOIN name_mapper nm ON (1 AND a.Player = nm.wrong_name AND (nm.start_year IS NULL OR nm.start_year <= a.year) AND (nm.end_year IS NULL OR nm.end_year >= a.year) # AND (nm.position = '' OR nm.position = a.PO) AND (nm.rl_team = '' OR nm.rl_team = a.team_abb) # AND (nm.nsbl_team = '' OR nm.nsbl_team = rbp.team_abb) ) LEFT JOIN name_mapper nm2 ON (nm.right_fname = nm2.right_fname AND nm.right_lname = nm2.right_lname AND (nm.start_year IS NULL OR nm2.start_year = nm2.start_year) AND (nm.end_year IS NULL OR nm2.end_year = nm2.end_year) AND (nm.position = '' OR nm2.position = nm2.position) AND (nm.rl_team = '' OR nm2.rl_team = nm2.rl_team) ) JOIN zips_FA_contract_value_pitchers cv ON (a.year = cv.year AND a.team_abb = cv.team_abb AND IFNULL(nm2.wrong_name, a.Player) = cv.Player ) ;""" player_qry = player_q % (year) # raw_input(player_qry) player_data = db.query(player_qry) entries = [] for row in player_data: entry = {} year, player_name, team_abb, age, hand, era, g, gs, ip, h, er, hr, bb, k, k_9, bb_9, hr_9, bb_pct, k_pct, babip, zera_plus, zera_minus, zfip, zwar, yr1_WAR, yr1_value, yr2_WAR, yr2_value, yr3_WAR, yr3_value, yr4_WAR, yr4_value, yr5_WAR, yr5_value, yr6_WAR, yr6_value, yr7_WAR, yr7_value, yr8_WAR, yr8_value = row r = er if (gs >= 20 or float(gs) / float(g) > 0.8): pos = 'SP' else: pos = 'RP' team_abb = team_abb.upper() pf = float(helper.get_park_factors(team_abb, year - 1)) / float(100) if float(bb) == 0: if float(k) > 0: k_bb = 99.0 else: k_bb = 0.0 else: k_bb = (float(k) / float(bb)) fip_const = float( helper.get_zips_average_pitchers(year - 1, 'fip_const')) FIP = ((((13 * float(hr)) + (3 * float(bb)) - (2 * float(k))) / float(ip)) + fip_const) park_FIP, FIP_min, FIP_WAR = helper.get_zips_pitching_metrics( FIP, ip, year - 1, pf, g, gs, 'fip') ERA = float(era) park_ERA, ERA_min, ERA_WAR = helper.get_zips_pitching_metrics( ERA, ip, year - 1, pf, g, gs, 'era') if pos == 'SP': FIP_WAR = 32 * (float(FIP_WAR) / float(gs)) ERA_WAR = 32 * (float(ERA_WAR) / float(gs)) elif pos == 'RP': FIP_WAR = float(FIP_WAR) ERA_WAR = float(ERA_WAR) if k_pct is not None and bb_pct is not None: k_minus_bb_pct = float(k_pct) - float(bb_pct) else: k_minus_bb_pct = None entry['year'] = year entry['player_name'] = player_name entry['team_abb'] = team_abb entry['age'] = age entry['hand'] = hand entry['pos'] = pos entry['pf'] = pf entry['g'] = g entry['gs'] = gs entry['ip'] = ip entry['babip'] = babip entry['k_9'] = k_9 entry['bb_9'] = bb_9 entry['k_bb'] = k_bb entry['hr_9'] = hr_9 entry['k_pct'] = k_pct entry['bb_pct'] = bb_pct entry['k_minus_bb_pct'] = k_minus_bb_pct entry['zERA_plus'] = zera_plus entry['zERA_minus'] = zera_minus entry['zFIP'] = zfip entry['zWAR'] = zwar entry['FIP'] = FIP entry['park_FIP'] = park_FIP entry['FIP_minus'] = FIP_min entry['FIP_WAR'] = FIP_WAR entry['ERA'] = era entry['park_ERA'] = park_ERA entry['ERA_minus'] = ERA_min entry['ERA_WAR'] = ERA_WAR entry['yr1_WAR'] = yr1_WAR entry['yr1_value'] = yr1_value entry['yr2_WAR'] = yr2_WAR entry['yr2_value'] = yr2_value entry['yr3_WAR'] = yr3_WAR entry['yr3_value'] = yr3_value entry['yr4_WAR'] = yr4_WAR entry['yr4_value'] = yr4_value entry['yr5_WAR'] = yr5_WAR entry['yr5_value'] = yr5_value entry['yr6_WAR'] = yr6_WAR entry['yr6_value'] = yr6_value entry['yr7_WAR'] = yr7_WAR entry['yr7_value'] = yr7_value entry['yr8_WAR'] = yr8_WAR entry['yr8_value'] = yr8_value entries.append(entry) table = 'zips_fangraphs_prep_FA_pitchers' print table if entries != []: for i in range(0, len(entries), 1000): db.insertRowDict(entries[i:i + 1000], table, insertMany=True, replace=True, rid=0, debug=1) db.conn.commit()
def process_basic(year): print 'initial table setup' for _type in ('roster', 'projected'): basic_query = """SELECT team_abb, team_name, year, season_gp, games_played, current_W, current_L, overall_var, roster_W, roster_L, roster_pct, ros_W, ros_L, ros_pct, projected_W, projected_L, projected_pct FROM __team_strength t1 JOIN (SELECT team_abb, MAX(year) AS year, MAX(games_played) AS games_played FROM __team_strength GROUP BY team_abb, year) t2 USING (team_abb, year, games_played) WHERE year = %s;""" basic_query = basic_query % (year) # raw_input(basic_query) basic_res = db.query(basic_query) for basic_row in basic_res: entry = {} team_abb, team_name, year, season_gp, games_played, cur_W, cur_L, overall_var, roster_W, roster_L, roster_pct, ros_W, ros_L, ros_pct, projected_W, projected_L, projected_pct = basic_row games_played = float(games_played) games_remaining = float(max(0.0, 162.0 - games_played)) # linearly scaled variance (no variance at game 162, full variance at game 0) projected_var = max(0.001, float(overall_var) * (games_remaining / 162.0)) projected_std = max( 0.001, math.sqrt(float(overall_var)) * (games_remaining / 162.0)) division, div_teams, conf_teams, non_conf_teams = helper.get_division( team_name, year) if _type == 'roster': p_95 = float(roster_W) + 1.96 * math.sqrt(float(overall_var)) p_75 = float(roster_W) + 1.15 * math.sqrt(float(overall_var)) p_25 = float(roster_W) - 1.15 * math.sqrt(float(overall_var)) p_05 = float(roster_W) - 1.96 * math.sqrt(float(overall_var)) entry['team_abb'] = team_abb entry['team_name'] = team_name entry['year'] = year entry['season_gp'] = season_gp entry['games_played'] = games_played entry['division'] = division entry['strength_type'] = _type entry['strength_pct'] = roster_pct entry['var'] = overall_var entry['mean_W'] = roster_W entry['mean_L'] = roster_L entry['p_95'] = p_95 entry['p_75'] = p_75 entry['p_25'] = p_25 entry['p_05'] = p_05 elif _type == 'projected': p_95 = float(projected_W) + 1.96 * (projected_std) p_75 = float(projected_W) + 1.15 * (projected_std) p_25 = float(projected_W) - 1.15 * (projected_std) p_05 = float(projected_W) - 1.96 * (projected_std) entry['team_abb'] = team_abb entry['team_name'] = team_name entry['year'] = year entry['season_gp'] = season_gp entry['games_played'] = games_played entry['division'] = division entry['strength_type'] = _type entry['strength_pct'] = ros_pct entry['var'] = projected_var entry['mean_W'] = projected_W entry['mean_L'] = projected_L entry['p_95'] = p_95 entry['p_75'] = p_75 entry['p_25'] = p_25 entry['p_05'] = p_05 db.insertRowDict(entry, '__playoff_probabilities', insertMany=False, replace=True, rid=0, debug=1) db.conn.commit()
"zips_fangraphs_pitchers_counting": "a.Player", "zips_fangraphs_pitchers_rate": "a.Player", "zips_offense": "a.player_name", "zips_offense_splits": "a.player_name", "zips_pitching": "a.player_name", "zips_pitching_splits": "a.player_name", "mlb_prospects.fg_raw": "a.playerName", "mlb_prospects.minorleagueball_professional": "a.full_name", "mlb_prospects.mlb_prospects_draft": "CONCAT(a.fname, ' ', a.lname)", "mlb_prospects.mlb_prospects_international": "CONCAT(a.fname, ' ', a.lname)", "mlb_prospects.mlb_prospects_professional": "CONCAT(a.fname, ' ', a.lname)" } for k, v in table_dict.items(): print k qry = """ SELECT DISTINCT %s FROM %s a LEFT JOIN name_mapper nm ON (%s = nm.wrong_name) WHERE 1 AND nm.wrong_name IS NULL """ % (v, k, v) # raw_input(qry) names = db.query(qry) for name in names: helper.input_name(name[0])
def parse_player(player_text, year, team_abb): try: int(player_text[0:1]) except ValueError: # raw_input(player_text) return None try: full_name = player_text.split(")")[1].split(",")[0].strip() except IndexError: return None try: team_rank = player_text.split(")")[0].strip() except IndexError: team_rank = None try: position = player_text.split(",")[1].split(",")[0].split(";")[0].strip().split(" ")[0].split(".")[0].strip() except IndexError: position = None try: grade_base = player_text.upper().split("GRADE")[1].split(":")[0].split(".")[0].split(";")[0] grade = grade_base.replace("/BORDERLINE","/").replace("BORDERLINE","/").replace("//","/").replace(" ","").strip() except IndexError: # raw_input(player_text) grade_base, grade = None, None try: age = player_text.lower().split(" age")[1].split(",")[0].split(";")[0].split(":")[0].split("(")[0].strip() age = int(age) except (IndexError, ValueError): try: age = player_text.lower().split("age")[1].split(",")[0].split(";")[0].strip() age = int(age) except (IndexError, ValueError): age = 0 try: eta = player_text.lower().split(" eta")[1].split(".")[0].split(";")[0].split("(")[0].replace(":","").strip() except IndexError: eta = None entry = {"year":year, "team":team_abb} full_name, fname, lname = helper.adjust_minorleagueball_name(full_name, year, team_abb) est_birthyear = year - int(age) age = helper.adjust_minorleagueball_birthyear(full_name, year, team_abb, est_birthyear) position = helper.adjust_minorleagueball_position(full_name, year, team_abb, position) eta = helper.adjust_minorleagueball_eta(full_name, year, team_abb, eta) if grade is None: return None try: blurb = player_text.split("Grade"+grade_base+":")[1].strip() except (TypeError, IndexError): try: blurb = "Age " + player_text.split("Age")[1].strip() except (TypeError, IndexError): blurb = None try: grade_split = blurb.upper().split("BORDERLINE")[1].split(":")[0].split(".")[0].strip()[0:2].strip() if grade_split != "": grade = grade + "/" + grade_split except (IndexError, AttributeError): grade = grade grade = helper.adjust_minorleagueball_grade(full_name, year, team_abb, grade) if int(team_rank) == 31 and grade[0] in ("A", "B"): team_rank = 1 entry["team_rank"] = team_rank entry["full_name"] = full_name entry["position"] = position entry["age"] = age entry["grade"] = grade entry["eta"] = eta entry["fname"] = fname entry["lname"] = lname entry["blurb"] = blurb print "\t\t", team_rank, full_name, position, age, grade, eta helper2.input_name(entry.get('full_name')) db.insertRowDict(entry, "minorleagueball_professional", replace=True, debug=1) db.conn.commit()
def process(): print "processed_team_standings_advanced" table = 'processed_team_standings_advanced' db.query("TRUNCATE TABLE `" + table + "`") entries = [] teamWAR_qry = """SELECT year, team_abb, dWAR, oWAR, (replacement/10) as repWAR, FIP_WAR, ERA_WAR FROM processed_WAR_team """ team_WAR_list = db.query(teamWAR_qry) for team in team_WAR_list: year, team_abb, dWAR, oWAR, repWAR, FIP_WAR, ERA_WAR = team mascot_name = helper.get_mascot_names(team_abb.upper(), year) #a full season is ~17 replacement wins? repWAR = float(repWAR) pos_WAR = float(dWAR) + float(oWAR) + repWAR fWAR = pos_WAR + float(FIP_WAR) rWAR = pos_WAR + float(ERA_WAR) if team_abb == '': continue else: record_q = """SELECT year, team_name, games_played, w, l, rf, ra FROM team_standings WHERE team_name LIKE '%%%s%%' AND year = %s AND games_played = (SELECT MAX(games_played) FROM team_standings WHERE team_name LIKE '%%%s%%' AND year = %s) """ record_qry = record_q % (mascot_name, year, mascot_name, year) # raw_input(record_qry) record = db.query(record_qry)[0] year, team_name, games_played, w, l, rf, ra = record # http://www.had2know.com/sports/pythagorean-expectation-win-percentage-baseball.html pythag_x = ((float(rf) + float(ra)) / (float(w) + float(l)))**(float(0.285)) pythag_win_pct = (float(rf)**pythag_x) / ((float(rf)**pythag_x) + (float(ra)**pythag_x)) pythag_wins = (w + l) * pythag_win_pct pythag_losses = games_played - (pythag_wins) if year < 2017: rep_team_win_pct = 0.300 else: rep_team_win_pct = 0.325 rep_team_wins = rep_team_win_pct * games_played # f_wins = (pos_WAR/repWAR)*17.0 + float(FIP_WAR) + rep_team_wins # f_losses = games_played - (f_wins) # r_wins = (pos_WAR/repWAR)*17.0 + float(ERA_WAR) + rep_team_wins # r_losses = games_played - (r_wins) f_wins = fWAR + rep_team_wins f_losses = games_played - (f_wins) r_wins = rWAR + rep_team_wins r_losses = games_played - (r_wins) entry = { "year": year, "team_name": team_name, "games_played": games_played, "repWAR": repWAR, "oWAR": oWAR, "dWAR": dWAR, "FIP_WAR": FIP_WAR, "ERA_WAR": ERA_WAR, "RF": rf, "RA": ra, "f_Wins": f_wins, "f_Losses": f_losses, "r_Wins": r_wins, "r_Losses": r_losses, "py_Wins": pythag_wins, "py_Losses": pythag_losses, "W": w, "L": l } entries.append(entry) if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()
def process(curr_year): rosters_link = '/Users/connordog/Dropbox/Desktop_Files/Baseball/Rosters.xlsx' season_gp = db.query( "SELECT gs FROM processed_league_averages_pitching WHERE year = %s" % (curr_year)) if season_gp == (): season_gp = 0 else: season_gp = float(season_gp[0][0]) / 2 workbook = xlrd.open_workbook(rosters_link) # iterate through all team sheets for index in range(4, 34): team_name = workbook.sheet_names()[index] print team_name team_abbs, primary_abb = helper.get_team_abbs(team_name.upper()) entries = [] team_sheet = workbook.sheet_by_index(index) # get a maximum row for each sheet for row in range(1, 100): if team_sheet.cell(row, 1).value == 'Waived Players': max_row = row break position = '' for row in range(8, max_row): if team_sheet.cell(row, 1).value == 'Pitchers': position = 'p' if team_sheet.cell(row, 1).value == 'Catchers': position = 'c' if team_sheet.cell(row, 1).value == 'Infielders': position = 'if' if team_sheet.cell(row, 1).value == 'Outfielders': position = 'of' entered_name = team_sheet.cell(row, 1).value if position == 'c' and entered_name == 'Smith, Will': entered_name = 'D. Smith, Will' player_name, first_name, last_name = name_parser( entered_name, primary_abb) if team_sheet.cell( row, 2).value not in ('Year', '') and team_sheet.cell( row, 3).value not in ('Salary', ''): salary = team_sheet.cell(row, 3).value year = team_sheet.cell(row, 2).value expires = team_sheet.cell(row, 4).value opt = team_sheet.cell(row, 5).value NTC = team_sheet.cell(row, 8).value salary_counted = team_sheet.cell(row, 9).value entry = { 'year': curr_year, 'gp': season_gp, 'player_name': player_name, "fname": first_name, "lname": last_name, "team_abb": primary_abb, "position": position, "salary": salary, "contract_year": year, "expires": expires, "opt": opt, "NTC": NTC, "salary_counted": salary_counted, "entered_name": entered_name } # print entry entries.append(entry) if entries != []: db.insertRowDict(entries, 'excel_rosters', replace=True, insertMany=True, rid=0) db.conn.commit()
def pitching_war(year): player_q = """SELECT player_name, team_abb, position, age, g, gs, era, ROUND(ip) + (10 * (ip - ROUND(ip)) / 3) as ip, h, r, er, bb, k, hr FROM register_pitching_primary WHERE year = %s; """ player_qry = player_q % (year) player_data = db.query(player_qry) entries = [] for row in player_data: entry = {} player_name, team_abb, position, age, g, gs, era, ip, h, r, er, bb, k, hr = row entry['year'] = year entry['player_name'] = player_name entry['team_abb'] = team_abb entry['position'] = position throws = None entry['throws'] = throws entry['age'] = age entry['ip'] = ip team_abb = team_abb.upper() pf = float(helper.get_park_factors(team_abb, year)) / float(100) entry['pf'] = pf if ip == 0: k_9 = 0.0 if bb > 0: bb_9 = 99.0 k_bb = 99.0 else: bb_9 = 0.0 k_bb = 0.0 if hr > 0: hr_9 = 99.0 else: hr_9 = 0.0 else: k_9 = (float(k) / float(ip)) * 9 bb_9 = (float(bb) / float(ip)) * 9 hr_9 = (float(hr) / float(ip)) * 9 if bb == 0: if k > 0: k_bb = 99.0 else: k_bb = 0.0 else: k_bb = (float(k) / float(bb)) entry['k_9'] = k_9 entry['bb_9'] = bb_9 entry['k_bb'] = k_bb entry['hr_9'] = hr_9 fip_const = float(helper.get_league_average_pitchers( year, 'fip_const')) if ip == 0: FIP = 99.99 else: FIP = ((((13 * float(hr)) + (3 * float(bb)) - (2 * float(k))) / float(ip)) + fip_const) entry['FIP'] = FIP park_FIP, FIP_min, FIP_WAR = helper.get_pitching_metrics( FIP, ip, year, pf, g, gs, 'fip') entry['park_FIP'] = park_FIP entry['FIP_minus'] = FIP_min entry['FIP_WAR'] = FIP_WAR ERA = float(era) entry['ERA'] = ERA park_ERA, ERA_min, ERA_WAR = helper.get_pitching_metrics( ERA, ip, year, pf, g, gs, 'era') entry['park_ERA'] = park_ERA entry['ERA_minus'] = ERA_min entry['ERA_WAR'] = ERA_WAR entries.append(entry) table = 'processed_WAR_pitchers' if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()