def initiate(yr, _type, player_mapper): path = '/Users/connordog/Dropbox/Desktop_Files/Work_Things/CodeBase/Python_Scripts/Python_Projects/NSBL/ad_hoc/historical_csv_files/' csv_file = path + '%s_zips_%s_splits.csv' % (yr, _type) print yr, _type entries = [] with codecs.open(csv_file, 'rb', encoding='utf-8', errors='ignore') as f: mycsv = csv.reader(f) i = 0 for row in mycsv: if i == 0: i += 1 continue else: i += 1 year, player_name, vs_hand, ab, h, _2b, _3b, hr, rbi, bb, so, hbp, ibb, sh, sf = row if player_name in player_mapper: player_name = player_mapper.get(player_name) else: helper.input_name(player_name) entry = { "year": yr, "player_name": player_name, "vs_hand": vs_hand, "ab": ab, "h": h, "2b": _2b, "3b": _3b, "hr": hr, "rbi": rbi, "bb": bb, "so": so, "hbp": hbp, "ibb": ibb, "sh": sh, "sf": sf } entries.append(entry) table = 'zips_%s_splits' % (_type) if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()
def input_data(ratings, sql_table, cats, year): print '\t' + sql_table entries = [] for player in ratings: entry = {} entry['year'] = year for cat, val in zip(cats, player): # any category we aren't interested in recording, we mark as foo if cat != 'foo': # entry[cat] = val ##### if cat == 'player_name' and val is not None: entry[cat] = val.replace('*', '').replace('#', '') else: entry[cat] = val if (entry.get("player_name") not in ('Total', None, '', 'Other') and entry.get("team_abb") not in ('Total', None, '', 'Other')): entries.append(entry) elif entry.get("team_name") not in ('Total', None, '', 'Other'): full_name = helper.get_team_name(entry.get("team_name"), year) entry['team_name'] = full_name if sql_table == 'team_standings': entry['games_played'] = int(entry.get('w')) + int( entry.get('l')) entries.append(entry) if 'player_name' in entry: helper.input_name(entry.get('player_name')) # used for debugging # if entries != []: # for entry in entries[0:30]: # print '\t\t', # print entry # raw_input("") if entries != []: db.insertRowDict(entries, sql_table, insertMany=True, rid=0, replace=True) db.conn.commit()
def process_prospect_list(year, list_type, list_key): list_url = base_url + "%s%s" % (year, list_key) print "\n", year, list_type, list_url json = getter.get_url_data(list_url, "json") entries = [] for plr in json: entry = {'prospect_type': list_type} for ky, val in plr.items(): if type(val) in (str, unicode): val2 = "".join([i if ord(i) < 128 else "" for i in val]) if val != val2 and 'name' in ky.lower(): print '\n\n\n\nUNICODE NAME!!!! - \n\t', val print '\t', val2, '\n\n\n\n' if 'playerid' in ky.lower(): val = val2.replace(' ', '') else: val = val2 entry[ky.lower().replace("%", "%%")] = val if ('playername' not in entry or entry['playername'] == ''): continue if 'playerid' not in entry or entry['playerid'] == '': entry['playerid'] = str( entry['playername'].replace(' ', '').replace('*', '').replace( ",", "")) + '_' + str(entry['type'].replace(' ', '')) if 'team' not in entry or entry['team'] == '': entry['team'] = '--empty--' print '\t', year, list_key, entry['playername'] helper2.input_name(entry.get('playername')) db.insertRowDict(entry, 'fg_raw', insertMany=False, replace=True, rid=0, debug=1) db.conn.commit() sleep(sleep_time)
def initiate(yr, _type, player_mapper): path = '/Users/connordog/Dropbox/Desktop_Files/Work_Things/CodeBase/Python_Scripts/Python_Projects/NSBL/ad_hoc/historical_csv_files/' csv_file_ext = '%s_zips_%s.csv' % (yr, _type) csv_file = path+csv_file_ext print yr, _type, csv_file_ext entries = [] with codecs.open(csv_file, 'rb', encoding='utf-8', errors='ignore') as f: mycsv = csv.reader(f) i = 0 for row in mycsv: if i == 0: i += 1 continue else: i += 1 if _type == 'offense': year, player_name, team_abb, age, bats, g, ab, r, h, _2b, _3b, hr, rbi , bb, so , hbp, sb, cs, sh, sf, ibb, war = row if player_name in player_mapper: player_name = player_mapper.get(player_name) else: helper.input_name(player_name) # print player_name entry = {"year":yr, "player_name":player_name, "team_abb":team_abb, "age":age, "bats":bats, "g":g, "ab":ab, "r":r, "h":h, "2b":_2b, "3b":_3b, "hr":hr, "rbi":rbi, "bb":bb, "so":so, "hbp":hbp, "sb":sb, "cs":cs, "sh":sh, "sf":sf, "ibb":ibb, "zWAR":war} entries.append(entry) elif _type == 'pitching': year, player_name, team_abb, age, throws, w, l, era, g, gs, ip, h, r, er, hr, bb, so, war = row if player_name in player_mapper: player_name = player_mapper.get(player_name) else: helper.input_name(player_name) entry = {"year":yr, "player_name":player_name, "team_abb":team_abb, "age":age, "throws":throws, "w":w, "l":l, "era":era, "g":g, "gs":gs, "ip":ip, "h":h, "r":r, "er":er, "hr":hr, "bb":bb, "so":so, "zWAR":war} entries.append(entry) elif _type == 'defense': year, player_name, c_rn, c_er, _1b_rn, _1b_er, _2b_rn, _2b_er, _3b_rn, _3b_er, ss_rn, ss_er, lf_rn, lf_er, cf_rn, cf_er, rf_rn, rf_er, c_arm, of_arm, pb, FOO = row if player_name in player_mapper: player_name = player_mapper.get(player_name) else: helper.input_name(player_name) entry = {"year":yr, "player_name":player_name, "c_range":c_rn, "c_error":c_er, "1b_range":_1b_rn, "1b_error":_1b_er, "2b_range":_2b_rn, "2b_error":_2b_er, "3b_range":_3b_rn, "3b_error":_3b_er, "ss_range":ss_rn, "ss_error":ss_er, "lf_range":lf_rn, "lf_error":lf_er, "cf_range":cf_rn, "cf_error":cf_er, "rf_range":rf_rn, "rf_error":rf_er, "c_arm":c_arm, "of_arm":of_arm, "c_pb":pb} entries.append(entry) # print i, _type, player_name table = 'zips_%s' % (_type) if entries != []: db.insertRowDict(entries, table, replace=True, insertMany=True, rid=0) db.conn.commit()
def process_urls(urls, year): print year for teamcnt, team_pair in enumerate(urls): for tm, url in team_pair.items(): print '\t', str(teamcnt + 1), tm, '-', url tm_list = [] tm_query = db.query( "SELECT DISTINCT team_abb FROM zips_fangraphs_batters_counting WHERE year = %s" % (year)) for t in tm_query: tm_list.append(t[0]) if tm in tm_list: continue sleep(sleep_time) team_data = requests.get(url) team_soup = BeautifulSoup(team_data.content, "lxml") postmeta_date = team_soup.find( class_="postmeta").findAll("div")[-1].getText() post_date = parse(postmeta_date).strftime("%Y-%m-%d") tables = team_soup.findAll( "table", { "class": [ "sortable", "sortable table-equal-width", "table-equal-width" ] }) print len(tables) if len(tables) == 0: tables = team_soup.findAll("table")[11:] print len(tables) j = 0 for table in tables: # raw_input(table) headers = table.find("tr") # raw_input(headers) headers = headers.findAll() cats = [] for h in headers: cat = h.getText().replace('/', '_').replace( '+', '_Plus').replace('-', '_Minus').replace( 'No. 1 Comp', 'Top_Comp').replace('%', '_Pct') cats.append(cat) if len(cats) < 10: continue else: j = j + 1 # raw_input(i) entries = [] if j == 1: db_table = "zips_fangraphs_batters_counting" elif j == 2: db_table = "zips_fangraphs_batters_rate" elif j == 3: db_table = "zips_fangraphs_pitchers_counting" elif j == 4: db_table = "zips_fangraphs_pitchers_rate" print '\t\t', db_table # print cats rows = table.findAll("tr") for r in rows: # print r # print r.get("class") # raw_input("") if r.get("class") is None: entry = {} entry["year"] = year entry["team_abb"] = tm entry["post_date"] = post_date atts = r.findAll("td") # raw_input(atts) if atts != []: for k, att in enumerate(atts): fld = att.getText() fld = "".join( [i if ord(i) < 128 else "" for i in fld]) entry[cats[k]] = fld # print '\t\t\t', entry if entry["Player"] != "": helper.input_name(entry.get('Player')) entries.append(entry) if entries != []: for i in range(0, len(entries), 1000): db.insertRowDict(entries[i:i + 1000], db_table, insertMany=True, replace=True, rid=0, debug=1) db.conn.commit()
def parse_player(player_text, year, team_abb): try: int(player_text[0:1]) except ValueError: # raw_input(player_text) return None try: full_name = player_text.split(")")[1].split(",")[0].strip() except IndexError: return None try: team_rank = player_text.split(")")[0].strip() except IndexError: team_rank = None try: position = player_text.split(",")[1].split(",")[0].split(";")[0].strip().split(" ")[0].split(".")[0].strip() except IndexError: position = None try: grade_base = player_text.upper().split("GRADE")[1].split(":")[0].split(".")[0].split(";")[0] grade = grade_base.replace("/BORDERLINE","/").replace("BORDERLINE","/").replace("//","/").replace(" ","").strip() except IndexError: # raw_input(player_text) grade_base, grade = None, None try: age = player_text.lower().split(" age")[1].split(",")[0].split(";")[0].split(":")[0].split("(")[0].strip() age = int(age) except (IndexError, ValueError): try: age = player_text.lower().split("age")[1].split(",")[0].split(";")[0].strip() age = int(age) except (IndexError, ValueError): age = 0 try: eta = player_text.lower().split(" eta")[1].split(".")[0].split(";")[0].split("(")[0].replace(":","").strip() except IndexError: eta = None entry = {"year":year, "team":team_abb} full_name, fname, lname = helper.adjust_minorleagueball_name(full_name, year, team_abb) est_birthyear = year - int(age) age = helper.adjust_minorleagueball_birthyear(full_name, year, team_abb, est_birthyear) position = helper.adjust_minorleagueball_position(full_name, year, team_abb, position) eta = helper.adjust_minorleagueball_eta(full_name, year, team_abb, eta) if grade is None: return None try: blurb = player_text.split("Grade"+grade_base+":")[1].strip() except (TypeError, IndexError): try: blurb = "Age " + player_text.split("Age")[1].strip() except (TypeError, IndexError): blurb = None try: grade_split = blurb.upper().split("BORDERLINE")[1].split(":")[0].split(".")[0].strip()[0:2].strip() if grade_split != "": grade = grade + "/" + grade_split except (IndexError, AttributeError): grade = grade grade = helper.adjust_minorleagueball_grade(full_name, year, team_abb, grade) if int(team_rank) == 31 and grade[0] in ("A", "B"): team_rank = 1 entry["team_rank"] = team_rank entry["full_name"] = full_name entry["position"] = position entry["age"] = age entry["grade"] = grade entry["eta"] = eta entry["fname"] = fname entry["lname"] = lname entry["blurb"] = blurb print "\t\t", team_rank, full_name, position, age, grade, eta helper2.input_name(entry.get('full_name')) db.insertRowDict(entry, "minorleagueball_professional", replace=True, debug=1) db.conn.commit()
"zips_fangraphs_pitchers_counting": "a.Player", "zips_fangraphs_pitchers_rate": "a.Player", "zips_offense": "a.player_name", "zips_offense_splits": "a.player_name", "zips_pitching": "a.player_name", "zips_pitching_splits": "a.player_name", "mlb_prospects.fg_raw": "a.playerName", "mlb_prospects.minorleagueball_professional": "a.full_name", "mlb_prospects.mlb_prospects_draft": "CONCAT(a.fname, ' ', a.lname)", "mlb_prospects.mlb_prospects_international": "CONCAT(a.fname, ' ', a.lname)", "mlb_prospects.mlb_prospects_professional": "CONCAT(a.fname, ' ', a.lname)" } for k, v in table_dict.items(): print k qry = """ SELECT DISTINCT %s FROM %s a LEFT JOIN name_mapper nm ON (%s = nm.wrong_name) WHERE 1 AND nm.wrong_name IS NULL """ % (v, k, v) # raw_input(qry) names = db.query(qry) for name in names: helper.input_name(name[0])
def process_players(player_list, year, season_gp, team_name, team_abb, date): entries = [] pos = '' for plr in player_list: if plr == []: continue if plr[0] == 'Pitchers': pos = 'p' elif plr[0] == 'Catchers': pos = 'c' elif plr[0] == 'Infield': pos = 'if' elif plr[0] == 'Outfield': pos = 'of' try: if (plr[1] == 'MLI') or ((float(plr[2]) > 0 or float(plr[3]) > 0) and plr[2] != ''): entry = { 'year': year, 'gp': season_gp, 'position': pos, 'team_abb': team_abb, 'date': date } entered_name = plr[0] if pos == 'c' and entered_name == 'Smith, Will': entered_name = 'D. Smith, Will' player_name, first_name, last_name = name_parser(entered_name) entry['player_name'] = player_name entry['fname'] = first_name entry['lname'] = last_name entry['entered_name'] = entered_name contract_year = plr[1] if player_name == 'Max Stassi': contract_year = 'V' entry['contract_year'] = contract_year if plr[1] == 'MLI': salary = 1.1 else: salary = plr[2] entry['salary'] = salary if len(plr) < 4: expires = 0 else: expires = plr[3] if len(plr) < 5: opt = '' else: opt = plr[4] if len(plr) < 6: ntc = None else: ntc = plr[5] # if len(plr) < 7: # salary_counted = 'N' # else: salary_counted = 'N' if (contract_year.lower() in ('v', 'ce', '4th', '5th', '6th') or contract_year[-1] == 'G'): salary_counted = 'Y' entry['expires'] = expires entry['opt'] = opt entry['ntc'] = ntc entry['salary_counted'] = salary_counted # for i,v in entry.items(): # print i, '\t', v helper.input_name(entry.get('player_name')) entries.append(entry) except (IndexError, ValueError): continue if entries != []: db.insertRowDict(entries, 'excel_rosters', replace=True, insertMany=True, rid=0) db.conn.commit()
def scrape_prospects(year, prospect_lists): list_cnt = 0 for list_type in (prospect_lists): entries = [] if list_type not in ("rule5", "prospects", "pdp", "rhp", "lhp", "c", "1b", "2b", "3b", "ss", "of"): # if list_type in ("draft","int"): list_cnt += 1 ind_list = prospect_lists[list_type] i = 0 for player in ind_list: entry = {} i += 1 sleep(sleep_time) mlb_id = player["player_id"] player_url = player_base_url % (year, mlb_id) print list_cnt, year, list_type, i, "\t", str(mlb_id) print "\t\t", str(player_url) sleep(sleep_time) player_json = getter.get_url_data(player_url, "json") try: player_info = player_json["prospect_player"] except TypeError: print "\n\n**ERROR TAG** TYPE_ERROR", str(year), str( mlb_id), "\n\n" continue fname = player_info["player_first_name"] lname = player_info["player_last_name"] input_name = fname + ' ' + lname helper2.input_name(input_name) fname, lname = helper.adjust_mlb_names(mlb_id, fname, lname) position = player_info["positions"] position = helper.adjust_mlb_positions(mlb_id, position) entry["year"] = year entry["rank"] = i entry["mlb_id"] = mlb_id entry["fname"] = fname entry["lname"] = lname entry["position"] = position if list_type in ("int", "draft"): bats = player_info["bats"] throws = player_info["thrw"] try: height = player_info["height"].replace("\"", "").split("\"") height = int(height[0]) * 12 + int(height[1]) except (IndexError, ValueError, AttributeError): height = None weight = player_info["weight"] try: dob = player_info["birthdate"] byear = dob.split("/")[2] bmonth = dob.split("/")[0] bday = dob.split("/")[1] except IndexError: print '\n\nNO BIRTHDAY', fname, lname, mlb_id, "\n\n" continue byear, bmonth, bday = helper.adjust_mlb_birthdays( mlb_id, byear, bmonth, bday) prospect_id = helper.add_prospect(mlb_id, fname, lname, byear, bmonth, bday, p_type=list_type) else: info_url = player2_base_url % mlb_id print "\t\t", info_url sleep(sleep_time) info_json = getter.get_url_data(info_url, "json", json_unicode_convert=True) try: info_info = info_json["player_info"]["queryResults"][ "row"] except TypeError: print "\n\n**ERROR TAG** MLB_ERROR", str(year), str( mlb_id), str(fname), str(lname), "\n\n" continue dob = info_info["birth_date"] byear = dob.split("-")[0] bmonth = dob.split("-")[1] bday = dob.split("-")[2].split("T")[0] prospect_id = helper.add_prospect(mlb_id, fname, lname, byear, bmonth, bday, p_type="professional") try: bats = info_info["bats"] throws = info_info["throws"] height = int(info_info["height_feet"]) * 12 + int( info_info["height_inches"]) weight = int(info_info["weight"]) except UnicodeDecodeError: bats, throws, height, weight = (None, None, None, None) except ValueError: print "\n\n**ERROR TAG** MLB_ERROR", str(year), str( mlb_id), str(fname), str(lname), "\n\n" continue if prospect_id == 0 or prospect_id is None: grades_id = mlb_id else: grades_id = prospect_id entry["prospect_id"] = prospect_id entry["grades_id"] = grades_id entry["bats"] = bats entry["throws"] = throws entry["height"] = height entry["weight"] = weight entry["birth_year"] = byear entry["birth_month"] = bmonth entry["birth_day"] = bday entry["team"] = player["team_file_code"] drafted = player_info["drafted"] if list_type == "int": drafted = None try: sign_text = player_info["signed"] sign_value = sign_text.split(" - ")[1] signed = sign_value except IndexError: signed = "" try: signed = int(signed.replace("$", "").replace(",", "")) except ValueError: signed = None schoolcity = player_info["school"] gradecountry = player_info["year"] commit = None elif list_type == "draft": try: signed = player_info["preseason20"].replace( " ", "").replace(",", "").replace("$", "").split("-")[1] except (KeyError, IndexError): signed = player_info["signed"].replace( " ", "").replace(",", "").replace("$", "") try: signed = int(signed) except ValueError: signed = None schoolcity = player_info["school"] gradecountry = player_info["year"] commit = player_info["signed"] else: signed = player_info["signed"] schoolcity = None gradecountry = None commit = None entry["drafted"] = drafted entry["signed"] = signed entry["school_city"] = schoolcity entry["grade_country"] = gradecountry entry["college_commit"] = commit if list_type not in ("int", "draft"): eta = player_info["eta"] try: pre_top100 = player_info["preseason100"] except KeyError: pre_top100 = None else: pre_top100 = None eta = None entry["pre_top100"] = pre_top100 entry["eta"] = eta entry["twitter"] = player_info["twitter"] blurb = player_info["content"]["default"].replace( "<b>", "").replace("</b>", "").replace("<br />", "").replace( "<p>", "").replace("</p>", "").replace("*", "") entry["blurb"] = blurb try: overall_text = blurb.split("Overall")[1].split( '\n')[0].replace(':', '').replace(' ', '')[:8] if overall_text[0] not in (' ', ':', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'): raise IndexError try: text2 = overall_text.split('/')[1] except IndexError: text2 = overall_text.split('/')[-1] overall = int(filter(str.isdigit, text2[:2])) except IndexError: overall = 0 if overall < 20 and overall is not None: overall = overall * 10 entry["FV"] = overall entries.append(entry) if list_type == "draft": table = "mlb_prospects_draft" elif list_type == "int": table = "mlb_prospects_international" else: table = "mlb_prospects_professional" for e in entries: raw_input(e)
def parse_prospect(rnk, year, prospect, team): prospect_type = (team if team in ("draft", "international") else "professional") entry = {} def print_prospect_details (prospect): def print_dict(k, v, lvl): for num in range(1, lvl): print "\t", if type(v) is dict: print k for y, z in j.items(): print_dict(y, z, lvl+1) else: print (str(k)[:20] if len(str(k)) > 20 else str(k).ljust(20)), "\t", ("SOME LIST" if type(v) is list else v) for a, b in prospect.items(): print_dict(a, b, 1) def process_grades(year, grades_id, grades, player_type, prospect_type): grade_entry = {"year":year, "grades_id":grades_id, "prospect_type":prospect_type} fv = 0 for g in grades: if g.get("key") is None: continue if g.get("key").lower().strip() == "overall": fv = g.get("value") elif g.get("key").lower().strip() not in ("fastball", "change", "curve", "slider", "cutter", "splitter", "control", "hit", "power", "run", "arm", "field", "speed", "throw", "defense"): grade_entry["other"] = g.get("value") else: if g.get("key").lower().strip() == "speed": grade_entry["run"] = g.get("value") elif g.get("key").lower().strip() == "throw": grade_entry["arm"] = g.get("value") elif g.get("key").lower().strip() == "defense": grade_entry["field"] = g.get("value") else: grade_entry[g.get("key").lower().strip()] = g.get("value") if "hit" in grade_entry or "field" in grade_entry: grades_table = "mlb_grades_hitters" elif "control" in grade_entry or "fastball" in grade_entry: grades_table = "mlb_grades_pitchers" else: print "\n\n\n", grades, "\n\n\n" return fv db.insertRowDict(grade_entry, grades_table, insertMany=False, replace=True, rid=0,debug=1) db.conn.commit() return fv # print_prospect_details(prospect) mlb_id = prospect.get("player").get("id") fname = prospect.get("player").get("useName") lname = prospect.get("player").get("lastName") input_name = fname + " " + lname helper2.input_name(input_name) fname, lname = helper.adjust_mlb_names(mlb_id, fname, lname) position = prospect.get("player").get("positionAbbreviation") position = helper.adjust_mlb_positions(mlb_id, position) entry["year"] = year entry["rank"] = rnk entry["mlb_id"] = mlb_id entry["fname"] = fname entry["lname"] = lname entry["position"] = position try: dob = prospect.get("player").get("birthDate") byear = dob.split("-")[0] bmonth = dob.split("-")[1] bday = dob.split("-")[2] except IndexError: print "\n\nNO BIRTHDAY", fname, lname, mlb_id, "\n\n" prospect_id = helper.add_prospect(mlb_id, fname, lname, byear, bmonth, bday, p_type=prospect_type) if prospect_id == 0 or prospect_id is None: grades_id = mlb_id else: grades_id = prospect_id entry["birth_year"] = byear entry["birth_month"] = bmonth entry["birth_day"] = bday entry["prospect_id"] = prospect_id entry["grades_id"] = grades_id bats = prospect.get("player").get("batSideCode") throws = prospect.get("player").get("pitchHandCode") weight = prospect.get("player").get("weight") try: height = prospect.get("player").get("height").replace("\"","").split("'") height = int(height[0])*12+int(height[1]) except (IndexError, ValueError, AttributeError): height = None entry["bats"] = bats entry["throws"] = throws entry["weight"] = weight entry["height"] = height try: team = prospect.get("player").get("currentTeam").get("parentOrgName") except (AttributeError): team = None entry["team"] = team commit = prospect.get("prospectSchoolCommitted") entry["college_commit"] = commit eta = prospect.get("eta") entry["eta"] = eta hit_fv = None pitch_fv = None if prospect.get("gradesHitting") is not None and prospect.get("gradesHitting") != []: hit_grades = prospect.get("gradesHitting") hit_fv = process_grades(year, grades_id, hit_grades, "hit", prospect_type) if prospect.get("gradesPitching") is not None and prospect.get("gradesPitching") != []: pitch_grades = prospect.get("gradesPitching") pitch_fv = process_grades(year, grades_id, pitch_grades, "pitch", prospect_type) fv = max(hit_fv, pitch_fv) entry["FV"] = fv blurbs = prospect.get("prospectBio") sorted_blurbs = sorted(blurbs, key=lambda k:k["contentTitle"], reverse=True) cleaned_blurbs = [] for i,b in enumerate(sorted_blurbs): if b.get("contentText") is None: sorted_blurbs[i] = None else: blurbtext = str(b.get("contentTitle")) + b.get("contentText").replace("<b>","").replace("</b>","").replace("<br />","").replace("<p>","\n").replace("</p>","").replace("*","").replace("<strong>","").replace("</strong>","") blurbtext = "".join([j if ord(j) < 128 else "" for j in blurbtext]) cleaned_blurbs.append(blurbtext) blurb = "\n\n".join(cleaned_blurbs) entry["blurb"] = blurb # raw_input(entry) return entry