def main(args): """Use this function to generate a JSON file of player profile data, for seeding our database. """ # init logging logging.basicConfig(filename='scrape_players_json.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logging.info('Starting player scrape...') # seasons to process, descending order years = [2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007] all_players = [] for year in years: logging.info('Begin scraping page for year {}'.format(year)) # make a urllib url object for the year's player list page list_url = urllib.parse.urlparse('http://www.rugbyleagueproject.org/' 'seasons/nrl-{}/players.html'.format( year)) logging.info('Requesting {}'.format(list_url)) try: list_soup = saltytools.make_soup(list_url, USER_AGENT) except requests.exceptions.HTTPError as err: logging.exception(err) break # find all links to individual player detail pages player_links = list_soup.find_all(href=re.compile('players/\d+')) # load individual player detail pages for link in player_links: # dict to fill with player data player = {} player['uuid'] = str(uuid.uuid4()) # make url and parse player_url = urllib.parse.urlparse( 'http://www.rugbyleagueproject.org' + link['href']) logging.info('Requesting {}'.format(player_url)) try: player_soup = saltytools.make_soup(player_url, USER_AGENT) except requests.exceptions.HTTPError as err: logging.exception(err) break # process player name name = player_soup.h1.get_text() name = saltytools.normalise_caseless(name) player['name'] = name print(player['name']) logging.info('Player name: {}'.format(player['name'])) # process player DOB born_dt = player_soup.find('dt', text='Born') if born_dt is None: player['dob'] = None else: dob = born_dt.find_next_sibling('dd').get_text() player['dob'] = dateutil.parser.parse(dob).date().isoformat() print(player['dob']) logging.info('Player DOB: {}'.format(player['dob'])) # process player career nrl_year_td = player_soup.find_all('td', text=re.compile('NRL\s\d+')) years = [int(td.get_text().split()[1]) for td in nrl_year_td] teams = [td.find_previous_sibling('td').get_text() for td in nrl_year_td] career = dict(zip(years, teams)) # drop {year: team} pairs prior to 2007 as we don't care about # these matches, normalise the teams from years we do care about for year in list(career): if year < 2007: career.pop(year, None) else: career[year] = saltytools.process_team(career[year]) ''' for year, team in raw_career.items(): if year < 2007: career.pop(year, None) else: career[year] = saltytools.process_team(team) ''' player['career'] = career logging.info('Processed career') # Duplicate handling: # Because we process years in descending ordering, we don't # need to update the career information when we find the # player a second time. # The first occurance of the player will always be the # newest, and hence have the most up-to-date career info. is_duplicate = False for ex in all_players: if ex['name'] == player['name'] and ex['dob'] == player['dob']: is_duplicate = True break logging.info('Duplicate: {}'.format(is_duplicate)) if not is_duplicate: all_players.append(player) time.sleep(SLEEP_TIME) # break # end player list loop # break # end year loop all_players_asc = sorted(all_players, key=itemgetter('name')) with open('players.json', 'w') as outfile: json.dump(all_players_asc, outfile)
def main(args): # handle options parser = argparse.ArgumentParser(description=''' SaltyStats: the NRL stats scraper''') parser.add_argument('type', choices=['match']) parser.add_argument('store', choices=['csv', 'sqlite']) parser.add_argument('url') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--version', action='version', version=VERSION) args = parser.parse_args() print('Arguments: ', vars(args)) # parse URL urlobj = urlparse(args.url) # parse HTML w/BeautifulSoup match_soup = saltytools.make_soup(urlobj, USER_AGENT) # parse URL for datasource kw = saltytools.datasource_kw(urlobj) # choose object type based on datasource URL if kw == 'afltables': match_data = MatchDataAfl(match_soup) elif kw == 'rlproject': match_data = MatchDataRlp(match_soup) elif kw is None: raise UnknownDatasourceError('''Couldn\'t parse datasource from {url}. Is this site supported?'''.format(urlobj.geturl)) print('\n### HOME ###') print('team name (raw): ', match_data.home) print('team name: ', saltytools.process_team(match_data.home)) print('team score: ', match_data.home_score) print('team scrums: ', match_data.home_scrums) print('team penalties: ', match_data.home_penalties) print('players: ', str(match_data.home_players)) print('try scorers: ', str(match_data.home_tryscorers)) print('goal scorers: ', str(match_data.home_goalscorers)) print('field goal scorers: ', str(match_data.home_fgoalscorers)) print('\n### AWAY ###') print('team name (raw): ', match_data.away) print('team name: ', saltytools.process_team(match_data.away)) print('team score: ', match_data.away_score) print('team scrums: ', match_data.away_scrums) print('team penalties: ', match_data.away_penalties) print('players: ', str(match_data.away_players)) print('try scorers: ', str(match_data.away_tryscorers)) print('goal scorers: ', str(match_data.away_goalscorers)) print('field goal scorers: ', str(match_data.away_fgoalscorers)) print('\n### META ###') print('ref(s): ', match_data.referees) print('venue (raw): ', match_data.venue) print('venue: ', saltytools.process_venue(match_data.venue)) print('crowd: ', match_data.crowd) print('date: ', match_data.date) print('time: ', match_data.time) print('round: ', match_data.round) print('\n### STRUCTURE ###') print('__repr__: ', match_data) print('\n### OUTPUT ###') if args.store == 'csv': filename = 'export/{} vs {} {}.saltystats_{}.csv'.format( match_data.home, match_data.away, match_data.date, VERSION) print('write csv...') match_data.write_csv(filename) elif args.store == 'sqlite': print('do sql stuffs...') # setup db print('Initialising database "{}"'.format(SQLITE_FILE)) print('Result: {}'.format(saltysql.create_database(SQLITE_FILE))) # seed db print('Seeding players from players.json') print('Result: {}'.format(saltysql.seed_players('players.json', SQLITE_FILE))) # ensure home, away, venue, round exist & and get ID home_cannonical = saltytools.process_team(match_data.home) print('Home ID: {}'.format(saltysql.insert_team(home_cannonical, SQLITE_FILE))) away_cannonical = saltytools.process_team(match_data.away) print('Away ID: {}'.format(saltysql.insert_team(away_cannonical, SQLITE_FILE))) venue_cannonical = saltytools.process_venue(match_data.venue) print('Venue ID: {}'.format(saltysql.insert_venue(venue_cannonical, SQLITE_FILE))) print('Round ID: {}'.format(saltysql.insert_round( match_data.round, match_data.date.year, SQLITE_FILE))) print('Home players:') # For player disambiguation, we must find each players uuid in the # json data file & use it find the correct player_id. # First look up by name. This will work in 95% of cases. # If we get a name clash, we need to disambiguate. # For match datasources that include a DOB, we can compare the incoming # DOB to the DOBs of the clashing player's in the json data file. # Otherwise, we can look at the clashing player's career's in the json # data file and find which one was playing for the incoming player's # team at the time of the match in question. # query db to find is multiple players exist with name # what is the minimum we know about this player at this stage? # his name and that he played for a certain team in a certain year # we can parse the json data file & find the right guy with this info # get json from file print(saltytools.find_player_uuid( match_data.home_players[13], match_data.home, match_data.date.year, 'players.json')) return 0