def insert_player(player_name, player_dob, json_uuid, filename): """Inserts a player into the database if that player does not already exist. Returns the player_id Args: """ conn = sqlite3.connect(filename) c = conn.cursor() values = (saltytools.normalise_caseless(player_name), player_dob, json_uuid ) c.execute('INSERT OR IGNORE INTO PLAYER VALUES (NULL, ?, ?, ?)', values) conn.commit() c.execute(''' SELECT player_id FROM PLAYER WHERE player_name = ?''', (player_name, )) player_row = c.fetchone() player_id = player_row[0] conn.close() return player_id
def main(args): """Use this function to generate a JSON file of player profile data, for seeding our database. """ # init logging logging.basicConfig(filename='scrape_players_json.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logging.info('Starting player scrape...') # seasons to process, descending order years = [2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007] all_players = [] for year in years: logging.info('Begin scraping page for year {}'.format(year)) # make a urllib url object for the year's player list page list_url = urllib.parse.urlparse('http://www.rugbyleagueproject.org/' 'seasons/nrl-{}/players.html'.format( year)) logging.info('Requesting {}'.format(list_url)) try: list_soup = saltytools.make_soup(list_url, USER_AGENT) except requests.exceptions.HTTPError as err: logging.exception(err) break # find all links to individual player detail pages player_links = list_soup.find_all(href=re.compile('players/\d+')) # load individual player detail pages for link in player_links: # dict to fill with player data player = {} player['uuid'] = str(uuid.uuid4()) # make url and parse player_url = urllib.parse.urlparse( 'http://www.rugbyleagueproject.org' + link['href']) logging.info('Requesting {}'.format(player_url)) try: player_soup = saltytools.make_soup(player_url, USER_AGENT) except requests.exceptions.HTTPError as err: logging.exception(err) break # process player name name = player_soup.h1.get_text() name = saltytools.normalise_caseless(name) player['name'] = name print(player['name']) logging.info('Player name: {}'.format(player['name'])) # process player DOB born_dt = player_soup.find('dt', text='Born') if born_dt is None: player['dob'] = None else: dob = born_dt.find_next_sibling('dd').get_text() player['dob'] = dateutil.parser.parse(dob).date().isoformat() print(player['dob']) logging.info('Player DOB: {}'.format(player['dob'])) # process player career nrl_year_td = player_soup.find_all('td', text=re.compile('NRL\s\d+')) years = [int(td.get_text().split()[1]) for td in nrl_year_td] teams = [td.find_previous_sibling('td').get_text() for td in nrl_year_td] career = dict(zip(years, teams)) # drop {year: team} pairs prior to 2007 as we don't care about # these matches, normalise the teams from years we do care about for year in list(career): if year < 2007: career.pop(year, None) else: career[year] = saltytools.process_team(career[year]) ''' for year, team in raw_career.items(): if year < 2007: career.pop(year, None) else: career[year] = saltytools.process_team(team) ''' player['career'] = career logging.info('Processed career') # Duplicate handling: # Because we process years in descending ordering, we don't # need to update the career information when we find the # player a second time. # The first occurance of the player will always be the # newest, and hence have the most up-to-date career info. is_duplicate = False for ex in all_players: if ex['name'] == player['name'] and ex['dob'] == player['dob']: is_duplicate = True break logging.info('Duplicate: {}'.format(is_duplicate)) if not is_duplicate: all_players.append(player) time.sleep(SLEEP_TIME) # break # end player list loop # break # end year loop all_players_asc = sorted(all_players, key=itemgetter('name')) with open('players.json', 'w') as outfile: json.dump(all_players_asc, outfile)