def parse_player_page(source, url): print 'soupifying...' t0 = time.time() session = mysql_session(ensure_created=True) soup = BeautifulSoup(source, "lxml") t1 = time.time() print 'Done in %s' % (t1 - t0, ) t0 = t1 print 'scraping player...' # scrape player bio and create player object p = Player() p.url = url bio = soup.find(id='bio') # NOTE: if no bio, skip! if bio.find('table') is None: return [] cells = bio.find('table').find_all('td') p.name, p.country = player_name_parse(cells[0].text) p.twitter = player_twitter_parse(cells[1]) p.dob = date_parse(cells[2].text) p.plays = player_plays_parse(cells[3].text) p.atp_rank = player_atp_parse(cells[4].text) try: p.peak_atp_rank, p.peak_date = player_peak_atp_parse(cells[5].text) except AttributeError as e: pass t1 = time.time() print 'Done in %s' % (t1 - t0, ) t0 = t1 print 'saving player...' # commit player to database and get id # >> if player page already exists, replace! if len(session.query(Player).filter(Player.name == p.name).all()) > 0: session.query(Player).filter(Player.name == p.name).delete() session.add(p) session.commit() # delete matches first session.query(Match).filter( or_(Match.p1_name == p.name, Match.p2_name == p.name)).delete() t1 = time.time() print 'Done in %s' % (t1 - t0, ) t0 = t1 print 'scraping matching...' # scrape matches list and create corresponding list of match objects # >> also track player name urls matches = soup.find(id='matches').find_all('tr')[2:] urls = [] ms = [] for match in matches: cells = match.find_all('td') m = Match() m.date = date_parse(cells[0].text) m.tournament = cells[1].text m.surface = cells[2].text m.tournament_round = match_round_parse(cells[3].text) m.p1_rank = match_rank_parse(cells[4].text) m.p2_rank = match_rank_parse(cells[5].text) m.p1_name, m.p1_id, m.p2_name, m.p2_id, m_urls = match_players_parse( cells[6], p.name, p.id) urls += m_urls # check for uniqueness then add to db #if session.query(Match).filter(and_(or_(and_(Match.p1_name == m.p1_name, Match.p2_name == m.p2_name), and_(Match.p1_name == m.p2_name, Match.p2_name == m.p1_name)), Match.date == m.date)).count() == 0: # session.add(m) session.add(m) ms.append(m) t1 = time.time() print 'Done in %s' % (t1 - t0, ) t0 = t1 print 'committing...' # keep only unique urls (to other player pages for crawler) urls = unique_urls(urls) # >> return urls for crawling session.commit() session.close() return urls
import django from models import Player # Full path to your django project directory djangoproject = "/Users/Emanon805/Desktop/TIY/week4/day2/homework/arsenal-app/mysite/polls" sys.path.append(djangoproject) django.setup() os.environ['DJANGO_SETTINGS_MODULE'] = 'settings' PATH = 'invincibles.csv' with open(PATH, 'r') as f: catergories = {'fieldnames': ('kit_num', 'name', 'pos', 'dob', 'nat', 'apps', 'goals', 'assists', 'mins'), 'delimiter': ','} reader = csv.DictReader(f, **catergories) next(reader, None) for row in reader: p = Player() print(p) p.kit_num = row['kit_num'] p.name = row['name'] p.pos = row['pos'] p.dob = row['dob'] p.nat = row['nat'] p.apps = row['apps'] p.goals = row['goals'] p.assists = row['assists'] p.mins = row['mins'] p.save()