def parse_player_page(source, url):

    print 'soupifying...'
    t0 = time.time()

    session = mysql_session(ensure_created=True)
    soup = BeautifulSoup(source, "lxml")

    t1 = time.time()
    print 'Done in %s' % (t1 - t0, )
    t0 = t1
    print 'scraping player...'

    # scrape player bio and create player object
    p = Player()
    p.url = url
    bio = soup.find(id='bio')

    # NOTE: if no bio, skip!
    if bio.find('table') is None:
        return []

    cells = bio.find('table').find_all('td')
    p.name, p.country = player_name_parse(cells[0].text)
    p.twitter = player_twitter_parse(cells[1])
    p.dob = date_parse(cells[2].text)
    p.plays = player_plays_parse(cells[3].text)
    p.atp_rank = player_atp_parse(cells[4].text)
    try:
        p.peak_atp_rank, p.peak_date = player_peak_atp_parse(cells[5].text)
    except AttributeError as e:
        pass

    t1 = time.time()
    print 'Done in %s' % (t1 - t0, )
    t0 = t1
    print 'saving player...'

    # commit player to database and get id
    # >> if player page already exists, replace!
    if len(session.query(Player).filter(Player.name == p.name).all()) > 0:
        session.query(Player).filter(Player.name == p.name).delete()
    session.add(p)
    session.commit()

    # delete matches first
    session.query(Match).filter(
        or_(Match.p1_name == p.name, Match.p2_name == p.name)).delete()

    t1 = time.time()
    print 'Done in %s' % (t1 - t0, )
    t0 = t1
    print 'scraping matching...'

    # scrape matches list and create corresponding list of match objects
    # >> also track player name urls
    matches = soup.find(id='matches').find_all('tr')[2:]
    urls = []
    ms = []
    for match in matches:
        cells = match.find_all('td')
        m = Match()
        m.date = date_parse(cells[0].text)
        m.tournament = cells[1].text
        m.surface = cells[2].text
        m.tournament_round = match_round_parse(cells[3].text)
        m.p1_rank = match_rank_parse(cells[4].text)
        m.p2_rank = match_rank_parse(cells[5].text)
        m.p1_name, m.p1_id, m.p2_name, m.p2_id, m_urls = match_players_parse(
            cells[6], p.name, p.id)
        urls += m_urls

        # check for uniqueness then add to db
        #if session.query(Match).filter(and_(or_(and_(Match.p1_name == m.p1_name, Match.p2_name == m.p2_name), and_(Match.p1_name == m.p2_name, Match.p2_name == m.p1_name)), Match.date == m.date)).count() == 0:
        #  session.add(m)
        session.add(m)
        ms.append(m)

    t1 = time.time()
    print 'Done in %s' % (t1 - t0, )
    t0 = t1
    print 'committing...'

    # keep only unique urls (to other player pages for crawler)
    urls = unique_urls(urls)

    # >> return urls for crawling
    session.commit()
    session.close()
    return urls
示例#2
0
import django
from models import Player

# Full path to your django project directory
djangoproject = "/Users/Emanon805/Desktop/TIY/week4/day2/homework/arsenal-app/mysite/polls"
sys.path.append(djangoproject)
django.setup()
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'


PATH = 'invincibles.csv'

with open(PATH, 'r') as f:
    catergories = {'fieldnames': ('kit_num', 'name', 'pos', 'dob', 'nat', 'apps', 'goals', 'assists', 'mins'), 'delimiter': ','}
    reader = csv.DictReader(f, **catergories)
    next(reader, None)

    for row in reader:
        p = Player()
        print(p)
        p.kit_num = row['kit_num']
        p.name = row['name']
        p.pos = row['pos']
        p.dob = row['dob']
        p.nat = row['nat']
        p.apps = row['apps']
        p.goals = row['goals']
        p.assists = row['assists']
        p.mins = row['mins']
        p.save()