def espn_schedule(): for year in [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]: time.sleep(30) for month in [3, 4, 5, 6, 7, 8, 9, 10]: for day in [1, 8, 15, 22, 29]: if month == 3: if day != 29: continue time.sleep(2) if day < 10: day = '0%s' % (day) if month < 10: month = '0%s' % (month) date = int('%s%s%s' % (year, month, day)) browser = create_browser() page = browser.open('http://espn.go.com/mlb/schedule?date=%s' % date) html = parse_html(page.read()) for tr in html.cssselect('.mod-content tr'): for a in tr.cssselect('td a'): link = a.get('href') print link try: espn_id = int(link.split('?id=')[1]) print espn_id get_or_create_espnid(espn_id=espn_id, date=date) except: continue break
def main(): global news_scraper opener = urllib2.build_opener() opener.addheaders = [ ('User-agent', 'Mozilla/5.0'), ('Referer', news_scraper.base_url) ] urllib2.install_opener(opener) error_count = 0 while error_count < 3: try: html = scraperwiki.scrape(news_scraper.base_url) except urllib2.URLError, e: print 'Cannot reach the server:', if hasattr(e, 'reason'): print e.reason elif hasattr(e, 'code'): print e.code error_count += 1 try: html = html.decode(news_scraper.page_encoding) except UnicodeDecodeError: encoded = '' for word in html.split(' '): try: encoded += word.decode(news_scraper.page_encoding) + ' ' except UnicodeDecodeError: pass html = encoded.rstrip() num_of_article = scrape(parse_html(html)) print 'Headline ,', num_of_article, 'article(s)' break
def main(): global base_url, page_num_of_article, page_start, page_step, page_encoding, page_sleep, count count = 0 last_page = scraperwiki.sqlite.get_var('last_page', -1) latest_article = None start_over = False if last_page == -1: last_page = page_start latest_article = scraperwiki.sqlite.get_var('latest_article', None) start_over = True opener = urllib2.build_opener() opener.addheaders = [ ('User-agent', 'Mozilla/5.0'), ('Referer', base_url) ] urllib2.install_opener(opener) error_count = 0 num_of_article = page_num_of_article while num_of_article == page_num_of_article: page_url = build_url(last_page) try: html = scraperwiki.scrape(page_url) except urllib2.URLError, e: print 'Cannot reach the server:', if hasattr(e, 'reason'): print e.reason elif hasattr(e, 'code'): print e.code error_count += 1 if error_count < 3: continue else: break try: html = html.decode(page_encoding) except UnicodeDecodeError: encoded = '' for word in html.split(' '): try: encoded += word.decode(page_encoding) + ' ' except UnicodeDecodeError: pass html = encoded.rstrip() num_of_article = scrape(parse_html(html), latest_article, start_over) page = last_page / page_step if (page_start == 0): page += 1 scraperwiki.sqlite.save_var('last_page', last_page) print 'Page', page, ',', num_of_article, 'article(s)' last_page += page_step if not page_exists(html, last_page): break time.sleep(page_sleep)
def main(): global base_url, page_num_of_article, page_start, page_step, page_encoding, page_sleep, count count = 0 last_page = scraperwiki.sqlite.get_var('last_page', -1) latest_article = None start_over = False if last_page == -1: last_page = page_start latest_article = scraperwiki.sqlite.get_var('latest_article', None) start_over = True opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0'), ('Referer', base_url)] urllib2.install_opener(opener) error_count = 0 num_of_article = page_num_of_article while num_of_article == page_num_of_article: page_url = build_url(last_page) try: html = scraperwiki.scrape(page_url) except urllib2.URLError, e: print 'Cannot reach the server:', if hasattr(e, 'reason'): print e.reason elif hasattr(e, 'code'): print e.code error_count += 1 if error_count < 3: continue else: break try: html = html.decode(page_encoding) except UnicodeDecodeError: encoded = '' for word in html.split(' '): try: encoded += word.decode(page_encoding) + ' ' except UnicodeDecodeError: pass html = encoded.rstrip() num_of_article = scrape(parse_html(html), latest_article, start_over) page = last_page / page_step if (page_start == 0): page += 1 scraperwiki.sqlite.save_var('last_page', last_page) print 'Page', page, ',', num_of_article, 'article(s)' last_page += page_step if not page_exists(html, last_page): break time.sleep(page_sleep)