def save(self, page): if not hasattr(self, 'seed'): self.seed = Seed.objects.get(schema__slug__exact=self.schema_name) p = Page.objects.create( seed=self.seed, url=page['url'], scraped_url=page['url'], html=page['data'], when_crawled=datetime.datetime.now(), is_article=True, is_pdf=True, is_printer_friendly=False, article_headline=self.get_headline(page), article_date=page['date'], has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, ) save_locations_for_page(p) return p
def save_page(self, unique_id): """ Downloads the page with the given unique ID (possibly a numeric ID, or a URL) and saves it as a Page object. Returns the Page object, or None if the page couldn't be found. The page won't be retrieved/saved if it's already in the database. In this case, the existing Page object will be returned. """ self.logger.debug('save_page(%s)', unique_id) retrieval_url = self.retrieval_url(unique_id) public_url = self.public_url(unique_id) try: p = Page.objects.get(seed__id=self.seed.id, url=public_url) except Page.DoesNotExist: pass else: self.logger.debug('Skipping already-saved URL %s', public_url) return p try: html = self.retriever.get_html(retrieval_url).strip() except (RetrievalError, UnicodeDecodeError): return None if not html: self.logger.debug('Got empty page for %s', retrieval_url) return None self.logger.debug('Got VALID page for %s', retrieval_url) m = self.date_headline_re.search(html) if not m: self.logger.debug('Could not find date/headline on %s', retrieval_url) return None article_date, article_headline = m.groupdict()['article_date'], m.groupdict()['article_headline'] try: article_date = parse_date(article_date, self.date_format) except ValueError: self.logger.debug('Got unparseable date %r on %s', article_date, retrieval_url) return None article_headline = strip_tags(article_headline) if len(article_headline) > 255: article_headline = article_headline[:252] + '...' p = Page.objects.create( seed=self.seed, url=public_url, scraped_url=retrieval_url, html=html, when_crawled=datetime.datetime.now(), is_article=True, is_pdf=False, is_printer_friendly=False, article_headline=article_headline, article_date=article_date, has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, robot_report='', ) self.logger.debug('Created Page ID %s' % p.id) save_locations_for_page(p) return p
class FeedUpdater(object): def __init__(self, seed, retriever, logger): self.seed = seed self.retriever = retriever self.logger = logger def update(self): try: feed = feedparser.parse(self.seed.url) except UnicodeDecodeError: self.logger.info('UnicodeDecodeError on %r', self.seed.url) return for entry in feed['entries']: if 'feedburner_origlink' in entry: url = entry['feedburner_origlink'] elif 'pheedo_origLink' in entry: url = entry['pheedo_origLink'] elif 'link' in entry: url = entry['link'] else: continue # Skip entries with no link. try: url = normalize_url(self.seed.base_url, url, self.seed.normalize_www) except Exception: self.logger.warn('Problem normalizing URL: %r, %r, %r', self.seed.base_url, url, self.seed.normalize_www) continue if not url: self.logger.info('Skipping article with empty URL: %r, %r', self.seed.base_url, url) continue if len(url) > 512: self.logger.warning('Skipping long URL %s', url) continue article_date = entry.get('updated_parsed') and datetime.date( *entry['updated_parsed'][:3]) or None if article_date and article_date > datetime.date.today(): # Skip articles in the future, because sometimes articles show # up in the feed before they show up on the site, and we don't # want to retrieve the article until it actually exists. self.logger.info( 'Skipping article_date %s, which is in the future', article_date) continue url = self.normalize_url(url) try: title = entry['title'] except KeyError: self.logger.debug('Skipping %s due to missing title', url) continue if not self.download_page(url, title): self.logger.debug('Skipping %s due to download_page()', url) continue # If we've already retrieved the page, there's no need to retrieve # it again. try: Page.objects.filter(url=url)[0] except IndexError: pass else: self.logger.debug('URL %s has already been retrieved', url) continue # If this seed contains the full content in the RSS feed <summary>, # then we just use it instead of downloading the contents. if self.seed.rss_full_entry: is_printer_friendly = False try: html = entry['summary'] except KeyError: html = entry['description'] else: is_printer_friendly = False html = None time.sleep(self.seed.delay) # First, try deducing for the printer-friendly page, given the URL. print_url = self.get_printer_friendly_url(url) if print_url is not None: try: html = self.get_article_page(print_url) is_printer_friendly = True except Exception, e: self.logger.info( 'Error retrieving supposedly accurate printer-friendly page %s: %s', print_url, e) # If a printer-friendly page didn't exist, get the real page. if html is None: try: html = self.get_article_page(url) except Exception, e: self.logger.info('Error retrieving %s: %s', url, e) continue # If a page was downloaded, try looking for a printer-friendly # link, and download that. print_page = self.get_printer_friendly_page(html, url) if print_page is not None: is_printer_friendly = True html = print_page new_html = self.scrape_article_from_page(html) if new_html is not None: html = new_html if article_date is None: article_date = self.scrape_article_date_from_page(html) if not html.strip(): self.logger.debug('Got empty HTML page') continue article_headline = strip_tags(title) if len(article_headline) > 252: article_headline = article_headline[252:] + '...' p = Page.objects.create( seed=self.seed, url=url, scraped_url=(is_printer_friendly and print_url or url), html=html, when_crawled=datetime.datetime.now(), is_article=True, is_pdf=False, is_printer_friendly=is_printer_friendly, article_headline=article_headline, article_date=article_date, has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, robot_report='', ) self.logger.info('Created %s story %r', self.seed.base_url, article_headline) save_locations_for_page(p)
def save_page(self, unique_id): """ Downloads the page with the given unique ID (possibly a numeric ID, or a URL) and saves it as a Page object. Returns the Page object, or None if the page couldn't be found. The page won't be retrieved/saved if it's already in the database. In this case, the existing Page object will be returned. """ self.logger.debug('save_page(%s)', unique_id) retrieval_url = self.retrieval_url(unique_id) public_url = self.public_url(unique_id) try: p = Page.objects.get(seed__id=self.seed.id, url=public_url) except Page.DoesNotExist: pass else: self.logger.debug('Skipping already-saved URL %s', public_url) return p try: html = self.retriever.fetch_data(retrieval_url).strip() except (RetrievalError, UnicodeDecodeError): return None if not html: self.logger.debug('Got empty page for %s', retrieval_url) return None self.logger.debug('Got VALID page for %s', retrieval_url) m = self.date_headline_re.search(html) if not m: self.logger.debug('Could not find date/headline on %s', retrieval_url) return None article_date, article_headline = m.groupdict( )['article_date'], m.groupdict()['article_headline'] try: article_date = parse_date(article_date, self.date_format) except ValueError: self.logger.debug('Got unparseable date %r on %s', article_date, retrieval_url) return None article_headline = strip_tags(article_headline) if len(article_headline) > 255: article_headline = article_headline[:252] + '...' p = Page.objects.create( seed=self.seed, url=public_url, scraped_url=retrieval_url, html=html, when_crawled=datetime.datetime.now(), is_article=True, is_pdf=False, is_printer_friendly=False, article_headline=article_headline, article_date=article_date, has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, robot_report='', ) self.logger.debug('Created Page ID %s' % p.id) save_locations_for_page(p) return p