예제 #1
0
 def save(self, page):
     if not hasattr(self, 'seed'):
         self.seed = Seed.objects.get(schema__slug__exact=self.schema_name)
     p = Page.objects.create(
         seed=self.seed,
         url=page['url'],
         scraped_url=page['url'],
         html=page['data'],
         when_crawled=datetime.datetime.now(),
         is_article=True,
         is_pdf=True,
         is_printer_friendly=False,
         article_headline=self.get_headline(page),
         article_date=page['date'],
         has_addresses=None,
         when_geocoded=None,
         geocoded_by='',
         times_skipped=0,
     )
     save_locations_for_page(p)
     return p
예제 #2
0
 def save(self, page):
     if not hasattr(self, 'seed'):
         self.seed = Seed.objects.get(schema__slug__exact=self.schema_name)
     p = Page.objects.create(
         seed=self.seed,
         url=page['url'],
         scraped_url=page['url'],
         html=page['data'],
         when_crawled=datetime.datetime.now(),
         is_article=True,
         is_pdf=True,
         is_printer_friendly=False,
         article_headline=self.get_headline(page),
         article_date=page['date'],
         has_addresses=None,
         when_geocoded=None,
         geocoded_by='',
         times_skipped=0,
     )
     save_locations_for_page(p)
     return p
예제 #3
0
    def save_page(self, unique_id):
        """
        Downloads the page with the given unique ID (possibly a numeric ID, or
        a URL) and saves it as a Page object. Returns the Page object, or None
        if the page couldn't be found.

        The page won't be retrieved/saved if it's already in the database. In
        this case, the existing Page object will be returned.
        """
        self.logger.debug('save_page(%s)', unique_id)
        retrieval_url = self.retrieval_url(unique_id)
        public_url = self.public_url(unique_id)

        try:
            p = Page.objects.get(seed__id=self.seed.id, url=public_url)
        except Page.DoesNotExist:
            pass
        else:
            self.logger.debug('Skipping already-saved URL %s', public_url)
            return p

        try:
            html = self.retriever.get_html(retrieval_url).strip()
        except (RetrievalError, UnicodeDecodeError):
            return None
        if not html:
            self.logger.debug('Got empty page for %s', retrieval_url)
            return None
        self.logger.debug('Got VALID page for %s', retrieval_url)

        m = self.date_headline_re.search(html)
        if not m:
            self.logger.debug('Could not find date/headline on %s', retrieval_url)
            return None
        article_date, article_headline = m.groupdict()['article_date'], m.groupdict()['article_headline']
        try:
            article_date = parse_date(article_date, self.date_format)
        except ValueError:
            self.logger.debug('Got unparseable date %r on %s', article_date, retrieval_url)
            return None
        article_headline = strip_tags(article_headline)
        if len(article_headline) > 255:
            article_headline = article_headline[:252] + '...'

        p = Page.objects.create(
            seed=self.seed,
            url=public_url,
            scraped_url=retrieval_url,
            html=html,
            when_crawled=datetime.datetime.now(),
            is_article=True,
            is_pdf=False,
            is_printer_friendly=False,
            article_headline=article_headline,
            article_date=article_date,
            has_addresses=None,
            when_geocoded=None,
            geocoded_by='',
            times_skipped=0,
            robot_report='',
        )
        self.logger.debug('Created Page ID %s' % p.id)
        save_locations_for_page(p)
        return p
예제 #4
0
class FeedUpdater(object):
    def __init__(self, seed, retriever, logger):
        self.seed = seed
        self.retriever = retriever
        self.logger = logger

    def update(self):
        try:
            feed = feedparser.parse(self.seed.url)
        except UnicodeDecodeError:
            self.logger.info('UnicodeDecodeError on %r', self.seed.url)
            return
        for entry in feed['entries']:
            if 'feedburner_origlink' in entry:
                url = entry['feedburner_origlink']
            elif 'pheedo_origLink' in entry:
                url = entry['pheedo_origLink']
            elif 'link' in entry:
                url = entry['link']
            else:
                continue  # Skip entries with no link.

            try:
                url = normalize_url(self.seed.base_url, url,
                                    self.seed.normalize_www)
            except Exception:
                self.logger.warn('Problem normalizing URL: %r, %r, %r',
                                 self.seed.base_url, url,
                                 self.seed.normalize_www)
                continue

            if not url:
                self.logger.info('Skipping article with empty URL: %r, %r',
                                 self.seed.base_url, url)
                continue

            if len(url) > 512:
                self.logger.warning('Skipping long URL %s', url)
                continue

            article_date = entry.get('updated_parsed') and datetime.date(
                *entry['updated_parsed'][:3]) or None
            if article_date and article_date > datetime.date.today():
                # Skip articles in the future, because sometimes articles show
                # up in the feed before they show up on the site, and we don't
                # want to retrieve the article until it actually exists.
                self.logger.info(
                    'Skipping article_date %s, which is in the future',
                    article_date)
                continue

            url = self.normalize_url(url)

            try:
                title = entry['title']
            except KeyError:
                self.logger.debug('Skipping %s due to missing title', url)
                continue

            if not self.download_page(url, title):
                self.logger.debug('Skipping %s due to download_page()', url)
                continue

            # If we've already retrieved the page, there's no need to retrieve
            # it again.
            try:
                Page.objects.filter(url=url)[0]
            except IndexError:
                pass
            else:
                self.logger.debug('URL %s has already been retrieved', url)
                continue

            # If this seed contains the full content in the RSS feed <summary>,
            # then we just use it instead of downloading the contents.
            if self.seed.rss_full_entry:
                is_printer_friendly = False
                try:
                    html = entry['summary']
                except KeyError:
                    html = entry['description']
            else:
                is_printer_friendly = False
                html = None
                time.sleep(self.seed.delay)

                # First, try deducing for the printer-friendly page, given the URL.
                print_url = self.get_printer_friendly_url(url)
                if print_url is not None:
                    try:
                        html = self.get_article_page(print_url)
                        is_printer_friendly = True
                    except Exception, e:
                        self.logger.info(
                            'Error retrieving supposedly accurate printer-friendly page %s: %s',
                            print_url, e)

                # If a printer-friendly page didn't exist, get the real page.
                if html is None:
                    try:
                        html = self.get_article_page(url)
                    except Exception, e:
                        self.logger.info('Error retrieving %s: %s', url, e)
                        continue

                    # If a page was downloaded, try looking for a printer-friendly
                    # link, and download that.
                    print_page = self.get_printer_friendly_page(html, url)
                    if print_page is not None:
                        is_printer_friendly = True
                        html = print_page

                new_html = self.scrape_article_from_page(html)
                if new_html is not None:
                    html = new_html

                if article_date is None:
                    article_date = self.scrape_article_date_from_page(html)

            if not html.strip():
                self.logger.debug('Got empty HTML page')
                continue

            article_headline = strip_tags(title)
            if len(article_headline) > 252:
                article_headline = article_headline[252:] + '...'
            p = Page.objects.create(
                seed=self.seed,
                url=url,
                scraped_url=(is_printer_friendly and print_url or url),
                html=html,
                when_crawled=datetime.datetime.now(),
                is_article=True,
                is_pdf=False,
                is_printer_friendly=is_printer_friendly,
                article_headline=article_headline,
                article_date=article_date,
                has_addresses=None,
                when_geocoded=None,
                geocoded_by='',
                times_skipped=0,
                robot_report='',
            )
            self.logger.info('Created %s story %r', self.seed.base_url,
                             article_headline)
            save_locations_for_page(p)
예제 #5
0
    def save_page(self, unique_id):
        """
        Downloads the page with the given unique ID (possibly a numeric ID, or
        a URL) and saves it as a Page object. Returns the Page object, or None
        if the page couldn't be found.

        The page won't be retrieved/saved if it's already in the database. In
        this case, the existing Page object will be returned.
        """
        self.logger.debug('save_page(%s)', unique_id)
        retrieval_url = self.retrieval_url(unique_id)
        public_url = self.public_url(unique_id)

        try:
            p = Page.objects.get(seed__id=self.seed.id, url=public_url)
        except Page.DoesNotExist:
            pass
        else:
            self.logger.debug('Skipping already-saved URL %s', public_url)
            return p

        try:
            html = self.retriever.fetch_data(retrieval_url).strip()
        except (RetrievalError, UnicodeDecodeError):
            return None
        if not html:
            self.logger.debug('Got empty page for %s', retrieval_url)
            return None
        self.logger.debug('Got VALID page for %s', retrieval_url)

        m = self.date_headline_re.search(html)
        if not m:
            self.logger.debug('Could not find date/headline on %s',
                              retrieval_url)
            return None
        article_date, article_headline = m.groupdict(
        )['article_date'], m.groupdict()['article_headline']
        try:
            article_date = parse_date(article_date, self.date_format)
        except ValueError:
            self.logger.debug('Got unparseable date %r on %s', article_date,
                              retrieval_url)
            return None
        article_headline = strip_tags(article_headline)
        if len(article_headline) > 255:
            article_headline = article_headline[:252] + '...'

        p = Page.objects.create(
            seed=self.seed,
            url=public_url,
            scraped_url=retrieval_url,
            html=html,
            when_crawled=datetime.datetime.now(),
            is_article=True,
            is_pdf=False,
            is_printer_friendly=False,
            article_headline=article_headline,
            article_date=article_date,
            has_addresses=None,
            when_geocoded=None,
            geocoded_by='',
            times_skipped=0,
            robot_report='',
        )
        self.logger.debug('Created Page ID %s' % p.id)
        save_locations_for_page(p)
        return p