示例#1
0
文件: scraper.py 项目: haukurb/Reynir
    def scrape_root(self, root, helper):
        """ Scrape a root URL """

        t0 = time.time()
        # Fetch the root URL and scrape all child URLs that refer
        # to the same domain suffix and we haven't seen before
        logging.info("Fetching root {0}".format(root.url))

        # Read the HTML document at the root URL
        html_doc = Fetcher.raw_fetch_url(root.url)
        if not html_doc:
            logging.warning("Unable to fetch root {0}".format(root.url))
            return

        # Parse the HTML document
        soup = Fetcher.make_soup(html_doc)

        # Obtain the set of child URLs to fetch
        fetch_set = Fetcher.children(root, soup)

        # Add the children whose URLs we don't already have to the
        # scraper articles table
        with SessionContext() as session:

            for url in fetch_set:

                if helper and helper.skip_url(url):
                    # The helper doesn't want this URL
                    continue

                # noinspection PyBroadException
                try:
                    article = ArticleRow(url=url, root_id=root.id)
                    # Leave article.scraped as NULL for later retrieval
                    session.add(article)
                    session.commit()
                except IntegrityError as e:
                    # Article URL already exists in database:
                    # roll back and continue
                    session.rollback()
                except Exception as e:
                    logging.warning(
                        "Roll back due to exception in scrape_root: {0}"
                        .format(e)
                    )
                    session.rollback()

        t1 = time.time()

        logging.info("Root scrape completed in {0:.2f} seconds".format(t1 - t0))
示例#2
0
    def urls2fetch(self, root, helper):
        """ Returns a set of URLs to fetch. If the scraper helper class has
            associated RSS feed URLs, these are used to acquire article URLs.
            Otherwise, the URLs are found by scraping the root website and
            searching for links to subpages. """
        fetch_set = set()
        feeds = None if helper is None else helper.feeds

        if feeds:

            for feed_url in feeds:
                logging.info("Fetching feed {0}".format(feed_url))
                try:
                    d = feedparser.parse(feed_url)
                except Exception as e:
                    logging.warning(
                        "Error fetching/parsing feed {0}: {1}".format(
                            feed_url, str(e)))
                    continue
                for entry in d.entries:
                    if entry.link and not helper.skip_rss_entry(entry):
                        fetch_set.add(entry.link)

        else:

            # Fetch the root URL and scrape all child URLs
            # that refer to the same domain suffix
            logging.info("Fetching root {0}".format(root.url))

            # Read the HTML document at the root URL
            html_doc = Fetcher.raw_fetch_url(root.url)
            if not html_doc:
                logging.warning("Unable to fetch root {0}".format(root.url))
                return

            # Parse the HTML document
            soup = Fetcher.make_soup(html_doc)

            # Obtain the set of child URLs to fetch
            fetch_set = Fetcher.children(root, soup)

        return fetch_set
示例#3
0
    def urls2fetch(self, root, helper):
        """ Returns a set of URLs to fetch. If the scraper helper class has
            associated RSS feed URLs, these are used to acquire article URLs.
            Otherwise, the URLs are found by scraping the root website and
            searching for links to subpages. """
        fetch_set = set()
        feeds = helper.feeds

        if feeds:
            for feed_url in feeds:
                logging.info("Fetching feed {0}".format(feed_url))
                try:
                    d = feedparser.parse(feed_url)
                except Exception as e:
                    logging.warning(
                        "Error fetching/parsing feed {0}: {1}".format(feed_url, str(e))
                    )
                    continue

                for entry in d.entries:
                    if entry.link and not helper.skip_rss_entry(entry):
                        fetch_set.add(entry.link)
        else:
            # Fetch the root URL and scrape all child URLs
            # that refer to the same domain suffix
            logging.info("Fetching root {0}".format(root.url))

            # Read the HTML document at the root URL
            html_doc = Fetcher.raw_fetch_url(root.url)
            if not html_doc:
                logging.warning("Unable to fetch root {0}".format(root.url))
                return

            # Parse the HTML document
            soup = Fetcher.make_soup(html_doc)

            # Obtain the set of child URLs to fetch
            fetch_set = Fetcher.children(root, soup)

        return fetch_set