예제 #1
0
    def scrape(self):
        """Iterates through a single results page and extracts bids.

        This is implemented as follows:
          1. Download the results page.
          2. Extract the bid identifiers from this page.
          3. Check which of those identifiers are not yet in our database.
          4. For each of the identifiers not yet in our database:
            4.1. Download the detail page for each identifier.
            4.2. Extract the fields we are interested in.
            4.3. Create a Bid object and store it in the database.
        """
        session = Session()
        page = self.scraper.get(self.results_url)
        bid_ids = self.scrape_results_page(page.content)
        log.info("Found bid ids: {}".format(bid_ids))
        new_ids = get_new_identifiers(session, bid_ids, self.get_site())
        arg_tuples = [(self.scrape_bid_page, bid_id) for bid_id in new_ids]
        bids = execute_parallel(arg_tuples)
        session.bulk_save_objects(bids)
        session.commit()
예제 #2
0
    def scrape(self):
        """Iterates through all of Commbuys and extracts bids.

        This is implemented as follows, starting on the first results page:
          1. Download the results page.
          2. Extract the bid identifiers from this page.
          3. Check which of those identifiers are not yet in our database.
          4. For each of the identifiers not yet in our database:
            4.1. Download the detail page for each identifier.
            4.2. Extract the fields we are interested in.
            4.3. Create a Bid object and store it in the database.
          5. Go to the next page. Repeat from step #1.
        """
        current_page = 1
        session = Session()
        while True:
            page = self.scraper.post(self.results_url,
                                     data={
                                         'mode': 'navigation',
                                         'currentPage': current_page
                                     })
            bid_ids = self.scrape_results_page(page.content)
            log.info("Results page {} found bid ids: {}".format(
                current_page, bid_ids))
            if not bid_ids:
                log.info("Page {} has no results. Done scraping.".format(
                    current_page))
                break
            new_ids = get_new_identifiers(session, bid_ids, self.get_site())
            # Scrape in parallel the new bid ids found.
            # Any underlying exceptions are allowed to propagate to the caller, and
            # will abort the entire scraping process.
            arg_tuples = [(self.scrape_bid_page, bid_id) for bid_id in new_ids]
            bids = execute_parallel(arg_tuples)
            session.bulk_save_objects(bids)
            # Save all the new bids from this results page in one db call.
            session.commit()
            current_page += 1