예제 #1
0
    def initiatives(self) -> Generator[InitiativeImport, None, None]:
        page_counter = 1
        try:
            while page_counter < 100:
                list_page_url = self.config.list_endpoint + f"&page={page_counter}"

                # schemas: defines fields to be scraped
                # schema: fieldname:{xpath,all,cast,transform}
                schemas = {'initiatives':
                               {'xpath': '//a[@href and contains(@class, "postpreview-content")]',
                                'all': True,
                                'transform': lambda elements: self.find_initiative_links(elements)}}

                # initialize TreeParser using url and schemas, returns html tree
                initiative_parser = TreeParser(list_page_url, None, schemas)
                if initiative_parser.tree is None:
                    break

                output = initiative_parser.apply_schemas()
                for uri in output['initiatives']:
                    yield InitiativeImport(source_uri=uri[0])

                page_counter = page_counter + 1
        except Exception as ex:
            raise ScrapeException("Error loading list of initiatives") from ex
예제 #2
0
    def test_should_handle_scrape_exception(self):
        self.pf_source_mock.initiatives = \
            MagicMock(side_effect=ScrapeException("Failed loading the list"))

        self.scraper.scrape()

        assert self.scraper.get_current_batch().state == BatchImportState.FAILED
예제 #3
0
    def test_should_log_listing_exception(self):
        self.pf_source_mock.initiatives = \
            MagicMock(side_effect=ScrapeException("Failed loading the list"))

        self.scraper.scrape()

        self.logger_mock.exception.assert_called_once_with("Error while reading list of initiatives")
예제 #4
0
    def scrape_collection_exception(self):
        self.pf_source_mock.initiatives = MagicMock(
            return_value=iter([InitiativeImport(source_uri="test/123")]))

        self.pf_source_mock.complete = Mock(
            side_effect=ScrapeException("Test"))

        self.scraper.scrape()
예제 #5
0
    def test_should_log_item_exception(self):
        self.pf_source_mock.initiatives = MagicMock(return_value=iter([InitiativeImport(
            source_uri="test/123"
        )]))
        self.pf_source_mock.complete = \
            MagicMock(side_effect=ScrapeException("Failed loading item"))

        self.scraper.scrape()

        self.logger_mock.exception.assert_called_once_with("Error while collecting initiative test/123")
예제 #6
0
    def complete(self, initiative: InitiativeImport):
        try:
            # Robots.txt mentions 10 secs crawl delay.
            time.sleep(10)

            session_metadata = self.item_parser.get_session_metadata(initiative.source_uri)
            full_initiative = self.item_parser.apply_schemas(metadata=session_metadata,
                                                             url=initiative.source_uri)
            for key, value in full_initiative.items():
                setattr(initiative, key, value)

            if not initiative.location:
                initiative.location = self.config.location

        except Exception as ex:
            raise ScrapeException(f"Error scraping {initiative.source_uri}") from ex