def start_requests(self):
        headers = None
        if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv(
                "CF_ACCESS_CLIENT_SECRET"):
            headers = {
                "CF-Access-Client-Id": os.getenv("CF_ACCESS_CLIENT_ID"),
                "CF-Access-Client-Secret": os.getenv("CF_ACCESS_CLIENT_SECRET")
            }
        elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv(
                "IAP_AUTH_SERVICE_ACCOUNT_JSON"):
            iap_token = IAPAuth(
                client_id=os.getenv("IAP_AUTH_CLIENT_ID"),
                service_account_secret_dict=json.loads(
                    os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON")),
            )().headers["Authorization"]
            headers = {"Authorization": iap_token}

        # We crawl according to the sitemap
        for url in self.sitemap_urls:
            yield Request(url,
                          callback=self._parse_sitemap,
                          headers=headers,
                          meta={
                              "alternative_links":
                              DocumentationSpider.to_other_scheme(url)
                          },
                          errback=self.errback_alternative_link)
        # Redirection is neither an error (4XX status) nor a success (2XX) if dont_redirect=False, thus we force it

        # We crawl the start URL in order to ensure we didn't miss anything (Even if we used the sitemap)
        for url in self.start_urls:
            yield Request(
                url,
                callback=self.parse_from_start_url
                if self.scrape_start_urls else self.parse,
                headers=headers,
                # If we wan't to crawl (default behavior) without scraping, we still need to let the
                # crawling spider acknowledge the content by parsing it with the built-in method
                meta={
                    "alternative_links":
                    DocumentationSpider.to_other_scheme(url)
                },
                errback=self.errback_alternative_link)
Пример #2
0
def run_config(config):
    config = ConfigLoader(config)
    CustomDownloaderMiddleware.driver = config.driver
    DocumentationSpider.NB_INDEXED = 0

    strategy = DefaultStrategy(config)

    meilisearch_helper = MeiliSearchHelper(
        config.app_id,
        config.api_key,
        config.index_uid,
        config.custom_settings
    )

    root_module = 'src.' if __name__ == '__main__' else 'scraper.src.'
    DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__
    DOWNLOADER_CLIENTCONTEXTFACTORY = root_module + 'scrapy_patch.' + CustomContextFactory.__name__
    DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en",
    }  # Defaults for scrapy https://docs.scrapy.org/en/latest/topics/settings.html#default-request-headers

    if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv("CF_ACCESS_CLIENT_SECRET"):
        headers.update(
            {
                "CF-Access-Client-Id": os.getenv("CF_ACCESS_CLIENT_ID"),
                "CF-Access-Client-Secret": os.getenv("CF_ACCESS_CLIENT_SECRET"),
            }
        )
    elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON"):
        iap_token = IAPAuth(
            client_id=os.getenv("IAP_AUTH_CLIENT_ID"),
            service_account_secret_dict=json.loads(
                os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON")
            ),
        )(requests.Request()).headers["Authorization"]
        headers.update({"Authorization": iap_token})

    DEFAULT_REQUEST_HEADERS = headers

    process = CrawlerProcess({
        'LOG_ENABLED': '1',
        'LOG_LEVEL': 'ERROR',
        'USER_AGENT': config.user_agent,
        'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900},
        # Need to be > 600 to be after the redirectMiddleware
        'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY,
        'DUPEFILTER_USE_ANCHORS': config.use_anchors,
        # Use our custom dupefilter in order to be scheme agnostic regarding link provided
        'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH,
        'DEFAULT_REQUEST_HEADERS': DEFAULT_REQUEST_HEADERS,
    })

    process.crawl(
        DocumentationSpider,
        config=config,
        meilisearch_helper=meilisearch_helper,
        strategy=strategy
    )

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        meilisearch_helper.add_records(config.extra_records, "Extra records", False)

    print("")

    if DocumentationSpider.NB_INDEXED > 0:
        # meilisearch_helper.commit_tmp_index()
        print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED))
        config.update_nb_hits_value(DocumentationSpider.NB_INDEXED)
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_uid)
        # meilisearch_helper.report_crawling_issue()
        exit(EXIT_CODE_NO_RECORD)
    print("")