def start_requests(self): headers = None if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv( "CF_ACCESS_CLIENT_SECRET"): headers = { "CF-Access-Client-Id": os.getenv("CF_ACCESS_CLIENT_ID"), "CF-Access-Client-Secret": os.getenv("CF_ACCESS_CLIENT_SECRET") } elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv( "IAP_AUTH_SERVICE_ACCOUNT_JSON"): iap_token = IAPAuth( client_id=os.getenv("IAP_AUTH_CLIENT_ID"), service_account_secret_dict=json.loads( os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON")), )().headers["Authorization"] headers = {"Authorization": iap_token} # We crawl according to the sitemap for url in self.sitemap_urls: yield Request(url, callback=self._parse_sitemap, headers=headers, meta={ "alternative_links": DocumentationSpider.to_other_scheme(url) }, errback=self.errback_alternative_link) # Redirection is neither an error (4XX status) nor a success (2XX) if dont_redirect=False, thus we force it # We crawl the start URL in order to ensure we didn't miss anything (Even if we used the sitemap) for url in self.start_urls: yield Request( url, callback=self.parse_from_start_url if self.scrape_start_urls else self.parse, headers=headers, # If we wan't to crawl (default behavior) without scraping, we still need to let the # crawling spider acknowledge the content by parsing it with the built-in method meta={ "alternative_links": DocumentationSpider.to_other_scheme(url) }, errback=self.errback_alternative_link)
def run_config(config): config = ConfigLoader(config) CustomDownloaderMiddleware.driver = config.driver DocumentationSpider.NB_INDEXED = 0 strategy = DefaultStrategy(config) meilisearch_helper = MeiliSearchHelper( config.app_id, config.api_key, config.index_uid, config.custom_settings ) root_module = 'src.' if __name__ == '__main__' else 'scraper.src.' DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__ DOWNLOADER_CLIENTCONTEXTFACTORY = root_module + 'scrapy_patch.' + CustomContextFactory.__name__ DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__ headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", } # Defaults for scrapy https://docs.scrapy.org/en/latest/topics/settings.html#default-request-headers if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv("CF_ACCESS_CLIENT_SECRET"): headers.update( { "CF-Access-Client-Id": os.getenv("CF_ACCESS_CLIENT_ID"), "CF-Access-Client-Secret": os.getenv("CF_ACCESS_CLIENT_SECRET"), } ) elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON"): iap_token = IAPAuth( client_id=os.getenv("IAP_AUTH_CLIENT_ID"), service_account_secret_dict=json.loads( os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON") ), )(requests.Request()).headers["Authorization"] headers.update({"Authorization": iap_token}) DEFAULT_REQUEST_HEADERS = headers process = CrawlerProcess({ 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR', 'USER_AGENT': config.user_agent, 'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900}, # Need to be > 600 to be after the redirectMiddleware 'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY, 'DUPEFILTER_USE_ANCHORS': config.use_anchors, # Use our custom dupefilter in order to be scheme agnostic regarding link provided 'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH, 'DEFAULT_REQUEST_HEADERS': DEFAULT_REQUEST_HEADERS, }) process.crawl( DocumentationSpider, config=config, meilisearch_helper=meilisearch_helper, strategy=strategy ) process.start() process.stop() # Kill browser if needed BrowserHandler.destroy(config.driver) if len(config.extra_records) > 0: meilisearch_helper.add_records(config.extra_records, "Extra records", False) print("") if DocumentationSpider.NB_INDEXED > 0: # meilisearch_helper.commit_tmp_index() print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED)) config.update_nb_hits_value(DocumentationSpider.NB_INDEXED) else: print('Crawling issue: nbHits 0 for ' + config.index_uid) # meilisearch_helper.report_crawling_issue() exit(EXIT_CODE_NO_RECORD) print("")