def __init__(self, content_xpath=None): """Initializes the HTML converter. Args: content_xpath (etree.XPath): an XPath pointing to the relevant html portions. Defaults to `//body` for fetching all the contents of the page. """ super().__init__() self.content_xpath = content_xpath if content_xpath is None: self.content_xpath = XPathResource("//body") self.head_xpath = XPathResource("//head") self.link_xpath = XPathResource("//*[@href or @src]") self.style_xpath = XPathResource( "//style[@type = 'text/css' and contains(text(), 'url(')]") self.base_xpath = XPathResource( "//head/base/@href", after=[utility.defer("__getitem__", 0)]) # treat windows specially if utility.PLATFORM == "win32": self.pdfkit_config = pdfkit.configuration( wkhtmltopdf=utility.path_in_project("wkhtmltopdf", True)) else: self.pdfkit_config = pdfkit.configuration()
class EBAPlugin(BasePlugin): CWD = "https://eba.europa.eu" """Directory to use when localizing the relative paths.""" source_name = "EBA" """Name that should be displayed as source.""" entry_path = XPathResource( "//tr[contains(normalize-space(@class), 'results-row')]/td[1]") date_path = XPathResource( ".//span[contains(normalize-space(@class), 'result-date')]/text()", after=[ut.defer("__getitem__", 0), ut.defer("strip"), _convert_dates]) title_path = XPathResource( """ ./div[contains(normalize-space(@class), 'search-result-title')] /a[1]/text() """, after=[ut.defer("__getitem__", 0), ut.defer("strip")]) doc_path = XPathResource(""" ./div[contains(normalize-space(@class), 'search-result-title')] /a[1]/@href """, after=[ ut.defer("__getitem__", 0), ut.curry(_make_resource_path, cwd=CWD) ]) def __init__(self, elastic): super().__init__(elastic) self.entry_resource = PaginatedResource(URL_TEMPLATE) def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") title = self.title_path(entry) if not title: continue doc["metadata.title"] = title logging.info(f"Found document: {title}.") dates = self.date_path(entry) doc["metadata.date"] = dates.get("Last update", dt.datetime.now()) doc["metadata.date_original"] = dates.get("Publication date", dt.datetime.now()) doc["metadata.url"] = self.doc_path(entry) docs.append(doc.a_dict) return docs def process_document(self, document, **kwargs): # this is a stub which does nothing, EBA does not deliver any # additional metadata, or a detail url. Therefore, just the docs are # considered. return document
class BubaNotificationsPlugin(BasePlugin): CWD = "https://www.bundesbank.de" """Directory to use when localizing the relative paths.""" source_name = "Deutsche Bundesbank Mitteilungen" """Name that should be displayed as source.""" entry_path = XPathResource( "//li[contains(normalize-space(@class), 'resultlist__item')]") date_path = XPathResource( ".//span[contains(normalize-space(@class), 'teasable__date')]/text()", after=[ut.defer("__getitem__", 0), ut.defer("strip"), _convert_dates]) title_path = XPathResource( ".//div[contains(normalize-space(@class), 'teasable__title')]" "/div[contains(normalize-space(@class), 'h2')]" "/text()[normalize-space()]", after=[ut.defer("__getitem__", 0), ut.defer("strip")]) doc_path = XPathResource( ".//a[contains(normalize-space(@class), 'teasable__link')]/@href", after=[ ut.defer("__getitem__", 0), ut.curry(_make_resource_path, cwd=CWD) ]) def __init__(self, elastic): super().__init__(elastic) pre_filled_url = buba_state_fetcher(URL_TEMPLATE) self.entry_resource = PaginatedResource(pre_filled_url, min_page=0) def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") title = self.title_path(entry) if not title: continue doc["metadata.title"] = title logging.info(f"Found document: {title}.") date = self.date_path(entry) doc["metadata.date"] = date doc["metadata.url"] = self.doc_path(entry) docs.append(doc.a_dict) return docs def process_document(self, document, **kwargs): # this is a stub which does nothing, EBA does not deliver any # additional metadata, or a detail url. Therefore, just the docs are # considered. return document
class SearchPlugin(BasePlugin): entry_path = XPathResource("//div[@class = 'g']") date_path = XPathResource( """ .//div[@class = 's']/div/span[@class = 'st']/span[@class = 'f']/text() """, after=[ut.defer("__getitem__", 0), _convert_date]) doc_path = XPathResource(""" .//div[@class = 'rc']/div[@class = 'r']/a/@href """, after=[ut.defer("__getitem__", 0)]) title_path = XPathResource(""" .//div[@class = 'rc']/div[@class = 'r']/a/h3/text() """, after=[ut.defer("__getitem__", 0)]) def __init__(self, elastic, **search_args): # make sure the searches only retrieve 20 results. super().__init__(elastic, fetch_limit=20) search_id = search_args.get("search_id") search = self.elastic.get_search(search_id) self.source_name = search.get("name", search_id) self.entry_resource = PaginatedResource( URL_TEMPLATE.format(**_query_from_search(search)), min_page=0, page_step=10, headers={"User-Agent": USER_AGENT}) def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") doc["metadata.url"] = self.doc_path(entry) doc["metadata.date"] = self.date_path(entry) doc["metadata.title"] = self.title_path(entry) doc["metadata.crawl_date"] = ut.from_date() docs.append(doc.a_dict) return docs def process_document(self, document, **kwargs): return document
class BafinPlugin(BasePlugin): CWD = "https://www.bafin.de" """Directory to use when localizing the relative paths.""" source_name = "BaFin" """Name that should be displayed as source.""" entry_path = XPathResource( "//div[contains(normalize-space(@class), 'search-result ')]", ) date_path = XPathResource( """ ./h3/span/span[@class = 'metadata']/span[contains(text(), 'Erscheinung')] /following-sibling::text() """, after=[ut.defer("__getitem__", 0), _convert_date]) detail_path = XPathResource("./h3/a/@href", after=[ ut.defer("__getitem__", 0), ut.curry(_make_resource_path, cwd=CWD) ]) title_path = XPathResource("string(./h3/a)", after=[_unescape_string]) type_path = XPathResource(""" normalize-space( ./h3/span/span[@class = 'metadata']/span[contains(text(), 'Format:')] /following-sibling::text() ) """, after=[ut.defer("split", ", ")]) topic_path = XPathResource(""" normalize-space( ./h3/span/span[@class = 'thema']/a/text() ) """, after=[ut.defer("split", ", ")]) doc_path = XPathResource("./ul[@class = 'links']/li[1]/a/@href", after=[ ut.defer("__getitem__", 0), ut.curry(_make_resource_path, cwd=CWD) ]) content_path = XPathResource("//div[@id = 'content']") connected_path = XPathResource( ".//a[contains(@class, 'RichTextIntLink ')]/@href") def __init__(self, elastic): super().__init__(elastic) # bafin needs a faked user-agent in the headers. self.entry_resource = PaginatedResource(URL_TEMPLATE) def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") doc["metadata.date"] = self.date_path(entry) doc["metadata.title"] = self.title_path(entry) doc["metadata.detail_url"] = self.detail_path(entry) doc["metadata.url"] = self.doc_path(entry) if doc["metadata.url"] is None: doc["metadata.url"] = doc["metadata.detail_url"] doc["metadata.topic"] = self.topic_path(entry) doc["metadata.type"] = self.type_path(entry) docs.append(doc.a_dict) return docs def process_document(self, document, **kwargs): doc = ut.SDA(document) resp = self.url_fetcher(doc["metadata.detail_url"]) tree = html.fromstring(resp.content) content = self.content_path(tree) doc["metadata.mentionned"] = [ _make_resource_path(e, self.CWD) for e in self.connected_path(content) ] return doc.a_dict
This function is used to fill the `state` attribute of a url-template with a valid value. Author: Johannes Mueller <*****@*****.**> """ from urllib.parse import urlparse, urlunparse from lxml import html import string from crawlers.plugin import _retry_connection, XPathResource import utility as ut # Define an XPath for the token: state_xpath = XPathResource( "//form[@id = 'formSearchAction']/input[@name = 'state']/@value", after=[ut.defer("__getitem__", 0)] ) def partial_string_format(template, *args, **kwargs): """This function partially replaces arguments in a template string. Arguments: template (str): a template string *args (list): the positional replacements passed to format(). No effect on the partial logic, just here for completeness. **kwargs (dict): the keyword replacements passed to format(). Returns: str: a preformatted template string """
class EurlexPlugin(BasePlugin): CWD = "https://eur-lex.europa.eu" """Directory to use when localizing the relative paths.""" source_name = "EurLex" """Name that should be displayed as source.""" entry_path = XPathResource("//div[@class = 'SearchResult']") date_path = XPathResource( """ .//dl/dd[preceding-sibling::dt[contains(text(), 'Date') or contains(text(), 'Datum')]]/text() """, after=[ut.defer("__getitem__", 0), _convert_date]) doc_path = XPathResource(""" .//ul[contains(@class, 'SearchResultDoc')]/li /a[contains(@href, 'PDF') or contains(@href, 'HTML')]/@href """, after=[ ut.defer("__getitem__", 0), ut.curry(_make_resource_path, cwd=CWD) ]) title_path = XPathResource(".//h2/a[@class = 'title']/text()", after=[ut.defer("__getitem__", 0)]) detail_path = XPathResource(".//h2/a[@class = 'title']/@href", after=[ ut.defer("__getitem__", 0), ut.curry(_make_resource_path, cwd=CWD) ]) num_of_docs_path = XPathResource(""" count( //div[@id = 'textTabContent']/ div[contains(@class, 'tabContent') and not(contains(@class, 'documentSeparator'))] ) """) meta_path = XPathResource("//dl[contains(@class, 'NMetadata')]/dd") key_path = XPathResource("normalize-space(./preceding-sibling::dt[1])", after=[ut.defer("strip", " .:,;!?-_#")]) value_path = XPathResource( """ normalize-space( string-join( ./text() | .//*[self::span[@lang] or self::a[not(child::span)] or self::i[not(child::span)]]/text(), "#" ) ) """, after=[ut.defer("strip", " .:,;!?-_#"), ut.defer("split", "#")]) def __init__(self, elastic): super().__init__(elastic, initial=True) # TODO remove min_page tag. self.entry_resource = PaginatedResource(URL_TEMPLATE, min_page=2338, max_page=9999) # register a string-join function for the lxml XPath ns = etree.FunctionNamespace(None) ns["string-join"] = _string_join def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") doc["metadata.url"] = self.doc_path(entry) doc["metadata.date"] = self.date_path(entry) doc["metadata.title"] = self.title_path(entry) doc["metadata.detail_url"] = self.detail_path(entry) docs.append(doc.a_dict) return docs def process_document(self, document, **kwargs): doc = ut.SDA(document) resp = self.url_fetcher(doc["metadata.detail_url"]) tree = html.fromstring(resp.content) for entry in self.meta_path(tree): key = self.key_path(entry) value = self.value_path(entry) doc[f"metadata.{key}"] = value # if there is more than one document included, use the HTML version. if self.num_of_docs_path(tree) > 1: doc["metadata.url"] = _get_html_version(doc["metadata.url"]) return doc.a_dict