예제 #1
0
    def __init__(self, content_xpath=None):
        """Initializes the HTML converter.

        Args:
            content_xpath (etree.XPath): an XPath pointing to the relevant
                html portions. Defaults to `//body`
                for fetching all the contents of the page.
        """
        super().__init__()

        self.content_xpath = content_xpath
        if content_xpath is None:
            self.content_xpath = XPathResource("//body")
        self.head_xpath = XPathResource("//head")
        self.link_xpath = XPathResource("//*[@href or @src]")
        self.style_xpath = XPathResource(
            "//style[@type = 'text/css' and contains(text(), 'url(')]")
        self.base_xpath = XPathResource(
            "//head/base/@href", after=[utility.defer("__getitem__", 0)])
        # treat windows specially
        if utility.PLATFORM == "win32":
            self.pdfkit_config = pdfkit.configuration(
                wkhtmltopdf=utility.path_in_project("wkhtmltopdf", True))
        else:
            self.pdfkit_config = pdfkit.configuration()
예제 #2
0
class EBAPlugin(BasePlugin):

    CWD = "https://eba.europa.eu"
    """Directory to use when localizing the relative paths."""

    source_name = "EBA"
    """Name that should be displayed as source."""

    entry_path = XPathResource(
        "//tr[contains(normalize-space(@class), 'results-row')]/td[1]")
    date_path = XPathResource(
        ".//span[contains(normalize-space(@class), 'result-date')]/text()",
        after=[ut.defer("__getitem__", 0),
               ut.defer("strip"), _convert_dates])
    title_path = XPathResource(
        """
        ./div[contains(normalize-space(@class), 'search-result-title')]
         /a[1]/text()
        """,
        after=[ut.defer("__getitem__", 0),
               ut.defer("strip")])
    doc_path = XPathResource("""
        ./div[contains(normalize-space(@class), 'search-result-title')]
         /a[1]/@href
        """,
                             after=[
                                 ut.defer("__getitem__", 0),
                                 ut.curry(_make_resource_path, cwd=CWD)
                             ])

    def __init__(self, elastic):
        super().__init__(elastic)
        self.entry_resource = PaginatedResource(URL_TEMPLATE)

    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            title = self.title_path(entry)
            if not title:
                continue
            doc["metadata.title"] = title
            logging.info(f"Found document: {title}.")
            dates = self.date_path(entry)
            doc["metadata.date"] = dates.get("Last update", dt.datetime.now())
            doc["metadata.date_original"] = dates.get("Publication date",
                                                      dt.datetime.now())
            doc["metadata.url"] = self.doc_path(entry)
            docs.append(doc.a_dict)

        return docs

    def process_document(self, document, **kwargs):
        # this is a stub which does nothing, EBA does not deliver any
        # additional metadata, or a detail url. Therefore, just the docs are
        # considered.
        return document
예제 #3
0
class BubaNotificationsPlugin(BasePlugin):
    CWD = "https://www.bundesbank.de"
    """Directory to use when localizing the relative paths."""

    source_name = "Deutsche Bundesbank Mitteilungen"
    """Name that should be displayed as source."""

    entry_path = XPathResource(
        "//li[contains(normalize-space(@class), 'resultlist__item')]")

    date_path = XPathResource(
        ".//span[contains(normalize-space(@class), 'teasable__date')]/text()",
        after=[ut.defer("__getitem__", 0),
               ut.defer("strip"), _convert_dates])

    title_path = XPathResource(
        ".//div[contains(normalize-space(@class), 'teasable__title')]"
        "/div[contains(normalize-space(@class), 'h2')]"
        "/text()[normalize-space()]",
        after=[ut.defer("__getitem__", 0),
               ut.defer("strip")])

    doc_path = XPathResource(
        ".//a[contains(normalize-space(@class), 'teasable__link')]/@href",
        after=[
            ut.defer("__getitem__", 0),
            ut.curry(_make_resource_path, cwd=CWD)
        ])

    def __init__(self, elastic):
        super().__init__(elastic)
        pre_filled_url = buba_state_fetcher(URL_TEMPLATE)
        self.entry_resource = PaginatedResource(pre_filled_url, min_page=0)

    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            title = self.title_path(entry)
            if not title:
                continue
            doc["metadata.title"] = title
            logging.info(f"Found document: {title}.")
            date = self.date_path(entry)
            doc["metadata.date"] = date
            doc["metadata.url"] = self.doc_path(entry)
            docs.append(doc.a_dict)

        return docs

    def process_document(self, document, **kwargs):
        # this is a stub which does nothing, EBA does not deliver any
        # additional metadata, or a detail url. Therefore, just the docs are
        # considered.
        return document
예제 #4
0
class SearchPlugin(BasePlugin):

    entry_path = XPathResource("//div[@class = 'g']")
    date_path = XPathResource(
        """
        .//div[@class = 's']/div/span[@class = 'st']/span[@class = 'f']/text()
        """,
        after=[ut.defer("__getitem__", 0), _convert_date])
    doc_path = XPathResource("""
        .//div[@class = 'rc']/div[@class = 'r']/a/@href
        """,
                             after=[ut.defer("__getitem__", 0)])
    title_path = XPathResource("""
        .//div[@class = 'rc']/div[@class = 'r']/a/h3/text()
        """,
                               after=[ut.defer("__getitem__", 0)])

    def __init__(self, elastic, **search_args):
        # make sure the searches only retrieve 20 results.
        super().__init__(elastic, fetch_limit=20)
        search_id = search_args.get("search_id")
        search = self.elastic.get_search(search_id)
        self.source_name = search.get("name", search_id)

        self.entry_resource = PaginatedResource(
            URL_TEMPLATE.format(**_query_from_search(search)),
            min_page=0,
            page_step=10,
            headers={"User-Agent": USER_AGENT})

    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            doc["metadata.url"] = self.doc_path(entry)
            doc["metadata.date"] = self.date_path(entry)
            doc["metadata.title"] = self.title_path(entry)
            doc["metadata.crawl_date"] = ut.from_date()
            docs.append(doc.a_dict)

        return docs

    def process_document(self, document, **kwargs):
        return document
예제 #5
0
class BafinPlugin(BasePlugin):

    CWD = "https://www.bafin.de"
    """Directory to use when localizing the relative paths."""

    source_name = "BaFin"
    """Name that should be displayed as source."""

    entry_path = XPathResource(
        "//div[contains(normalize-space(@class), 'search-result ')]", )
    date_path = XPathResource(
        """
        ./h3/span/span[@class = 'metadata']/span[contains(text(),
                                                 'Erscheinung')]
        /following-sibling::text()
        """,
        after=[ut.defer("__getitem__", 0), _convert_date])
    detail_path = XPathResource("./h3/a/@href",
                                after=[
                                    ut.defer("__getitem__", 0),
                                    ut.curry(_make_resource_path, cwd=CWD)
                                ])
    title_path = XPathResource("string(./h3/a)", after=[_unescape_string])
    type_path = XPathResource("""
        normalize-space(
            ./h3/span/span[@class = 'metadata']/span[contains(text(),
                                                    'Format:')]
            /following-sibling::text()
        )
        """,
                              after=[ut.defer("split", ", ")])
    topic_path = XPathResource("""
        normalize-space(
            ./h3/span/span[@class = 'thema']/a/text()
        )
        """,
                               after=[ut.defer("split", ", ")])
    doc_path = XPathResource("./ul[@class = 'links']/li[1]/a/@href",
                             after=[
                                 ut.defer("__getitem__", 0),
                                 ut.curry(_make_resource_path, cwd=CWD)
                             ])

    content_path = XPathResource("//div[@id = 'content']")

    connected_path = XPathResource(
        ".//a[contains(@class, 'RichTextIntLink ')]/@href")

    def __init__(self, elastic):
        super().__init__(elastic)
        # bafin needs a faked user-agent in the headers.
        self.entry_resource = PaginatedResource(URL_TEMPLATE)

    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            doc["metadata.date"] = self.date_path(entry)
            doc["metadata.title"] = self.title_path(entry)
            doc["metadata.detail_url"] = self.detail_path(entry)
            doc["metadata.url"] = self.doc_path(entry)
            if doc["metadata.url"] is None:
                doc["metadata.url"] = doc["metadata.detail_url"]
            doc["metadata.topic"] = self.topic_path(entry)
            doc["metadata.type"] = self.type_path(entry)
            docs.append(doc.a_dict)

        return docs

    def process_document(self, document, **kwargs):
        doc = ut.SDA(document)
        resp = self.url_fetcher(doc["metadata.detail_url"])
        tree = html.fromstring(resp.content)
        content = self.content_path(tree)
        doc["metadata.mentionned"] = [
            _make_resource_path(e, self.CWD)
            for e in self.connected_path(content)
        ]
        return doc.a_dict
예제 #6
0
This function is used to fill the `state` attribute of a url-template with
a valid value.

Author: Johannes Mueller <*****@*****.**>
"""
from urllib.parse import urlparse, urlunparse
from lxml import html
import string

from crawlers.plugin import _retry_connection, XPathResource
import utility as ut

# Define an XPath for the token:
state_xpath = XPathResource(
    "//form[@id = 'formSearchAction']/input[@name = 'state']/@value",
    after=[ut.defer("__getitem__", 0)]
)


def partial_string_format(template, *args, **kwargs):
    """This function partially replaces arguments in a template string.

    Arguments:
        template (str): a template string
        *args (list): the positional replacements passed to format().
            No effect on the partial logic, just here for completeness.
        **kwargs (dict): the keyword replacements passed to format().

    Returns:
        str: a preformatted template string
    """
예제 #7
0
class EurlexPlugin(BasePlugin):

    CWD = "https://eur-lex.europa.eu"
    """Directory to use when localizing the relative paths."""

    source_name = "EurLex"
    """Name that should be displayed as source."""

    entry_path = XPathResource("//div[@class = 'SearchResult']")
    date_path = XPathResource(
        """
        .//dl/dd[preceding-sibling::dt[contains(text(), 'Date') or
                                       contains(text(), 'Datum')]]/text()
        """,
        after=[ut.defer("__getitem__", 0), _convert_date])
    doc_path = XPathResource("""
        .//ul[contains(@class, 'SearchResultDoc')]/li
        /a[contains(@href, 'PDF') or contains(@href, 'HTML')]/@href
        """,
                             after=[
                                 ut.defer("__getitem__", 0),
                                 ut.curry(_make_resource_path, cwd=CWD)
                             ])
    title_path = XPathResource(".//h2/a[@class = 'title']/text()",
                               after=[ut.defer("__getitem__", 0)])
    detail_path = XPathResource(".//h2/a[@class = 'title']/@href",
                                after=[
                                    ut.defer("__getitem__", 0),
                                    ut.curry(_make_resource_path, cwd=CWD)
                                ])

    num_of_docs_path = XPathResource("""
        count(
            //div[@id = 'textTabContent']/
            div[contains(@class, 'tabContent') and
                not(contains(@class, 'documentSeparator'))]
        )
        """)

    meta_path = XPathResource("//dl[contains(@class, 'NMetadata')]/dd")
    key_path = XPathResource("normalize-space(./preceding-sibling::dt[1])",
                             after=[ut.defer("strip", " .:,;!?-_#")])
    value_path = XPathResource(
        """
        normalize-space(
            string-join(
                ./text() | .//*[self::span[@lang] or
                                self::a[not(child::span)] or
                                self::i[not(child::span)]]/text(), "#"
            )
        )
        """,
        after=[ut.defer("strip", " .:,;!?-_#"),
               ut.defer("split", "#")])

    def __init__(self, elastic):
        super().__init__(elastic, initial=True)
        # TODO remove min_page tag.
        self.entry_resource = PaginatedResource(URL_TEMPLATE,
                                                min_page=2338,
                                                max_page=9999)
        # register a string-join function for the lxml XPath
        ns = etree.FunctionNamespace(None)
        ns["string-join"] = _string_join

    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            doc["metadata.url"] = self.doc_path(entry)
            doc["metadata.date"] = self.date_path(entry)
            doc["metadata.title"] = self.title_path(entry)
            doc["metadata.detail_url"] = self.detail_path(entry)
            docs.append(doc.a_dict)

        return docs

    def process_document(self, document, **kwargs):
        doc = ut.SDA(document)
        resp = self.url_fetcher(doc["metadata.detail_url"])
        tree = html.fromstring(resp.content)
        for entry in self.meta_path(tree):
            key = self.key_path(entry)
            value = self.value_path(entry)
            doc[f"metadata.{key}"] = value
        # if there is more than one document included, use the HTML version.
        if self.num_of_docs_path(tree) > 1:
            doc["metadata.url"] = _get_html_version(doc["metadata.url"])
        return doc.a_dict