Exemplo n.º 1
0
    def test_cssselect(self):
        div, = lxml.html.fromstring(HTML).xpath('//div')

        def count(selector, expected_count, **kwargs):
            result = div.cssselect(selector, **kwargs)
            self.assertEqual(len(result), expected_count)

        count('div', 1)
        count('a', 2)
        count('em', 0)
        # Element names are case-insensitive in HTML
        count('DIV', 1)
        # ... but not in XHTML or XML
        count('DIV', 0, translator='xhtml')
        count('DIV', 0, translator='xml')

        # :contains() is case-insensitive in lxml
        count(':contains("link")', 2)  # div, a
        count(':contains("LInk")', 2)
        # Whatever the document language
        count(':contains("LInk")', 2, translator='xhtml')
        count(':contains("LInk")', 2, translator='xml')
        # ... but not in upstream cssselect
        import cssselect
        count(':contains("link")', 2, translator=cssselect.HTMLTranslator())
        count(':contains("LInk")', 0, translator=cssselect.HTMLTranslator())
Exemplo n.º 2
0
def main(argv):
    if len(argv) < 3:
        return 1
    css = argv[1]
    html = argv[2]
    doc = ht.document_fromstring(open(html).read())

    css_text = open(css).read()
    rules = parser().parseString(css_text)

    tr = cs.HTMLTranslator()

    result_rules = []
    rejected_rules = []
    for r in rules:
        if check_rule(r, doc):
            result_rules.append(r)
            print(r.text(), end='')
        else:
            print('rejected:', r.text(exclude=False), file=sys.stderr)
            rejected_rules.append(r)
    print()

    print("rules before:\t", len(rules), file=sys.stderr)
    print("rules after:\t", len(result_rules), file=sys.stderr)
    #print ("rejected rules:", file=sys.stderr)
    #for r in rejected_rules:
    #    print(r.text(exclude=False), file=sys.stderr, end='')

    sys.exit()
Exemplo n.º 3
0
def check_selector_list(sel, doc):
    tr = cs.HTMLTranslator()

    def convert(x):
        return tr.selector_to_xpath(x)

    # convert e.g. a:hover to a::hover (css3)
    def S(m):
        if m.group(0) == ':': return '::'
        else: return m.group(0)

    for s in sel:
        s = re.sub(':+', S, s)
        try:
            sel_list = cs.parse(s)
            for x in map(convert, sel_list):
                if doc.xpath(x):
                    return True
        except cs.parser.SelectorSyntaxError as e:
            # probably unsupported @media selector
            # may still be matched by subrules' selectors
            # so just skip this selector
            pass
        except Exception as e:
            print(e, "; sel='{}'".format(s), file=sys.stderr)

    return False
Exemplo n.º 4
0
def download_assets(page, directory=None):
    """
  Downloads assets from the given page into a temporary directory.
  """
    print_(EMPTY, "Compiling list of assets to download...")
    try:
        expression = cssselect.HTMLTranslator().css_to_xpath(ASSET_SELECTOR)
    except cssselect.SelectorError:
        print_(FAIL)
        print_(INFO, "Invalid ASSET_SELECTOR configured.")
        exit(1)
        return
    asset_list = [
        "https://github.com" + e.get('href')
        for e in lxml.html.fromstring(page).xpath(expression)
        if "Source code" not in e[0].text
    ]
    print_(OK)

    tmp_dir = directory or tempfile.mkdtemp(prefix='intel-opencl-neo-')
    if directory is None:
        print_(DBUG, f"Temporary directory: {tmp_dir}")

    for asset in asset_list:
        download_asset(asset, tmp_dir)

    return tmp_dir
Exemplo n.º 5
0
    def __init__(self, *rules_files):
        """
        :param rules_files: path to rules files
        """
        if not rules_files:
            rule_urls = [
                'https://filters.adtidy.org/extension/chromium/filters/2.txt',
                'https://easylist.to/easylist/easylist.txt'
            ]

            rules_files = [url.rpartition('/')[-1] for url in rule_urls]

            if not os.path.isdir("adList"):
                os.mkdir("adList")

            # download files containing rules
            for rule_url, rule_file in zip(rule_urls, rules_files):
                r = requests.get(rule_url)
                with open("adList/" + rule_file, 'w', encoding='utf-8') as f:
                    f.write(r.text)

        translator = cssselect.HTMLTranslator()
        self.rules = []

        for rules_file in rules_files:
            with open("adList/" + rules_file, 'r', encoding="utf-8") as f:
                for line in f:
                    # elemhide rules are prefixed by ## in the adblock filter syntax
                    if line[:2] != '@@':
                        try:
                            self.rules.append(translator.css_to_xpath(
                                line[2:]))
                        except cssselect.SelectorError:
                            # just skip bad selectors
                            pass

        n_thread = mp.cpu_count() * 2
        l_query = len(self.rules)
        # create one large query by joining them the xpath | (or) operator
        self.xpath_query_list = []
        for _ in range(n_thread):
            start = int(_ * l_query / n_thread)
            stop = int((_ + 1) * l_query / n_thread)
            self.xpath_query_list.append('|'.join(self.rules[start:stop]))
Exemplo n.º 6
0
    def __init__(self, *rules_files):
        if not rules_files:
            raise ValueError("one or more rules_files required")

        translator = cssselect.HTMLTranslator()
        rules = []

        for rules_file in rules_files:
            with open(rules_file, 'r') as f:
                for line in f:
                    # elemhide rules are prefixed by ## in the adblock filter syntax
                    if line[:2] == '##':
                        try:
                            rules.append(translator.css_to_xpath(line[2:]))
                        except cssselect.SelectorError:
                            # just skip bad selectors
                            pass

        # create one large query by joining them the xpath | (or) operator
        self.xpath_query = '|'.join(rules)
Exemplo n.º 7
0
def advertisements(data, store):

    with open('webcred/data/easylist.txt') as f:
        # elemhide rules are prefixed by ## in the filter syntax
        css_rules = [line[2:] for line in f if line[:2] == "##"]

    # convert css rules from filter list to xpath rules
    xpath_rules = []
    translator = cssselect.HTMLTranslator()
    for rule in css_rules:
        try:
            xpath_rules.append(translator.css_to_xpath(rule))
        except cssselect.SelectorError:
            # skip bad selectors
            pass

    # create one large query by joining the rules using the xpath OR operator
    xpath_query = '|'.join(xpath_rules)

    ad_count = len(data['doc'].xpath(xpath_query))

    store['advertisements'] = ad_count
Exemplo n.º 8
0
def preprocess_stylesheet(device_media_type, base_url, rules, url_fetcher):
    """Do the work that can be done early on stylesheet, before they are
    in a document.

    """
    selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath
    for rule in rules:
        if not rule.at_keyword:
            declarations = list(
                preprocess_declarations(base_url, rule.declarations))
            if declarations:
                selector_string = rule.selector.as_css()
                try:
                    selector_list = []
                    for selector in cssselect.parse(selector_string):
                        xpath = selector_to_xpath(selector)
                        try:
                            lxml_xpath = lxml.etree.XPath(xpath)
                        except ValueError as exc:
                            # TODO: Some characters are not supported by lxml's
                            # XPath implementation (including control
                            # characters), but these characters are valid in
                            # the CSS2.1 specification.
                            raise cssselect.SelectorError(str(exc))
                        selector_list.append(
                            Selector((0, ) + selector.specificity(),
                                     selector.pseudo_element, lxml_xpath))
                    for selector in selector_list:
                        if selector.pseudo_element not in PSEUDO_ELEMENTS:
                            raise cssselect.ExpressionError(
                                'Unknown pseudo-element: %s' %
                                selector.pseudo_element)
                except cssselect.SelectorError as exc:
                    LOGGER.warn("Invalid or unsupported selector '%s', %s",
                                selector_string, exc)
                    continue
                yield rule, selector_list, declarations

        elif rule.at_keyword == '@import':
            if not evaluate_media_query(rule.media, device_media_type):
                continue
            url = url_join(base_url, rule.uri, '@import at %s:%s', rule.line,
                           rule.column)
            if url is not None:
                try:
                    stylesheet = CSS(url=url,
                                     url_fetcher=url_fetcher,
                                     media_type=device_media_type)
                except URLFetchingError as exc:
                    LOGGER.warn('Failed to load stylesheet at %s : %s', url,
                                exc)
                else:
                    for result in stylesheet.rules:
                        yield result

        elif rule.at_keyword == '@media':
            if not evaluate_media_query(rule.media, device_media_type):
                continue
            for result in preprocess_stylesheet(device_media_type, base_url,
                                                rule.rules, url_fetcher):
                yield result

        elif rule.at_keyword == '@page':
            page_name, pseudo_class = rule.selector
            # TODO: support named pages (see CSS3 Paged Media)
            if page_name is not None:
                LOGGER.warn(
                    'Named pages are not supported yet, the whole '
                    '@page %s rule was ignored.',
                    page_name + (':' + pseudo_class if pseudo_class else ''))
                continue
            declarations = list(
                preprocess_declarations(base_url, rule.declarations))

            # Use a double lambda to have a closure that holds page_types
            match = (lambda page_types: lambda _document: page_types)(
                PAGE_PSEUDOCLASS_TARGETS[pseudo_class])
            specificity = rule.specificity

            if declarations:
                selector_list = [Selector(specificity, None, match)]
                yield rule, selector_list, declarations

            for margin_rule in rule.at_rules:
                declarations = list(
                    preprocess_declarations(base_url,
                                            margin_rule.declarations))
                if declarations:
                    selector_list = [
                        Selector(specificity, margin_rule.at_keyword, match)
                    ]
                    yield margin_rule, selector_list, declarations
Exemplo n.º 9
0
def css(self, selector):
    xpath = cssselect.HTMLTranslator().css_to_xpath(selector)
    return self.xpath(xpath)
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules,
                          url_fetcher, rules, fonts, font_config):
    """Do the work that can be done early on stylesheet, before they are
    in a document.

    """
    selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath
    for rule in stylesheet_rules:
        if rule.type == 'qualified-rule':
            declarations = list(
                preprocess_declarations(
                    base_url, tinycss2.parse_declaration_list(rule.content)))
            if declarations:
                selector_string = tinycss2.serialize(rule.prelude)
                try:
                    selector_list = []
                    for selector in cssselect.parse(selector_string):
                        xpath = selector_to_xpath(selector)
                        try:
                            lxml_xpath = lxml.etree.XPath(xpath)
                        except ValueError as exc:
                            # TODO: Some characters are not supported by lxml's
                            # XPath implementation (including control
                            # characters), but these characters are valid in
                            # the CSS2.1 specification.
                            raise cssselect.SelectorError(str(exc))
                        selector_list.append(
                            Selector((0, ) + selector.specificity(),
                                     selector.pseudo_element, lxml_xpath))
                    for selector in selector_list:
                        if selector.pseudo_element not in PSEUDO_ELEMENTS:
                            raise cssselect.ExpressionError(
                                'Unknown pseudo-element: %s' %
                                selector.pseudo_element)
                except cssselect.SelectorError as exc:
                    LOGGER.warning("Invalid or unsupported selector '%s', %s",
                                   selector_string, exc)
                    continue
                rules.append((rule, selector_list, declarations))

        elif rule.type == 'at-rule' and rule.at_keyword == 'import':
            tokens = remove_whitespace(rule.prelude)
            if tokens and tokens[0].type in ('url', 'string'):
                url = tokens[0].value
            else:
                continue
            media = parse_media_query(tokens[1:])
            if media is None:
                LOGGER.warning(
                    'Invalid media type "%s" '
                    'the whole @import rule was ignored at %s:%s.',
                    tinycss2.serialize(rule.prelude), rule.source_line,
                    rule.source_column)
            if not evaluate_media_query(media, device_media_type):
                continue
            url = url_join(base_url,
                           url,
                           allow_relative=False,
                           context='@import at %s:%s',
                           context_args=(rule.source_line, rule.source_column))
            if url is not None:
                try:
                    stylesheet = CSS(url=url,
                                     url_fetcher=url_fetcher,
                                     media_type=device_media_type,
                                     font_config=font_config)
                except URLFetchingError as exc:
                    LOGGER.warning('Failed to load stylesheet at %s : %s', url,
                                   exc)
                else:
                    for result in stylesheet.rules:
                        rules.append(result)

        elif rule.type == 'at-rule' and rule.at_keyword == 'media':
            media = parse_media_query(rule.prelude)
            if media is None:
                LOGGER.warning(
                    'Invalid media type "%s" '
                    'the whole @media rule was ignored at %s:%s.',
                    tinycss2.serialize(rule.prelude), rule.source_line,
                    rule.source_column)
                continue
            if not evaluate_media_query(media, device_media_type):
                continue
            content_rules = tinycss2.parse_rule_list(rule.content)
            preprocess_stylesheet(device_media_type, base_url, content_rules,
                                  url_fetcher, rules, fonts, font_config)

        elif rule.type == 'at-rule' and rule.at_keyword == 'page':
            tokens = remove_whitespace(rule.prelude)
            # TODO: support named pages (see CSS3 Paged Media)
            if not tokens:
                pseudo_class = None
                specificity = (0, 0)
            elif (len(tokens) == 2 and tokens[0].type == 'literal'
                  and tokens[0].value == ':' and tokens[1].type == 'ident'):
                pseudo_class = tokens[1].lower_value
                specificity = {
                    'first': (1, 0),
                    'blank': (1, 0),
                    'left': (0, 1),
                    'right': (0, 1),
                }.get(pseudo_class)
                if not specificity:
                    LOGGER.warning(
                        'Unknown @page pseudo-class "%s", '
                        'the whole @page rule was ignored '
                        'at %s:%s.', pseudo_class, rule.source_line,
                        rule.source_column)
                    continue
            else:
                LOGGER.warning(
                    'Unsupported @page selector "%s", '
                    'the whole @page rule was ignored at %s:%s.',
                    tinycss2.serialize(rule.prelude), rule.source_line,
                    rule.source_column)
                continue
            content = tinycss2.parse_declaration_list(rule.content)
            declarations = list(preprocess_declarations(base_url, content))

            # Use a double lambda to have a closure that holds page_types
            match = (lambda page_types: lambda _document: page_types)(
                PAGE_PSEUDOCLASS_TARGETS[pseudo_class])

            if declarations:
                selector_list = [Selector(specificity, None, match)]
                rules.append((rule, selector_list, declarations))

            for margin_rule in content:
                if margin_rule.type != 'at-rule':
                    continue
                declarations = list(
                    preprocess_declarations(
                        base_url,
                        tinycss2.parse_declaration_list(margin_rule.content)))
                if declarations:
                    selector_list = [
                        Selector(specificity, '@' + margin_rule.at_keyword,
                                 match)
                    ]
                    rules.append((margin_rule, selector_list, declarations))

        elif rule.type == 'at-rule' and rule.at_keyword == 'font-face':
            content = tinycss2.parse_declaration_list(rule.content)
            rule_descriptors = dict(preprocess_descriptors(base_url, content))
            for key in ('src', 'font_family'):
                if key not in rule_descriptors:
                    LOGGER.warning(
                        "Missing %s descriptor in '@font-face' rule at %s:%s",
                        key.replace('_', '-'), rule.source_line,
                        rule.source_column)
                    break
            else:
                if font_config is not None:
                    font_filename = font_config.add_font_face(
                        rule_descriptors, url_fetcher)
                    if font_filename:
                        fonts.append(font_filename)
Exemplo n.º 11
0
def css_to_xpath(css):
    return cssselect.HTMLTranslator().css_to_xpath(css)
Exemplo n.º 12
0
def csspath(query):
    return cssselect.HTMLTranslator().css_to_xpath(query)
Exemplo n.º 13
0
    def convert(self):
        """Remove HTML and PGDP marker from the text."""

        escaped_unicode_re = re.compile(r"\\u[0-9a-fA-F]{4}")

        def escaped_unicode(m):
            try:
                newstr = bytes(m.group(0), 'utf8').decode('unicode-escape')
            except Exception:
                newstr = m.group(0)

            return newstr

        def new_content(element):
            """Process the "content:" property
            """
            retstr = ""
            for token in val.value:
                if token.type == "STRING":
                    # e.g. { content: "xyz" }
                    retstr += escaped_unicode_re.sub(escaped_unicode,
                                                     token.value)
                elif token.type == "FUNCTION":
                    if token.function_name == 'attr':
                        # e.g. { content: attr(title) }
                        retstr += element.attrib.get(token.content[0].value,
                                                     "")
                elif token.type == "IDENT":
                    if token.value == "content":
                        # Identity, e.g. { content: content }
                        retstr += element.text

            return retstr

        # Process each rule from our transformation CSS
        stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss)
        property_errors = []
        for rule in stylesheet.rules:

            # Extract values we care about
            f_transform = None
            f_replace_with_attr = None
            #f_replace_regex = None
            f_text_replace = None
            f_element_func = None
            f_move = None

            for val in rule.declarations:

                if val.name == 'content':
                    # result depends on element and pseudo elements.
                    pass

                elif val.name == "text-transform":
                    if len(val.value) != 1:
                        property_errors += [(val.line, val.column,
                                             val.name + " takes 1 argument")]
                    else:
                        v = val.value[0].value
                        if v == "uppercase":
                            f_transform = lambda x: x.upper()
                        elif v == "lowercase":
                            f_transform = lambda x: x.lower()
                        elif v == "capitalize":
                            f_transform = lambda x: x.title()
                        else:
                            property_errors += [(
                                val.line, val.column, val.name +
                                " accepts only 'uppercase', 'lowercase' or 'capitalize'"
                            )]

                elif val.name == "_replace_with_attr":
                    f_replace_with_attr = lambda el: el.attrib[val.value[0].
                                                               value]

                elif val.name == "text-replace":
                    # Skip S (spaces) tokens.
                    values = [v for v in val.value if v.type != "S"]
                    if len(values) != 2:
                        property_errors += [
                            (val.line, val.column,
                             val.name + " takes 2 string arguments")
                        ]
                    else:
                        v1 = values[0].value
                        v2 = values[1].value
                        f_text_replace = lambda x: x.replace(v1, v2)

                elif val.name == "display":
                    # Support display none only. So ignore "none" argument.
                    f_element_func = clear_element

                elif val.name == "_graft":
                    values = [v for v in val.value if v.type != "S"]
                    if len(values) < 1:
                        property_errors += [
                            (val.line, val.column,
                             val.name + " takes at least one argument")
                        ]
                        continue
                    f_move = []
                    for v in values:
                        print("[", v.value, "]")
                        if v.value == 'parent':
                            f_move.append(lambda el: el.getparent())
                        elif v.value == 'prev-sib':
                            f_move.append(lambda el: el.getprevious())
                        elif v.value == 'next-sib':
                            f_move.append(lambda el: el.getnext())
                        else:
                            property_errors += [
                                (val.line, val.column,
                                 val.name + " invalid value " + v.value)
                            ]
                            f_move = None
                            break

                    if not f_move:
                        continue


#                elif val.name == "_replace_regex":
#                    f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2")
#                    f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value)

                else:
                    property_errors += [(val.line, val.column,
                                         "Unsupported property " + val.name)]
                    continue

                # Iterate through each selectors in the rule
                for selector in cssselect.parse(rule.selector.as_css()):

                    pseudo_element = selector.pseudo_element

                    xpath = cssselect.HTMLTranslator().selector_to_xpath(
                        selector)
                    find = etree.XPath(xpath)

                    # Find each matching element in the HTML/XHTML document
                    for element in find(self.myfile.tree):

                        # Replace text with content of an attribute.
                        if f_replace_with_attr:
                            element.text = f_replace_with_attr(element)

                        if val.name == 'content':
                            v_content = new_content(element)
                            if pseudo_element == "before":
                                element.text = v_content + (element.text or ''
                                                            )  # opening tag
                            elif pseudo_element == "after":
                                element.tail = v_content + (element.tail or ''
                                                            )  # closing tag
                            else:
                                # Replace all content
                                element.text = new_content(element)

                        if f_transform:
                            self.text_apply(element, f_transform)

                        if f_text_replace:
                            self.text_apply(element, f_text_replace)

                        if f_element_func:
                            f_element_func(element)

                        if f_move:
                            parent = element.getparent()
                            new = element
                            for f in f_move:
                                new = f(new)

                            # Move the tail to the sibling or the parent
                            if element.tail:
                                sibling = element.getprevious()
                                if sibling:
                                    sibling.tail = (sibling.tail
                                                    or "") + element.tail
                                else:
                                    parent.text = (parent.text
                                                   or "") + element.tail
                                element.tail = None

                            # Prune and graft
                            parent.remove(element)
                            new.append(element)

                    # if f_replace_regex and element.text:
                    #     element.text = f_replace_regex(element.text)

        css_errors = ""
        if stylesheet.errors or property_errors:
            # There is transformation CSS errors. If the default css
            # is included, take the offset into account.
            i = 0
            if self.args.css_no_default is False:
                i = DEFAULT_TRANSFORM_CSS.count('\n')
            css_errors = "<div class='error-border bbox'><p>Error(s) in the transformation CSS:</p><ul>"
            for err in stylesheet.errors:
                css_errors += "<li>{0},{1}: {2}</li>".format(
                    err.line - i, err.column, err.reason)
            for err in property_errors:
                css_errors += "<li>{0},{1}: {2}</li>".format(
                    err[0] - i, err[1], err[2])
            css_errors += "</ul>"

        return css_errors
Exemplo n.º 14
0
# -*- coding: utf-8 -*-

import urlparse

import cssselect
css = cssselect.HTMLTranslator().css_to_xpath

from wiseguy import html_tags as ht
from wiseguy.template import Transform


def stylesheet(href):
    return ht.LINK({'rel': "stylesheet", 'type': "text/css", 'href': href})


def script(href):
    return ht.SCRIPT({'src': href})


def add_stylesheet(href):
    return Transform(
        [], lambda template: template.element.add("head", stylesheet(href)))


def add_script(href):
    return Transform(
        [], lambda template: template.element.add("head", script(href)))


_url_fixable_tags = set([
    ("link", "href"),
Exemplo n.º 15
0
    def convert(self):
        """Remove HTML and PGDP marker from the text."""

        # Process each rule from our transformation CSS
        stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss)
        for rule in stylesheet.rules:

            # Extract values we care about
            v_content = None
            f_transform = None
            f_replace_with_attr = None
            f_replace_regex = None
            f_text_replace = None
            f_element_func = None

            for val in rule.declarations:

                if val.name == 'content':
                    v_content = val.value[0].value

                elif val.name == "text-transform":
                    v = val.value[0].value
                    if v == "uppercase":
                        f_transform = lambda x: x.upper()
                    elif v == "lowercase":
                        f_transform = lambda x: x.lower()
                    elif v == "capitalize":
                        f_transform = lambda x: x.title()

                elif val.name == "_replace_with_attr":
                    f_replace_with_attr = lambda el: el.attrib[val.value[0].
                                                               value]

                elif val.name == "text-replace":
                    v1 = val.value[0].value
                    v2 = val.value[2].value
                    f_text_replace = lambda x: x.replace(v1, v2)

                elif val.name == "display":
                    # Support display none only. So ignore "none" argument.
                    f_element_func = clear_element

#                elif val.name == "_replace_regex":
#                    f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2")
#                    f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value)

# Iterate through each selectors in the rule
                for selector in cssselect.parse(rule.selector.as_css()):

                    pseudo_element = selector.pseudo_element

                    xpath = cssselect.HTMLTranslator().selector_to_xpath(
                        selector)
                    find = etree.XPath(xpath)

                    # Find each matching element in the HTML/XHTML document
                    for element in find(self.myfile.tree):

                        # Replace text with content of an attribute.
                        if f_replace_with_attr:
                            element.text = f_replace_with_attr(element)

                        if pseudo_element == "before":
                            element.text = v_content + (element.text or ''
                                                        )  # opening tag
                        elif pseudo_element == "after":
                            element.tail = v_content + (element.tail or ''
                                                        )  # closing tag

                        if f_transform:
                            self.text_apply(element, f_transform)

                        if f_text_replace:
                            self.text_apply(element, f_text_replace)

                        if f_element_func:
                            f_element_func(element)

                    # if f_replace_regex and element.text:
                    #     element.text = f_replace_regex(element.text)

        return

        # Transform footnote anchors to [..]
        find = etree.XPath("//a")
        for element in find(self.myfile.tree):
            href = element.attrib.get('href', None)
            if not href or not href.startswith("#Footnote_"):
                continue

            if element.text and not element.text.startswith('['):
                # Some PP have [xx], other have just xx for a page
                # number. Do not add [ ] if they are already there.
                element.text = '[' + (element.text or '')  # opening tag
                element.tail = ']' + (element.tail or '')  # closing tag

        # Add illustration tag, wherever we find it
        for figclass in ['figcenter', 'figleft', 'figright', 'caption']:
            find = etree.XPath(
                "//div[contains(concat(' ', normalize-space(@class), ' '), ' "
                + figclass + " ')]")
            for element in find(self.myfile.tree):
                if element.text and len(element.text) > 1:
                    element.text = '[Illustration:' + element.text  # opening tag
                else:
                    element.text = '[Illustration' + (element.text or ''
                                                      )  # opening tag
                element.tail = ']' + (element.tail or '')  # closing tag


#        for figclass in [ 'caption' ]:
#            find = etree.XPath("//p[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]")
#            for element in find(self.myfile.tree):
#                element.text = '[Illustration:' + (element.text or '')  # opening tag
#                element.tail = ']' + (element.tail or '') # closing tag

# Add sidenote tag
        if args.with_sidenote_tags:
            for sntag in ['sidenote']:
                for find in [
                        "//p[contains(concat(' ', normalize-space(@class), ' '), ' "
                        + sntag + " ')]",
                        "//div[starts-with(@class, 'sidenote')]"
                ]:
                    for element in etree.XPath(find)(self.myfile.tree):
                        element.text = '[Sidenote:' + (element.text or ''
                                                       )  # opening tag
                        element.tail = ']' + (element.tail or ''
                                              )  # closing tag