Exemplo n.º 1
0
def transform_collapsibles(text):
    """Find simple collapsible elements and transform them to full html."""
    tree = parseFragment(text, container='div', treebuilder='etree',
                         namespaceHTMLElements=False)

    base_id = ''.join(filter(str.isdigit, str(time.time())))
    collapsibles = tree.findall('./div[@class="collapsible-item"]')
    for i, collapsible in enumerate(collapsibles):
        title = collapsible.find('./div[@class="collapsible-item-title"]')
        body = collapsible.find('./div[@class="collapsible-item-body"]')

        if title is not None and body is not None:
            title.tag = 'span'
            del title.attrib['class']

            body.tag = 'div'
            del body.attrib['class']

            final_html = render_to_string(
                'a4ckeditor/collapsible_fragment.html',
                dict(
                    id='a4ckeditor-collapsible-{}_{}'.format(base_id, i),
                    title=serialize(title),
                    body=serialize(body))
            )

            collapsible.clear()
            collapsible.append(parseFragment(final_html, treebuilder='etree',
                                             namespaceHTMLElements=False))

    return serialize(tree)
Exemplo n.º 2
0
    def parse_comments(self, root, raw):
        ans = ''
        ns = tuple(self.selector('#bookDescription_feature_div noscript'))
        if ns:
            ns = ns[0]
            if len(ns) == 0 and ns.text:
                import html5lib
                # html5lib parsed noscript as CDATA
                ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
            else:
                ns.tag = 'div'
            ans = self._render_comments(ns)
        else:
            desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
            if desc:
                ans = self._render_comments(desc[0])

        desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
        if desc:
            ans += self._render_comments(desc[0])
        else:
            # Idiot chickens from amazon strike again. This data is now stored
            # in a JS variable inside a script tag URL encoded.
            m = re.search(b'var\s+iframeContent\s*=\s*"([^"]+)"', raw)
            if m is not None:
                try:
                    text = unquote(m.group(1)).decode('utf-8')
                    nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
                    desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]')
                    if desc:
                        ans += self._render_comments(desc[0])
                except Exception as e:
                    self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e))

        return ans
Exemplo n.º 3
0
    def load_html(fn):
        # Open file.
        with open(fn) as f:
            doc = f.read()

        # Parse DOM. It's a fragment so we need to use parseFragment,
        # which returns a list which we re-assemble into a node.
        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            fragment = html5lib.parseFragment(doc, treebuilder="lxml")

        dom = lxml.etree.Element("div")
        for node in fragment:
            dom.append(node)

        ## Remove comments - xml_diff can't handle that.
        ## They seem to already be stripped by the HTML
        ## sanitization.
        # for node in dom.xpath("//comment()"):
        #    node.getparent().remove(node)

        # Take everything out of the HTML namespace so
        # that when we serialize at the end there are no
        # namespaces and it's plain HTML.
        for node in dom.xpath("//*"):
            node.tag = node.tag.replace("{http://www.w3.org/1999/xhtml}", "")

        return (doc, dom)
Exemplo n.º 4
0
def parse_harlowe_html(s):
    """
    Parse a string containing the HTML of a Twine game written using Harlowe.

    Args:
        s (str): The Harlowe source.

    Returns:
        (dict, list, OrderedDict): A dictionary of the attributes on the top-level tw-storydata
         element, a list of non-passage elements in the game (as etree.ElementTree.Element),
         and a dict whose keys are the passage's names and whose values are the
         corresponding HarlowePassage objects.
    """
    passages = OrderedDict()  # So that we keep the original room order in source code
    other_elems = list()

    # The story uses HTML5 custom elements, and so requires an HTML5-aware parser
    story_elem = html5lib.parseFragment(s, treebuilder='lxml', namespaceHTMLElements=False)[0]
    if story_elem is None or story_elem.tag != _STORY_TAG:
        raise RuntimeError('No properly-formatted story tag ('+_STORY_TAG+') found')

    for elem in story_elem:
        if elem.tag == _PASSAGE_TAG:
            passage = HarlowePassage.from_element(elem)
            passages[passage.name] = passage
        else:
            other_elems.append(elem)

    return story_elem.attrib, other_elems, passages
Exemplo n.º 5
0
def warcToText(url):
# request the url/warc.gz file
    resp = requests.get(url, stream=True)
    # iterate through the archive
    fail = 0
    succeed  = 0
    for record in ArchiveIterator(resp.raw, arc2warc=True):
        # if the record type is a response (which is the case for html page)
        if record.rec_type == 'response':
            # check if the response is http
            if record.http_headers != None:
                # if the http header is one of the following
                if ((record.http_headers.get_header('Content-Type') =='text/html') |(record.http_headers.get_header('Content-Type') == 'text/html; charset=UTF-8')\
                 | (record.http_headers.get_header('Content-Type') =='text/html; charset=utf-8')| (record.http_headers.get_header('Content-Type') =='text/html; charset=ISO-8859-1')\
                 | (record.http_headers.get_header('Content-Type') =='charset=iso-8859-1')):
                    # return the html page
                    try:
                        html = record.content_stream().read()
                        # from html to plain text
                        html_parse = html5lib.parseFragment(html)
                        s = ''.join(html_parse.itertext())
                        print(s)
                        succeed = succeed +1
                    except Exception:
                        fail = fail +1
                        continue
    print('fail: %s'%(fail))
    print('succeed: %s'%(succeed))
Exemplo n.º 6
0
 def html(self):
     try:
         import html5lib
         self.html5lib = html5lib
         return html5lib.parseFragment(self.content)
     except ImportError, err:
         raise ImproperlyConfigured("Error while importing html5lib: %s" % err)
Exemplo n.º 7
0
def truncate(html,
             truncated_message,
             suffix,
             max_entities=None,
             max_length=None):
    walker = html5lib.getTreeWalker('etree')
    html_stream = walker(html5lib.parseFragment(html, treebuilder='etree'))
    truncated_message_stream = walker(
        html5lib.parseFragment(truncated_message, treebuilder='etree'))
    suffix_stream = walker(html5lib.parseFragment(suffix, treebuilder='etree'))
    truncated = TelegramTruncator(html_stream,
                                  truncated_message=truncated_message_stream,
                                  suffix=suffix_stream,
                                  max_entities=max_entities,
                                  max_length=max_length)
    return HTMLSerializer().render(truncated).strip('\n')
Exemplo n.º 8
0
 def html(self):
     try:
         import html5lib
         self.html5lib = html5lib
         return html5lib.parseFragment(self.content)
     except ImportError, err:
         raise ImproperlyConfigured("Error while importing html5lib: %s" % err)
Exemplo n.º 9
0
def sanitize_html(html):
    """
    Make the given HTML string safe to display in a Yarrharr page.
    """
    tree = html5lib.parseFragment(html)
    serializer = html5lib.serializer.HTMLSerializer()
    source = html5lib.getTreeWalker("etree")(tree)
    source = _strip_attrs(source)
    source = _drop_empty_tags(source)
    source = _ReplaceObjectFilter(source)
    source = _ElideFilter(source)
    source = _ReplaceYoutubeEmbedFilter(source)
    source = _ExtractTitleTextFilter(source)
    source = _adjust_links(source)
    source = _video_attrs(source)
    source = _wp_smileys(source)
    source = sanitizer.Filter(
        source,
        allowed_elements=sanitizer.allowed_elements
        | frozenset([
            (
                namespaces["html"],
                "summary",
            ),  # https://github.com/html5lib/html5lib-python/pull/423
            (
                namespaces["html"],
                "wbr",
            ),  # https://github.com/html5lib/html5lib-python/pull/395
        ]),
    )
    return serializer.render(source)
Exemplo n.º 10
0
def strip_style_and_script(input):
    dom = html5lib.parseFragment(input, treebuilder="dom")
    walker = html5lib.getTreeWalker("dom")
    stream = walker(dom)

    s = html5lib.serializer.HTMLSerializer()

    return s.render(NoChildTagFilter(stream, ("script", "style")))
Exemplo n.º 11
0
 def get_title(self):
     document = parseFragment(self.content, treebuilder='etree', \
         namespaceHTMLElements=False, encoding='utf-8')
     try:
         text = \
             ' '.join([w for w in document.find('.//h1').itertext()])
         return text.encode('utf-8')
     except AttributeError:
         return None
Exemplo n.º 12
0
def test_sanitizer(expected, input):
    parsed = parseFragment(expected)
    expected = serialize(parsed,
                         omit_optional_tags=False,
                         use_trailing_solidus=True,
                         space_before_trailing_solidus=False,
                         quote_attr_values="always",
                         quote_char='"',
                         alphabetical_attributes=True)
    assert expected == sanitize_html(input)
Exemplo n.º 13
0
def runSanitizerTest(_, expected, input):
    parsed = parseFragment(expected)
    expected = serialize(parsed,
                         omit_optional_tags=False,
                         use_trailing_solidus=True,
                         space_before_trailing_solidus=False,
                         quote_attr_values="always",
                         quote_char='"',
                         alphabetical_attributes=True)
    assert expected == sanitize_html(input)
Exemplo n.º 14
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = [
        '{http://www.w3.org/1999/xhtml}blockquote',
        '{http://www.w3.org/1999/xhtml}ol',
        '{http://www.w3.org/1999/xhtml}li',
        '{http://www.w3.org/1999/xhtml}ul',
    ]

    if not string:
        return string

    def parse_html(tree):
        # In etree, a tag may have:
        # - some text content (piece of text before its first child)
        # - a tail (piece of text just after the tag, and before a sibling)
        # - children
        # Eg: "<div>text <b>children's text</b> children's tail</div> tail".

        # Strip new lines directly inside block level elements: first new lines
        # from the text, and:
        # - last new lines from the tail of the last child if there's children
        #   (done in the children loop below).
        # - or last new lines from the text itself.
        if tree.tag in html_blocks:
            if tree.text:
                tree.text = tree.text.lstrip('\n')
                if not len(tree):  # No children.
                    tree.text = tree.text.rstrip('\n')

            # Remove the first new line after a block level element.
            if tree.tail and tree.tail.startswith('\n'):
                tree.tail = tree.tail[1:]

        for child in tree:  # Recurse down the tree.
            if tree.tag in html_blocks:
                # Strip new lines directly inside block level elements: remove
                # the last new lines from the children's tails.
                if child.tail:
                    child.tail = child.tail.rstrip('\n')
            parse_html(child)
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    # Serialize the parsed tree back to html.
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values='always',
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemplo n.º 15
0
    def _process_post(self, post):
        fetchables = []
        if post['type'] == 'photo':
            for photo in post['photos']:
                #Seems that first alt size is the biggest
                url = photo['alt_sizes'][0]['url']
                fetchables.append(Image(url, self.sfh))
        elif post['type'] == 'video':
            #Video type:  youtube, vimeo, unknown
            #source_url is only present sometimes
            #To do:  download the video.
            #Perhaps use https://github.com/NFicano/pytube to download Youtube
            #Youtube and Vimeo are embedded with iframe, where src is link to video
            video_url = "None"
            if post['video_type'] in ('youtube', 'vimeo'):
                #May have videos that do no have players.  (I think they are reblogs of videos.)
                #Parse the embed_code
                embed_code = post['player'][0]['embed_code']
                if embed_code:
                    player_fragment = html5lib.parseFragment(embed_code)
                    video_url = player_fragment[0].attrib['src']
                    #Vimeo omits http
                    if video_url.startswith("//"):
                        video_url = "http:" + video_url
                    if post['video_type'] == 'youtube':
                        fetchables.append(youtube.Video(video_url, self.sfh))
                    elif post['video_type'] == 'vimeo':
                        fetchables.append(vimeo.Video(video_url, self.sfh))
        elif post['type'] == 'text':
            #Parse body
            body_fragment = html5lib.parseFragment(post['body'], namespaceHTMLElements=False)
            #Extract links
            for a_elem in body_fragment.findall(".//a[@href]"):
                fetchables.append(UnknownResource(a_elem.attrib['href'], self.sfh))
            #Extract images
            for img_elem in body_fragment.findall(".//img[@src]"):
                fetchables.append(Image(img_elem.attrib['src'], self.sfh))
            #TODO:  Consider whether there are other elements that should be parsed.
            #Also, need to test if original is markdown, do we get html or markdown.
        #TODO: Other post types

        return fetchables
Exemplo n.º 16
0
def sanitize(string):
    """
    Ensure that the text does not contain any malicious HTML code which might
    break the page.
    """
    from html5lib import parseFragment, serialize

    parsed = parseFragment(string)
    clean = serialize(parsed, sanitize=True, omit_optional_tags=False,
                      quote_attr_values='always')
    return clean
Exemplo n.º 17
0
 def test_linkify(self):
     tmpl = env.from_string('{{ "http://test.example.com"|linkify}}')
     rendered = tmpl.render()
     el = html5lib.parseFragment(rendered)
     self.assertEquals(len(el.getchildren()), 1)
     el = el.getchildren()[0]
     self.assertEquals(el.tag, u'{http://www.w3.org/1999/xhtml}a')
     self.assertEquals(el.text, u'http://test.example.com')
     self.assertEquals(sorted(el.items()),
                       [(u'href', u'http://test.example.com'),
                        (u'rel', u'nofollow')])
Exemplo n.º 18
0
def typo_html(data, out=None):
    if data and not isinstance(data, unicode):
        raise RuntimeError("`typo_html` requires unicode")
    return_value = False
    if not out:
        out = cStringIO.StringIO()
        return_value = True
    fragment = html5lib.parseFragment(data)
    TypoWalker(fragment, out)
    if return_value:
        return out.getvalue()
Exemplo n.º 19
0
def sanitize_html(stream):
    parsed = parseFragment(stream)
    serialized = serialize(parsed,
                           sanitize=True,
                           omit_optional_tags=False,
                           use_trailing_solidus=True,
                           space_before_trailing_solidus=False,
                           quote_attr_values="always",
                           quote_char='"',
                           alphabetical_attributes=True)
    return serialized
Exemplo n.º 20
0
def obfuscate_emails(content):
    if isinstance(content, contents.Static):
        return

    dom = html5lib.parseFragment(content._content, treebuilder="etree")
    walker = html5lib.getTreeWalker("etree")
    stream = walker(dom)
    stream = ObfuscateEmailsFilter(stream)
    s = html5lib.serializer.HTMLSerializer(quote_attr_values="always",
                                           omit_optional_tags=False)
    content._content = s.render(stream)
Exemplo n.º 21
0
    def request(self, **options):
        """
        Perform a remote theater program request and return the html5
        document with results.  You need to extract details yourself.
        """

        fp = urlopen(self.base_url + urlencode(options))
        data = '<div>' + fp.read() + '</div>'
        fp.close()

        return html5.parseFragment(data, 'div', 'lxml', 'utf-8', False).pop()
Exemplo n.º 22
0
def typo_html(data, out=None):
    if data and not isinstance(data, unicode):
        raise RuntimeError("`typo_html` requires unicode")
    return_value = False
    if not out:
        out = cStringIO.StringIO()
        return_value = True
    fragment = html5lib.parseFragment(data)
    TypoWalker(fragment, out)
    if return_value:
        return out.getvalue()
Exemplo n.º 23
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = [
        "{http://www.w3.org/1999/xhtml}blockquote",
        "{http://www.w3.org/1999/xhtml}ol",
        "{http://www.w3.org/1999/xhtml}li",
        "{http://www.w3.org/1999/xhtml}ul",
    ]

    if not string:
        return string

    def parse_html(tree):
        # In etree, a tag may have:
        # - some text content (piece of text before its first child)
        # - a tail (piece of text just after the tag, and before a sibling)
        # - children
        # Eg: "<div>text <b>children's text</b> children's tail</div> tail".

        # Strip new lines directly inside block level elements: first new lines
        # from the text, and:
        # - last new lines from the tail of the last child if there's children
        #   (done in the children loop below).
        # - or last new lines from the text itself.
        if tree.tag in html_blocks:
            if tree.text:
                tree.text = tree.text.lstrip("\n")
                if not len(tree):  # No children.
                    tree.text = tree.text.rstrip("\n")

            # Remove the first new line after a block level element.
            if tree.tail and tree.tail.startswith("\n"):
                tree.tail = tree.tail[1:]

        for child in tree:  # Recurse down the tree.
            if tree.tag in html_blocks:
                # Strip new lines directly inside block level elements: remove
                # the last new lines from the children's tails.
                if child.tail:
                    child.tail = child.tail.rstrip("\n")
            parse_html(child)
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    # Serialize the parsed tree back to html.
    walker = html5lib.treewalkers.getTreeWalker("etree")
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
    return serializer.render(stream)
Exemplo n.º 24
0
def sanitize_html(stream):
    parsed = parseFragment(stream)
    serialized = serialize(parsed,
                           sanitize=True,
                           omit_optional_tags=False,
                           use_trailing_solidus=True,
                           space_before_trailing_solidus=False,
                           quote_attr_values="always",
                           quote_char='"',
                           alphabetical_attributes=True)
    return serialized
Exemplo n.º 25
0
 def test_linkify(self):
   tmpl = env.from_string('{{ "http://test.example.com"|linkify}}')
   rendered = tmpl.render()
   el = html5lib.parseFragment(rendered)
   self.assertEquals(len(el.getchildren()), 1)
   el = el.getchildren()[0]
   self.assertEquals(el.tag, u'{http://www.w3.org/1999/xhtml}a')
   self.assertEquals(el.text, u'http://test.example.com')
   self.assertEquals(
     sorted(el.items()),
     [(u'href', u'http://test.example.com'), (u'rel', u'nofollow')]
   )
Exemplo n.º 26
0
    def _html_serialize(self, chunks, attributes, max_length):
        """Returns concatenated HTML code with SPAN tag.

    Args:
      chunks: The list of chunks to be processed. (ChunkList)
      attributes: If a dictionary, it should be a map of name-value pairs for
          attributes of output SPAN tags. If a string, it should be a class name
          of output SPAN tags. If an array, it should be a list of class names
          of output SPAN tags. (str or dict or list of str)
      max_length: Maximum length of span enclosed chunk. (int, optional)      

    Returns:
      The organized HTML code. (str)
    """
        doc = ET.Element('span')
        for chunk in chunks:
            if chunk.is_space():
                if doc.getchildren():
                    if doc.getchildren()[-1].tail is None:
                        doc.getchildren()[-1].tail = ' '
                    else:
                        doc.getchildren()[-1].tail += ' '
                else:
                    if doc.text is not None:
                        # We want to preserve space in cases like "Hello 你好"
                        # But the space in " 你好" can be discarded.
                        doc.text += ' '
            else:
                if chunk.has_cjk() and not (max_length
                                            and len(chunk.word) > max_length):
                    ele = ET.Element('span')
                    ele.text = chunk.word
                    for k, v in attributes.items():
                        ele.attrib[k] = v
                    doc.append(ele)
                else:
                    # add word without span tag for non-CJK text (e.g. English)
                    # by appending it after the last element
                    if doc.getchildren():
                        if doc.getchildren()[-1].tail is None:
                            doc.getchildren()[-1].tail = chunk.word
                        else:
                            doc.getchildren()[-1].tail += chunk.word
                    else:
                        if doc.text is None:
                            doc.text = chunk.word
                        else:
                            doc.text += chunk.word
        result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
        result = html5lib.serialize(html5lib.parseFragment(result),
                                    sanitize=True,
                                    quote_attr_values="always")
        return result
Exemplo n.º 27
0
def test_parse_fragment_etree():
    """
    Parsing a fragment to to an etree produces a fragment root element that
    directly contains the given HTML.
    """
    fragment = parseFragment("<p>...</p><div>...</div>", treebuilder="etree")
    assert fragment.tag == 'DOCUMENT_FRAGMENT'
    [p, div] = fragment
    assert p.tag == "{http://www.w3.org/1999/xhtml}p"
    assert p.text == "..."
    assert div.tag == "{http://www.w3.org/1999/xhtml}div"
    assert div.text == "..."
Exemplo n.º 28
0
def truncate(html, length, killwords=False, end='...'):
    """
    Return a slice of ``html`` <= length chars.

    killwords and end are currently ignored.
    """
    tree = html5lib.parseFragment(html, encoding='utf-8')
    if text_length(tree) <= length:
        return jinja2.Markup(html)
    else:
        short, _ = trim(tree, length, killwords, end)
        return jinja2.Markup(force_unicode(short.toxml()))
Exemplo n.º 29
0
def test_parse_fragment_lxml():
    """
    Parsing a fragment to to an lxml etree produces a list of elements in the
    fragment.
    """
    fragment = parseFragment("<p>...</p><div>...</div>", treebuilder="lxml")
    assert isinstance(fragment, list)
    [p, div] = fragment
    assert p.tag == "{http://www.w3.org/1999/xhtml}p"
    assert p.text == "..."
    assert div.tag == "{http://www.w3.org/1999/xhtml}div"
    assert div.text == "..."
Exemplo n.º 30
0
def test_linkify():
    tmpl = env.from_string('{{ "http://test.example.com"|linkify}}')
    rendered = tmpl.render()
    el = html5lib.parseFragment(rendered)
    assert len(el.getchildren()) == 1

    el = el.getchildren()[0]
    assert el.tag == "{http://www.w3.org/1999/xhtml}a"
    assert el.text == "http://test.example.com"
    assert sorted(el.items()) == [
        ("href", "http://test.example.com"),
        ("rel", "nofollow"),
    ]
Exemplo n.º 31
0
def test_linkify():
    tmpl = env.from_string('{{ "http://test.example.com"|linkify}}')
    rendered = tmpl.render()
    el = html5lib.parseFragment(rendered)
    assert len(el.getchildren()) == 1

    el = el.getchildren()[0]
    assert el.tag == "{http://www.w3.org/1999/xhtml}a"
    assert el.text == "http://test.example.com"
    assert sorted(el.items()) == [
        ("href", "http://test.example.com"),
        ("rel", "nofollow"),
    ]
Exemplo n.º 32
0
    def run(self, text):
        parsed = html5lib.parseFragment(text)

        # if we didn't have to customize our sanitization, could just do:
        # return html5lib.serialize(parsed, sanitize=True)

        # instead we do the same steps as that function,
        # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
        walker = TreeWalker(parsed)
        walker = ForgeHTMLSanitizerFilter(walker)  # this is our custom step
        s = html5lib.serializer.HTMLSerializer()
        return s.render(walker)
Exemplo n.º 33
0
 def get_excerpt(self):
     """
     Look in the body text to find the ‘chapeau’, the lead text,
     that can be used as a description.
     """
     dom = html5lib.parseFragment(self.lead, treebuilder="etree", namespaceHTMLElements=False)
     for el in dom:
         if el.tag == "p":
             head = el.text or ""
             # el.text does not return the entire text if you have <p>Text with <em>child</em> tags</p>
             # cf http://stackoverflow.com/a/380717
             return "".join([head] + [ElementTree.tostring(e) for e in el.getchildren()])
     return u""
Exemplo n.º 34
0
def sanitize(string):
    """
    Ensure that the text does not contain any malicious HTML code which might
    break the page.
    """
    from html5lib import parseFragment, serialize

    parsed = parseFragment(string)
    clean = serialize(parsed,
                      sanitize=True,
                      omit_optional_tags=False,
                      quote_attr_values='always')
    return clean
Exemplo n.º 35
0
def preprocess(source):
    """Removes unnecessary break lines and white spaces.

  Args:
    source (str): Input sentence.

  Returns:
    Preprocessed sentence. (str)
  """
    doc = html5lib.parseFragment(source)
    source = ET.tostring(doc, encoding='utf-8', method='text').decode('utf-8')
    source = source.replace(u'\n', u'').strip()
    source = re.sub(r'\s\s+', u' ', source)
    return source
Exemplo n.º 36
0
    def test_attrib_no_toolbar(self,
                               name='form-0-code',
                               value='<html></html>'):
        ace_widget = django_ace.AceWidget(toolbar=False)
        content = ace_widget.render(name, value)

        root = html5lib.parseFragment(content, namespaceHTMLElements=False)
        editor = root[0]
        self.assertEqual(len(editor), 2)
        self.assertEqual(editor.attrib['class'], 'django-ace-editor')
        self.assertEqual(editor[0].tag, 'div')
        self.assertEqual(editor[0].attrib['class'],
                         'django-ace-widget loading')
        self.assertEqual(editor[1].tag, 'textarea')
Exemplo n.º 37
0
def truncate(html, length, killwords=False, end='...'):
    """
    Return a slice of ``html`` <= length chars.

    killwords and end are currently ignored.

    ONLY USE FOR KNOWN-SAFE HTML.
    """
    tree = html5lib.parseFragment(html, encoding='utf-8')
    if text_length(tree) <= length:
        return jinja2.Markup(html)
    else:
        short, _ = trim(tree, length, killwords, end)
        return jinja2.Markup(force_unicode(short.toxml()))
Exemplo n.º 38
0
Arquivo: tests.py Projeto: hwine/elmo
 def test_green(self):
     r = Run()
     r.id = 1
     rv = showrun(r)
     ok_(isinstance(rv, SafeString))
     frag = parseFragment(rv)
     eq_(len(frag.childNodes), 1)
     a = frag.childNodes[0]
     eq_(a.attributes, {'data-errors': '0',
                        'data-total': '0',
                        'data-missing': '0',
                        'href': '/dashboard/compare?run=1',
                        'data-warnings': '0'})
     text = a.childNodes[0].value
     ok_('green' in text)
Exemplo n.º 39
0
 def test_green(self):
     r = Run()
     r.id = 1
     rv = showrun(r)
     ok_(isinstance(rv, SafeString))
     frag = parseFragment(rv)
     eq_(len(frag.childNodes), 1)
     a = frag.childNodes[0]
     eq_(a.attributes, {'data-errors': '0',
                        'data-total': '0',
                        'data-missing': '0',
                        'href': '/dashboard/compare?run=1',
                        'data-warnings': '0'})
     text = a.childNodes[0].value
     ok_('green' in text)
Exemplo n.º 40
0
 def get(self, **kwargs):
     user = kwargs.get('_api_user')
     fmt = kwargs.get('format', 'object')
     part = kwargs.get('part', 'all')
     if fmt not in self._valid_formats:
         raise APISyntaxError("Unknown format: {0}".format(fmt))
     if part not in self._valid_parts:
         raise APISyntaxError("Unknown part: {0}".format(part))
     try:
         page = confluence_session.getPageById(kwargs.get('id'))
     except RemoteException:
         raise NotFound('Page not found')
     require_access(user, auth.Permissions.READ, page.space)
     logger.debug("Access checked")
     page.short_url = confluence_session.make_short_url(page.shortcode)
     if not page.current:
         raise Gone(config.get('Text', 'deleted_article',
                               'Article deleted.'))
     # Make a copy so we don't clobber the class one
     marshal_fields = self._fields.copy()
     render_kwargs = {'page_id': page.id}
     if part == 'excerpt':
         if page.excerpt is None:
             raise NotFound('The article has no excerpt.')
         del marshal_fields['content']
         marshal_fields['excerpt'] = fields.String
         render_kwargs['content'] = page.excerpt
     # Why do we not simply always pass 'content' to renderContent,
     # and just decide between the page content or the excerpt?  Because
     # when a page is rendered by page_id alone, it can be (and is) cached.
     # when a page is rendered by arbitrary content, it is not.
     if fmt == 'html':
         return {'html': confluence_session.renderContent(**render_kwargs) }
     if fmt == 'div':
         html = confluence_session.renderContent(style='clean',
                                                 **render_kwargs)
         parsed = html5lib.parseFragment(html, treebuilder='etree',
                                         namespaceHTMLElements=False)
         for el in parsed.findall(".//img"):
             if el.get('src').startswith('/confluence'):
                 el.set('src', 'http://kb.mit.edu' + el.get('src'))
         for el in parsed.findall(".//a"):
             if el.get('href', '').startswith('/confluence'):
                 el.set('href', 'http://kb.mit.edu' + el.get('href'))
         cleaned = xmletree.tostring(parsed[0], method='html' )
         return {'html':  cleaned}
     return { 'page': marshal(page,
                              marshal_fields)}
Exemplo n.º 41
0
    def get_image(self):
        """
        Look in the body text for the first image

        Try to find the associated filer object so we can make thumbnails
        """
        dom = html5lib.parseFragment(self.body, treebuilder="etree", namespaceHTMLElements=False)
        images = dom.findall('.//img')
        if images:
            img = images[0].get('src')            # u'https://medor.coop/media/filer_public/cb/1b/cb1b0760-5931-4766-b062-6ea821ba33c6/gent-cropped.png'
            img_path = urlparse(img).path         # u'/media/filer_public/cb/1b/cb1b0760-5931-4766-b062-6ea821ba33c6/gent-cropped.png'
            img_filename = basename(img_path)     # u'gent-cropped.png'
            for image in Image.objects.filter(original_filename__iexact=img_filename):
                if image.url == img_path:
                    return image
        return None
Exemplo n.º 42
0
def generate_slug(html):
    """Generates a URL slug for a HTML fragment."""
    document = parseFragment(html, treebuilder='etree', \
        namespaceHTMLElements=False, encoding='utf-8')
    try:
        text = ' '.join([t for t in document.find('.//h1').itertext()])
    except AttributeError:
        text = ' '.join([t for t in document.itertext()])
    text = get_first_sentence(text)
    text = unidecode(text).lower()
    allowed = \
        'abcdefghijklmnopqrstuvwxyz' + \
        '1234567890+- '
    text = ''.join([c for c in text if c in allowed])
    text = '-'.join(text.split())
    return text.encode('utf-8')
Exemplo n.º 43
0
 def test_green(self):
     r = Run()
     r.id = 1
     rv = showrun(r)
     ok_(isinstance(rv, SafeUnicode))
     frag = parseFragment(rv)
     childNodes = list(frag)
     eq_(len(childNodes), 1)
     a = childNodes[0]
     eq_(a.attrib, {'data-errors': '0',
                    'data-total': '0',
                    'data-missing': '0',
                    'href': '/dashboard/compare?run=1',
                    'data-warnings': '0'})
     text = a.text
     ok_('green' in text)
Exemplo n.º 44
0
 def test_green(self):
     r = Run()
     r.id = 1
     rv = showrun(r)
     self.assertIsInstance(rv, SafeUnicode)
     frag = parseFragment(rv)
     childNodes = list(frag)
     self.assertEqual(len(childNodes), 1)
     a = childNodes[0]
     self.assertDictEqual(
         a.attrib, {'data-errors': '0',
                    'data-total': '0',
                    'data-missing': '0',
                    'href': '/dashboard/compare?run=1',
                    'data-warnings': '0'})
     text = a.text
     self.assertIn('green', text)
Exemplo n.º 45
0
    def runtest(self):
        input = self.test["input"]
        expected = self.test["output"]

        parsed = parseFragment(input)
        serialized = serialize(parsed,
                               sanitize=True,
                               omit_optional_tags=False,
                               use_trailing_solidus=True,
                               space_before_trailing_solidus=False,
                               quote_attr_values="always",
                               quote_char="'",
                               alphabetical_attributes=True)
        errorMsg = "\n".join(["\n\nInput:", input,
                              "\nExpected:", expected,
                              "\nReceived:", serialized])
        assert expected == serialized, errorMsg
Exemplo n.º 46
0
    def test_attrib_options(self, name='form-0-code', value='<html></html>'):
        ace_widget = django_ace.AceWidget(
            mode='html',
            theme='twilight',
            wordwrap=True,
            showinvisibles=True,
            minlines=8,
            maxlines=16,
            tabsize=4,
            fontsize=12,
        )
        content = ace_widget.render(name, value)

        root = html5lib.parseFragment(content, namespaceHTMLElements=False)
        editor = root[0]

        widget = editor[1]
        self.assertEqual(widget.tag, 'div')
        self.assertEqual(len(widget.attrib.keys()), 16)
        self.assertEqual(
            sorted(widget.attrib.keys()),
            sorted([
                'class',
                'style',
                'data-mode',
                'data-theme',
                'data-wordwrap',
                'data-minlines',
                'data-maxlines',
                'data-tabsize',
                'data-fontsize',
                'data-behaviours',
                'data-readonly',
                'data-showgutter',
                'data-showinvisibles',
                'data-showprintmargin',
                'data-usesofttabs',
                'data-use-worker',
            ]))
        self.assertEqual(widget.attrib['data-mode'], 'html')
        self.assertEqual(widget.attrib['data-theme'], 'twilight')
        self.assertEqual(widget.attrib['data-wordwrap'], '')
        self.assertEqual(widget.attrib['data-minlines'], '8')
        self.assertEqual(widget.attrib['data-maxlines'], '16')
        self.assertEqual(widget.attrib['data-tabsize'], '4')
        self.assertEqual(widget.attrib['data-fontsize'], '12')
Exemplo n.º 47
0
def sanitize_html(html):
    """
    Make the given HTML string safe to display in a Yarrharr page.
    """
    tree = html5lib.parseFragment(html)
    serializer = html5lib.serializer.HTMLSerializer(sanitize=True)
    source = html5lib.getTreeWalker('etree')(tree)
    source = _strip_attrs(source)
    source = _drop_empty_tags(source)
    source = _ReplaceObjectFilter(source)
    source = _ElideFilter(source)
    source = _ReplaceYoutubeEmbedFilter(source)
    source = _ExtractTitleTextFilter(source)
    source = _adjust_links(source)
    source = _video_attrs(source)
    source = _wp_smileys(source)
    return serializer.render(source)
Exemplo n.º 48
0
def typogrify(html):
    # Using etree is important here because it does not suffer from a bug
    # where a text featuring entitities is split into various
    # adjacent text nodes.
    # (thanks html5lib folks for the tip).
    # See <https://github.com/html5lib/html5lib-python/issues/208>
    dom = html5lib.parseFragment(html, treebuilder="etree")
    walker = html5lib.getTreeWalker("etree")

    stream = walker(dom)
    stream = whitespace.Filter(stream)
    stream = medor.Filter(stream)
    stream = figures.Filter(stream)

    s = html5lib.serializer.HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)

    return s.render(stream)
Exemplo n.º 49
0
    def runtest(self):
        input = self.test["input"]
        expected = self.test["output"]

        parsed = parseFragment(input)
        serialized = serialize(parsed,
                               sanitize=True,
                               omit_optional_tags=False,
                               use_trailing_solidus=True,
                               space_before_trailing_solidus=False,
                               quote_attr_values="always",
                               quote_char="'",
                               alphabetical_attributes=True)
        errorMsg = "\n".join(["\n\nInput:", input,
                              "\nExpected:", expected,
                              "\nReceived:", serialized])
        assert expected == serialized, errorMsg
Exemplo n.º 50
0
    def parse_comments(self, root, raw):
        from urllib import unquote
        ans = ''
        ns = root.xpath('//div[@class="descrip"]')

        if ns:
            ns = ns[0]
            if len(ns) == 0 and ns.text:
                import html5lib
                # html5lib parsed noscript as CDATA
                ns = html5lib.parseFragment('<div>%s</div>' % (ns.text),
                                            treebuilder='lxml',
                                            namespaceHTMLElements=False)[0]

            ans = self._render_comments(ns)

        return ans
Exemplo n.º 51
0
    def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html
        import html5lib
        # html5lib parsed noscript as CDATA

        desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \
                                      treebuilder='lxml', namespaceHTMLElements=False)[0]
        matches = desc.xpath('descendant::*[contains(text(), "内容提要") \
            or contains(text(), "内容推荐") or contains(text(), "编辑推荐") \
            or contains(text(), "内容简介") or contains(text(), "基本信息")]/../*[self::p or self::div or self::span]'
                             )

        if matches:
            if len(matches) > 1:
                desc = matches[-1]
                for item in matches:
                    content_len = len(self.totext(item))
                    if content_len > 50 and content_len < 200:
                        desc = item
                        break

        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for c in desc.xpath('descendant::*[@class="seeAll" or'
                            ' @class="emptyClear" or @id="collapsePS" or'
                            ' @id="expandPS"]'):
            c.getparent().remove(c)
        #
        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = self.tostring(desc, method='text', encoding=unicode).strip()
        # return desc
        # Encoding bug in Amazon data U+fffd (replacement char)
        # in some examples it is present in place of '
        desc = desc.replace('\ufffd', "'")
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        desc = re.sub('\n+', '\n', desc)
        desc = re.sub(' +', ' ', desc)
        # Remove the notice about text referring to out of print editions
        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)
Exemplo n.º 52
0
    def test_attrib_default(self, name='form-0-code', value='<html></html>'):
        ace_widget = django_ace.AceWidget()
        content = ace_widget.render(name, value)

        root = html5lib.parseFragment(content, namespaceHTMLElements=False)
        editor = root[0]
        self.assertEqual(len(editor), 3)
        self.assertEqual(editor.tag, 'div')
        self.assertEqual(editor.attrib['class'], 'django-ace-editor')

        toolbar = editor[0]
        self.assertEqual(toolbar.tag, 'div')
        self.assertEqual(sorted(toolbar.attrib.keys()), ['class', 'style'])
        self.assertEqual(toolbar.attrib['class'], 'django-ace-toolbar')
        self.assertEqual(toolbar.attrib['style'], 'width: 500px')
        self.assertEqual(toolbar[0].tag, 'a')
        self.assertEqual(sorted(toolbar[0].attrib.keys()), ['class', 'href'])
        self.assertEqual(toolbar[0].attrib['class'], 'django-ace-max_min')
        self.assertEqual(toolbar[0].attrib['href'], './')

        widget = editor[1]
        self.assertEqual(widget.tag, 'div')
        self.assertEqual(len(widget.attrib.keys()), 8)
        self.assertEqual(
            sorted(widget.attrib.keys()),
            sorted([
                'class',
                'style',
                'data-behaviours',
                'data-readonly',
                'data-showgutter',
                'data-showprintmargin',
                'data-usesofttabs',
                'data-use-worker',
            ]))
        self.assertEqual(widget.attrib['class'], 'django-ace-widget loading')
        self.assertEqual(widget.attrib['style'], 'width:500px; height:300px')
        self.assertEqual(widget.attrib['data-showprintmargin'], '')
        self.assertEqual(widget.attrib['data-usesofttabs'], '')
        self.assertEqual(widget.attrib['data-use-worker'], '')

        textarea = editor[2]
        self.assertEqual(textarea.tag, 'textarea')
        self.assertEqual(textarea.attrib['name'], name)
        self.assertEqual(textarea.text, value)
Exemplo n.º 53
0
def find_iter(skeleton, document):
    """
    Return an iterator that yields elements from the document that
    match given skeleton.

    See `find_all` for details.
    """
    if is_string(document):
        document = html5lib.parse(document)
    if is_string(skeleton):
        fragment = html5lib.parseFragment(skeleton)
        if len(fragment) != 1:
            raise ValueError("Skeleton must have exactly one root element.")
        skeleton = fragment[0]

    for element in document.iter():
        if node_matches_bone(element, skeleton):
            yield element
Exemplo n.º 54
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ['blockquote', 'ol', 'li', 'ul']

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ''
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip('\n')

                # Remove the first new line after a block level element.
                if (prev_tag in html_blocks and value.startswith('\n')):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))
    if not parse.childNodes:
        # The parser couldn't make sense of the given html, eg bad markup.
        return ''

    walker = html5lib.treewalkers.getTreeWalker('simpletree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemplo n.º 55
0
    def parse_comments(self, root):
        ans = ''
        ns = CSSSelect('#bookDescription_feature_div noscript')(root)
        if ns:
            ns = ns[0]
            if len(ns) == 0 and ns.text:
                import html5lib
                # html5lib parsed noscript as CDATA
                ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
            else:
                ns.tag = 'div'
            ans = self._render_comments(ns)
        else:
            desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
            if desc:
                ans = self._render_comments(desc[0])

        desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
        if desc:
            ans += self._render_comments(desc[0])
        return ans
Exemplo n.º 56
0
def truncate(html, length, killwords=False, end='...'):
    """
    Return a slice of ``html`` <= length chars.

    killwords and end are currently ignored.

    ONLY USE FOR KNOWN-SAFE HTML.
    """
    tree = html5lib.parseFragment(html)
    if text_length(tree) <= length:
        return jinja2.Markup(html)
    else:
        # Get a truncated version of the tree.
        short, _ = trim(tree, length, killwords, end)

        # Serialize the parsed tree back to html.
        walker = html5lib.treewalkers.getTreeWalker('etree')
        stream = walker(short)
        serializer = html5lib.serializer.htmlserializer.HTMLSerializer(
            quote_attr_values=True, omit_optional_tags=False)
        return jinja2.Markup(force_unicode(serializer.render(stream)))
Exemplo n.º 57
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ["blockquote", "ol", "li", "ul"]

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ""
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip("\n")

                # Remove the first new line after a block level element.
                if prev_tag in html_blocks and value.startswith("\n"):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    walker = html5lib.treewalkers.getTreeWalker("simpletree")
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
    return serializer.render(stream)
Exemplo n.º 58
0
def html_to_text(html):
    """
    Convert HTML to representative text.

    All HTML tags are dropped. The content of non-visible tags like
    ``<script>`` and ``<style>`` tags is dropped. Other elements are replaced
    by their textual content. A single space is injected between `non-phrasing
    content <https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content>`_.

    Whitespace is normalized to approximate what CSS's ``white-space: normal``
    `would do on display <https://www.w3.org/TR/CSS2/text.html#white-space-model>`_
    to minimize the size of the resulting string.  Leading and trailing
    whitespace is dropped.

    :param str html: HTML string
    :returns: Plain text
    """
    tree = html5lib.parseFragment(html)
    buf = StringIO()

    def visit(el):
        needs_ws = el.tag not in _NO_WHITESPACE_TAGS
        if el.tag == _IMG_TAG:
            buf.write(el.get('alt', '🖼️'))
        elif el.tag not in _DROP_TAGS:
            if el.text is not None:
                if needs_ws:
                    buf.write(' ')
                buf.write(el.text)
            for child in el:
                visit(child)
        if el.tail is not None:
            if needs_ws:
                buf.write(' ')
            buf.write(el.tail)

    visit(tree)
    return _WHITESPACE_RE.sub(' ', buf.getvalue()).strip()
Exemplo n.º 59
0
    def parse_comments(self, root):
        ans = ""
        ns = tuple(self.selector("#bookDescription_feature_div noscript"))
        if ns:
            ns = ns[0]
            if len(ns) == 0 and ns.text:
                import html5lib

                # html5lib parsed noscript as CDATA
                ns = html5lib.parseFragment(
                    "<div>%s</div>" % (ns.text), treebuilder="lxml", namespaceHTMLElements=False
                )[0]
            else:
                ns.tag = "div"
            ans = self._render_comments(ns)
        else:
            desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
            if desc:
                ans = self._render_comments(desc[0])

        desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
        if desc:
            ans += self._render_comments(desc[0])
        return ans