Exemplo n.º 1
0
    def test_detokenize_single(self):
        src_tree = self._load()
        orig_src_tree = deepcopy(src_tree)

        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(src_tree)
        new_tree = tokenizer.cleanup_tree(src_tree)
        self.assertIn(b'__START_ORG__', tostring(src_tree))
        self.assertNotIn(b'__START_ORG__', tostring(new_tree))

        self.assertHtmlTreeEqual(
            new_tree,
            html_document_fromstring(UNANNOTATED_HTML)
        )

        html_tokens, _ = tokenizer.tokenize_single(new_tree)
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertIn(b'__START_ORG__', tostring(detokenized_tree))

        self.assertHtmlTreeEqual(
            detokenized_tree,
            html_document_fromstring(ANNOTATED_HTML)
        )
        self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree)
        self.assertHtmlTreeEqual(detokenized_tree, src_tree)
Exemplo n.º 2
0
    def test_tokenize_scripts_and_styles(self):
        html = b"""
        <html>
          <head>
            <script>function foo(){}</script>
            <style>
              body {
                color: "red"
              }
            </style>
          </head>
          <body>hello</body>
        </html>
        """

        tree = HtmlLoader().loadbytes(html)
        tree2 = html_document_fromstring(html)

        # tokenizer doesn't produce tokens for <script> and <style> contents
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 1)
        self.assertEqual(html_tokens[0].tokens, ['hello'])
        self.assertEqual(html_tokens[0].elem.tag, 'body')

        # but it preserves <script> and <style> elements
        self.assertHtmlTreeEqual(tree, tree2)

        # and restores the tree if needed
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(tree, detokenized_tree)
Exemplo n.º 3
0
    def test_tokenize_scripts_and_styles(self):
        html = b"""
        <html>
          <head>
            <script>function foo(){}</script>
            <style>
              body {
                color: "red"
              }
            </style>
          </head>
          <body>hello</body>
        </html>
        """

        tree = HtmlLoader().loadbytes(html)
        tree2 = html_document_fromstring(html)

        # tokenizer doesn't produce tokens for <script> and <style> contents
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 1)
        self.assertEqual(html_tokens[0].tokens, ['hello'])
        self.assertEqual(html_tokens[0].elem.tag, 'body')

        # but it preserves <script> and <style> elements
        self.assertHtmlTreeEqual(tree, tree2)

        # and restores the tree if needed
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(tree, detokenized_tree)
Exemplo n.º 4
0
    def test_dont_tokenize_nontext_nodes(self):
        html = b"""
          <body>
              <?xml version="1.0" encoding="UTF-8" standalone="no"?>
          </body>
        """

        tree = html_document_fromstring(html)
        tokenizer = HtmlTokenizer()
        html_tokens, _ = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 0)
Exemplo n.º 5
0
    def test_dont_tokenize_nontext_nodes(self):
        html = b"""
          <body>
              <?xml version="1.0" encoding="UTF-8" standalone="no"?>
          </body>
        """

        tree = html_document_fromstring(html)
        tokenizer = HtmlTokenizer()
        html_tokens, _ = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 0)
Exemplo n.º 6
0
    def test_detokenize_preserve_commas(self):
        annotated_html = b"""
        <html>
          <body> __START_ORG__ hello __END_ORG__  a, b <a>world</a></body>
        </html>
        """

        annotated_tree = HtmlLoader().loadbytes(annotated_html)
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(annotated_tree)
        clean_tree = tokenizer.cleanup_tree(annotated_tree)
        html_tokens, _ = tokenizer.tokenize_single(clean_tree)
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
Exemplo n.º 7
0
    def test_detokenize_handle_unicode(self):
        annotated_html = bytes(u"""
        <html>
          <body>Δ  __START_ORG__ hello __END_ORG__  a, b <a>world</a></body>
        </html>
        """.encode('utf-8'))


        annotated_tree = HtmlLoader().loadbytes(annotated_html)
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(annotated_tree)
        clean_tree = tokenizer.cleanup_tree(annotated_tree)
        html_tokens, _ = tokenizer.tokenize_single(clean_tree)
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
Exemplo n.º 8
0
    def test_tokenization_doesnt_alter_tree(self):
        src_tree = self._load()
        orig_src_tree = deepcopy(src_tree)
        HtmlTokenizer().tokenize_single(src_tree)

        # original tree is not changed
        self.assertHtmlTreeEqual(src_tree, orig_src_tree)
Exemplo n.º 9
0
    def test_detokenize_single(self):
        src_tree = self._load()
        orig_src_tree = deepcopy(src_tree)

        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(src_tree)
        new_tree = html_tokens[0].root
        self.assertIn(b'__START_ORG__', tostring(src_tree))
        self.assertNotIn(b'__START_ORG__', tostring(new_tree))

        self.assertHtmlTreeEqual(new_tree,
                                 html_document_fromstring(UNANNOTATED_HTML))

        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertIn(b'__START_ORG__', tostring(detokenized_tree))

        self.assertHtmlTreeEqual(detokenized_tree,
                                 html_document_fromstring(ANNOTATED_HTML))
        self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree)
        self.assertHtmlTreeEqual(detokenized_tree, src_tree)
Exemplo n.º 10
0
 def __init__(self,
              model,
              loader=None,
              html_tokenizer=None,
              entity_colors=None):
     self.model = model
     self.loader = loader or HtmlLoader()
     self.html_tokenizer = html_tokenizer or HtmlTokenizer()
     if entity_colors is None:
         entity_colors = EntityColors()
     self.entity_colors = entity_colors
Exemplo n.º 11
0
    def assertTokenizationWorks(self, tree):
        html_tokens, tags = HtmlTokenizer().tokenize_single(tree)

        # data is correct
        self.assertListEqual(
            [t.token for t in html_tokens],
            [
                u'Scrapinghub', u'Inc', u'has', u'an', u'office', u'in',
                u'Montevideo'
            ],
        )
        self.assertListEqual(
            tags, [u'B-ORG', u'I-ORG', 'O', 'O', 'O', 'O', u'B-CITY'])
Exemplo n.º 12
0
    def test_detokenize_handle_unicode(self):
        annotated_html = bytes(u"""
        <html>
          <body>Δ  __START_ORG__ hello __END_ORG__  a, b <a>world</a></body>
        </html>
        """.encode('utf-8'))

        annotated_tree = HtmlLoader().loadbytes(annotated_html)
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(annotated_tree)
        clean_tree = tokenizer.cleanup_tree(annotated_tree)
        html_tokens, _ = tokenizer.tokenize_single(clean_tree)
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
Exemplo n.º 13
0
    def test_detokenize_preserve_commas(self):
        annotated_html = b"""
        <html>
          <body> __START_ORG__ hello __END_ORG__  a, b <a>world</a></body>
        </html>
        """

        annotated_tree = HtmlLoader().loadbytes(annotated_html)
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(annotated_tree)
        clean_tree = tokenizer.cleanup_tree(annotated_tree)
        html_tokens, _ = tokenizer.tokenize_single(clean_tree)
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
Exemplo n.º 14
0
 def test_detokenize_single_empty(self):
     self.assertIs(HtmlTokenizer().detokenize_single([], []), None)
Exemplo n.º 15
0
    'noscript',
    'ol',
    'output',
    'p',
    'pre',
    'section',
    'table',
    'tfoot',
    'ul',
    'video',
    # not really block, but makes sense to include
    'li',
    'body',
}

html_tokenizer = HtmlTokenizer()


def get_text_blocks(tree):
    tokens, _ = html_tokenizer.tokenize_single(tree)
    text_blocks = []
    prev_parent = None
    current = []
    for token in tokens:
        parent = token.parent
        while parent.tag not in BLOCK_TAGS:
            parent = parent.getparent()
        if prev_parent is not None and prev_parent != parent:
            text_blocks.append((prev_parent.tag, ' '.join(current)))
            current = []
        current.append(token.token)