def test_detokenize_single(self): src_tree = self._load() orig_src_tree = deepcopy(src_tree) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) new_tree = tokenizer.cleanup_tree(src_tree) self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) self.assertHtmlTreeEqual( new_tree, html_document_fromstring(UNANNOTATED_HTML) ) html_tokens, _ = tokenizer.tokenize_single(new_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) self.assertHtmlTreeEqual( detokenized_tree, html_document_fromstring(ANNOTATED_HTML) ) self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree) self.assertHtmlTreeEqual(detokenized_tree, src_tree)
def test_detokenize_handle_unicode(self): annotated_html = bytes(u""" <html> <body>Δ __START_ORG__ hello __END_ORG__ a, b <a>world</a></body> </html> """.encode('utf-8')) annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(annotated_tree) clean_tree = tokenizer.cleanup_tree(annotated_tree) html_tokens, _ = tokenizer.tokenize_single(clean_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
def test_detokenize_preserve_commas(self): annotated_html = b""" <html> <body> __START_ORG__ hello __END_ORG__ a, b <a>world</a></body> </html> """ annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(annotated_tree) clean_tree = tokenizer.cleanup_tree(annotated_tree) html_tokens, _ = tokenizer.tokenize_single(clean_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
def test_detokenize_preserve_commas(self): annotated_html = b""" <html> <body> __START_ORG__ hello __END_ORG__ a, b <a>world</a></body> </html> """ annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(annotated_tree) clean_tree = tokenizer.cleanup_tree(annotated_tree) html_tokens, _ = tokenizer.tokenize_single(clean_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
def test_detokenize_handle_unicode(self): annotated_html = bytes(u""" <html> <body>Δ __START_ORG__ hello __END_ORG__ a, b <a>world</a></body> </html> """.encode('utf-8')) annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(annotated_tree) clean_tree = tokenizer.cleanup_tree(annotated_tree) html_tokens, _ = tokenizer.tokenize_single(clean_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
def test_detokenize_single(self): src_tree = self._load() orig_src_tree = deepcopy(src_tree) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) new_tree = tokenizer.cleanup_tree(src_tree) self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) self.assertHtmlTreeEqual(new_tree, html_document_fromstring(UNANNOTATED_HTML)) html_tokens, _ = tokenizer.tokenize_single(new_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) self.assertHtmlTreeEqual(detokenized_tree, html_document_fromstring(ANNOTATED_HTML)) self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree) self.assertHtmlTreeEqual(detokenized_tree, src_tree)