def test_detokenize_single(self): src_tree = self._load() orig_src_tree = deepcopy(src_tree) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) new_tree = tokenizer.cleanup_tree(src_tree) self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) self.assertHtmlTreeEqual( new_tree, html_document_fromstring(UNANNOTATED_HTML) ) html_tokens, _ = tokenizer.tokenize_single(new_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) self.assertHtmlTreeEqual( detokenized_tree, html_document_fromstring(ANNOTATED_HTML) ) self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree) self.assertHtmlTreeEqual(detokenized_tree, src_tree)
def test_wa_convert_ignore_comments(self): tree = html_document_fromstring(b""" <html> <body> __START_ORG__ a <!--comment--> b __END_ORG__ cool </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual( wa_tree_str, r""" <html> <body> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> a </span> <!--comment--> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> b </span> cool </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> </html> """)
def test_tokenize_scripts_and_styles(self): html = b""" <html> <head> <script>function foo(){}</script> <style> body { color: "red" } </style> </head> <body>hello</body> </html> """ tree = HtmlLoader().loadbytes(html) tree2 = html_document_fromstring(html) # tokenizer doesn't produce tokens for <script> and <style> contents tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(tree) self.assertEqual(len(html_tokens), 1) self.assertEqual(html_tokens[0].tokens, ['hello']) self.assertEqual(html_tokens[0].elem.tag, 'body') # but it preserves <script> and <style> elements self.assertHtmlTreeEqual(tree, tree2) # and restores the tree if needed detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(tree, detokenized_tree)
def test_wa_title(self): tree = html_document_fromstring(b""" <html> <head><title>Foo</title></head> <body>contents</body> <wa-title><b>hello</b>, world</wa-title> </html> """) webannotator.apply_wa_title(tree) self.assertHtmlTreeEqual(tree, html_document_fromstring(b""" <html> <head><title><b>hello</b>, world</title></head> <body>contents</body> </html> """))
def test_wa_convert_ignore_comments(self): tree = html_document_fromstring(b""" <html> <body> __START_ORG__ a <!--comment--> b __END_ORG__ cool </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, r""" <html> <body> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> a </span> <!--comment--> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> b </span> cool </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> </html> """)
def loadbytes(self, data): # defer cleaning the tree to prevent custom cleaners from cleaning # WebAnnotator markup tree = html_document_fromstring(data, encoding=self.encoding) self._fix_title(tree) entities = self._get_entities(tree) self._process_entities(entities) return self._cleanup_tree(tree)
def loadbytes(self, data): # defer cleaning the tree to prevent custom cleaners from cleaning # WebAnnotator markup tree = html_document_fromstring(data, encoding=self.encoding) webannotator.apply_wa_title(tree) if self.known_entities: self._prune_tags(tree) entities = self._get_entities(tree) self._process_entities(entities) return self._cleanup_tree(tree)
def test_wa_convert_inner(self): tree = html_document_fromstring(b""" <html> <head> <title> __START_PER__ Hello! __END_PER__ world!</title> </head> <body> <p> __START_ORG__ Scrapinghub <b>Inc has</b>an <b>office</b>in Montevideo __END_ORG__ cool </p> </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual( wa_tree_str, r""" <html> <head> <title>Hello! world!</title> </head> <body> <p> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> Scrapinghub </span> <b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> Inc has </span> </b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> an </span> <b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> office </span> </b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> in Montevideo </span> cool </p> </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_PER" type="PER"></wa-color> <wa-color id="WA-color-1" bg="#FF0000" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;"> <span class="WebAnnotator_PER" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="PER">Hello!</span> world! </wa-title> </html> """)
def test_baseurl_exists(self): html = b""" <html> <head><base href="http://example.com/foo"/></head> <body><p>hello</p></body> </html> """ tree = html_document_fromstring(html) wa_tree = webannotator.to_webannotator(tree, url='http://example.com/bar') self.assertHtmlEqual(tostring(wa_tree), html)
def test_wa_convert_inner(self): tree = html_document_fromstring(b""" <html> <head> <title> __START_PER__ Hello! __END_PER__ world!</title> </head> <body> <p> __START_ORG__ Scrapinghub <b>Inc has</b>an <b>office</b>in Montevideo __END_ORG__ cool </p> </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, r""" <html> <head> <title>Hello! world!</title> </head> <body> <p> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> Scrapinghub </span> <b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> Inc has </span> </b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> an </span> <b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> office </span> </b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> in Montevideo </span> cool </p> </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_PER" type="PER"></wa-color> <wa-color id="WA-color-1" bg="#FF0000" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;"> <span class="WebAnnotator_PER" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="PER">Hello!</span> world! </wa-title> </html> """)
def from_htmlbytes(cls, html_bytes, encoding=None): colors = cls() tree = html_document_fromstring(html_bytes, encoding=encoding) for wa_color in tree.xpath('//wa-color'): assert wa_color.get('id').lower().startswith('wa-color-') idx = int(wa_color.get('id')[len("WA-color-"):]) fg = wa_color.get('fg') bg = wa_color.get('bg') typ = wa_color.get('type') colors[typ] = (fg, bg, idx) return colors
def test_dont_tokenize_nontext_nodes(self): html = b""" <body> <?xml version="1.0" encoding="UTF-8" standalone="no"?> </body> """ tree = html_document_fromstring(html) tokenizer = HtmlTokenizer() html_tokens, _ = tokenizer.tokenize_single(tree) self.assertEqual(len(html_tokens), 0)
def test_detokenize_single(self): src_tree = self._load() orig_src_tree = deepcopy(src_tree) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) new_tree = html_tokens[0].root self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) self.assertHtmlTreeEqual(new_tree, html_document_fromstring(UNANNOTATED_HTML)) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) self.assertHtmlTreeEqual(detokenized_tree, html_document_fromstring(ANNOTATED_HTML)) self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree) self.assertHtmlTreeEqual(detokenized_tree, src_tree)
def test_handle_nonxml_attributes(self): html = b""" <html> <body> <a class="addthis_button_facebook_like" like:layout="button_count"> </body> </html> """ tree = html_document_fromstring(html) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, html)
def test_wa_convert_no_title(self): tree = html_document_fromstring(b""" <html><body><p> __START_ORG__ Scrapinghub __END_ORG__ </p></body></html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, br""" <html> <body> <p> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">Scrapinghub</span> </p> </body> <wa-color bg="#33CCFF" class="WebAnnotator_ORG" fg="#000000" id="WA-color-0" type="ORG"></wa-color> </html> """)
def test_wa_convert_no_title(self): tree = html_document_fromstring(b""" <html><body><p> __START_ORG__ Scrapinghub __END_ORG__ </p></body></html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual( wa_tree_str, r""" <html> <body> <p> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">Scrapinghub</span> </p> </body> <wa-color bg="#33CCFF" class="WebAnnotator_ORG" fg="#000000" id="WA-color-0" type="ORG"></wa-color> </html> """)
def test_wa_convert_crosstitle(self): tree = html_document_fromstring(b""" <html> <head> <title> __START_ORG__ a __END_ORG__ b __START_ORG__ a </title> </head> <body> a __END_ORG__ a </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual( wa_tree_str, r""" <html> <head> <title> a b a </title> </head> <body> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG"> a </span> a </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;"> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> a </span> b <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG"> a </span> </wa-title> </html> """)
def test_wa_convert_crosstitle(self): tree = html_document_fromstring(b""" <html> <head> <title> __START_ORG__ a __END_ORG__ b __START_ORG__ a </title> </head> <body> a __END_ORG__ a </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, r""" <html> <head> <title> a b a </title> </head> <body> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG"> a </span> a </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;"> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> a </span> b <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG"> a </span> </wa-title> </html> """)
def loadbytes(self, data): tree = html_document_fromstring(data, self.encoding) return self.cleaner.clean_html(tree)
def assertApplyWaTitle(self, source, result): tree = html_document_fromstring(source) webannotator.apply_wa_title(tree) self.assertHtmlTreeEqual(tree, html_document_fromstring(result))