def test_extraction_tag_caption_td_th(self): """Check that we can extract table related translatable: th, td and caption""" h = html.htmlfile() # Example form http://www.w3schools.com/tags/tag_caption.asp store = h.parsestring( """ <table> <caption>Monthly savings</caption> <tr> <th>Month</th> <th>Savings</th> </tr> <tr> <td>January</td> <td>$100</td> </tr> </table>""" ) print(store.units[0].source) assert len(store.units) == 5 assert store.units[0].source == "Monthly savings" assert store.units[1].source == "Month" assert store.units[2].source == "Savings" assert store.units[3].source == "January" assert store.units[4].source == "$100"
def test_extraction_attr_title(self): """Check that we can extract title attribute""" h = html.htmlfile() # Example form http://www.w3schools.com/tags/att_global_title.asp store = h.parsestring(""" <p><abbr title="World Health Organization">WHO</abbr> was founded in 1948.</p> <p title="Free Web tutorials">W3Schools.com</p>""") print(store.units[0].source) assert len(store.units) == 4 assert store.units[0].source == "World Health Organization" # FIXME this is not ideal we need to either drop title= as we've # extracted it already or not extract it earlier assert store.units[1].source == '<abbr title="World Health Organization">WHO</abbr> was founded in 1948.' assert store.units[2].source == "Free Web tutorials" assert store.units[3].source == "W3Schools.com" # Example from http://www.netmechanic.com/news/vol6/html_no1.htm store = h.parsestring(""" <table width="100" border="2" title="Henry Jacobs Camp summer 2003 schedule"> """) assert len(store.units) == 1 assert store.units[0].source == "Henry Jacobs Camp summer 2003 schedule" # FIXME this doesn't extract as I'd have expected #store = h.parsestring(""" # <a href="page1.html" title="HS Jacobs - a UAHC camp in Utica, MS">Henry S. Jacobs Camp</a> #""") #assert len(store.units) == 2 #assert store.units[0].source == "HS Jacobs - a UAHC camp in Utica, MS" #assert store.units[1].source == "Henry S. Jacobs Camp" store = h.parsestring(""" <form name="application" title="Henry Jacobs camper application" method=" " action=" "> """) assert len(store.units) == 1 assert store.units[0].source == "Henry Jacobs camper application"
def mergestore(self, inputstore, templatetext, includefuzzy): """converts a file to .po format""" self.inputstore = inputstore self.inputstore.makeindex() self.includefuzzy = includefuzzy output_store = html.htmlfile(inputfile=templatetext, callback=self.lookup) return output_store.filesrc
def test_escaping_script_and_pre(self): """<script> and <pre> can contain < and > and these should not be interpretted as tags""" h = html.htmlfile() store = h.parsestring("<p>We are here</p><script>Some </tag>like data<script></p>") print store.units[0].source assert len(store.units) == 1
def test_extraction_attr_title(self): """Check that we can extract title attribute""" h = html.htmlfile() # Example form http://www.w3schools.com/tags/att_global_title.asp store = h.parsestring(""" <p><abbr title="World Health Organization">WHO</abbr> was founded in 1948.</p> <p title="Free Web tutorials">W3Schools.com</p>""") print(store.units[0].source) assert len(store.units) == 3 assert store.units[ 0].source == '<abbr title="World Health Organization">WHO</abbr> was founded in 1948.' assert store.units[1].source == "Free Web tutorials" assert store.units[2].source == "W3Schools.com" # Example from http://www.netmechanic.com/news/vol6/html_no1.htm store = h.parsestring(""" <table width="100" border="2" title="Henry Jacobs Camp summer 2003 schedule"> """) assert len(store.units) == 1 assert store.units[ 0].source == "Henry Jacobs Camp summer 2003 schedule" store = h.parsestring(""" <div><a href="page1.html" title="HS Jacobs - a UAHC camp in Utica, MS">Henry S. Jacobs Camp</a></div> """) assert len(store.units) == 2 assert store.units[0].source == "HS Jacobs - a UAHC camp in Utica, MS" assert store.units[1].source == "Henry S. Jacobs Camp" store = h.parsestring(""" <form name="application" title="Henry Jacobs camper application" method=" " action=" "> """) assert len(store.units) == 1 assert store.units[0].source == "Henry Jacobs camper application"
def test_strip_html_with_pi(): h = html.htmlfile() assert html.strip_html( h.pi_escape('<a href="<?$var?>">Something</a>')) == "Something" assert html.strip_html( h.pi_escape( '<a href="<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>">Something</a>' )) == "Something"
def test_escaping_script_and_pre(self): """<script> and <pre> can contain < and > and these should not be interpretted as tags""" h = html.htmlfile() store = h.parsestring( "<p>We are here</p><script>Some </tag>like data<script></p>") print(store.units[0].source) assert len(store.units) == 1
def convertfile_inner(inputfile, outputstore, keepcomments): """Extract translation units from an html file and add to a pofile object.""" htmlparser = html.htmlfile(inputfile=inputfile) for htmlunit in htmlparser.units: thepo = outputstore.addsourceunit(htmlunit.source) thepo.addlocations(htmlunit.getlocations()) if keepcomments: thepo.addnote(htmlunit.getnotes(), "developer")
def test_extraction_attr_alt(self): """Check that we can extract title attribute""" h = html.htmlfile() # Example from http://www.netmechanic.com/news/vol6/html_no1.htm store = h.parsestring(""" <img src="cafeteria.jpg" height="200" width="200" alt="UAHC campers enjoy a meal in the camp cafeteria"> """) assert len(store.units) == 1 assert store.units[0].source == "UAHC campers enjoy a meal in the camp cafeteria"
def test_guess_encoding(): """Read an encoding header to guess the encoding correctly""" h = html.htmlfile() assert (h.guess_encoding( b"""<META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-8">""" ) == "UTF-8") assert (h.guess_encoding( b"""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><!-- base href="http://home.online.no/~rut-aane/linux.html" --><link rel="shortcut icon" href="http://home.online.no/~rut-aane/peng16x16a.gif"><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"><meta name="Description" content="Linux newbie stuff and a little about Watching TV under Linux"><meta name="MSSmartTagsPreventParsing" content="TRUE"><meta name="GENERATOR" content="Mozilla/4.7 [en] (X11; I; Linux 2.2.5-15 i586) [Netscape]"><title>Some Linux for beginners</title><style type="text/css">""" ) == "iso-8859-1")
def convertfile(storefile): store = pofile() # Fake input file with a blank filename htmlparser = htmlfile(includeuntaggeddata=False, inputfile=BytesIOMode("", storefile.read())) for htmlunit in htmlparser.units: thepo = store.addsourceunit(htmlunit.source) thepo.addlocations(htmlunit.getlocations()) thepo.addnote(htmlunit.getnotes(), "developer") store.removeduplicates("msgctxt") return store
def test_extraction_pre_code(): """Check that we can preserve lines in the <pre> tag""" h = html.htmlfile() store = h.parsestring(""" <pre><code> this is a multiline pre tag </code></pre> """) assert len(store.units) == 1 assert store.units[0].source == "this is\na multiline\npre tag"
def test_extraction_tag_figcaption(self): """Check that we can extract figcaption""" h = html.htmlfile() # Example form http://www.w3schools.com/tags/tag_figcaption.asp store = h.parsestring(""" <figure> <img src="img_pulpit.jpg" alt="The Pulpit Rock" width="304" height="228"> <figcaption>Fig1. - A view of the pulpit rock in Norway.</figcaption> </figure>""") print(store.units[0].source) assert len(store.units) == 2 assert store.units[0].source == "The Pulpit Rock" assert store.units[1].source == "Fig1. - A view of the pulpit rock in Norway."
def convertfile(self, inputfile, filename, includeuntagged=False, duplicatestyle="msgctxt", keepcomments=False): """converts a html file to .po format""" thetargetfile = po.pofile() htmlparser = html.htmlfile(includeuntaggeddata=includeuntagged, inputfile=inputfile) for htmlunit in htmlparser.units: thepo = thetargetfile.addsourceunit(htmlunit.source) thepo.addlocations(htmlunit.getlocations()) if keepcomments: thepo.addnote(htmlunit.getnotes(), "developer") thetargetfile.removeduplicates(duplicatestyle) return thetargetfile
def convertfile(storefile, template_store): store = pofile() # Fake input file with a blank filename htmlparser = htmlfile(inputfile=BytesIOMode("", storefile.read())) for htmlunit in htmlparser.units: locations = htmlunit.getlocations() if template_store: # Transalation template = template_store.find_unit_mono("".join(locations)) if template is None: # Skip locations not present in the source HTML file continue # Create unit with matching source thepo = store.addsourceunit(template.source) thepo.target = htmlunit.source else: # Source file thepo = store.addsourceunit(htmlunit.source) thepo.target = htmlunit.source thepo.addlocations(htmlunit.getlocations()) thepo.addnote(htmlunit.getnotes(), "developer") store.removeduplicates("msgctxt") return store
def test_extraction_tag_caption_td_th(self): """Check that we can extract table related translatable: th, td and caption""" h = html.htmlfile() # Example form http://www.w3schools.com/tags/tag_caption.asp store = h.parsestring(""" <table> <caption>Monthly savings</caption> <tr> <th>Month</th> <th>Savings</th> </tr> <tr> <td>January</td> <td>$100</td> </tr> </table>""") print(store.units[0].source) assert len(store.units) == 5 assert store.units[0].source == "Monthly savings" assert store.units[1].source == "Month" assert store.units[2].source == "Savings" assert store.units[3].source == "January" assert store.units[4].source == "$100"
def test_extraction_attr_title(self): """Check that we can extract title attribute""" h = html.htmlfile() # Example form http://www.w3schools.com/tags/att_global_title.asp store = h.parsestring(""" <p><abbr title="World Health Organization">WHO</abbr> was founded in 1948.</p> <p title="Free Web tutorials">W3Schools.com</p>""") print(store.units[0].source) assert len(store.units) == 4 assert store.units[0].source == "World Health Organization" # FIXME this is not ideal we need to either drop title= as we've # extracted it already or not extract it earlier assert store.units[ 1].source == '<abbr title="World Health Organization">WHO</abbr> was founded in 1948.' assert store.units[2].source == "Free Web tutorials" assert store.units[3].source == "W3Schools.com" # Example from http://www.netmechanic.com/news/vol6/html_no1.htm store = h.parsestring(""" <table width="100" border="2" title="Henry Jacobs Camp summer 2003 schedule"> """) assert len(store.units) == 1 assert store.units[ 0].source == "Henry Jacobs Camp summer 2003 schedule" # FIXME this doesn't extract as I'd have expected #store = h.parsestring(""" # <a href="page1.html" title="HS Jacobs - a UAHC camp in Utica, MS">Henry S. Jacobs Camp</a> #""") #assert len(store.units) == 2 #assert store.units[0].source == "HS Jacobs - a UAHC camp in Utica, MS" #assert store.units[1].source == "Henry S. Jacobs Camp" store = h.parsestring(""" <form name="application" title="Henry Jacobs camper application" method=" " action=" "> """) assert len(store.units) == 1 assert store.units[0].source == "Henry Jacobs camper application"
def strip_html(self, str): h = html.htmlfile() store = h.parsestring(str) return "\n".join([u.source for u in store.units])
def test_self_closing_tags(self): h = html.htmlfile() store = h.parsestring("<h3>Some text <img><br><img></h3>") assert len(store.units) == 1
def test_pi_escaping(): h = html.htmlfile() assert h.pi_escape('<a href="<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>">') == '<a href="<?=($a %lt; $b ? $foo : ($b %gt; c ? $bar : $cat))?>">'
def test_strip_html_with_pi(): h = html.htmlfile() assert html.strip_html(h.pi_escape('<a href="<?$var?>">Something</a>')) == "Something" assert html.strip_html(h.pi_escape('<a href="<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>">Something</a>')) == "Something"
def test_guess_encoding(): """Read an encoding header to guess the encoding correctly""" h = html.htmlfile() assert h.guess_encoding('''<META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-8">''') == "UTF-8" assert h.guess_encoding('''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><!-- base href="http://home.online.no/~rut-aane/linux.html" --><link rel="shortcut icon" href="http://home.online.no/~rut-aane/peng16x16a.gif"><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"><meta name="Description" content="Linux newbie stuff and a little about Watching TV under Linux"><meta name="MSSmartTagsPreventParsing" content="TRUE"><meta name="GENERATOR" content="Mozilla/4.7 [en] (X11; I; Linux 2.2.5-15 i586) [Netscape]"><title>Some Linux for beginners</title><style type="text/css">''') == "iso-8859-1"
def test_pi_escaping(): h = html.htmlfile() assert h.pi_escape( '<a href="<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>">' ) == '<a href="<?=($a %lt; $b ? $foo : ($b %gt; c ? $bar : $cat))?>">'