def testIgnoreTraditionalBr(self): result = parseHtml( u"<html><body><p>Kissaa on ruokittava <br>huolella.</p></body></html>" ) self.assertEquals( [(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def checkPage(url, dictionary, clientIp, requestHeaders): log("checkPage: " + url.encode("UTF-8")) if dictionary not in _voikko: return u"" v = _voikko[dictionary] try: html = getHtmlSafely(url.encode('UTF-8'), clientIp, requestHeaders) segments = parseHtml(html) res = u"Analyysi sivusta " + escape(url) + u"<br />" v.setAcceptUnfinishedParagraphsInGc(True) for segment in segments: segmentClass = None checkGrammar = True if segment[0] == SEGMENT_TYPE_HEADING: v.setAcceptTitlesInGc(True) v.setAcceptBulletedListsInGc(False) segmentClass = u"webvoikkoH" elif segment[0] == SEGMENT_TYPE_LIST_ITEM: v.setAcceptTitlesInGc(False) v.setAcceptBulletedListsInGc(True) segmentClass = u"webvoikkoLi" elif segment[0] == SEGMENT_TYPE_PARAGRAPH: v.setAcceptTitlesInGc(False) v.setAcceptBulletedListsInGc(False) segmentClass = u"webvoikkoP" elif segment[0] == SEGMENT_TYPE_OTHER: checkGrammar = False segmentClass = u"webvoikkoO" res = res + u"<p class='" + segmentClass + u"'>" + doSpell( segment[1], v, checkGrammar) + u"</p>" return res except HttpException, e: return u"Sivua %s ei voitu hakea: %s" % (escape(url), e.parameter)
def testUnclosedTdIsLikeClosedTd(self): result = parseHtml( u"<html><body><table><tr><th>kissa<td>koira<td>poni</tr></table></body></html>" ) self.assertEquals([(SEGMENT_TYPE_OTHER, u"kissa"), (SEGMENT_TYPE_OTHER, u"koira"), (SEGMENT_TYPE_OTHER, u"poni")], result)
def testLineFeedIsJustSpace(self): result = parseHtml( u"<html><body><p>Kissaa\non\r\nruokittava\rhuolella.</p></body></html>" ) self.assertEquals( [(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def testExtraWhitespaceIsRemoved(self): result = parseHtml( u"<html><body><p>\tKissaa on \rruokittava huolella. </p></body></html>" ) self.assertEquals( [(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def testH1WithinPClosesP(self): result = parseHtml( u"<html><body><p>Kissa<h1>Koira</h1>jotain muuta</p></body></html>" ) self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissa"), (SEGMENT_TYPE_HEADING, u"Koira"), (SEGMENT_TYPE_OTHER, u"jotain muuta")], result)
def testBrIsWhitespace(self): result = parseHtml( u"<html><body><p>Kissaa on ruokittava<br/>huolella.</p></body></html>" ) self.assertEquals( [(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def checkPage(url, dictionary, clientIp, requestHeaders, checkForMaybeErrors): log("checkPage: " + url.encode("UTF-8")) if dictionary not in _voikko: return u"" v = _voikko[dictionary] try: html = getHtmlSafely(url.encode('UTF-8'), clientIp, requestHeaders) segments = parseHtml(html) res = u"Analyysi sivusta " + escape(url) + u"<br />" v.setAcceptUnfinishedParagraphsInGc(True) for segment in segments: segmentClass = None checkGrammar = True if segment[0] == SEGMENT_TYPE_HEADING: v.setAcceptTitlesInGc(True) v.setAcceptBulletedListsInGc(False) segmentClass = u"webvoikkoH" elif segment[0] == SEGMENT_TYPE_LIST_ITEM: v.setAcceptTitlesInGc(False) v.setAcceptBulletedListsInGc(True) segmentClass = u"webvoikkoLi" elif segment[0] == SEGMENT_TYPE_PARAGRAPH: v.setAcceptTitlesInGc(False) v.setAcceptBulletedListsInGc(False) segmentClass = u"webvoikkoP" elif segment[0] == SEGMENT_TYPE_OTHER: checkGrammar = False segmentClass = u"webvoikkoO" res = res + u"<p class='" + segmentClass + u"'>" + doSpell(segment[1], v, checkGrammar, checkForMaybeErrors) + u"</p>" return res except HttpException, e: return u"Sivua %s ei voitu hakea: %s" % (escape(url), e.parameter)
def testUnclosedP(self): result = parseHtml(u"<html><body><p>kissa<p>koira<div><p>hevonen</div></body></html>") self.assertEquals( [ (SEGMENT_TYPE_PARAGRAPH, u"kissa"), (SEGMENT_TYPE_PARAGRAPH, u"koira"), (SEGMENT_TYPE_PARAGRAPH, u"hevonen"), ], result, )
def testH1WithinPClosesP(self): result = parseHtml(u"<html><body><p>Kissa<h1>Koira</h1>jotain muuta</p></body></html>") self.assertEquals( [ (SEGMENT_TYPE_PARAGRAPH, u"Kissa"), (SEGMENT_TYPE_HEADING, u"Koira"), (SEGMENT_TYPE_OTHER, u"jotain muuta"), ], result, )
def testLineFeedIsJustSpace(self): result = parseHtml(u"<html><body><p>Kissaa\non\r\nruokittava\rhuolella.</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def testBrIsWhitespace(self): result = parseHtml(u"<html><body><p>Kissaa on ruokittava<br/>huolella.</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def testIgnoreTraditionalBr(self): result = parseHtml(u"<html><body><p>Kissaa on ruokittava <br>huolella.</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def testScriptsAreStripped(self): result = parseHtml( u"<html><body><p>Kissaa on ruokittava.</p><script>lksjdf</script></body></html>" ) self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
def assertParseError(self, html, lineno, offset): try: parseHtml(html) except HTMLParseError, e: self.assertEquals(lineno, e.lineno) self.assertEquals(offset, e.offset)
def testUnknownEntityIsAssumedToBeJustText(self): result = parseHtml(u"<html><body><p>Kissa & koira ja &kissa;</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissa & koira ja &kissa")], result)
def testStrayTdCloseTag(self): result = parseHtml(u"<html><body><p>kissa</p></td></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"kissa")], result)
def testNonAscii(self): result = parseHtml( u"<html><body><h1>Eläinlääkärissä käynti €</h1></body></html>" ) self.assertEquals( [(SEGMENT_TYPE_HEADING, u"Eläinlääkärissä käynti €")], result)
def testUnderlineInducesNoSpace(self): result = parseHtml( u"<html><body><h1>Libre<u>Office</u></h1></body></html>") self.assertEquals([(SEGMENT_TYPE_HEADING, u"LibreOffice")], result)
def testCiteIsJustText(self): result = parseHtml( u"<html><body><p>Kissaa on <cite>ruokittava</cite>.</p></body></html>" ) self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
def testTableWithinP(self): result = parseHtml( u"<html><body><p><table><tr><td>sdsd</td></tr></table>ruokittava.</p></body></html>" ) self.assertEquals([(SEGMENT_TYPE_OTHER, u"sdsd"), (SEGMENT_TYPE_OTHER, u"ruokittava.")], result)
def testTablesAreIgnored(self): result = parseHtml( u"<html><body><p>Kissaa on ruokittava.</p><table><tr><td>sdsd</td></tr></table></body></html>" ) self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava."), (SEGMENT_TYPE_OTHER, u"sdsd")], result)
def testScriptsWithinPIsIgnoredAndContentStripped(self): result = parseHtml(u"<html><body><p>Kissaa on <script>aksldj</script>ruokittava.</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
def testUnknownEntityIsAssumedToBeJustText(self): result = parseHtml( u"<html><body><p>Kissa & koira ja &kissa;</p></body></html>") self.assertEquals( [(SEGMENT_TYPE_PARAGRAPH, u"Kissa & koira ja &kissa")], result)
def testUnderlineInducesNoSpace(self): result = parseHtml(u"<html><body><h1>Libre<u>Office</u></h1></body></html>") self.assertEquals([(SEGMENT_TYPE_HEADING, u"LibreOffice")], result)
def testParseEmptyDocument(self): result = parseHtml(u"<html><head></head><body></body></html>") self.failUnless(len(result) == 0)
def testTextWithinBody(self): result = parseHtml(u"<html><body>kissa</body></html>") self.assertEquals([(SEGMENT_TYPE_OTHER, u"kissa")], result)
def testHexCharacterReferences(self): result = parseHtml(u"<html><body><h1>ä</h1></body></html>") self.assertEquals([(SEGMENT_TYPE_HEADING, u"ä")], result)
def testParseTitle(self): result = parseHtml(u"<html><head><title>kissa</title></head><body></body></html>") self.assertEquals([(SEGMENT_TYPE_OTHER, u"kissa")], result)
def testParseParagraph(self): result = parseHtml(u"<html><body><p>Kissaa on ruokittava huolella.</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def testAnyClosingTagIsIgnoredIfOpenTagStackHasNoSuchTag(self): result = parseHtml( u"<html><head></style></head><body><p>kissa</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"kissa")], result)
def testIgnoreImages(self): result = parseHtml(u"<html><body><p>Kissaa <img src='cat.jpg'>on ruokittava.</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
def testPWithinBlockquote(self): result = parseHtml( u"<html><body><blockquote><p>Kissa</p></blockquote></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissa")], result)
def testExtraWhitespaceIsRemoved(self): result = parseHtml(u"<html><body><p>\tKissaa on \rruokittava huolella. </p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def testParseTitle(self): result = parseHtml( u"<html><head><title>kissa</title></head><body></body></html>") self.assertEquals([(SEGMENT_TYPE_OTHER, u"kissa")], result)
def testScriptsAreStripped(self): result = parseHtml(u"<html><body><p>Kissaa on ruokittava.</p><script>lksjdf</script></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
def testHeadMayRemainUnfinished(self): result = parseHtml(u"<html><head></html>") self.assertEquals([], result)
def testParseHeader(self): result = parseHtml( u"<html><body><h1>Kissan ruokkiminen</h1></body></html>") self.assertEquals([(SEGMENT_TYPE_HEADING, u"Kissan ruokkiminen")], result)
def testParseListItemsWithinEm(self): result = parseHtml( u"<html><body><ul><li>kis<em>sa</em></li></ul></body></html>") self.assertEquals([(SEGMENT_TYPE_LIST_ITEM, u"kissa")], result)
def testTableWithinP(self): result = parseHtml(u"<html><body><p><table><tr><td>sdsd</td></tr></table>ruokittava.</p></body></html>") self.assertEquals([(SEGMENT_TYPE_OTHER, u"sdsd"), (SEGMENT_TYPE_OTHER, u"ruokittava.")], result)
def testParseNestedLists(self): result = parseHtml( u"<html><body><ul><li>kissa<ul><li>koira</li></ul></li></ul></body></html>" ) self.assertEquals([(SEGMENT_TYPE_OTHER, u"kissa"), (SEGMENT_TYPE_LIST_ITEM, u"koira")], result)
def testStrongIsJustText(self): result = parseHtml(u"<html><body><p>Kissaa on <strong>ruokittava</strong>.</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
def testParseDefinitionLists(self): result = parseHtml( u"<html><body><dl><dt>kissa</dt><dd>jalo eläin</dd></dl></body></html>" ) self.assertEquals([(SEGMENT_TYPE_LIST_ITEM, u"kissa"), (SEGMENT_TYPE_LIST_ITEM, u"jalo eläin")], result)
def testNonAscii(self): result = parseHtml(u"<html><body><h1>Eläinlääkärissä käynti €</h1></body></html>") self.assertEquals([(SEGMENT_TYPE_HEADING, u"Eläinlääkärissä käynti €")], result)
def testParseHeader(self): result = parseHtml(u"<html><body><h1>Kissan ruokkiminen</h1></body></html>") self.assertEquals([(SEGMENT_TYPE_HEADING, u"Kissan ruokkiminen")], result)
def testParseNestedLists(self): result = parseHtml(u"<html><body><ul><li>kissa<ul><li>koira</li></ul></li></ul></body></html>") self.assertEquals([(SEGMENT_TYPE_OTHER, u"kissa"), (SEGMENT_TYPE_LIST_ITEM, u"koira")], result)
def testAnyClosingTagIsIgnoredIfOpenTagStackHasNoSuchTag(self): result = parseHtml(u"<html><head></style></head><body><p>kissa</p></body></html>") self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"kissa")], result)
def testClearBeforeParagraph(self): result = parseHtml(u"<html><head><title>koira</title></head><body><p>kissa</p></body></html>") self.assertEquals([(SEGMENT_TYPE_OTHER, u"koira"), (SEGMENT_TYPE_PARAGRAPH, u"kissa")], result)
def testScriptsWithinPIsIgnoredAndContentStripped(self): result = parseHtml( u"<html><body><p>Kissaa on <script>aksldj</script>ruokittava.</p></body></html>" ) self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)
def testParseListItemsWithinEm(self): result = parseHtml(u"<html><body><ul><li>kis<em>sa</em></li></ul></body></html>") self.assertEquals([(SEGMENT_TYPE_LIST_ITEM, u"kissa")], result)
def testParseParagraph(self): result = parseHtml( u"<html><body><p>Kissaa on ruokittava huolella.</p></body></html>") self.assertEquals( [(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava huolella.")], result)
def testParseDefinitionLists(self): result = parseHtml(u"<html><body><dl><dt>kissa</dt><dd>jalo eläin</dd></dl></body></html>") self.assertEquals([(SEGMENT_TYPE_LIST_ITEM, u"kissa"), (SEGMENT_TYPE_LIST_ITEM, u"jalo eläin")], result)
def testIgnoreImages(self): result = parseHtml( u"<html><body><p>Kissaa <img src='cat.jpg'>on ruokittava.</p></body></html>" ) self.assertEquals([(SEGMENT_TYPE_PARAGRAPH, u"Kissaa on ruokittava.")], result)