def testDistill(self):

        # check distilling basic HTML with all tags supported.

        self.fp = rspreader.openlog(testpath / "basictags.html")  # have all tags supported
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual(0, result)

        s = self.buf.getvalue()

        # these tags should be filtered
        self.assertEqual(-1, s.find("<html"))
        self.assertEqual(-1, s.find("<head"))
        self.assertEqual(-1, s.find("<title"))
        self.assertEqual(-1, s.find("<body"))
        self.assertEqual(-1, s.find("<font"))
        self.assertEqual(-1, s.find("<b>"))
        self.assertEqual(-1, s.find("<em"))
        self.assertEqual(-1, s.find("<pre>"))
        self.assertEqual(-1, s.find("<blockquote>"))
        self.assertEqual(-1, s.find("<div"))
        self.assertEqual(-1, s.find("<span"))
        self.assertEqual(-1, s.find("<table"))
        self.assertEqual(-1, s.find("<tr"))
        self.assertEqual(-1, s.find("<td"))
        self.assertEqual(-1, s.find("<form"))
        self.assertEqual(-1, s.find("<img"))
        self.assertEqual(-1, s.find("<a"))
        self.assertEqual(-1, s.find("</html>"))

        # these tags should present
        self.assert_(s.find("<h1>") > 0)
        self.assert_(s.find("<h2>") > 0)
        self.assert_(s.find("<h3>") > 0)
        self.assert_(s.find("<h4>") > 0)
        self.assert_(s.find("<h5>") > 0)
        self.assert_(s.find("<h6>") > 0)
        self.assert_(s.find("<p>") > 0)
        self.assert_(s.find("<ul>") > 0)
        self.assert_(s.find("<ol>") > 0)
        self.assert_(s.find("<li>") > 0)
        self.assert_(s.find("<br>") > 0)
        self.assert_(s.find("<hr>") > 0)

        # these are some other transformed data
        self.assert_(s.find("h1-Sample HTML") > 0)
        self.assert_(s.find("[fill your name]") > 0)  # <form>
        self.assert_(s.find("[*]") > 0)
        self.assert_(s.find("[ ]") > 0)
        self.assert_(s.find("(*)") > 0)
        self.assert_(s.find("( )") > 0)
        self.assert_(s.find("[***]") > 0)
        self.assert_(s.find("Lorem") > 0)  # <textarea>
        self.assert_(s.find("[button]") > 0)
        self.assert_(s.find("[submit]") > 0)
        self.assert_(s.find("[reset]") > 0)
        self.assert_(s.find("[go]") > 0)
        self.assert_(s.find("[a picture]") > 0)  # <img>
        self.assert_(s.find(u'<&amp;,&lt;, ,",&gt;>') > 0)  # entities
    def testDistill(self):

        # check distilling basic HTML with all tags supported.

        self.fp = rspreader.openlog(testdir + 'basictags.html')  # have all tags supported
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual(0, result)

        s = self.buf.getvalue()

        # these tags should be filtered
        self.assertEqual(-1, s.find('<html'))
        self.assertEqual(-1, s.find('<head'))
        self.assertEqual(-1, s.find('<title'))
        self.assertEqual(-1, s.find('<body'))
        self.assertEqual(-1, s.find('<font'))
        self.assertEqual(-1, s.find('<b>'))
        self.assertEqual(-1, s.find('<em'))
        self.assertEqual(-1, s.find('<pre>'))
        self.assertEqual(-1, s.find('<blockquote>'))
        self.assertEqual(-1, s.find('<div'))
        self.assertEqual(-1, s.find('<span'))
        self.assertEqual(-1, s.find('<table'))
        self.assertEqual(-1, s.find('<tr'))
        self.assertEqual(-1, s.find('<td'))
        self.assertEqual(-1, s.find('<form'))
        self.assertEqual(-1, s.find('<img'))
        self.assertEqual(-1, s.find('<a'))
        self.assertEqual(-1, s.find('</html>'))

        # these tags should present
        self.assert_(s.find('<h1>') > 0)
        self.assert_(s.find('<h2>') > 0)
        self.assert_(s.find('<h3>') > 0)
        self.assert_(s.find('<h4>') > 0)
        self.assert_(s.find('<h5>') > 0)
        self.assert_(s.find('<h6>') > 0)
        self.assert_(s.find('<p>' ) > 0)
        self.assert_(s.find('<ul>') > 0)
        self.assert_(s.find('<ol>') > 0)
        self.assert_(s.find('<li>') > 0)
        self.assert_(s.find('<br>') > 0)
        self.assert_(s.find('<hr>') > 0)

        # these are some other transformed data
        self.assert_(s.find('h1-Sample HTML') > 0)
        self.assert_(s.find('[fill your name]') > 0)        # <form>
        self.assert_(s.find('[*]') > 0)
        self.assert_(s.find('[ ]') > 0)
        self.assert_(s.find('(*)') > 0)
        self.assert_(s.find('( )') > 0)
        self.assert_(s.find('[***]') > 0)
        self.assert_(s.find('Lorem') > 0)                   # <textarea>
        self.assert_(s.find('[button]') > 0)
        self.assert_(s.find('[submit]') > 0)
        self.assert_(s.find('[reset]') > 0)
        self.assert_(s.find('[go]') > 0)
        self.assert_(s.find('[a picture]') > 0)             # <img>
        self.assert_(s.find(u'<&amp;,&lt;, ,",&gt;>') > 0)  # entities
    def transformDoc(self, inpath, outpath):
        """ Parse a message log file. Filter unwant document and transform it.
            File specified by outpath is only created when this success.

            @return whether the document is transformed.
        """

        mtime = os.path.getmtime(inpath)
        dt = datetime.datetime.utcfromtimestamp(mtime)
        timestamp = _formatTimestamp(dt)

        rfile = file(inpath,'rb')
        try:
            minfo = messagelog.MessageInfo.parseMessageLog(rfile)
            if minfo.discard:
                # these should be filtered in logging phrase, but double
                # check here perhaps for logs collected from other sources.
                log.info('discard %s %s - %s' % (os.path.split(inpath)[1], minfo.flags, minfo.req_path))
                return False

            meta = _extract_meta(minfo, timestamp)

            # simple filtering
            if (minfo.status < 200) or (300 <= minfo.status):
                return False
            if minfo.ctype != 'html' and minfo.ctype != 'txt':
                return False

            rfile.seek(0)
            contentFp = rspreader.ContentReader(rfile, inpath)

            discard = False
            wfile = file(outpath, 'wb')
            try:
                if minfo.ctype == 'html':
                    result = distillML.distill(contentFp, wfile, meta=meta)
                else:
                    result = distillML.distillTxt(contentFp, wfile, meta=meta)
                if result != 0:
                    log.info('discard %s %s - %s' % (os.path.split(inpath)[1], str(result), minfo.req_path))
                    discard = True
            finally:
                wfile.close()

        finally:
            rfile.close()

        if discard:
            os.remove(outpath)      # remove unwanted output
            return False
        else:
            filename = os.path.split(outpath)[1]
            log.debug('transformed %s (%s) - %s', filename, meta.get('encoding','?'), minfo.req_path)

        return True
    def testMeta(self):

        # Check basic meta data parsing

        self.fp = rspreader.openlog(testdir + 'basictags.html')
        meta = {}
        result = distillML.distill(self.fp, self.buf, meta)
        self.assertEqual(u'Basic HTML Sample Document', meta['title'])
        self.assertEqual(u'Description: this sample contains all basic HTML tags the converter understands', meta['description'])
        self.assertEqual(u'basic HTML, sample', meta['keywords'])
        self.assertEqual(4, len(meta))
    def testParserError(self):

        PROBLEM_LINE = '<! -- this is bad -->'

        self.fp = rspreader.openlog(testdir + 'malformed_html.mlog')
        s = self.fp.read(1024)
        self.assert_(s.find(PROBLEM_LINE) > 0)   # make sure the PROBLEM_LINE is in the test data
        self.fp.seek(0)

        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual(distillML.PARSE_ERROR, result[0])
    def testMetaVariations(self):

        # See meta_variations.html for variations of attributes formatting

        self.fp = rspreader.openlog(testdir + 'meta_variations.html')
        meta = {}
        result = distillML.distill(self.fp, self.buf, meta)
        self.assertEqual(u'word1 word2 word3', meta['title'])            # title span multiple lines
        self.assertEqual(u'word1 & word2 <word3>', meta['description'])  # all cap 'DESCRIPTION'; HTML encoding decoded; attr span lines
        self.assert_(not meta.has_key('keywords'))
        self.assertEqual(3, len(meta))
    def testMetaVariations(self):

        # See meta_variations.html for variations of attributes formatting

        self.fp = rspreader.openlog(testpath / "meta_variations.html")
        meta = {}
        result = distillML.distill(self.fp, self.buf, meta)
        self.assertEqual(u"word1 word2 word3", meta["title"])  # title span multiple lines
        self.assertEqual(
            u"word1 & word2 <word3>", meta["description"]
        )  # all cap 'DESCRIPTION'; HTML encoding decoded; attr span lines
        self.assert_(not meta.has_key("keywords"))
        self.assertEqual(3, len(meta))
    def testAttrEncodingProblem(self):
        """ Bad HTML found in http://news.bbc.co.uk/ """

        # note: the <b> inside the quoted attribute value should be
        # written as &lt;b&gt;. We choose not to workaround this right now
        doc = """<html><body>
<p>filler.filler.filler.filler.filler.filler.filler</p>
<a onmouseover="ChangeText('<b>Back to previous</b>');">text</a>
</body></html>"""

        result = distillML.distill(StringIO.StringIO(doc), self.buf, {})
        self.assertEqual(0, result)
        s = self.buf.getvalue()
    def testMeta(self):

        # Check basic meta data parsing

        self.fp = rspreader.openlog(testpath / "basictags.html")
        meta = {}
        result = distillML.distill(self.fp, self.buf, meta)
        self.assertEqual(u"Basic HTML Sample Document", meta["title"])
        self.assertEqual(
            u"Description: this sample contains all basic HTML tags the converter understands", meta["description"]
        )
        self.assertEqual(u"basic HTML, sample", meta["keywords"])
        self.assertEqual(4, len(meta))
    def testParseCrazyTitleProblem(self):

        # Test problem in parsing a missing <title>
        doc = """<html><head>hello</title></head>
<body>
<p>filler.filler.filler.filler.filler.filler.filler</p>
</body></html>"""

        meta = {}
        result = distillML.distill(StringIO.StringIO(doc), self.buf, meta)
        self.assertEqual(0, result)

        s = self.buf.getvalue()
        self.assert_(not meta.has_key('title'))     # no title
        self.assert_(s.find('filler') >= 0)         # but sort of getting rest of data
    def testParseEmptyTagProblem(self):
        """ Test problem in parsing <br/> """

        # The smgllib.SGMLParser in various versions of Python has problem
        # parsing <br/> It was suggested to workaround by using <br />
        # with a space. But we don't have a choice for documents fetched
        # from the web.
        doc = """<html><body>
<p>filler.filler.filler.filler.filler.filler.filler</p>
<p>abc<br/>def</p>
</body></html>"""

        result = distillML.distill(StringIO.StringIO(doc), self.buf, {})
        self.assertEqual(0, result)

        s = self.buf.getvalue()
        self.assert_(s.find('abc<br>') > 0)
        self.assert_(s.find('>def') < 0)        # the '>' from the preceding <br/> is a syndrome
    def _get_snapshot_content(self, item):
        # TODO: refactor
        filename = item.id == -1 and '_.mhtml' or '%s.mhtml' % item.id
        spath = cfg.getpath('weblibsnapshot')/filename
        if not spath.exists():
            return ''

        fp = spath.open('rb')       # TODO: check file exist, move to weblib? getSnapshotFile()?
        lwa = mhtml.LoadedWebArchive(fp)
        resp = lwa.fetch_uri(lwa.root_uri)
        if not resp:
            return ''

        # TODO: lucene_logic: use to docid is confusing with lucene's internal docid?
        # TODO: mind content-type, encoding, framed objects??
        data = resp.read()
        meta = {}
        contentBuf = StringIO.StringIO()
        result = distillML.distill(resp, contentBuf, meta=meta)
        contentBuf.seek(0)
        # TODO: what's the deal with writeHeader?
        meta, content = distillparse.parseDistillML(contentBuf, writeHeader=None)
        return content
    def testJavascript(self):
        self.fp = rspreader.openlog(testdir + 'js/doc_write_html.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u'document.write('), result)

        self.fp = rspreader.openlog(testdir + 'js/function.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u'function YADopenWindow(x){'), result)

        self.fp = rspreader.openlog(testdir + 'js/ibHtml1=.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u'ibHtml1="'), result)

        self.fp = rspreader.openlog(testdir + 'js/var_with_html.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS,  u'var pophtml ='), result)

        self.fp = rspreader.openlog(testdir + 'js/small1.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.NON_HTML, 'unknown'), result)

        self.fp = rspreader.openlog(testdir + 'js/small2.js')
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.NON_HTML, 'unknown'), result)
    def testJavascript(self):
        self.fp = rspreader.openlog(testpath / "js/doc_write_html.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u"document.write("), result)

        self.fp = rspreader.openlog(testpath / "js/function.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u"function YADopenWindow(x){"), result)

        self.fp = rspreader.openlog(testpath / "js/ibHtml1=.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u'ibHtml1="'), result)

        self.fp = rspreader.openlog(testpath / "js/var_with_html.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.JS, u"var pophtml ="), result)

        self.fp = rspreader.openlog(testpath / "js/small1.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.NON_HTML, "unknown"), result)

        self.fp = rspreader.openlog(testpath / "js/small2.js")
        result = distillML.distill(self.fp, self.buf, {})
        self.assertEqual((distillML.NON_HTML, "unknown"), result)
 def testLowvisible(self):
    self.fp = rspreader.openlog(testdir + 'lowvisible(doubleclick).mlog')
    result = distillML.distill(self.fp, self.buf, {})
    self.assertEqual(distillML.LOWVISIBLE, result[0])
 def testCSS(self):
     self.fp = rspreader.openlog(testdir + 'main.css')
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.NON_HTML, 'unknown'), result)
 def testFrameset(self):
     self.fp = rspreader.openlog(testdir + 'frameset.html')
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.FRAMESET), result[0])
 def testMagicFiltered(self):
     self.fp = rspreader.openlog(testdir + 'gif.qlog')
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.NON_HTML, 'image/gif'), result)
 def testDomainFiltered(self):
     self.fp = StringIO.StringIO()
     result = distillML.distill(self.fp, self.buf, {"uri": "http://x.googlesyndication.com/"})
     self.assertEqual((distillML.EXDOMAIN, ".googlesyndication.com"), result)
 def testMagicFiltered(self):
     self.fp = rspreader.openlog(testpath / "gif.qlog")
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.NON_HTML, "image/gif"), result)
 def testDomainFiltered(self):
     self.fp = StringIO.StringIO()
     result = distillML.distill(self.fp, self.buf, {'uri':'http://x.googlesyndication.com/'})
     self.assertEqual((distillML.EXDOMAIN, '.googlesyndication.com'), result)
 def testCSS(self):
     self.fp = rspreader.openlog(testpath / "main.css")
     result = distillML.distill(self.fp, self.buf, {})
     self.assertEqual((distillML.NON_HTML, "unknown"), result)