def testDistillTxt(self): self.fp = rspreader.openlog(testdir + 'plaintext.mlog') result = distillML.distillTxt(self.fp, self.buf, {}) self.assertEqual(0, result) # check content self.buf.seek(0) p = patterns_tester.checkPatterns(self.buf, ['Copyright', 'All rights reserved.', 'OF SUCH DAMAGE.']) self.assert_(not p, 'unexpected: %s' % p)
def testDistillTxt(self): self.fp = rspreader.openlog(testpath / "plaintext.mlog") result = distillML.distillTxt(self.fp, self.buf, {}) self.assertEqual(0, result) # check content self.buf.seek(0) p = patterns_tester.checkStrings(self.buf.read(), ["Copyright", "All rights reserved.", "OF SUCH DAMAGE."]) self.assert_(not p, "unexpected: %s" % p)
def transformDoc(self, inpath, outpath): """ Parse a message log file. Filter unwant document and transform it. File specified by outpath is only created when this success. @return whether the document is transformed. """ mtime = os.path.getmtime(inpath) dt = datetime.datetime.utcfromtimestamp(mtime) timestamp = _formatTimestamp(dt) rfile = file(inpath,'rb') try: minfo = messagelog.MessageInfo.parseMessageLog(rfile) if minfo.discard: # these should be filtered in logging phrase, but double # check here perhaps for logs collected from other sources. log.info('discard %s %s - %s' % (os.path.split(inpath)[1], minfo.flags, minfo.req_path)) return False meta = _extract_meta(minfo, timestamp) # simple filtering if (minfo.status < 200) or (300 <= minfo.status): return False if minfo.ctype != 'html' and minfo.ctype != 'txt': return False rfile.seek(0) contentFp = rspreader.ContentReader(rfile, inpath) discard = False wfile = file(outpath, 'wb') try: if minfo.ctype == 'html': result = distillML.distill(contentFp, wfile, meta=meta) else: result = distillML.distillTxt(contentFp, wfile, meta=meta) if result != 0: log.info('discard %s %s - %s' % (os.path.split(inpath)[1], str(result), minfo.req_path)) discard = True finally: wfile.close() finally: rfile.close() if discard: os.remove(outpath) # remove unwanted output return False else: filename = os.path.split(outpath)[1] log.debug('transformed %s (%s) - %s', filename, meta.get('encoding','?'), minfo.req_path) return True
def test_big5_txt(self): self.fp = file(testdir + 'ah_ying.txt', 'rb') title, content = self.test_data[19:21] self.meta['content-type'] = 'text/plain; charset=big5' result = distillML.distillTxt(self.fp, self.buf, self.meta) self.assertEqual(0, result) self.assertEqual(self.meta['encoding'], 'big5 [HTTP]') #self.assertEqual(self.meta['title'], title) s = self.buf.getvalue().decode('utf8') self.assert_(s.find(content) > 0)
def test_big5_txt(self): self.fp = file(testpath / "ah_ying.txt", "rb") title, content = self.test_data[19:21] self.meta["content-type"] = "text/plain; charset=big5" result = distillML.distillTxt(self.fp, self.buf, self.meta) self.assertEqual(0, result) self.assertEqual(self.meta["encoding"], "big5 [HTTP]") # self.assertEqual(self.meta['title'], title) s = self.buf.getvalue().decode("utf8") self.assert_(s.find(content) > 0)
def testMagicFilteredTxt(self): """ Wrong media type text/plain """ self.fp = rspreader.openlog(testdir + 'favicon.ico_text(nutch).mlog') result = distillML.distillTxt(self.fp, self.buf, {}) self.assertEqual((distillML.NON_HTML, 'image/vnd.microsoft.icon'), result)
def testDomainFilteredTxt(self): self.fp = StringIO.StringIO() result = distillML.distillTxt(self.fp, self.buf, {'uri':'http://x.googlesyndication.com/'}) self.assertEqual((distillML.EXDOMAIN, '.googlesyndication.com'), result)