def testParseMeta(self): ''' test parsing header into meta dictionary ''' input = StringIO.StringIO(' header1 : value1 \nHEADER2:value2\n\n') meta, content = distillparse.parseDistillML(input) self.assertEqual(2, len(meta)) self.assertEqual('value1', meta['header1']) # extra space should be trimmed self.assertEqual('value2', meta['header2']) # header would be turned into lower case
def indexDoc(self, path): fp = file(path,'rb') try: meta, content = distillparse.parseDistillML(fp, distillparse.writeHeader) uri = meta['uri'] # if there is no uri, throw an exception and discard this doc # check index to see if document already indexed result = self._searchForArchived(uri, meta) if result: log.info('discard %s archived(%s) - %s' % (os.path.split(path)[1], result, uri)) return False # add this document in the archive fp.seek(0) id = docarchive.idCounter.getNewId() self.arcHandler.add_document(id, fp) # add this document into the index self.writer.addDocument(id, meta, content) # remember it in freshly added document # note if there are existing uri, it will be overwritten by the new one self.freshdocs[uri] = meta log.info('%s -> %s' % (os.path.split(path)[1], id)) finally: fp.close() return True
def reindex(dbdoc, beginId, endId, index_path): ah = docarchive.ArchiveHandler('r') writer = lucene_logic.Writer(index_path) writer.writer.minMergeDocs = 1000 for i in xrange(beginId, endId): docid = '%09d' % i if i % NOTIFY_INTERVAL == 1: print '%s Reindexing %09d' % (datetime.datetime.now(), i) zfile, filename = ah._open(docid) try: data = zfile.read(filename) except KeyError: continue # skip holes fp = StringIO.StringIO(data) meta, content = distillparse.parseDistillML(fp, distillparse.writeHeader) writer.addDocument(docid, meta, content) print '%s optimizing' % datetime.datetime.now() writer.optimize() writer.close() ah.close()
def testParseTagSpanBuffer(self): header = '\n' # empty header # |123456789|123456789| # tags span buffer boundary of 10 # | | | input = StringIO.StringIO(header + 'abcdef<item>ghijk</li>lmn') meta, content = distillparse.parseDistillML(input, bufsize=10) # bufsize of 10 self.assertEqual('abcdef<item>ghijklmn', content)
def _get_snapshot_content(self, item): # TODO: refactor filename = item.id == -1 and '_.mhtml' or '%s.mhtml' % item.id spath = cfg.getpath('weblibsnapshot')/filename if not spath.exists(): return '' fp = spath.open('rb') # TODO: check file exist, move to weblib? getSnapshotFile()? lwa = mhtml.LoadedWebArchive(fp) resp = lwa.fetch_uri(lwa.root_uri) if not resp: return '' # TODO: lucene_logic: use to docid is confusing with lucene's internal docid? # TODO: mind content-type, encoding, framed objects?? data = resp.read() meta = {} contentBuf = StringIO.StringIO() result = distillML.distill(resp, contentBuf, meta=meta) contentBuf.seek(0) # TODO: what's the deal with writeHeader? meta, content = distillparse.parseDistillML(contentBuf, writeHeader=None) return content
self.docid = self.doc.get('docid' ) self.date = self.doc.get('date' ) self.uri = self.doc.get('uri' ) # title & description are filled at hightlight() def highlight(self, analyzer, highlighter): maxNumFragmentsRequired = 2 try: fp = docarchive.get_document(self.docid) except Exception, e: # maybe the index is outdate to refer to some non-exist file log.exception('Unable to get "%s"' % self.docid) else: meta, content = distillparse.parseDistillML(fp) tokenStream = analyzer.tokenStream('content', StringIO.StringIO(content)) self.description = highlighter.getBestFragments(tokenStream, content, maxNumFragmentsRequired, "...") self.title = meta.get('title','') def parseQuery(phrase): query = QueryParser.parse(phrase, "content", StandardAnalyzer()) return query MAXRESULT = 1000 def sortHits(hits, maxDoc): """ Return list of (adj score, id, doc, original score) """
def testParseTags(self): header = '\n' # empty header input = StringIO.StringIO(header + '<item><h1>*</h1></item>') meta, content = distillparse.parseDistillML(input) self.assertEqual('<item>*</item>', content) # <h1> stripped, <item> stays
def testParse0(self): ''' test parsing a minimal file ''' input = StringIO.StringIO('\n') # with an empty header meta, content = distillparse.parseDistillML(input) self.assertEqual(0, len(meta)) self.assertEqual('', content)
def testParse00(self): ''' test parsing a empty file (invalid without the header section) ''' input = StringIO.StringIO('') meta, content = distillparse.parseDistillML(input) self.assertEqual(0, len(meta)) self.assertEqual('', content)