def __init__(self, lucene_results, searcher, compressor, query): self.lucene_results = lucene_results self.searcher = searcher self.compressor = compressor scorer = lucene.QueryScorer(query) self.highlighter = lucene.Highlighter(scorer) self.analyzer = BlogCorpusAnalyzer() fragmenter = lucene.SimpleFragmenter(10000) self.highlighter.setTextFragmenter(fragmenter) self.length = min(self.lucene_results.totalHits, MAX_RESULTS)
class SearchResults(object): def __init__(self, lucene_results, searcher, compressor, query): self.lucene_results = lucene_results self.searcher = searcher self.compressor = compressor scorer = lucene.QueryScorer(query) self.highlighter = lucene.Highlighter(scorer) self.analyzer = BlogCorpusAnalyzer() fragmenter = lucene.SimpleFragmenter(10000) self.highlighter.setTextFragmenter(fragmenter) self.length = min(self.lucene_results.totalHits, MAX_RESULTS) def __len__(self): return self.length def uncompress_contents(self, doc): unicode_str = doc.getField('compressed').stringValue() # hack, because not indexed as a binary field. fix that normal_str = ''.join(chr(ord(x)) for x in unicode_str.encode('utf8')) return self.compressor.decompress(normal_str) def get_doc(self, number): scoredoc = self.lucene_results.scoreDocs[number] doc = self.searcher.doc(scoredoc.doc) if self.compressor is not None: contents = self.uncompress_contents(doc) else: contents = doc.getField('contents').stringValue() tokenStream = self.analyzer.tokenStream('f', lucene.StringReader(contents)) highlighted = self.highlighter.getBestFragment(tokenStream, contents) # TODO: fix this ridiculous thing if highlighted is None: return Result(['ERROR', 'ERROR', 'ERROR'], 1, 2) words = highlighted.split() actual_words = [] highlighted_words = [] for index, word in enumerate(words): splitted = word.split('@') wordform = (splitted[0] if splitted[0] != '' and splitted[0] != '<B>' else splitted[1]) if word.startswith('<B>'): highlighted_words.append(index) actual_words.append(wordform.replace('<B>', '')) # Take only first continguous strech of highlighted words for i in range(0, len(highlighted_words) - 1): if highlighted_words[i+1] - highlighted_words[i] > 1: highlighted_words = highlighted_words[:i+1] break start = highlighted_words[0] end = highlighted_words[-1] # Make sure highlighted words are contiguous assert len(highlighted_words) == end - start + 1 return Result(actual_words, start, end) def __getitem__(self, item): if isinstance(item, slice): results = [] for i in range(item.start, item.stop): d = self.get_doc(i) if 'ERROR' not in d.before: results.append(d) return results else: return self.get_doc(item)