def get_snippet(self, hit, normal=lambda x: x, highlight=lambda x: x, maxsents=3, maxchars=100, maxlr=20, default_snippet_sentid=0): # Normally it assumes that self.iter() is already called # so the contexts for this location is not empty. When it is empty, # fill out with the default snippet string. (idx, docid) = hit try: (loc, mtime) = idx_docid2info(idx, docid) except KeyError: raise try: title = idx_sent(idx, docid, 0) except KeyError: title = None contexts = self.snippets.get(hit, [default_snippet_sentid]) snippet = u'' sentid0 = None for sentid in sorted(contexts): # Avoid repeating. if sentid0 == sentid: continue # For each position, we take maxsents sentences. text = '' try: for (i, s) in enumerate(idx_sents(idx, docid, sentid)): if text: text += ' ' text += s if maxsents <= i + 1 or maxchars <= len(text): break except KeyError: pass x = self.matched_range(text) if len(x) == 1: # No highlight (no pattern specified). snippet += normal(text[:maxchars]) + u'...' else: # Highlight the matched parts. assert 3 <= len(x) # prepend the leftmost context. (state, left) = x[0] if not state: snippet += u'... ' + normal(left[-maxlr:]) for (state, s) in x[1:-1]: if not s: continue if state: snippet += highlight(s) else: snippet += normal(s) # append the rightmost context. (state, right) = x[-1] if not state: snippet += normal(right[:maxlr]) + u'...' if maxchars - len(snippet) < maxlr: break sentid0 = sentid return (loc, mtime, title, snippet)
def get_snippet(self, hit, normal=lambda x:x, highlight=lambda x:x, maxsents=3, maxchars=100, maxlr=20, default_snippet_sentid=0): # Normally it assumes that self.iter() is already called # so the contexts for this location is not empty. When it is empty, # fill out with the default snippet string. (idx, docid) = hit try: (loc, mtime) = idx_docid2info(idx, docid) except KeyError: raise try: title = idx_sent(idx, docid, 0) except KeyError: title = None contexts = self.snippets.get(hit, [default_snippet_sentid]) snippet = u'' sentid0 = None for sentid in sorted(contexts): # Avoid repeating. if sentid0 == sentid: continue # For each position, we take maxsents sentences. text = '' try: for (i,s) in enumerate(idx_sents(idx, docid, sentid)): if text: text += ' ' text += s if maxsents <= i+1 or maxchars <= len(text): break except KeyError: pass x = self.matched_range(text) if len(x) == 1: # No highlight (no pattern specified). snippet += normal(text[:maxchars]) + u'...' else: # Highlight the matched parts. assert 3 <= len(x) # prepend the leftmost context. (state,left) = x[0] if not state: snippet += u'... ' + normal(left[-maxlr:]) for (state,s) in x[1:-1]: if not s: continue if state: snippet += highlight(s) else: snippet += normal(s) # append the rightmost context. (state,right) = x[-1] if not state: snippet += normal(right[:maxlr]) + u'...' if maxchars-len(snippet) < maxlr: break sentid0 = sentid return (loc, mtime, title, snippet)
def start_iter(self): "Iterates over the search resuts." from time import time for (idx,docid,contexts) in self.get_docids(): t0 = 0 if self.timeout: t0 = time() pol = 0 if self.doc_preds: # Apply the document predicates. try: (loc,_) = idx_docid2info(idx, docid) for pred in self.doc_preds: pol = pred(loc) if pol: break except KeyError: pass # pol < 0: rejected immediately. # pol > 0: accepted immediately. # pol = 0: undecided (further examination required). hit = (idx,docid) self.narrowed += 1 if pol == 0: # contexts (a list of sentids) is stored in descending order in an index file. filtered = [] # Receives a list of pairs of sentids and regexp patterns: [([sentid],regpat), ...] # and returns a sentid list that actually matches to the patterns. # Unless ALL the patterns match, it returns a null. for (sentids,pat) in contexts: # make the list in ascending order. sentids.reverse() for sentid in sentids: try: sent = idx_sent(idx, docid, sentid) except KeyError: continue if not pat or pat.search(sent): filtered.append(sentid) break else: if not self.disjunctive: pol = -1 break else: if filtered: pol = 1 self.snippets[hit] = filtered if 0 < pol: self.found_docs.append(hit) yield hit # Abort if the specified time is passed. if self.timeout and t0+self.timeout <= time(): raise SearchTimeout(self) return
def start_iter(self): "Iterates over the search resuts." from time import time for (idx, docid, contexts) in self.get_docids(): t0 = 0 if self.timeout: t0 = time() pol = 0 if self.doc_preds: # Apply the document predicates. try: (loc, _) = idx_docid2info(idx, docid) for pred in self.doc_preds: pol = pred(loc) if pol: break except KeyError: pass # pol < 0: rejected immediately. # pol > 0: accepted immediately. # pol = 0: undecided (further examination required). hit = (idx, docid) self.narrowed += 1 if pol == 0: # contexts (a list of sentids) is stored in descending order in an index file. filtered = [] # Receives a list of pairs of sentids and regexp patterns: [([sentid],regpat), ...] # and returns a sentid list that actually matches to the patterns. # Unless ALL the patterns match, it returns a null. for (sentids, pat) in contexts: # make the list in ascending order. sentids.reverse() for sentid in sentids: try: sent = idx_sent(idx, docid, sentid) except KeyError: continue if not pat or pat.search(sent): filtered.append(sentid) break else: if not self.disjunctive: pol = -1 break else: if filtered: pol = 1 self.snippets[hit] = filtered if 0 < pol: self.found_docs.append(hit) yield hit # Abort if the specified time is passed. if self.timeout and t0 + self.timeout <= time(): raise SearchTimeout(self) return