def shorten_excerpt(content, terms): """ Iterates over the words in the excerpt and attempts to "close the gap" between matched terms in an overly long excerpt. Naive implementation. """ flattened_excerpt_words = [] last_term_appearence = 0 skipping_words = False terms = compile_terms(terms) for i, word in enumerate(content.split()): # Spotted a matched term, set our state flag to false and update # the "time" of our last term appearance for term in terms: if term.match(word): last_term_appearence = i skipping_words = False # If it's been too long since our last match, start dropping words if i - last_term_appearence > EXCERPT_MATCH_WINDOW_SIZE: # Only want to add '...' once between terms, # so check our state flag first if not skipping_words: flattened_excerpt_words.append('...') skipping_words = True continue flattened_excerpt_words.append(word) return ' '.join(flattened_excerpt_words)
def highlight(content, terms): """ Highlight the HTML with BeautifulSoup. """ index = 1 update_content = False soup = BeautifulSoup(content) terms = compile_terms(terms) for term in terms: for text in soup.find_all(text=term): if text.parent.name in PROTECTED_MARKUPS: continue def highlight(match): match_term = match.group(0) return HIGHLIGHTING_PATTERN % { 'index': index, 'term': match_term} new_text = term.sub(highlight, text) text.replace_with(BeautifulSoup(new_text)) update_content = True # Reload the entire soup, because substituion # doesn't rebuild the document tree soup = BeautifulSoup(str(soup)) index += 1 if update_content: return str(soup) return content
def generate_term_positions(splitted_content, terms): """ Iterates over the words in the corpus and stores the locations of each matched query term. This data is structured as a list of lists, where each sub-list contains all of the positions for a matched query term. """ positions = [[] for i in range(len(terms))] terms = compile_terms(sorted(terms)) for i, word in enumerate(splitted_content): for term in terms: if term.match(word): positions[terms.index(term)].append(i) break return [x for x in positions if x]
def test_compile_terms(self): terms = compile_terms(['toto', 'titi']) self.assertTrue(terms[0].match('TOTO'))