Пример #1
0
def shorten_excerpt(content, terms):
    """
    Iterates over the words in the excerpt and attempts to
    "close the gap" between matched terms in an overly long excerpt.
    Naive implementation.
    """
    flattened_excerpt_words = []
    last_term_appearence = 0
    skipping_words = False
    terms = compile_terms(terms)

    for i, word in enumerate(content.split()):

        # Spotted a matched term, set our state flag to false and update
        # the "time" of our last term appearance
        for term in terms:
            if term.match(word):
                last_term_appearence = i
                skipping_words = False

        # If it's been too long since our last match, start dropping words
        if i - last_term_appearence > EXCERPT_MATCH_WINDOW_SIZE:

            # Only want to add '...' once between terms,
            # so check our state flag first
            if not skipping_words:
                flattened_excerpt_words.append('...')
                skipping_words = True

            continue

        flattened_excerpt_words.append(word)

    return ' '.join(flattened_excerpt_words)
Пример #2
0
def highlight(content, terms):
    """
    Highlight the HTML with BeautifulSoup.
    """
    index = 1
    update_content = False
    soup = BeautifulSoup(content)
    terms = compile_terms(terms)

    for term in terms:
        for text in soup.find_all(text=term):
            if text.parent.name in PROTECTED_MARKUPS:
                continue

            def highlight(match):
                match_term = match.group(0)
                return HIGHLIGHTING_PATTERN % {
                    'index': index, 'term': match_term}

            new_text = term.sub(highlight, text)
            text.replace_with(BeautifulSoup(new_text))
            update_content = True
        # Reload the entire soup, because substituion
        # doesn't rebuild the document tree
        soup = BeautifulSoup(str(soup))
        index += 1
    if update_content:
        return str(soup)
    return content
Пример #3
0
def generate_term_positions(splitted_content, terms):
    """
    Iterates over the words in the corpus and stores the locations of
    each matched query term. This data is structured as a list of lists,
    where each sub-list contains all of the positions for a matched query
    term.
    """
    positions = [[] for i in range(len(terms))]
    terms = compile_terms(sorted(terms))

    for i, word in enumerate(splitted_content):
        for term in terms:
            if term.match(word):
                positions[terms.index(term)].append(i)
                break

    return [x for x in positions if x]
Пример #4
0
 def test_compile_terms(self):
     terms = compile_terms(['toto', 'titi'])
     self.assertTrue(terms[0].match('TOTO'))
Пример #5
0
 def test_compile_terms(self):
     terms = compile_terms(['toto', 'titi'])
     self.assertTrue(terms[0].match('TOTO'))