Exemplo n.º 1
0
Arquivo: nlp.py Projeto: XI-lab/axel
def collocations(index, cutoff=2):
    """
    Extract collocations from n-gram index
    :type index: dict
    :rtype list
    """

    def filter_punkt(word):
        return _PUNKT_RE.match(word)

    def filter_len(word):
        return len(word) < 3 and not word.isupper()

    # do filtration by frequency > 2
    bigram_index = dict([(tuple(k.split()), v) for k, v in index.iteritems()
                         if len(k.split()) == 2 and v > cutoff])

    # Get abstract finder because we already have index
    finder = AbstractCollocationFinder(None, bigram_index)
    # remove collocation from 2 equal words
    finder.apply_ngram_filter(lambda x, y: x == y)
    # remove weird collocations
    finder.apply_ngram_filter(lambda x, y: _DIGIT_RE.match(x) and _DIGIT_RE.match(y))
    # remove punctuation, len and stopwords
    finder.apply_word_filter(filter_punkt)
    finder.apply_word_filter(filter_len)
    finder.apply_word_filter(lambda w: w in _STOPWORDS)

    filtered_collocs = finder.ngram_fd
    """:type: dict"""

    # generate possible n-grams
    filtered_collocs = _update_ngram_counts(_generate_possible_ngrams(filtered_collocs, index),
                                            index)
    return filtered_collocs