Пример #1
0
def paragraph_features(wiki_title):
    features = {}
    
    page = get_page(wiki_title)
    if not page:
       return None
    text = extract_text(page)
    
    global text_time
    text_time = text#used for time_estimation()

    word_tokens = nltk.word_tokenize(text)
    sent_tokens = nltk.sent_tokenize(text)

    
    features["ave syllables/word"] = num_syllables(word_tokens)/len(word_tokens)
    features["ave sentence length"] = len(word_tokens)/len(sent_tokens)
    features["ave word length"] = character_count(word_tokens)/len(word_tokens)
    features["percent common words"] = \
                        common_count(word_tokens)/len(word_tokens)
    features["percent stop words"] = \
                        stopword_count(word_tokens)/len(word_tokens)
    features["hapax legomenon"] = \
                        plugins.hapax_find(word_tokens)/len(word_tokens)
    features["acronym count"] = \
                        plugins.avg_acronym_count(word_tokens)/len(word_tokens)
    features["percent numbers"]= \
                        plugins.number_freq(word_tokens)/len(word_tokens)
                        
    return features
Пример #2
0
def paragraph_features_page(page):
    features = {}

    text = extract_text(page)
    word_tokens = nltk.word_tokenize(text)
    sent_tokens = nltk.sent_tokenize(text)
    
    features["ave syllables/word"] = num_syllables(word_tokens)/len(word_tokens)
    features["ave sentence length"] = len(word_tokens)/len(sent_tokens)
    features["ave word length"] = character_count(word_tokens)/len(word_tokens)
    features["percent common words"] = \
                        common_count(word_tokens)/len(word_tokens)
    features["percent stop words"] = \
                        stopword_count(word_tokens)/len(word_tokens)
    features["hapax legomenon"] = \
                        plugins.hapax_find(word_tokens)/len(word_tokens)
    features["acronym count"] = \
                        plugins.avg_acronym_count(word_tokens)/len(word_tokens)
    features["percent numbers"]= \
                        plugins.number_freq(word_tokens)/len(word_tokens)
    
    return features