def paragraph_features(wiki_title): features = {} page = get_page(wiki_title) if not page: return None text = extract_text(page) global text_time text_time = text#used for time_estimation() word_tokens = nltk.word_tokenize(text) sent_tokens = nltk.sent_tokenize(text) features["ave syllables/word"] = num_syllables(word_tokens)/len(word_tokens) features["ave sentence length"] = len(word_tokens)/len(sent_tokens) features["ave word length"] = character_count(word_tokens)/len(word_tokens) features["percent common words"] = \ common_count(word_tokens)/len(word_tokens) features["percent stop words"] = \ stopword_count(word_tokens)/len(word_tokens) features["hapax legomenon"] = \ plugins.hapax_find(word_tokens)/len(word_tokens) features["acronym count"] = \ plugins.avg_acronym_count(word_tokens)/len(word_tokens) features["percent numbers"]= \ plugins.number_freq(word_tokens)/len(word_tokens) return features
def paragraph_features_page(page): features = {} text = extract_text(page) word_tokens = nltk.word_tokenize(text) sent_tokens = nltk.sent_tokenize(text) features["ave syllables/word"] = num_syllables(word_tokens)/len(word_tokens) features["ave sentence length"] = len(word_tokens)/len(sent_tokens) features["ave word length"] = character_count(word_tokens)/len(word_tokens) features["percent common words"] = \ common_count(word_tokens)/len(word_tokens) features["percent stop words"] = \ stopword_count(word_tokens)/len(word_tokens) features["hapax legomenon"] = \ plugins.hapax_find(word_tokens)/len(word_tokens) features["acronym count"] = \ plugins.avg_acronym_count(word_tokens)/len(word_tokens) features["percent numbers"]= \ plugins.number_freq(word_tokens)/len(word_tokens) return features