def get_avg_sentence_length(): count = [] for essay in read_all_essays(): sentences = sent_tokenize(essay) words = word_tokenize(essay) count.append(len(words) / len(sentences)) return np.array(count)
def spellcheck_essays(): errors = [] for essay in read_all_essays(): words = word_tokenize(essay) stop_words = stopwords.words('english') words_cleaned = [ word for word in words if word.isalpha() and word not in stop_words and word not in EXCLUDED_CLEAN_WORDS ] errors.append(spellcheck_errors(words_cleaned)) return np.array(errors)
def train_model(self): # "before_1" is the first word before the word being predicted and "before_2" the second one # "before_2" "before_1" missing_word "after_1" "after_2" self.before_words_freq = {"before_1": {}, "before_2": {}} for essay in read_all_essays(): essay_tokenized = self.tokenize_text(essay) self.before_word_frequency(essay_tokenized) for book in read_books(): book_tokenized = self.tokenize_text(book) self.before_word_frequency(book_tokenized)
def clean_data(): cleaned_essays = [] stemmer = PorterStemmer() for essay in read_all_essays(): words = word_tokenize(essay) stop_words = stopwords.words('english') words_cleaned = [ stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words and word not in EXCLUDED_CLEAN_WORDS ] essay_cleaned = " ".join(words_cleaned) cleaned_essays.append(essay_cleaned) return cleaned_essays
def count_word_occurences(): corpus = dict() # lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() for essay in read_all_essays(): words = word_tokenize(essay) stop_words = stopwords.words('english') words_cleaned = [ stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words ] for word in words_cleaned: if word not in corpus.keys(): corpus[word] = 1 else: corpus[word] += 1 return corpus
def preprocess_structure_data(): df = pd.DataFrame() noun_counts = [] verb_counts = [] adjective_counts = [] adverb_counts = [] comma_counts = [] period_counts = [] noun_cnt = verb_cnt = adjective_cnt = adverb_cnt = comma_cnt = period_cnt = 0 for essay in read_all_essays(): essay_tokenized = word_tokenize(essay) comma_cnt = essay_tokenized.count(",") period_cnt = essay_tokenized.count(".") sentence_cnt = len(sent_tokenize(essay)) pos_tags = nltk.pos_tag(essay_tokenized) for word, tag in pos_tags: if tag in NOUNS: noun_cnt += 1 elif tag in ADVERBS: adverb_cnt += 1 elif tag in ADJECTIVES: adjective_cnt += 1 elif tag in VERBS: verb_cnt += 1 noun_counts.append(noun_cnt / sentence_cnt) verb_counts.append(verb_cnt / sentence_cnt) adjective_counts.append(adjective_cnt / sentence_cnt) adverb_counts.append(adverb_cnt / sentence_cnt) period_counts.append(period_cnt / len(essay_tokenized)) comma_counts.append(comma_cnt / sentence_cnt) noun_cnt = verb_cnt = adjective_cnt = adverb_cnt = 0 df["noun_count_avg"] = np.array(noun_counts) df["verb_count_avg"] = np.array(verb_counts) df["adjective_count_avg"] = np.array(adjective_counts) df["adverb_count_avg"] = np.array(adverb_counts) df["avg_comma_count_per_sentence"] = np.array(comma_counts) # df["avg_period_count"] = np.array(period_counts) #df["sentence_count_avg"] = get_sentence_count() #df["spelling_errors"] = spellcheck_essays() return df
def words_count(): count = [] for essay in read_all_essays(): count.append(len(word_tokenize(essay))) return count
def sentences_count(): count = [] for essay in read_all_essays(): count.append(len(sent_tokenize(essay))) return count