Exemplo n.º 1
0
def get_avg_sentence_length():
    count = []
    for essay in read_all_essays():
        sentences = sent_tokenize(essay)
        words = word_tokenize(essay)
        count.append(len(words) / len(sentences))

    return np.array(count)
Exemplo n.º 2
0
def spellcheck_essays():
    errors = []
    for essay in read_all_essays():
        words = word_tokenize(essay)
        stop_words = stopwords.words('english')
        words_cleaned = [
            word for word in words if word.isalpha() and word not in stop_words
            and word not in EXCLUDED_CLEAN_WORDS
        ]
        errors.append(spellcheck_errors(words_cleaned))
    return np.array(errors)
Exemplo n.º 3
0
    def train_model(self):
        # "before_1" is the first word before the word being predicted and "before_2" the second one
        # "before_2" "before_1" missing_word "after_1" "after_2"
        self.before_words_freq = {"before_1": {}, "before_2": {}}

        for essay in read_all_essays():
            essay_tokenized = self.tokenize_text(essay)
            self.before_word_frequency(essay_tokenized)

        for book in read_books():
            book_tokenized = self.tokenize_text(book)
            self.before_word_frequency(book_tokenized)
Exemplo n.º 4
0
def clean_data():
    cleaned_essays = []
    stemmer = PorterStemmer()
    for essay in read_all_essays():
        words = word_tokenize(essay)
        stop_words = stopwords.words('english')
        words_cleaned = [
            stemmer.stem(word) for word in words if word.isalpha()
            and word not in stop_words and word not in EXCLUDED_CLEAN_WORDS
        ]
        essay_cleaned = " ".join(words_cleaned)
        cleaned_essays.append(essay_cleaned)
    return cleaned_essays
def count_word_occurences():
    corpus = dict()
    # lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    for essay in read_all_essays():
        words = word_tokenize(essay)
        stop_words = stopwords.words('english')
        words_cleaned = [
            stemmer.stem(word) for word in words
            if word.isalpha() and word not in stop_words
        ]
        for word in words_cleaned:
            if word not in corpus.keys():
                corpus[word] = 1
            else:
                corpus[word] += 1
    return corpus
Exemplo n.º 6
0
def preprocess_structure_data():
    df = pd.DataFrame()

    noun_counts = []
    verb_counts = []
    adjective_counts = []
    adverb_counts = []
    comma_counts = []
    period_counts = []
    noun_cnt = verb_cnt = adjective_cnt = adverb_cnt = comma_cnt = period_cnt = 0

    for essay in read_all_essays():
        essay_tokenized = word_tokenize(essay)
        comma_cnt = essay_tokenized.count(",")
        period_cnt = essay_tokenized.count(".")
        sentence_cnt = len(sent_tokenize(essay))
        pos_tags = nltk.pos_tag(essay_tokenized)

        for word, tag in pos_tags:
            if tag in NOUNS: noun_cnt += 1
            elif tag in ADVERBS: adverb_cnt += 1
            elif tag in ADJECTIVES: adjective_cnt += 1
            elif tag in VERBS: verb_cnt += 1

        noun_counts.append(noun_cnt / sentence_cnt)
        verb_counts.append(verb_cnt / sentence_cnt)
        adjective_counts.append(adjective_cnt / sentence_cnt)
        adverb_counts.append(adverb_cnt / sentence_cnt)
        period_counts.append(period_cnt / len(essay_tokenized))
        comma_counts.append(comma_cnt / sentence_cnt)

        noun_cnt = verb_cnt = adjective_cnt = adverb_cnt = 0

    df["noun_count_avg"] = np.array(noun_counts)
    df["verb_count_avg"] = np.array(verb_counts)
    df["adjective_count_avg"] = np.array(adjective_counts)
    df["adverb_count_avg"] = np.array(adverb_counts)
    df["avg_comma_count_per_sentence"] = np.array(comma_counts)
    # df["avg_period_count"] = np.array(period_counts)
    #df["sentence_count_avg"] = get_sentence_count()
    #df["spelling_errors"] = spellcheck_essays()

    return df
def words_count():
    count = []
    for essay in read_all_essays():
        count.append(len(word_tokenize(essay)))
    return count
def sentences_count():
    count = []
    for essay in read_all_essays():
        count.append(len(sent_tokenize(essay)))
    return count