def create_features(self, answer): """ NOTES The below actually has MORE features than what my documentation is showing Suggesting I left out features for some reason? Maybe when I was doing feature selection? Basically need to create all the features for the new answer """ # Get the teacher's stuff a_stopwords = sf.remove_stopwords(self.teacher_answer) a_stemmed = sf.stem_sentence(a_stopwords) a_stemmed_ordered = sf.order_sentence(a_stemmed) teacher_answers = [ a_stemmed, a_stemmed_ordered, ] # Change sentence into multiple versions log = dict() log['student_answer'] = answer log['teacher_answer'] = self.teacher_answer log['q_answer'] = answer log['q_stopwords'] = sf.remove_stopwords(answer) log['q_stemmed'] = sf.stem_sentence(answer) log['q_stem_ordered'] = sf.order_sentence(log['q_stemmed']) # Might need to save scaling until jsut before modeling log['wordcount'] = sf.word_count(answer) log['wordcount'] = sf.scale_column(self.word_scaler, log['wordcount']) # Stem sim log['stem_g_similarity'] = sf.generic_similarity(log['q_stemmed'], a_stemmed) log['stem_j_similarity'] = sf.jaccard_similarity(log['q_stemmed'], a_stemmed) log['stem_c_similarity'] = sf.cosine_similarity(log['q_stemmed'], a_stemmed) # Ordered log['stem_ordered_g_similarity'] = sf.generic_similarity(log['q_stem_ordered'], a_stemmed_ordered) log['stem_ordered_j_similarity'] = sf.jaccard_similarity(log['q_stem_ordered'], a_stemmed_ordered) log['stem_ordered_c_similarity'] = sf.cosine_similarity(log['q_stem_ordered'], a_stemmed_ordered) # Appending New Answer self.new_answers = self.new_answers.append(log, ignore_index = True) # Entity Extraction types_of_sentences = [ 'q_stemmed', 'q_stem_ordered', ] for sent_type, teach_ans in zip(types_of_sentences, teacher_answers): self.new_answers = sf.unigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans) self.new_answers = sf.bigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans) self.new_answers = sf.trigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans)
def generate_features(file): og = pd.read_csv(file) column_count = len(og.columns) answer = og.iloc[0, 1] a_answer = answer a_stopwords = stfu.remove_stopwords(answer) a_stemmed = stfu.stem_sentence(a_stopwords) a_stemmed_ordered = stfu.order_sentence(a_stemmed) teacher_answers = [ a_stemmed, a_stemmed_ordered, ] og['q_answer'] = og.student_answer.values[0] og['q_stopwords'] = og.student_answer.apply(stfu.remove_stopwords) og['q_stemmed'] = og.q_stopwords.apply(stfu.stem_sentence) og['q_stem_ordered'] = og.q_stemmed.apply(stfu.order_sentence) column_count += 4 # # Counts og['wordcount'] = og.q_stem_ordered.apply(stfu.word_count) sc = MinMaxScaler() og['wordcount'] = sc.fit_transform(og['wordcount'].values.reshape(-1, 1)) # Stem sim og['stem_g_similarity'] = og.q_stemmed.apply( lambda x: stfu.generic_similarity(x, a_stemmed)) og['stem_j_similarity'] = og.q_stemmed.apply( lambda x: stfu.jaccard_similarity(x, a_stemmed)) og['stem_c_similarity'] = og.q_stemmed.apply( lambda x: stfu.cosine_similarity(x, a_stemmed)) # Ordered og['stem_ordered_g_similarity'] = og.q_stem_ordered.apply( lambda x: stfu.generic_similarity(x, a_stemmed_ordered)) og['stem_ordered_j_similarity'] = og.q_stem_ordered.apply( lambda x: stfu.jaccard_similarity(x, a_stemmed_ordered)) og['stem_ordered_c_similarity'] = og.q_stem_ordered.apply( lambda x: stfu.cosine_similarity(x, a_stemmed_ordered)) types_of_sentences = [ 'q_stemmed', 'q_stem_ordered', ] for sent_type, teach_ans in zip(types_of_sentences, teacher_answers): og = stfu.unigram_entity_extraction(og, sent_type, sent_type, teach_ans) og = stfu.bigram_entity_extraction(og, sent_type, sent_type, teach_ans) og = stfu.trigram_entity_extraction(og, sent_type, sent_type, teach_ans) return og.loc[:, :'q_stem_ordered'], og.loc[:, 'wordcount':]