Пример #1
0
    def create_features(self, answer):
        """
        NOTES
        The below actually has MORE features than what my documentation is showing
        Suggesting I left out features for some reason?
        
        Maybe when I was doing feature selection?
        
        Basically need to create all the features for the new answer
        """
        # Get the teacher's stuff
        a_stopwords = sf.remove_stopwords(self.teacher_answer)
        a_stemmed = sf.stem_sentence(a_stopwords)
        a_stemmed_ordered = sf.order_sentence(a_stemmed)
        teacher_answers = [
            a_stemmed,
            a_stemmed_ordered,
        ]
        
        # Change sentence into multiple versions
        log = dict()
        log['student_answer'] = answer
        log['teacher_answer'] = self.teacher_answer
        log['q_answer'] = answer
        log['q_stopwords'] = sf.remove_stopwords(answer)
        log['q_stemmed'] = sf.stem_sentence(answer)
        log['q_stem_ordered'] = sf.order_sentence(log['q_stemmed'])
        
        # Might need to save scaling until jsut before modeling
        log['wordcount'] = sf.word_count(answer)
        log['wordcount'] = sf.scale_column(self.word_scaler, log['wordcount'])


#         Stem sim
        log['stem_g_similarity'] = sf.generic_similarity(log['q_stemmed'], a_stemmed)
        log['stem_j_similarity'] = sf.jaccard_similarity(log['q_stemmed'], a_stemmed)
        log['stem_c_similarity'] = sf.cosine_similarity(log['q_stemmed'], a_stemmed)
        # Ordered
        log['stem_ordered_g_similarity'] =  sf.generic_similarity(log['q_stem_ordered'], a_stemmed_ordered)
        log['stem_ordered_j_similarity'] =  sf.jaccard_similarity(log['q_stem_ordered'], a_stemmed_ordered)
        log['stem_ordered_c_similarity'] =  sf.cosine_similarity(log['q_stem_ordered'], a_stemmed_ordered)


        
        # Appending New Answer
        self.new_answers = self.new_answers.append(log, ignore_index = True)
        
        # Entity Extraction
        types_of_sentences = [
            'q_stemmed',
            'q_stem_ordered',
        ]
        
        for sent_type, teach_ans in zip(types_of_sentences, teacher_answers):
            
            self.new_answers = sf.unigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans)
            self.new_answers = sf.bigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans)
            self.new_answers = sf.trigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans)
Пример #2
0
def generate_features(file):
    og = pd.read_csv(file)
    column_count = len(og.columns)
    answer = og.iloc[0, 1]
    a_answer = answer
    a_stopwords = stfu.remove_stopwords(answer)
    a_stemmed = stfu.stem_sentence(a_stopwords)
    a_stemmed_ordered = stfu.order_sentence(a_stemmed)
    teacher_answers = [
        a_stemmed,
        a_stemmed_ordered,
    ]
    og['q_answer'] = og.student_answer.values[0]

    og['q_stopwords'] = og.student_answer.apply(stfu.remove_stopwords)

    og['q_stemmed'] = og.q_stopwords.apply(stfu.stem_sentence)
    og['q_stem_ordered'] = og.q_stemmed.apply(stfu.order_sentence)
    column_count += 4

    #     # Counts
    og['wordcount'] = og.q_stem_ordered.apply(stfu.word_count)
    sc = MinMaxScaler()
    og['wordcount'] = sc.fit_transform(og['wordcount'].values.reshape(-1, 1))

    # Stem sim
    og['stem_g_similarity'] = og.q_stemmed.apply(
        lambda x: stfu.generic_similarity(x, a_stemmed))
    og['stem_j_similarity'] = og.q_stemmed.apply(
        lambda x: stfu.jaccard_similarity(x, a_stemmed))
    og['stem_c_similarity'] = og.q_stemmed.apply(
        lambda x: stfu.cosine_similarity(x, a_stemmed))

    # Ordered
    og['stem_ordered_g_similarity'] = og.q_stem_ordered.apply(
        lambda x: stfu.generic_similarity(x, a_stemmed_ordered))
    og['stem_ordered_j_similarity'] = og.q_stem_ordered.apply(
        lambda x: stfu.jaccard_similarity(x, a_stemmed_ordered))
    og['stem_ordered_c_similarity'] = og.q_stem_ordered.apply(
        lambda x: stfu.cosine_similarity(x, a_stemmed_ordered))

    types_of_sentences = [
        'q_stemmed',
        'q_stem_ordered',
    ]

    for sent_type, teach_ans in zip(types_of_sentences, teacher_answers):

        og = stfu.unigram_entity_extraction(og, sent_type, sent_type,
                                            teach_ans)
        og = stfu.bigram_entity_extraction(og, sent_type, sent_type, teach_ans)
        og = stfu.trigram_entity_extraction(og, sent_type, sent_type,
                                            teach_ans)

    return og.loc[:, :'q_stem_ordered'], og.loc[:, 'wordcount':]