def VPNN(tag_sen):
    new_sen = tag_sen[0]
    words = tag_sen[1]['VPNN']
    for word in words:
        if len(wordnet.get_synonyms(word)) == 0:
            continue
        new_word = random.sample(wordnet.get_synonyms(word), 1)
        new_sen = new_sen.replace(word, new_word[0])
    return new_sen
示例#2
0
def make_thesaurus(file_path):
    """
    Returns dict of counters 'thesaurus', where
    thesaurus[word] = { synonym1: 4, syn2: 8, syn3: 1, ... }
    """
    thesaurus = defaultdict(lambda: Counter())

    with open(file_path, "r") as f:
        for line in f:

            # Ignore repeated book title headers
            if _is_title(line):
                continue

            parsed = parse(line)

            for tagged_word in parsed.split()[0]:
                word = tagged_word[0].strip().lower()
                pos = tagged_word[1][0]  # get pos for word

                # Reject non-ASCII characters
                try:
                    word = word.decode("ascii")
                except (UnicodeDecodeError, UnicodeEncodeError):
                    continue

                # Reject whitespace character
                if re.match("^[\s]*$", word):
                    continue

                # Increment word count of word w
                thesaurus[word].update([word])

                # Retrieve syn = synonym[w], add to thesaurus[syn]
                for syn in wn.get_synonyms(word):
                    syn = syn.name().split(".")[0]

                    # if noun, add plural form if word is plural, else add singular
                    if pos == "N":
                        if word == pluralize(word):
                            thesaurus[pluralize(syn)].update([word])
                        else:
                            thesaurus[syn].update([word])
                    # if verb, conjugate synonyms to the right form before adding them to thes
                    elif pos == "V":
                        word_tenses = tenses(word)
                        if word_tenses:
                            thesaurus[conjugate(syn, tense=word_tenses[0][0])].update([word])
                        else:
                            thesaurus[syn].update([word])
                    else:
                        thesaurus[syn].update([word])

    # Update thesaurus with mappings, if map_file exists
    file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER)
    map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG)
    thesaurus = _add_mappings(map_file, thesaurus)

    return thesaurus
示例#3
0
    def CalCosDist(self, ans_sentencelist, std_sen):
        debug_print("Answer.CalCosDist%s" % str((ans_sentencelist, std_sen)),
                    level=6)
        match_sen = None
        max_cos = 0
        apply_term_expansion = (self.apply_synonym_expansion
                                or self.apply_ancestor_expansion)
        all_std_words = std_sen['KeySVec'].keys()
        for stu_sen in ans_sentencelist:
            # Make sure student sentence not already matched
            # TODO: Rework the already-matched check to be in terms of words not sentences (e.g., in case student just gives one long sentence).
            if (self.only_match_sentence_once and stu_sen.has_key('Selected')):
                debug_print("Ingoring already matched sentence %s" %
                            stu_sen['No'])
                continue
            # Compute measure for current sentence
            q, s, qs = 0, 0, 0
            exp_terms = []
            for word in all_std_words:
                # OLD: q += std_sen['KeySVec'][word] * std_sen['KeySVec'][word]
                # OLD: s += stu_sen['StuSVec'][word] * stu_sen['StuSVec'][word]
                # OLD: qs += std_sen['KeySVec'][word] * stu_sen['StuSVec'][word]

                # If a standard word doesn't occur in the student sentence, then apply term expansion
                # by checking for most frequent synonym and/or ancestor term that does occur.
                # Note: Ancestor terms might be too general, so not checked if synonym found.
                # Also, expansions omit standard terms to avoid counting evidence twice.
                # TODO: Scale ancestor weight by degree of generality.
                std_freq = std_sen['KeySVec'][word]
                stu_freq = stu_sen['StuSVec'][word] if stu_sen[
                    'StuSVec'].has_key(word) else 0
                std_word = word
                if ((stu_freq == 0) and apply_term_expansion):
                    stu_word = std_word
                    scale_factor = 1.0
                    # Check synonyms (e.g., attorney for lawyer), excluding words in standard
                    if (self.apply_synonym_expansion):
                        debug_print(
                            "Checking for synonym of standard term '%s' among student terms"
                            % std_word,
                            level=5)
                        synonyms = list_difference(
                            wordnet.get_synonyms(std_word), all_std_words)
                        exp_word = find_most_freq_term(synonyms,
                                                       stu_sen['StuSVec'])
                        if (exp_word and (exp_word != stu_word)):
                            # Note: Uses frequency from student vector for synonym term
                            stu_word = exp_word
                            scale_factor = self.synonym_scale_factor
                            debug_print(
                                "Using (student) synonym '%s' to match (standard) word '%s'"
                                % (exp_word, std_word),
                                level=4)
                    # Check ancestors (e.g., professional for lawyer), excluding words in standard
                    if (self.apply_ancestor_expansion
                            and (stu_word == std_word)):
                        debug_print(
                            "Checking for ancestor of standard term '%s' among student terms"
                            % std_word,
                            level=5)
                        ancestors = list_difference(
                            wordnet.get_hypernym_terms(
                                std_word, self.max_ancestor_links),
                            all_std_words)
                        exp_word = find_most_freq_term(ancestors,
                                                       stu_sen['StuSVec'])
                        if (exp_word and (exp_word != stu_word)):
                            # As before, uses frequency from student vector for expansion term
                            stu_word = exp_word
                            scale_factor = self.synonym_scale_factor
                            debug_print(
                                "Using (student) ancestor term '%s' to match (standard) word '%s'"
                                % (exp_word, std_word),
                                level=4)
                    # Update frequency and make note of expansion for posthoc diagnosis
                    if (stu_word != std_word):
                        stu_freq = stu_sen['StuSVec'][stu_word] * scale_factor
                        debug_print("Scaled frequency score from %f to %f" %
                                    (stu_sen['StuSVec'][stu_word], stu_freq),
                                    level=7)
                        exp_terms.append(std_word + "->" + stu_word)
                # Do component-wise update
                debug_print("deltas: q=%f s=%f qs=%f" %
                            (std_freq * std_freq, stu_freq * stu_freq,
                             std_freq * stu_freq),
                            level=6)
                q += std_freq * std_freq
                s += stu_freq * stu_freq
                qs += std_freq * stu_freq
                debug_print("q=%f s=%f qs=%f" % (q, s, qs), level=7)
            if q == 0 or s == 0:
                qs_cos = 0
            else:
                qs_cos = qs / (math.sqrt(q * s))
            if (apply_term_expansion):
                stu_sen['ExpTerms'] = exp_terms

            # Update max score, optionally recording expansion terms in hash for matching student sentence (under ExpTerms)
            stu_words = [
                word for word in stu_sen['StuSVec']
                if stu_sen['StuSVec'][word] > 0
            ]
            if qs_cos > max_cos and len(stu_words) > 0:
                max_cos = qs_cos
                match_sen = stu_sen
        if (self.only_match_sentence_once and match_sen):
            match_sen['Selected'] = True
        debug_print("Answer.CalCosDist(%s,_) => %s" %
                    (str(ans_sentencelist), str((max_cos, match_sen))),
                    level=6)
        return max_cos, match_sen
示例#4
0
 quiz_count = 0
 for (doc_idx, sent_idx, gap_idx, sim_ds,
      sim_sg) in quiz_doc_sent_gap_tuple:
     if quiz_count == args.num_quiz:
         break
     vgap = wword[gap_idx]
     sim = (wword - vgap)**2
     sim = np.sum(sim, axis=1)
     sim = np.sqrt(sim)
     pos_list = nltk.pos_tag(
         word_tokenize(sentwordrawdic[index2sent[sent_idx]]))
     gap_pos = [
         pos for (word, pos) in pos_list
         if word.lower() == index2word[gap_idx]
     ][0]
     synonyms = wn.get_synonyms(index2word[gap_idx])
     same_lexname_words = wn.get_same_lexname_words(index2word[gap_idx])
     distractors = []
     sim_gd = []
     for i in sim.argsort():
         if np.isnan(sim[i]):
             continue
         if index2word[i] == '#PAD_WORD#' or index2word[i] == '#EOS#':
             continue
         if gap_idx == i:
             continue
         if index2word[i].lower() in hand_stopwords:
             continue
         # Exclude the words that belong to the same sentence as the gap
         if index2word[i] in sentworddic[index2sent[sent_idx]]:
             continue
示例#5
0
    def CalCosDist(self, ans_sentencelist, std_sen):
        debug_print("Answer.CalCosDist%s" % str((ans_sentencelist, std_sen)), level=6)
        match_sen = None
        max_cos = 0
        best_matching_stu_words = []
        apply_term_expansion = (self.apply_synonym_expansion or self.apply_ancestor_expansion)
        all_std_words = std_sen['KeySVec'].keys()
        # Setup the hash key to use for looking up student frequencies
        stu_freq_master_key = 'StuSVec'
        stu_freq_lookup_key = 'StuSVecTemp' if self.only_match_word_tokens_once else stu_freq_master_key

        for stu_sen in ans_sentencelist:
            # Create bookkeeping hash when sentence encountered first time during single-word-token matching
            # Note: new temp hash used (e.g., 'StuSVecTemp'), which shadows the input version during calculations.
            if self.only_match_word_tokens_once and (not stu_sen.has_key(stu_freq_lookup_key)):
                # TODO: stu_sen[stu_freq_lookup_key] = stu_sen[stu_freq_master_key].copy
                stu_sen[stu_freq_lookup_key] = dict()
                for word in stu_sen[stu_freq_master_key].keys():
                    stu_sen[stu_freq_lookup_key][word] = stu_sen[stu_freq_master_key][word]
            debug_print("stu_sen[stu_freq_lookup_key] (len=%d): %s" % (len(stu_sen[stu_freq_lookup_key]), stu_sen[stu_freq_lookup_key]), 6)
            assert(len(stu_sen[stu_freq_lookup_key]) == len(stu_sen[stu_freq_master_key]))
            # Make sure student sentence not already matched
            # TODO: Rework the already-matched check to be in terms of words not sentences (e.g., in case student just gives one long sentence).
            if (self.only_match_sentence_once and stu_sen.has_key('Selected')):
                debug_print("Ignoring already matched sentence %s" % stu_sen['No'], 4)
                continue
            # Compute measure for current sentence
            q, s, qs = 0, 0, 0
            exp_terms = []
            matching_stu_words = []
            for word in all_std_words:
                # OLD: q += std_sen['KeySVec'][word] * std_sen['KeySVec'][word]
                # OLD: s += stu_sen['StuSVec'][word] * stu_sen['StuSVec'][word]
                # OLD: qs += std_sen['KeySVec'][word] * stu_sen['StuSVec'][word]

                # If a standard word doesn't occur in the student sentence, then apply term expansion
                # by checking for most frequent synonym and/or ancestor term that does occur.
                # Note: Ancestor terms might be too general, so not checked if synonym found.
                # Also, expansions omit standard terms to avoid counting evidence twice.
                # TODO: Scale ancestor weight by degree of generality.
                std_freq = std_sen['KeySVec'][word]
                stu_freq = stu_sen[stu_freq_lookup_key][word] if stu_sen[stu_freq_lookup_key].has_key(word) else 0
                std_word = word
                stu_word = std_word
                if ((stu_freq == 0) and apply_term_expansion):
                    scale_factor = 1.0
                    # Check synonyms (e.g., attorney for lawyer), excluding words in standard
                    if (self.apply_synonym_expansion):
                        debug_print("Checking for synonym of standard term '%s' among student terms" % std_word, level=5)
                        synonyms = list_difference(wordnet.get_synonyms(std_word), all_std_words)
                        exp_word = find_most_freq_term(synonyms, stu_sen[stu_freq_lookup_key])
                        if (exp_word and (exp_word != stu_word)):
                            # Note: Uses frequency from student vector for synonym term
                            stu_word = exp_word
                            scale_factor = self.synonym_scale_factor
                            debug_print("Using (student) synonym '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4)
                    # Check ancestors (e.g., professional for lawyer), excluding words in standard
                    if (self.apply_ancestor_expansion and (stu_word == std_word)):
                        debug_print("Checking for ancestor of standard term '%s' among student terms" % std_word, level=5)
                        ## OLD: ancestors = list_difference(wordnet.get_hypernym_terms(std_word, self.max_ancestor_links), all_std_words)
                        ancestors = list_difference(wordnet.get_hypernym_terms(std_word), all_std_words)
                        exp_word = find_most_freq_term(ancestors, stu_sen[stu_freq_lookup_key])
                        if (exp_word and (exp_word != stu_word)):
                            # As before, uses frequency from student vector for expansion term
                            stu_word = exp_word
                            scale_factor = self.ancestor_scale_factor
                            debug_print("Using (student) ancestor term '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4)
                    # Update frequency and make note of expansion for posthoc diagnosis
                    if (stu_word != std_word):
                        stu_freq = stu_sen[stu_freq_lookup_key][stu_word] * scale_factor
                        debug_print("Scaled frequency score from %f to %f" % (stu_sen[stu_freq_lookup_key][stu_word], stu_freq), level=7)
                        exp_terms.append(std_word + "->" + stu_word)
                # Do component-wise update
                debug_print("deltas: q=%f s=%f qs=%f w=%s" % (std_freq * std_freq, stu_freq * stu_freq, std_freq * stu_freq, word), level=6)
                q += std_freq * std_freq
                s += stu_freq * stu_freq
                qs += std_freq * stu_freq
                debug_print("q=%f s=%f qs=%f" % (q, s, qs),level=7)
                if (std_freq * stu_freq > 0):
                    matching_stu_words.append(stu_word)
            if q == 0 or s == 0:
                qs_cos = 0
            else:
                qs_cos = qs / (math.sqrt(q * s))
            if (apply_term_expansion):
                stu_sen['ExpTerms'] = exp_terms

            # Update max score, optionally recording expansion terms in hash for matching student sentence (under ExpTerms)
            stu_words = [word for word in stu_sen[stu_freq_lookup_key] if stu_sen[stu_freq_lookup_key][word] > 0]
            if qs_cos > max_cos and len(stu_words) > 0:
                max_cos = qs_cos
                match_sen = stu_sen
                best_matching_stu_words = matching_stu_words
        # Optionally, remove sentences or individual words matched from further consideration
        if (match_sen):
            if (self.only_match_sentence_once):
                match_sen['Selected'] = True
            if self.only_match_word_tokens_once:
                for word in best_matching_stu_words:
                    match_sen[stu_freq_lookup_key][word] = 0
        debug_print("Answer.CalCosDist(%s,_) => %s" % (str(ans_sentencelist), str((max_cos, match_sen, best_matching_stu_words))), level=6)
        return max_cos, match_sen, best_matching_stu_words