def VPNN(tag_sen): new_sen = tag_sen[0] words = tag_sen[1]['VPNN'] for word in words: if len(wordnet.get_synonyms(word)) == 0: continue new_word = random.sample(wordnet.get_synonyms(word), 1) new_sen = new_sen.replace(word, new_word[0]) return new_sen
def make_thesaurus(file_path): """ Returns dict of counters 'thesaurus', where thesaurus[word] = { synonym1: 4, syn2: 8, syn3: 1, ... } """ thesaurus = defaultdict(lambda: Counter()) with open(file_path, "r") as f: for line in f: # Ignore repeated book title headers if _is_title(line): continue parsed = parse(line) for tagged_word in parsed.split()[0]: word = tagged_word[0].strip().lower() pos = tagged_word[1][0] # get pos for word # Reject non-ASCII characters try: word = word.decode("ascii") except (UnicodeDecodeError, UnicodeEncodeError): continue # Reject whitespace character if re.match("^[\s]*$", word): continue # Increment word count of word w thesaurus[word].update([word]) # Retrieve syn = synonym[w], add to thesaurus[syn] for syn in wn.get_synonyms(word): syn = syn.name().split(".")[0] # if noun, add plural form if word is plural, else add singular if pos == "N": if word == pluralize(word): thesaurus[pluralize(syn)].update([word]) else: thesaurus[syn].update([word]) # if verb, conjugate synonyms to the right form before adding them to thes elif pos == "V": word_tenses = tenses(word) if word_tenses: thesaurus[conjugate(syn, tense=word_tenses[0][0])].update([word]) else: thesaurus[syn].update([word]) else: thesaurus[syn].update([word]) # Update thesaurus with mappings, if map_file exists file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER) map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG) thesaurus = _add_mappings(map_file, thesaurus) return thesaurus
def CalCosDist(self, ans_sentencelist, std_sen): debug_print("Answer.CalCosDist%s" % str((ans_sentencelist, std_sen)), level=6) match_sen = None max_cos = 0 apply_term_expansion = (self.apply_synonym_expansion or self.apply_ancestor_expansion) all_std_words = std_sen['KeySVec'].keys() for stu_sen in ans_sentencelist: # Make sure student sentence not already matched # TODO: Rework the already-matched check to be in terms of words not sentences (e.g., in case student just gives one long sentence). if (self.only_match_sentence_once and stu_sen.has_key('Selected')): debug_print("Ingoring already matched sentence %s" % stu_sen['No']) continue # Compute measure for current sentence q, s, qs = 0, 0, 0 exp_terms = [] for word in all_std_words: # OLD: q += std_sen['KeySVec'][word] * std_sen['KeySVec'][word] # OLD: s += stu_sen['StuSVec'][word] * stu_sen['StuSVec'][word] # OLD: qs += std_sen['KeySVec'][word] * stu_sen['StuSVec'][word] # If a standard word doesn't occur in the student sentence, then apply term expansion # by checking for most frequent synonym and/or ancestor term that does occur. # Note: Ancestor terms might be too general, so not checked if synonym found. # Also, expansions omit standard terms to avoid counting evidence twice. # TODO: Scale ancestor weight by degree of generality. std_freq = std_sen['KeySVec'][word] stu_freq = stu_sen['StuSVec'][word] if stu_sen[ 'StuSVec'].has_key(word) else 0 std_word = word if ((stu_freq == 0) and apply_term_expansion): stu_word = std_word scale_factor = 1.0 # Check synonyms (e.g., attorney for lawyer), excluding words in standard if (self.apply_synonym_expansion): debug_print( "Checking for synonym of standard term '%s' among student terms" % std_word, level=5) synonyms = list_difference( wordnet.get_synonyms(std_word), all_std_words) exp_word = find_most_freq_term(synonyms, stu_sen['StuSVec']) if (exp_word and (exp_word != stu_word)): # Note: Uses frequency from student vector for synonym term stu_word = exp_word scale_factor = self.synonym_scale_factor debug_print( "Using (student) synonym '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4) # Check ancestors (e.g., professional for lawyer), excluding words in standard if (self.apply_ancestor_expansion and (stu_word == std_word)): debug_print( "Checking for ancestor of standard term '%s' among student terms" % std_word, level=5) ancestors = list_difference( wordnet.get_hypernym_terms( std_word, self.max_ancestor_links), all_std_words) exp_word = find_most_freq_term(ancestors, stu_sen['StuSVec']) if (exp_word and (exp_word != stu_word)): # As before, uses frequency from student vector for expansion term stu_word = exp_word scale_factor = self.synonym_scale_factor debug_print( "Using (student) ancestor term '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4) # Update frequency and make note of expansion for posthoc diagnosis if (stu_word != std_word): stu_freq = stu_sen['StuSVec'][stu_word] * scale_factor debug_print("Scaled frequency score from %f to %f" % (stu_sen['StuSVec'][stu_word], stu_freq), level=7) exp_terms.append(std_word + "->" + stu_word) # Do component-wise update debug_print("deltas: q=%f s=%f qs=%f" % (std_freq * std_freq, stu_freq * stu_freq, std_freq * stu_freq), level=6) q += std_freq * std_freq s += stu_freq * stu_freq qs += std_freq * stu_freq debug_print("q=%f s=%f qs=%f" % (q, s, qs), level=7) if q == 0 or s == 0: qs_cos = 0 else: qs_cos = qs / (math.sqrt(q * s)) if (apply_term_expansion): stu_sen['ExpTerms'] = exp_terms # Update max score, optionally recording expansion terms in hash for matching student sentence (under ExpTerms) stu_words = [ word for word in stu_sen['StuSVec'] if stu_sen['StuSVec'][word] > 0 ] if qs_cos > max_cos and len(stu_words) > 0: max_cos = qs_cos match_sen = stu_sen if (self.only_match_sentence_once and match_sen): match_sen['Selected'] = True debug_print("Answer.CalCosDist(%s,_) => %s" % (str(ans_sentencelist), str((max_cos, match_sen))), level=6) return max_cos, match_sen
quiz_count = 0 for (doc_idx, sent_idx, gap_idx, sim_ds, sim_sg) in quiz_doc_sent_gap_tuple: if quiz_count == args.num_quiz: break vgap = wword[gap_idx] sim = (wword - vgap)**2 sim = np.sum(sim, axis=1) sim = np.sqrt(sim) pos_list = nltk.pos_tag( word_tokenize(sentwordrawdic[index2sent[sent_idx]])) gap_pos = [ pos for (word, pos) in pos_list if word.lower() == index2word[gap_idx] ][0] synonyms = wn.get_synonyms(index2word[gap_idx]) same_lexname_words = wn.get_same_lexname_words(index2word[gap_idx]) distractors = [] sim_gd = [] for i in sim.argsort(): if np.isnan(sim[i]): continue if index2word[i] == '#PAD_WORD#' or index2word[i] == '#EOS#': continue if gap_idx == i: continue if index2word[i].lower() in hand_stopwords: continue # Exclude the words that belong to the same sentence as the gap if index2word[i] in sentworddic[index2sent[sent_idx]]: continue
def CalCosDist(self, ans_sentencelist, std_sen): debug_print("Answer.CalCosDist%s" % str((ans_sentencelist, std_sen)), level=6) match_sen = None max_cos = 0 best_matching_stu_words = [] apply_term_expansion = (self.apply_synonym_expansion or self.apply_ancestor_expansion) all_std_words = std_sen['KeySVec'].keys() # Setup the hash key to use for looking up student frequencies stu_freq_master_key = 'StuSVec' stu_freq_lookup_key = 'StuSVecTemp' if self.only_match_word_tokens_once else stu_freq_master_key for stu_sen in ans_sentencelist: # Create bookkeeping hash when sentence encountered first time during single-word-token matching # Note: new temp hash used (e.g., 'StuSVecTemp'), which shadows the input version during calculations. if self.only_match_word_tokens_once and (not stu_sen.has_key(stu_freq_lookup_key)): # TODO: stu_sen[stu_freq_lookup_key] = stu_sen[stu_freq_master_key].copy stu_sen[stu_freq_lookup_key] = dict() for word in stu_sen[stu_freq_master_key].keys(): stu_sen[stu_freq_lookup_key][word] = stu_sen[stu_freq_master_key][word] debug_print("stu_sen[stu_freq_lookup_key] (len=%d): %s" % (len(stu_sen[stu_freq_lookup_key]), stu_sen[stu_freq_lookup_key]), 6) assert(len(stu_sen[stu_freq_lookup_key]) == len(stu_sen[stu_freq_master_key])) # Make sure student sentence not already matched # TODO: Rework the already-matched check to be in terms of words not sentences (e.g., in case student just gives one long sentence). if (self.only_match_sentence_once and stu_sen.has_key('Selected')): debug_print("Ignoring already matched sentence %s" % stu_sen['No'], 4) continue # Compute measure for current sentence q, s, qs = 0, 0, 0 exp_terms = [] matching_stu_words = [] for word in all_std_words: # OLD: q += std_sen['KeySVec'][word] * std_sen['KeySVec'][word] # OLD: s += stu_sen['StuSVec'][word] * stu_sen['StuSVec'][word] # OLD: qs += std_sen['KeySVec'][word] * stu_sen['StuSVec'][word] # If a standard word doesn't occur in the student sentence, then apply term expansion # by checking for most frequent synonym and/or ancestor term that does occur. # Note: Ancestor terms might be too general, so not checked if synonym found. # Also, expansions omit standard terms to avoid counting evidence twice. # TODO: Scale ancestor weight by degree of generality. std_freq = std_sen['KeySVec'][word] stu_freq = stu_sen[stu_freq_lookup_key][word] if stu_sen[stu_freq_lookup_key].has_key(word) else 0 std_word = word stu_word = std_word if ((stu_freq == 0) and apply_term_expansion): scale_factor = 1.0 # Check synonyms (e.g., attorney for lawyer), excluding words in standard if (self.apply_synonym_expansion): debug_print("Checking for synonym of standard term '%s' among student terms" % std_word, level=5) synonyms = list_difference(wordnet.get_synonyms(std_word), all_std_words) exp_word = find_most_freq_term(synonyms, stu_sen[stu_freq_lookup_key]) if (exp_word and (exp_word != stu_word)): # Note: Uses frequency from student vector for synonym term stu_word = exp_word scale_factor = self.synonym_scale_factor debug_print("Using (student) synonym '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4) # Check ancestors (e.g., professional for lawyer), excluding words in standard if (self.apply_ancestor_expansion and (stu_word == std_word)): debug_print("Checking for ancestor of standard term '%s' among student terms" % std_word, level=5) ## OLD: ancestors = list_difference(wordnet.get_hypernym_terms(std_word, self.max_ancestor_links), all_std_words) ancestors = list_difference(wordnet.get_hypernym_terms(std_word), all_std_words) exp_word = find_most_freq_term(ancestors, stu_sen[stu_freq_lookup_key]) if (exp_word and (exp_word != stu_word)): # As before, uses frequency from student vector for expansion term stu_word = exp_word scale_factor = self.ancestor_scale_factor debug_print("Using (student) ancestor term '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4) # Update frequency and make note of expansion for posthoc diagnosis if (stu_word != std_word): stu_freq = stu_sen[stu_freq_lookup_key][stu_word] * scale_factor debug_print("Scaled frequency score from %f to %f" % (stu_sen[stu_freq_lookup_key][stu_word], stu_freq), level=7) exp_terms.append(std_word + "->" + stu_word) # Do component-wise update debug_print("deltas: q=%f s=%f qs=%f w=%s" % (std_freq * std_freq, stu_freq * stu_freq, std_freq * stu_freq, word), level=6) q += std_freq * std_freq s += stu_freq * stu_freq qs += std_freq * stu_freq debug_print("q=%f s=%f qs=%f" % (q, s, qs),level=7) if (std_freq * stu_freq > 0): matching_stu_words.append(stu_word) if q == 0 or s == 0: qs_cos = 0 else: qs_cos = qs / (math.sqrt(q * s)) if (apply_term_expansion): stu_sen['ExpTerms'] = exp_terms # Update max score, optionally recording expansion terms in hash for matching student sentence (under ExpTerms) stu_words = [word for word in stu_sen[stu_freq_lookup_key] if stu_sen[stu_freq_lookup_key][word] > 0] if qs_cos > max_cos and len(stu_words) > 0: max_cos = qs_cos match_sen = stu_sen best_matching_stu_words = matching_stu_words # Optionally, remove sentences or individual words matched from further consideration if (match_sen): if (self.only_match_sentence_once): match_sen['Selected'] = True if self.only_match_word_tokens_once: for word in best_matching_stu_words: match_sen[stu_freq_lookup_key][word] = 0 debug_print("Answer.CalCosDist(%s,_) => %s" % (str(ans_sentencelist), str((max_cos, match_sen, best_matching_stu_words))), level=6) return max_cos, match_sen, best_matching_stu_words