def run_test_data(trigram_prob_index,unigram_prob_index,fivegram_count_index,quadgram_count_index,trigram_count_index,bigram_count_index): ngram_words = build.buildDict() # Get the index structure build from word checker start = time.time() with open('../TrainData/phrases.tsv') as f: lines = f.read().splitlines() for line in lines: phrase = line.split(' ')[0] words = extract_words(phrase) pos = 0 is_first_word = True for word in words: results = [] # Search in UNIX dictionary (indexed as a trie). It returns a list of words at edit distance 0. if len(build.search(word,0)) != 1: # need to predict change in word # For now , obtain confusion set as the set returned from index structure confusion_set = build.get_cands(build.candidate_from_ngrams(ngram_words,word,build.NGRAM_N),word) max_score = 0 max_sentence = [] for confused_pair in confusion_set: confused_word = confused_pair[0] sentence = list(words) sentence[pos] = confused_word #print sentence #score1 = find_prob_of_sentence(sentence,trigram_prob_index,unigram_prob_index) score1 = find_prob_sentence_all_grams(sentence,pos,fivegram_count_index,quadgram_count_index,trigram_count_index,bigram_count_index) results.append((sentence,score1)) if score1 > max_score: max_score = score1 max_sentence = sentence print max_sentence,max_score print sorted(results,key=lambda x: x[1],reverse=True)[0:4] pos +=1
def run_test_data(): ngram_words = build.buildDict() # Get the index structure build from word checker (wordToContextWords,totalCount) = learnContext() with open('../TrainData/phrases.tsv') as f: lines = f.read().splitlines() for line in lines: phrase = line.split(' ')[0] words = extract_words(phrase) pos = 0 chosen_word = "" chosen_word_rank = "" for word in words: # Search in UNIX dictionary (indexed as a trie). It returns a list of words at edit distance 0. if len(build.search(word,0)) != 1 : # need to predict change in word context_words = find_context_words(pos,words) # For now , obtain confusion set as the set returned from index structure confusion_set = build.get_cands(build.candidate_from_ngrams(ngram_words,word,build.NGRAM_N),word) max_prob = 0 max_rank = 0 max_score = 0 max_total = 0 max_sentence = [] total_sentence = [] for confused_pair in confusion_set: confused_word = confused_pair[0] if confused_word not in wordToContextWords.keys(): continue prob = 1 rank = 0 # For each context word in the phrase that is present in the dict, multiply the prob. or add rank # Here, I haven't removed the un important words as they do in the paper. They do it as they know # the confusion sets before hand. for context_word in context_words: if context_word in wordToContextWords[confused_word].keys(): prob *= wordToContextWords[confused_word][context_word]/float(wordToContextWords[confused_word]['cnt']) rank += 1 sentence = list(words) sentence[pos] = confused_word #print sentence #score1 = find_prob_of_sentence(sentence,trigram_prob_index,unigram_prob_index) score1 = ngram.find_prob_sentence_all_grams(sentence,pos,ngram.fivegram_count_index,ngram.quadgram_count_index,ngram.trigram_count_index,ngram.bigram_count_index) if score1 > max_score: max_score = score1 max_sentence = sentence # Find word with max prob/ rank if prob != 1 and prob > max_prob: max_prob = prob chosen_word = confused_word #print word,confused_word,rank if rank > max_rank: #print word,confused_word,rank max_rank = rank chosen_word_rank = confused_word if rank+score1 > max_total: max_total = rank+score1 total_sentence= sentence #elif rank == max_rank: #print confused_word #print word,chosen_word_rank,max_rank print max_sentence,max_score print word,chosen_word_rank,max_rank print total_sentence,max_total pos +=1