Exemplo n.º 1
0
def run_test_data(trigram_prob_index,unigram_prob_index,fivegram_count_index,quadgram_count_index,trigram_count_index,bigram_count_index):

	ngram_words = build.buildDict() # Get the index structure build from word checker
	start = time.time()
	with open('../TrainData/phrases.tsv') as f:
		lines = f.read().splitlines()
		for line in lines:
			phrase = line.split('  ')[0]
			words = extract_words(phrase)
			pos = 0
			is_first_word = True
			for word in words:
				results = []
				# Search in UNIX dictionary (indexed as a trie). It returns a list of words at edit distance 0.
				if len(build.search(word,0)) != 1:
					# need to predict change in word
					# For now , obtain confusion set as the set returned from index structure
					
					confusion_set = build.get_cands(build.candidate_from_ngrams(ngram_words,word,build.NGRAM_N),word)
					max_score = 0
					max_sentence = []
					for confused_pair in confusion_set:
						confused_word = confused_pair[0]
						sentence  = list(words)
						sentence[pos] = confused_word
						#print sentence
						#score1 = find_prob_of_sentence(sentence,trigram_prob_index,unigram_prob_index)
						score1 = find_prob_sentence_all_grams(sentence,pos,fivegram_count_index,quadgram_count_index,trigram_count_index,bigram_count_index)
						
						results.append((sentence,score1))
						if score1 > max_score:
							max_score = score1
							max_sentence = sentence
					print max_sentence,max_score
					print sorted(results,key=lambda x: x[1],reverse=True)[0:4]
					

				pos +=1
Exemplo n.º 2
0
def run_test_data():

    ngram_words = build.buildDict() # Get the index structure build from word checker
    (wordToContextWords,totalCount) = learnContext()

    with open('../TrainData/phrases.tsv') as f:
        lines = f.read().splitlines()
        for line in lines:
            phrase = line.split('  ')[0]
            words = extract_words(phrase)
            pos = 0
            chosen_word = ""
            chosen_word_rank = ""
            for word in words:

                # Search in UNIX dictionary (indexed as a trie). It returns a list of words at edit distance 0.
                if len(build.search(word,0)) != 1 :
                    # need to predict change in word
                    context_words = find_context_words(pos,words)

                    # For now , obtain confusion set as the set returned from index structure
                    confusion_set = build.get_cands(build.candidate_from_ngrams(ngram_words,word,build.NGRAM_N),word)
                    max_prob = 0
                    max_rank = 0
                    max_score = 0
                    max_total = 0
                    max_sentence = []
                    total_sentence = []

                    for confused_pair in confusion_set:
                        confused_word = confused_pair[0]

                        if confused_word not in wordToContextWords.keys():
                            continue

                        prob = 1
                        rank = 0
                        # For each context word in the phrase that is present in the dict, multiply the prob. or add rank
                        # Here, I haven't removed the un important words as they do in the paper. They do it as they know
                        # the confusion sets before hand.
                        for context_word in context_words:
                            if context_word in wordToContextWords[confused_word].keys():
                                prob *=  wordToContextWords[confused_word][context_word]/float(wordToContextWords[confused_word]['cnt'])
                                rank += 1



                        sentence  = list(words)
                        sentence[pos] = confused_word
                        #print sentence
                        #score1 = find_prob_of_sentence(sentence,trigram_prob_index,unigram_prob_index)
                        score1 = ngram.find_prob_sentence_all_grams(sentence,pos,ngram.fivegram_count_index,ngram.quadgram_count_index,ngram.trigram_count_index,ngram.bigram_count_index)

                        if score1 > max_score:
                            max_score = score1
                            max_sentence = sentence



                        # Find word with max prob/ rank

                        if prob != 1 and prob > max_prob:
                            max_prob = prob
                            chosen_word = confused_word

                        #print word,confused_word,rank

                        if rank > max_rank:
                            #print word,confused_word,rank
                            max_rank = rank
                            chosen_word_rank = confused_word

                        if rank+score1 > max_total:
                            max_total = rank+score1
                            total_sentence= sentence
                        #elif rank == max_rank:
                            #print confused_word
                    #print word,chosen_word_rank,max_rank    
                    print max_sentence,max_score                    
                    print word,chosen_word_rank,max_rank
                    print total_sentence,max_total    
                pos +=1