def good_turing_interpolation(): bins_tri = {} bins_bi = {} bins_uni = {} for key, value in sorted(trigram_dict.iteritems()): if value in bins_tri: bins_tri[value] += 1 else: bins_tri[value] = 1 for key, value in sorted(bigram_dict.iteritems()): if value in bins_bi: bins_bi[value] += 1 else: bins_bi[value] = 1 for key, value in sorted(unigram_dict.iteritems()): if value in bins_uni: bins_uni[value] += 1 else: bins_uni[value] = 1 f = open('English.txt', 'r') i = 1 print 'Probabilities' for line in f.readlines(): final_prob = float(0) probability = float(1) lis_tokens, dic_tokens = trigram_generator(line) for token in lis_tokens: probability = probability*float(calculate_prob(token, bins_tri, trigram_dict, trigram_len)) final_prob += 0.5*probability probability = float(1) lis_tokens, dic_tokens = bigram_generator(line) for token in lis_tokens: probability = probability*float(calculate_prob(token, bins_bi, bigram_dict, bigram_len)) final_prob += 0.3*probability probability = float(1) lis_tokens, dic_tokens = unigram_generator(line) for token in lis_tokens: probability = probability*float(calculate_prob(token, bins_uni, unigram_dict, unigram_len)) final_prob += 0.2*probability print 'line', i, ':', final_prob i = i+1
def good_turing_smoothing(): bins_tri = {} for key, value in sorted(trigram_dict.iteritems()): if value in bins_tri: bins_tri[value] += 1 else: bins_tri[value] = 1 # print bins_tri op = open("English.txt", "r") for line in op.readlines(): prob = 1 [new_token, tokendictionary1] = add_tri(line) for tokens_tri in new_token: prob = prob * float(calculate_good_next(bins_tri, tokens_tri, trigram_len, trigram_dict)) print prob
def good_turing(): bins = {} for key, value in sorted(trigram_dict.iteritems()): if value in bins: bins[value] += 1 else: bins[value] = 1 f = open('English.txt', 'r') i = 1 print 'Probabilities' for line in f.readlines(): probability = float(1) lis_tokens, dic_tokens = trigram_generator(line) for token in lis_tokens: probability = probability*float(calculate_prob(token, bins, trigram_dict, trigram_len)) print 'line', i, ':', probability i = i+1
def good_turing_interpolation(): bins_tri = {} bins_bi = {} bins_uni = {} for key, value in sorted(trigram_dict.iteritems()): if value in bins_tri: bins_tri[value] += 1 else: bins_tri[value] = 1 # print bins_tri for key, value in sorted(bigram_dict.iteritems()): if value in bins_bi: bins_bi[value] += 1 else: bins_bi[value] = 1 for key, value in sorted(unigram_dict.iteritems()): if value in bins_uni: bins_uni[value] += 1 else: bins_uni[value] = 1 op = open("English.txt", "r") for line in op.readlines(): prob = 1 prob_add = 0 [new_token, tokendictionary1] = add_tri(line) for tokens_tri in new_token: prob = prob * float(calculate_good_next(bins_tri, tokens_tri, trigram_len, trigram_dict)) # print prob prob_add = 0.5 * prob prob = 1 [new_token, tokendictionary1] = add_bi(line) for tokens_bi in new_token: prob = prob * float(calculate_good_next(bins_bi, tokens_bi, bigram_len, bigram_dict)) # print prob prob_add += 0.3 * prob prob = 1 [new_token, tokendictionary1] = add_uni(line) for tokens_uni in new_token: prob = prob * float(calculate_good_next(bins_uni, tokens_uni, unigram_len, unigram_dict)) prob_add += 0.2 * prob print prob_add