示例#1
0
def smooth(tagged_list, prime=0):
    brown_words = remove_tags(tagged_list)

    if prime == 0:
        return brown_words

    if prime == 1:
        x_prime = remove_words(tagged_list)
        U, B, T = ax.countNgrams(x_prime, 0)
        word_tag_dict = word_tag(tagged_list)

        Uy, By, Ty = ax.countNgrams(brown_words, 0)
        bigram_dict = prime_bigram(By, word_tag_dict)

        trigram_dict = prime_trigram(Ty, word_tag_dict)

        full_data = [U, bigram_dict, trigram_dict]
        return entropy(full_data, 1)

    x_prime = remove_words(tagged_list)
    word_tag_dict = word_tag(tagged_list)
    U, B, T = ax.countNgrams(x_prime, 0)
    Uy, By, Ty = ax.countNgrams(brown_words, 0)
    xy_bigram_dict = xy_prime_bigram(By, word_tag_dict)
    xy_trigram_dict = xy_prime_trigram(Ty, word_tag_dict)
    full_data = [U, xy_bigram_dict, xy_trigram_dict]
    return entropy(full_data, 1)
def get_trigram_entropy(words, start, end):
  total_words = end
  U, B, T = auxiliar.countNgrams(words,start, end)

  u_prob = helper.unigram_prob(U,float(total_words))
  b_prob, b_posibilities = helper.bigram_prob(B,U)
  t_prob, t_posibilities = helper.trigram_prob(T,B)

  t_h = helper.trigram_model(U, b_posibilities, t_posibilities, u_prob, b_prob, t_prob)
  return t_h
def compute_models():
  file = 'corpora/en.txt'
  words = auxiliar.getWordsFromFile(file)
  total_words = float(len(words))

  U, B, T = auxiliar.countNgrams(words,0, 0)

  u_prob = helper.unigram_prob(U,total_words)
  b_prob, b_posibilities = helper.bigram_prob(B,U)
  t_prob, t_posibilities = helper.trigram_prob(T,B)

  u_h = helper.unigram_model(U, u_prob)
  b_h = helper.bigram_model (U, b_posibilities, u_prob, b_prob)
  t_h = helper.trigram_model(U, b_posibilities, t_posibilities, u_prob, b_prob, t_prob)

  return u_h, b_h, t_h
示例#4
0
def entropy(word_list, flag=0):
    if flag == 0:
        U, B, T = ax.countNgrams(word_list, 0)
    else:
        U, B, T = word_list

    freq_uni = freq_count(U)
    uni_prob = prob_x(U, freq_uni)
    h_uni_gram = uni_entropy(uni_prob)

    bi_prob = prob_yx(B, U)
    h_bi_gram = bi_entropy(bi_prob, uni_prob)

    tri_prob = prob_zxy(T, B)
    h_tri_gram = tri_entropy(tri_prob, bi_prob, uni_prob)

    return (h_uni_gram, h_bi_gram, h_tri_gram)
示例#5
0
def get_brown_tri(tagged_data):
	words_without_tag = []
	for word,tag in tagged_data:
		words_without_tag.append(word)
	return words_without_tag

def get_brown_tags_uni(tagged_data):
	tags = []
	for word,tag in tagged_data[0]:
		tags.append(tag)
	return tags


# Get Data
n_grams_en = ax.countNgrams(enWords, 0)
uni_gram_dict = filter_data(n_grams_en, 1)

freq_uni = freq(uni_gram_dict)
uni_prob = prob_x(uni_gram_dict, freq_uni)
H_uni = uni_entropy(uni_prob)


bi_gram_dict = filter_data(n_grams_en, 2)
bi_prob = prob_yx(bi_gram_dict, uni_gram_dict)
H_bi =  bi_entropy(bi_prob, uni_prob)

tri_gram_dict = filter_data(n_grams_en, 3)
tri_prob = prob_zxy(tri_gram_dict, bi_gram_dict)
H_tri = tri_entropy(tri_prob, bi_prob, uni_prob)
示例#6
0
文件: LM.py 项目: misakss/Portfolio
        if (l[i - 1][xIsTag], l[i][yIsTag]) not in B:
            B[(l[i - 1][xIsTag], l[i][yIsTag])] = 1
        else:
            B[(l[i - 1][xIsTag], l[i][yIsTag])] += 1
        if (l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0]) not in T:
            T[(l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0])] = 1
        else:
            T[(l[i - 2][xIsTag], l[i - 1][yIsTag], l[i][0])] += 1
    return (U, B, T)


#------------------------------------MAIN-------------------------------

words = getWordsFromFile('en.txt')

(unicount, bicount, tricount) = countNgrams(words, 0)

print("unigram")
print(unigramEntropy(unicount, bicount, tricount, len(words)))
print("perplexity unigram")
print(math.pow(2, unigramEntropy(unicount, bicount, tricount, len(words))))
print("bigram")
print(bigramEntropy(unicount, bicount, tricount, len(words)))
print("perplexity bigram")
print(math.pow(2, bigramEntropy(unicount, bicount, tricount, len(words))))
print("trigram")
print(trigramEntropy(unicount, bicount, tricount, len(words)))
print("perplexity trigram")
print(math.pow(2, trigramEntropy(unicount, bicount, tricount, len(words))))

print("(x,y,z), words, full")