Пример #1
0
    def __init__(self, n, tra_filename):

        self.tra_filename = tra_filename
        self.n = n

        self.tra_letter_list = []
        self.ngram_list = []
        self.n1gram_list = []

        with open(TRA_SET_ROOT + tra_filename, "r") as file:
            for sentence in file:
                sentence = sentence.replace('\n', '')
                # sentence_letter_list = list(nltk.pad_sequence(sentence, self.n,
                # pad_left=False, pad_right=False))
                sentence_letter_list = list(
                    nltk.pad_sequence(sentence,
                                      self.n,
                                      pad_left=True,
                                      pad_right=True,
                                      left_pad_symbol='<s>',
                                      right_pad_symbol='</s>'))

                self.tra_letter_list.extend(sentence_letter_list)
                self.ngram_list.extend(
                    list(nltk.ngrams(sentence_letter_list, self.n)))
                self.n1gram_list.extend(
                    list(nltk.ngrams(sentence_letter_list, self.n - 1)))

        self.V = len(set(self.tra_letter_list))
        self.ngram_cfd = nltk.FreqDist(self.ngram_list)
        self.n1gram_cfd = nltk.FreqDist(self.n1gram_list)

        if gl_smoothing_type == SmoothingType.NLTK_KNESER_NEY:
            self.kneser_ney_prob_dist = nltk.KneserNeyProbDist(
                self.ngram_cfd, bins=None, discount=KNESER_NEY_DISCOUNT)
        if gl_smoothing_type == SmoothingType.IMPROVED_KNESER_NEY:
            self.n2gram_cfd = nltk.FreqDist(self.tra_letter_list)

            self.letter_count_after_n1gram = {}
            self.letter_typenum_after_n1gram = {}
            self.letter_count_after_n2gram = {}
            self.letter_typenum_after_n2gram = {}

            for w0, w1, w2 in self.ngram_cfd:
                if (w0, w1) not in self.letter_count_after_n1gram:
                    self.letter_count_after_n1gram[(w0, w1)] = 0
                    self.letter_typenum_after_n1gram[(w0, w1)] = 0

                self.letter_count_after_n1gram[(w0,
                                                w1)] += self.ngram_cfd[(w0, w1,
                                                                        w2)]
                self.letter_typenum_after_n1gram[(w0, w1)] += 1

            for w0, w1 in self.n1gram_cfd:
                if w0 not in self.letter_count_after_n2gram:
                    self.letter_count_after_n2gram[w0] = 0
                    self.letter_typenum_after_n2gram[w0] = 0
                self.letter_count_after_n2gram[w0] += self.n1gram_cfd[(w0, w1)]
                self.letter_typenum_after_n2gram[w0] += 1
Пример #2
0
def kneser_ney(tri_grams):
  freq_dist = nltk.probability.FreqDist([*tri_grams])
  for k, v in freq_dist.items():
    freq_dist[k] = tri_grams[k]
  KN = nltk.KneserNeyProbDist(freq_dist)
  KNDict = {}
  for i in KN.samples():
      KNDict[i] = KN.prob(i)
  return KNDict
Пример #3
0
    def getDictionaryProb(self):
        DictionaryProbDict = {}

        # DictionaryDist = nltk.LaplaceProbDist(self.TrigramProb)
        DictionaryDist = nltk.KneserNeyProbDist(self.TrigramProb)
        for i in DictionaryDist.samples():
            DictionaryProbDict[i] = DictionaryDist.prob(i)

        return DictionaryProbDict
Пример #4
0
def biKneserNeyBackup(nGram, palabra1, palabra2):
  fdist = nltk.FreqDist(nGram)
  kneser_ney = nltk.KneserNeyProbDist(fdist)
  prob_sum = 0 
  limiter = 0
  
  for i in kneser_ney.samples():
    if i[0] == palabra1 and i[1] == palabra2:
      prob_sum += kneser_ney.prob(i)
      print ("{0}:{1}".format(i, kneser_ney.prob(i)))
    print (prob_sum)
    limiter += 1
    if (limiter > 50):
      break
Пример #5
0
def biKneserNey(nGram, palabra1, palabra2):
  fdist = nltk.FreqDist(nGram)
  kneser_ney = nltk.KneserNeyProbDist(fdist)
  prob_sum = 0 
  dicPalabras3 = {}
  
  for i in kneser_ney.samples():
    
    if i[0] == palabra1 and i[1] == palabra2:
      dicPalabras3[i[2]] = kneser_ney.prob(i)
#       print ("{0}:{1}".format(i, kneser_ney.prob(i)))
  if (dicPalabras3 == {}):
    return '[END] no mas iteraciones'
  return sorted(dicPalabras3.items(), key=operator.itemgetter(1), reverse=True)[:1][0][0]
Пример #6
0
def kneser_ney_smoothing(kneser_train, kneser_input):

	"""
	The Kneser-Ney smoothing technique computes the probability of a trigram given its prefix. 
	The Kneser-Ney technique works only for trigrams and makes use of the KneserNeyProbDist() 
	function to train on the training data.

	:param kneser_train: A dictionary of training data consisting of trigrams.

	:param kneser_input: A dictionary of input data consisting of trigrams. 
	"""

	for input_key in kneser_input.keys():

		lowest_perplexity = None
		result_lang = None

		for train_key in kneser_train.keys():

			probability = 1
			perplexity = None
			l = []

			freq_dist_train = nltk.FreqDist(kneser_train[train_key])
			kneser_ney_train = nltk.KneserNeyProbDist(freq_dist_train, bins=None, discount=0.75)


			for input_ngrams in kneser_input[input_key]:
				
				prob_kn = kneser_ney_train.prob(input_ngrams)
				if(prob_kn == 0):
					prob_kn = 0.1
				
				probability *= prob_kn


			perplexity = probability**(-1/len(kneser_input[input_key]))

			if lowest_perplexity ==None:
				lowest_perplexity = perplexity
				result_lang = train_key
			elif lowest_perplexity > perplexity:
				lowest_perplexity = perplexity
				result_lang = train_key
			else:
				continue

			print str(input_key)+"\t"+str(result_lang)+"\t"+str(perplexity)+"\t"+str(n)
Пример #7
0
def triKneserNey(nGram, palabra1, palabra2, palabra3):
  fdist = nltk.FreqDist(nGram)
  kneser_ney = nltk.KneserNeyProbDist(fdist)

  prob_sum = 0 
  limiter = 0
   
  for i in kneser_ney.samples():
    if [0] == palabra1 and i[1] == palabra2 and i[2] == palabra3:
      prob_sum += kneser_ney.prob(i)
      print ("{0}:{1}:{2}".format(i, kneser_ney.prob(i)))
    print (prob_sum)
    limiter += 1
    if (limiter > 50):
      break
  return kneser_ney.max()
Пример #8
0
dist_ugs = Counter(w for w in all_tokens)

#bigramcounts=Counter(w for w in bigrams)
bgs = nltk.bigrams(all_tokens)
dist_bgs = nltk.FreqDist(bgs)
#dist_bgs2 = dict((key,value) for key,value in dist_bgs.items() if value > 1)
tgs = nltk.trigrams(all_tokens)
dist_tgs = nltk.FreqDist(tgs)
fgs = nltk.ngrams(all_tokens, 4)
dist_fgs = nltk.FreqDist(fgs)
del all_tokens
prob_table_bi = defaultdict(dict)
for key, value in dist_bgs.items():
    prob_table_bi[key[0]][key[1]] = dist_bgs[key] / dist_ugs[key[0]]
del dist_bgs
kn = nltk.KneserNeyProbDist(dist_tgs)
prob_table_kn2 = defaultdict(dict)
for gram in kn.samples():
    prob_table_kn2[gram[:2]][gram[2]] = kn.prob(gram)
del kn
### keep only 5 words per combination. this logic will convert the nexted dict to list.
for key, value in prob_table_kn2.items():
    prob_table_kn2[key] = sorted(value.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[:4]
for key, value in prob_table_bi.items():
    prob_table_bi[key] = sorted(value.items(),
                                key=lambda x: x[1],
                                reverse=True)[:4]
#prob_table_f = defaultdict(dict)
#for key,value in dist_fgs.items():
Пример #9
0
def kneserNeyProbDist(freqDist):
    return nltk.KneserNeyProbDist(freqDist)
Пример #10
0
    :param line_list:
    :return:
    '''
    start1, start2, end1, end2 = '<start-tag1>', '<start-tag2>', '</start-tag1>', '</start-tag2>'
    line_list.insert(0, start2)
    line_list.insert(0, start1)
    line_list.append(end1)
    line_list.append(end2)
    for index in range(2, len(line_list)):
        first = line_list[index - 2]
        second = line_list[index - 1]
        current = line_list[index]
        freq_dist[(first, second, current)] += 1

    for x in freq_dist.items():
        w, y, z = x[0]
        print w, y, z


for path in data_path:
    with open(path, 'r') as file:
        line = file.readline()
        while line:
            line = line.strip()
            arr_line = preprocessing(line)
            build_tri_gram(arr_line)
            line = file.readline()

kneser_ney = nltk.KneserNeyProbDist(freq_dist)
cPickle.dump(kneser_ney, dict_bi_gram_saver)
Пример #11
0
 def __init__(self, *args):
     super(KneserNeyModel, self).__init__(*args)
     self.model = nltk.KneserNeyProbDist(self.ngrams)
Пример #12
0
freq_1gram = nltk.FreqDist(train_corpus)
len_brown = len(train_corpus)
vocab = len(set(train_corpus))

trigrams_as_bigrams = []
trigram = [
    x for x in ngrams(train_corpus,
                      3,
                      pad_left=True,
                      pad_right=True,
                      left_pad_symbol='<s>',
                      right_pad_symbol="</s>")
]
trigrams_as_bigrams.extend([((t[0], t[1]), t[2]) for t in trigram])
cfreq_3gram = nltk.ConditionalFreqDist(trigrams_as_bigrams)
cprob_3gram = nltk.KneserNeyProbDist(nltk.FreqDist(trigram))


def trigram_prob(w1, w2, w3):
    return cprob_3gram.prob((w1, w2, w3))


def entropy(n, text):
    entropy = 0.0
    text = ["<s>"] + text + ["</s>"]
    for i in range(n - 1, len(text)):
        context = text[i - n + 1:i]
        token = text[i]
        entropy += logprob(token, context)
    return entropy / float(len(text) - (n - 1))
Пример #13
0
for k in remove:
    del trigram_freq[k]
dev_sentences_tokenized_trigram_flattened = [
    val for sublist in dev_sent_tokenized_trigram for val in sublist
]
trigrams_dev = list(nltk.trigrams(dev_sentences_tokenized_trigram_flattened))
trigram_dev_freq = nltk.FreqDist(trigrams_dev)
remove = [k for k in trigram_dev_freq.keys() if k[2] in ['start1', 'start2']]
for k in remove:
    del trigram_dev_freq[k]
len(trigrams_dev)
len(dev_sentences_tokenized_trigram_flattened)
sum_prob = 0
trigram_cnt = 0
g = 0
kn_tri = nltk.KneserNeyProbDist(trigram_freq)
kn_tri.samples()
kn_tri.max()
for itm in trigrams_dev:
    if kn_tri.prob(itm) != 0:
        sum_prob += math.log2(kn_tri.prob(itm))
    else:
        g = g + 1
    trigram_cnt += 1
HC = -sum_prob / trigram_cnt
perpl = math.pow(2, HC)

print("Cross Entropy: {0:.3f}".format(HC))
print("perplexity: {0:.3f}".format(perpl))
print("g: {0:.3f}".format(g))
g / len(trigrams_dev)
    knownCPPFile = open(knownCpp)
    knownCPPString = ""
    for line in knownCPPFile:
        knownCPPString += line

    # print(knownCPPString)
    knownCPPGram = ngrams(knownCPPString.split(' '), 3)
    knownCPPHashFreq = nltk.FreqDist(knownCPPGram)

    # cppMaxGram = max(knownCPPHashFreq, key=knownCPPHashFreq.get)
    # print(cppMaxGram, knownCPPHashFreq[cppMaxGram])

    #############################################################################################
    # Section 2: to calculate trigram Probability
    #############################################################################################
    kneserJava = nltk.KneserNeyProbDist(knownJavaHashFreq)
    kneserCPP = nltk.KneserNeyProbDist(knownCPPHashFreq)

    kneserJavaHash = convertProbListToHash(kneserJava)
    kneserCPPHash = convertProbListToHash(kneserCPP)

    cpp = 0
    java = 0
    totalCppWithTag = 0
    totalJavaWithTag = 0
    totalJavaTags = 0
    totalCppTags = 0
    totalEval = 0

    resultsFile = open('Results.txt', 'a')
    codeFile = open('Code.txt', 'a')