def probabilityDis(self): tagsDis = {} wordsDis = {} for tag in self.uTags: tags = [] words = [] for i in range(len(self.tags)): if tag == self.tags[i]: words.append(self.words[i]) if i < (len(self.tags) - 1): tags.append(self.tags[i + 1]) tagsDis[tag] = WittenBellProbDist(FreqDist(tags), bins=1e5) wordsDis[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) return tagsDis, wordsDis
def transition_using_witten_bell_smoothing(tags_bigram): smoothed = {} distinct_tags = set([t for (t, _) in tags_bigram]) for tag1 in distinct_tags: tag2 = [t2 for (t1, t2) in tags_bigram if t1 == tag1] smoothed[tag1] = WittenBellProbDist(FreqDist(tag2), bins=1e5) return smoothed
def build_emis(tagset, wrd_tag_pairs): smoothed = {} for tag in tagset: ws = [w for (w, t) in wrd_tag_pairs if t == tag] smoothed[tag] = WittenBellProbDist(FreqDist(ws), bins=1e5) return smoothed
def build_trans(tagset, tbigrams): smoothed = {} for tag in tagset: transitions = [t for (o, t) in tbigrams if o == tag] smoothed[tag] = WittenBellProbDist(FreqDist(transitions), bins=1e5) return smoothed
def calculate_transition_prob_for_POS_tags(self): tags_in_sentence = self.reformat_sentences() for tag in self.tag_type_set: tag_bigrams = nltk.bigrams(tags_in_sentence) words = [t2 for (t1, t2) in tag_bigrams if t1 == tag] self.transition_prob_for_tag[tag] = WittenBellProbDist(FreqDist(words), bins=1e6)
def getTransitionProb(sm, sents, tagset): # P(nextTag|prevTag) = transitionProb[prevTag].prob(nextTag) transition = [] for s in sents: tags = [t for (w, t) in s] transition += ngrams(tags, 2) transitionProb = {} for tag in tagset: nextTags = [ nextTag for (prevTag, nextTag) in transition if prevTag == tag ] if sm == "no": transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags), 0, bins=1e5) elif sm == "laplace": transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags), 1, bins=1e5) elif sm == "goodturing": transitionProb[tag] = SimpleGoodTuringProbDist(FreqDist(nextTags), bins=1e5) else: transitionProb[tag] = WittenBellProbDist(FreqDist(nextTags), bins=1e5) return transitionProb
def emission_using_witten_bell_smoothing(word_tag_tuple): smoothed = {} tags = set([t for (_, t) in word_tag_tuple]) for tag in tags: words = [w for (w, t) in word_tag_tuple if t == tag] smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) return smoothed
def createEmissionProbabilities(self): smoothed = {} words = [] for tag in self.uniqueTags: words = [w for (w,t) in self.emitted if t == tag] smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) self.emissionProbability = smoothed
def createTransitionProbabilities(self): smoothed = {} #In fact words here are tags for tag in self.uniqueTags: words = [w for (t,w) in self.tagsTupples if t == tag] smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) self.transitionProbability = smoothed
def setProbDistributions(self): tag_dist = {} word_dist = {} for t in self.uniqueTags: tagList = [] lenOfTags = len(self.tags) wordList = [] for i in range(lenOfTags - 1): if self.tags[i] == t: wordList.append(self.words[i]) if i < (lenOfTags - 2): tagList.append(self.tags[i + 1]) tag_dist[t] = WittenBellProbDist(FreqDist(tagList), bins=1e5) word_dist[t] = WittenBellProbDist(FreqDist(wordList), bins=1e5) return word_dist, tag_dist
def getEmissionProb(self, sents, tagset): # P(word|tag) = emissionProb[tag].prob(word) emission = [] for s in sents: emission += [ (w.lower(), t) for (w, t) in s ] # treat for both lowercase and uppercase in the same way emissionProb = {} for tag in tagset: words = [w for (w, t) in emission if t == tag] emissionProb[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) return emissionProb
def getTransitionProb(self, sents, tagset): # P(nextTag|prevTag) = transitionProb[prevTag].prob(nextTag) transition = [] for s in sents: tags = [t for (w, t) in s] transition += ngrams(tags, 2) transitionProb = {} for tag in tagset: nextTags = [ nextTag for (prevTag, nextTag) in transition if prevTag == tag ] transitionProb[tag] = WittenBellProbDist(FreqDist(nextTags), bins=1e5) return transitionProb
def transition_probabilities(): ''' Function that computes the transition probabilities for the tags/states returns tag set ''' bigrams = [] for tg in taglist: bigrams += ngrams(tg, 2) tag_s = set([t for (t, _) in bigrams]) for tg in tag_s: current_tag = [ct for (t, ct) in bigrams if t == tg] smoothed_transition_prob[tg] = WittenBellProbDist(FreqDist(current_tag), bins=1e5) return tag_s
def getEmissionProb(sm, sents, tagset): # P(word|tag) = transitionProb[tag].prob(word) emission = [] for s in sents: emission += [(w.lower(), t) for (w, t) in s] emissionProb = {} for tag in tagset: words = [w for (w, t) in emission if t == tag] if sm == "no": emissionProb[tag] = LidstoneProbDist(FreqDist(words), 0, bins=1e5) elif sm == "laplace": emissionProb[tag] = LidstoneProbDist(FreqDist(words), 1, bins=1e5) elif sm == "goodturing": emissionProb[tag] = SimpleGoodTuringProbDist(FreqDist(words), bins=1e5) else: emissionProb[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) return emissionProb
def get_word_emission_probabilities(training_set: list, training_tags: list, bins: int = 10000000) -> dict: """ Calculates the word emission probabilities (between the current POS tag and the current word) by smoothing their frequency distribution using the Witten-Bell estimate Probability Distribution. :param training_tags: the tags in the training set. :param training_set: the training set tokens. :param bins: the number of bins to use when smoothing. :return: the smoothed word emission probabilities (probability of a word given its POS tag). # todo """ emission_probabilities = dict() tags = remove_list_duplicates(training_tags) for tag in tags: words = list() for sentence in training_set: for (w, t) in sentence: if t == tag: words.append(w) emission_probabilities[tag] = WittenBellProbDist(FreqDist(words), bins=bins) return emission_probabilities
def get_tag_transition_probabilities(training_set: list, training_tags: list, bins: int = 100) -> dict: """ Calculates the tag transition probabilities (between a current POS tag and the next POS tag) by smoothing their frequency distribution using the Witten-Bell estimate Probability Distribution. :param training_tags: the tags in the training set. :param training_set: the training set tokens. :param bins: the number of bins to use when smoothing. :return: the smoothed tag transition probabilities (probability of a POS tag following another POS tag). # todo """ transition_probabilities = dict() tags = remove_list_duplicates(training_tags) for tag in tags: next_tags = list() for sentence in training_set: for i in range(0, len(sentence) - 1): if tag == sentence[i][1]: next_tags.append(sentence[i + 1][1]) transition_probabilities[tag] = WittenBellProbDist(FreqDist(next_tags), bins=bins) return transition_probabilities
def smoothed_observation_likelihood(tag): return WittenBellProbDist(cfd_tagwords[tag], bins=1e5)
def calculate_emission_prob(self): for tag in self.tag_type_set: words = [w for (w, t) in self.unified_sentences if t == tag] self.emission_prob[tag] = WittenBellProbDist(FreqDist(words), bins=1e6)
# ----------------------------------------- Relative Frequencies with Smoothing ---------------------------------------- print_heading("Smoothing & Training") smoothed_transition_prob = {} # Preparing to also add smoothing for <s> and </s> tag_pairs_with_start_words = tag_pairs.copy() for pair in start_word_bigrams: tag_pairs_with_start_words.append(pair) # Smoothing transition probabilities # all transitions (q1, q2) are stored in tag_pairs tags2 = set([t2 for (_, t2) in tag_pairs]) for tag in tags2: tags1 = [t1 for (t1, t2) in tag_pairs_with_start_words if t2 == tag] smoothed_transition_prob[tag] = WittenBellProbDist(FreqDist(tags1), bins=1e5) # Smoothed probabilities for </s> as t2 end_tags = [t for (w, t) in end_words] smoothed_transition_prob['</s>'] = WittenBellProbDist(FreqDist(end_tags), bins=1e5) print("Transition probability test:") print("alpha('DET', 'NOUN') " + str(smoothed_transition_prob['NOUN'].prob('DET'))) print("alpha('DET', 'DET') " + str(smoothed_transition_prob['DET'].prob('DET'))) print("alpha('.', '</s>') " + str(smoothed_transition_prob['</s>'].prob('.'))) print("alpha('<s>', 'DET') " + str(smoothed_transition_prob['DET'].prob('<s>')))
from nltk import FreqDist, WittenBellProbDist emissions = [('N', 'apple'), ('N', 'apple'), ('N', 'banana'), ('Adj', 'apple'), ('V', 'sing')] smoothed = {} tags = set([t for (t, _) in emissions]) for tag in tags: words = [w for (t, w) in emissions if t == tag] smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) print('probability of N -> apple is', smoothed['N'].prob('apple')) print('probability of N -> banana is', smoothed['N'].prob('banana')) print('probability of N -> peach is', smoothed['N'].prob('peach')) print('probability of V -> sing is', smoothed['V'].prob('sing')) print('probability of V -> walk is', smoothed['V'].prob('walk'))
def emission_probabilities(): tag_s = set([t for (_, t) in word_and_tag]) for tg in tag_s: words = [w for (w, t) in word_and_tag if t == tg] #convert all words for better accuracy smoothed_emission_prob[tg] = WittenBellProbDist(FreqDist(words), bins=1e5)
def smoothed_transition_probability(tag): return WittenBellProbDist(cfd_tags[tag], bins=1e5)