예제 #1
0
 def probabilityDis(self):
     tagsDis = {}
     wordsDis = {}
     for tag in self.uTags:
         tags = []
         words = []
         for i in range(len(self.tags)):
             if tag == self.tags[i]:
                 words.append(self.words[i])
                 if i < (len(self.tags) - 1):
                     tags.append(self.tags[i + 1])
         tagsDis[tag] = WittenBellProbDist(FreqDist(tags), bins=1e5)
         wordsDis[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)
     return tagsDis, wordsDis
def transition_using_witten_bell_smoothing(tags_bigram):
    smoothed = {}
    distinct_tags = set([t for (t, _) in tags_bigram])
    for tag1 in distinct_tags:
        tag2 = [t2 for (t1, t2) in tags_bigram if t1 == tag1]
        smoothed[tag1] = WittenBellProbDist(FreqDist(tag2), bins=1e5)
    return smoothed
예제 #3
0
def build_emis(tagset, wrd_tag_pairs):
    smoothed = {}
    for tag in tagset:
        ws = [w for (w, t) in wrd_tag_pairs if t == tag]
        smoothed[tag] = WittenBellProbDist(FreqDist(ws), bins=1e5)
    
    return smoothed
예제 #4
0
def build_trans(tagset, tbigrams):
    smoothed = {}
    for tag in tagset:
        transitions = [t for (o, t) in tbigrams if o == tag]
        smoothed[tag] = WittenBellProbDist(FreqDist(transitions), bins=1e5)
    
    return smoothed
예제 #5
0
    def calculate_transition_prob_for_POS_tags(self):

        tags_in_sentence = self.reformat_sentences()
        for tag in self.tag_type_set:
            tag_bigrams = nltk.bigrams(tags_in_sentence)
            words = [t2 for (t1, t2) in tag_bigrams if t1 == tag]
            self.transition_prob_for_tag[tag] = WittenBellProbDist(FreqDist(words), bins=1e6)
예제 #6
0
def getTransitionProb(sm, sents, tagset):
    # P(nextTag|prevTag) = transitionProb[prevTag].prob(nextTag)
    transition = []
    for s in sents:
        tags = [t for (w, t) in s]
        transition += ngrams(tags, 2)

    transitionProb = {}
    for tag in tagset:
        nextTags = [
            nextTag for (prevTag, nextTag) in transition if prevTag == tag
        ]

        if sm == "no":
            transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags),
                                                   0,
                                                   bins=1e5)
        elif sm == "laplace":
            transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags),
                                                   1,
                                                   bins=1e5)
        elif sm == "goodturing":
            transitionProb[tag] = SimpleGoodTuringProbDist(FreqDist(nextTags),
                                                           bins=1e5)
        else:
            transitionProb[tag] = WittenBellProbDist(FreqDist(nextTags),
                                                     bins=1e5)

    return transitionProb
def emission_using_witten_bell_smoothing(word_tag_tuple):
    smoothed = {}
    tags = set([t for (_, t) in word_tag_tuple])
    for tag in tags:
        words = [w for (w, t) in word_tag_tuple if t == tag]
        smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)
    return smoothed
예제 #8
0
 def createEmissionProbabilities(self):
     smoothed = {}
     words = []
     for tag in self.uniqueTags:
         words = [w for (w,t) in self.emitted if t == tag]
         smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)
     self.emissionProbability = smoothed
예제 #9
0
 def createTransitionProbabilities(self):
     smoothed = {}
     
     #In fact words here are tags
     for tag in self.uniqueTags:
         words = [w for (t,w) in self.tagsTupples if t == tag]
         smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)
     self.transitionProbability = smoothed
예제 #10
0
    def setProbDistributions(self):
        tag_dist = {}
        word_dist = {}

        for t in self.uniqueTags:
            tagList = []
            lenOfTags = len(self.tags)
            wordList = []
            for i in range(lenOfTags - 1):
                if self.tags[i] == t:
                    wordList.append(self.words[i])
                    if i < (lenOfTags - 2):
                        tagList.append(self.tags[i + 1])
            tag_dist[t] = WittenBellProbDist(FreqDist(tagList), bins=1e5)
            word_dist[t] = WittenBellProbDist(FreqDist(wordList), bins=1e5)

        return word_dist, tag_dist
예제 #11
0
    def getEmissionProb(self, sents, tagset):
        # P(word|tag) = emissionProb[tag].prob(word)
        emission = []
        for s in sents:
            emission += [
                (w.lower(), t) for (w, t) in s
            ]  # treat for both lowercase and uppercase in the same way

        emissionProb = {}
        for tag in tagset:
            words = [w for (w, t) in emission if t == tag]
            emissionProb[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)

        return emissionProb
예제 #12
0
    def getTransitionProb(self, sents, tagset):
        # P(nextTag|prevTag) = transitionProb[prevTag].prob(nextTag)
        transition = []
        for s in sents:
            tags = [t for (w, t) in s]
            transition += ngrams(tags, 2)

        transitionProb = {}
        for tag in tagset:
            nextTags = [
                nextTag for (prevTag, nextTag) in transition if prevTag == tag
            ]
            transitionProb[tag] = WittenBellProbDist(FreqDist(nextTags),
                                                     bins=1e5)

        return transitionProb
예제 #13
0
def transition_probabilities():
    '''
    Function that computes the transition probabilities for the tags/states

    returns tag set

    '''
    bigrams = []
    for tg in taglist:
        bigrams += ngrams(tg, 2)
    tag_s = set([t for (t, _) in bigrams])

    for tg in tag_s:
        current_tag = [ct for (t, ct) in bigrams if t == tg]
        smoothed_transition_prob[tg] = WittenBellProbDist(FreqDist(current_tag), bins=1e5)
    return tag_s
예제 #14
0
def getEmissionProb(sm, sents, tagset):
    # P(word|tag) = transitionProb[tag].prob(word)
    emission = []
    for s in sents:
        emission += [(w.lower(), t) for (w, t) in s]

    emissionProb = {}
    for tag in tagset:
        words = [w for (w, t) in emission if t == tag]
        if sm == "no":
            emissionProb[tag] = LidstoneProbDist(FreqDist(words), 0, bins=1e5)
        elif sm == "laplace":
            emissionProb[tag] = LidstoneProbDist(FreqDist(words), 1, bins=1e5)
        elif sm == "goodturing":
            emissionProb[tag] = SimpleGoodTuringProbDist(FreqDist(words),
                                                         bins=1e5)
        else:
            emissionProb[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)

    return emissionProb
def get_word_emission_probabilities(training_set: list,
                                    training_tags: list,
                                    bins: int = 10000000) -> dict:
    """
    Calculates the word emission probabilities (between the current POS tag and the current word) by smoothing their
    frequency distribution using the Witten-Bell estimate Probability Distribution.
    :param training_tags: the tags in the training set.
    :param training_set: the training set tokens.
    :param bins: the number of bins to use when smoothing.
    :return: the smoothed word emission probabilities (probability of a word given its POS tag). # todo
    """
    emission_probabilities = dict()

    tags = remove_list_duplicates(training_tags)
    for tag in tags:
        words = list()
        for sentence in training_set:
            for (w, t) in sentence:
                if t == tag:
                    words.append(w)
        emission_probabilities[tag] = WittenBellProbDist(FreqDist(words),
                                                         bins=bins)

    return emission_probabilities
def get_tag_transition_probabilities(training_set: list,
                                     training_tags: list,
                                     bins: int = 100) -> dict:
    """
    Calculates the tag transition probabilities (between a current POS tag and the next POS tag) by smoothing their
    frequency distribution using the Witten-Bell estimate Probability Distribution.
    :param training_tags: the tags in the training set.
    :param training_set: the training set tokens.
    :param bins: the number of bins to use when smoothing.
    :return: the smoothed tag transition probabilities (probability of a POS tag following another POS tag). # todo
    """
    transition_probabilities = dict()
    tags = remove_list_duplicates(training_tags)

    for tag in tags:
        next_tags = list()
        for sentence in training_set:
            for i in range(0, len(sentence) - 1):
                if tag == sentence[i][1]:
                    next_tags.append(sentence[i + 1][1])
        transition_probabilities[tag] = WittenBellProbDist(FreqDist(next_tags),
                                                           bins=bins)

    return transition_probabilities
def smoothed_observation_likelihood(tag):
    return WittenBellProbDist(cfd_tagwords[tag], bins=1e5)
예제 #18
0
    def calculate_emission_prob(self):

        for tag in self.tag_type_set:
            words = [w for (w, t) in self.unified_sentences if t == tag]
            self.emission_prob[tag] = WittenBellProbDist(FreqDist(words), bins=1e6)
예제 #19
0
# ----------------------------------------- Relative Frequencies with Smoothing ----------------------------------------
print_heading("Smoothing & Training")

smoothed_transition_prob = {}

# Preparing to also add smoothing for <s> and </s>
tag_pairs_with_start_words = tag_pairs.copy()
for pair in start_word_bigrams:
    tag_pairs_with_start_words.append(pair)

# Smoothing transition probabilities
# all transitions (q1, q2) are stored in tag_pairs
tags2 = set([t2 for (_, t2) in tag_pairs])
for tag in tags2:
    tags1 = [t1 for (t1, t2) in tag_pairs_with_start_words if t2 == tag]
    smoothed_transition_prob[tag] = WittenBellProbDist(FreqDist(tags1),
                                                       bins=1e5)

# Smoothed probabilities for </s> as t2
end_tags = [t for (w, t) in end_words]
smoothed_transition_prob['</s>'] = WittenBellProbDist(FreqDist(end_tags),
                                                      bins=1e5)

print("Transition probability test:")
print("alpha('DET', 'NOUN') " +
      str(smoothed_transition_prob['NOUN'].prob('DET')))
print("alpha('DET', 'DET') " +
      str(smoothed_transition_prob['DET'].prob('DET')))
print("alpha('.', '</s>') " + str(smoothed_transition_prob['</s>'].prob('.')))
print("alpha('<s>', 'DET') " +
      str(smoothed_transition_prob['DET'].prob('<s>')))
예제 #20
0
from nltk import FreqDist, WittenBellProbDist

emissions = [('N', 'apple'), ('N', 'apple'), ('N', 'banana'), ('Adj', 'apple'),
             ('V', 'sing')]
smoothed = {}
tags = set([t for (t, _) in emissions])
for tag in tags:
    words = [w for (t, w) in emissions if t == tag]
    smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)
print('probability of N -> apple is', smoothed['N'].prob('apple'))
print('probability of N -> banana is', smoothed['N'].prob('banana'))
print('probability of N -> peach is', smoothed['N'].prob('peach'))
print('probability of V -> sing is', smoothed['V'].prob('sing'))
print('probability of V -> walk is', smoothed['V'].prob('walk'))
예제 #21
0
def emission_probabilities():
    tag_s = set([t for (_, t) in word_and_tag])
    for tg in tag_s:
        words = [w for (w, t) in word_and_tag if t == tg]   #convert all words for better accuracy
        smoothed_emission_prob[tg] = WittenBellProbDist(FreqDist(words), bins=1e5)
def smoothed_transition_probability(tag):
    return WittenBellProbDist(cfd_tags[tag], bins=1e5)