Exemplo n.º 1
0
def train_model():
    """Create ngram model from Project Gutenberg texts"""
    text = ''
    for corpus in CORPORA:
        with open(corpus, 'r') as file_:
            text += file_.read().replace('\n', '')

    sents = sent_tokenize(text.lower())
    tokens = []
    # appends <start> and <end> tokens to each sentence
    for sent in sents:
        sent = 'START ' + sent + ' END'
        tokens += word_tokenize(sent)

    ngrams_ = tuple(ngrams(tokens, N_VAL))

    # bigram frequency distribution
    bi_cfdist = ConditionalFreqDist((ngram[0], ngram[:2]) for ngram in ngrams_)

    # bigram probability distribution
    bi_cpdist = ConditionalProbDist(bi_cfdist, LaplaceProbDist)

    # conditional frequency distribution
    cfdist = ConditionalFreqDist(
        (ngram[:N_MINUS1], ngram) for ngram in ngrams_)

    # conditional probability
    cpdist = ConditionalProbDist(cfdist, LaplaceProbDist)

    return bi_cpdist, cpdist
Exemplo n.º 2
0
    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        @return: the trained model
        @rtype: HiddenMarkovModelTagger
        @param labelled_sequences: the training data, a set of
            labelled sequences of observations
        @type labelled_sequences: list
        @param kwargs: may include an 'estimator' parameter, a function taking
            a C{FreqDist} and a number of bins and returning a C{ProbDistI};
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurences of starting states, transitions out of each state
        # and output symbols observed in each state
        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts == None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in self._states:
                    self._states.append(state)
                if symbol not in self._symbols:
                    self._symbols.append(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, False, N)
        B = ConditionalProbDist(outputs, estimator, False, len(self._symbols))
                               
        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
    def train(self):
        """ Construct the conditional frequencies and probabilities """
        #extract tags from sentences

        tags = [tag for (_, tag) in self.tagged_sents]
        self.replaceUnique()
        self.emission_frequencies = ConditionalFreqDist(
            [tup[::-1] for tup in self.tagged_sents])
        self.tagset_size = len(self.emission_frequencies.conditions())

        # emission - probability that a certain tag is a certain word
        # e.g. probability that a VB is 'race'
        self.emission_probabilities = ConditionalProbDist(
            self.emission_frequencies, MLEProbDist)
        self.transition_frequencies = ConditionalFreqDist(bigrams(tags))
        self.transition_probabilities = ConditionalProbDist(
            self.transition_frequencies, MLEProbDist)
        self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)
Exemplo n.º 4
0
def language_model(collection):
    from nltk import ConditionalProbDist
    from nltk import ConditionalFreqDist
    from nltk import bigrams
    from nltk import MLEProbDist
    words = tokenize_collection(collection)
    freq_model = ConditionalFreqDist(bigrams(words))
    prob_model = ConditionalProbDist(freq_model, MLEProbDist)
    return prob_model
Exemplo n.º 5
0
 def __init__(self):
     """Initializes the del_probs and ins_probs variables to empty MLE probability distributions,
     and the sub_probs to an empty conditional probability distribution."""
     self.del_probs = MLEProbDist(
         FreqDist()
     )  # a MLE probability distribution representing how likely each character is to be deleted
     self.ins_probs = MLEProbDist(
         FreqDist()
     )  # a MLE probability distribution representing how likely each character is to be inserted
     self.sub_probs = ConditionalProbDist(
         ConditionalFreqDist(), MLEProbDist
     )  # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character
Exemplo n.º 6
0
    def __init__(self):
        """
        on MLEProbDist
        The maximum likelihood estimate for the probability distribution of the experiment used to generate
        a frequency distribution. The “maximum likelihood estimate” approximates the probability of each sample
        as the frequency of that sample in the frequency distribution.
        """
        with open(connCompsJSON, 'r') as s:
            source = load(s)

        print('Creating the ligature model from: {}'.format(connCompsJSON))

        _bigrams = toNGrams(source.values(), isClean=True)
        _trigrams = [((first, sec), third) for first, sec, third in toNGrams(
            source.values(), n=3, isClean=True)]

        # Conditional Frequency distributions
        self.cfdBigrams = ConditionalFreqDist(_bigrams)
        self.cfdTrigrams = ConditionalFreqDist(_trigrams)

        # Conditional Probability distributions
        self.cpdBigrams = ConditionalProbDist(self.cfdBigrams, MLEProbDist)
        self.cpdTrigrams = ConditionalProbDist(self.cfdTrigrams, MLEProbDist)
        del _bigrams, _trigrams
Exemplo n.º 7
0
 def build_bi_con_prob_dist(self, sentence_list):
     """
     Returns a conditional probability distibution for 
     the bigrams in a list of sentences
     """
     
     bgrams = [bigram for sublist in 
               [bigrams(['<s>'] + word_tokenize(sent.lower())) 
               for sent in sentence_list] 
               for bigram in sublist]
     
     bi_cfreq_dist = ConditionalFreqDist(bgrams)
     bi_cprob_dist = ConditionalProbDist(bi_cfreq_dist, 
                                         MLEProbDist)
     
     return bi_cprob_dist
Exemplo n.º 8
0
    def train_costs(self, alignments):
        """Given a list of character alignments, uses it to estimate the likelihood of different types of errors."""
        # find all of the deletions, insertions, and substitutions in the alignment list
        deletions = []
        insertions = []
        substitutions = []
        for alignment in alignments:
            fromChar = alignment[0]
            toChar = alignment[1]
            if ((fromChar == toChar) or (fromChar != '%' and toChar != '%')):
                substitutions.append(alignment)
            elif fromChar == '%':
                insertions.append(toChar)
            else:  # toChar == '%'
                deletions.append(fromChar)

        # use the result above to update the probability distributions scores in del_probs, ins_probs, and sub_probs
        self.del_probs = MLEProbDist(FreqDist(deletions))
        self.ins_probs = MLEProbDist(FreqDist(insertions))
        self.sub_probs = ConditionalProbDist(
            ConditionalFreqDist([(pair[0], pair[1])
                                 for pair in substitutions]), MLEProbDist)
        return
Exemplo n.º 9
0
    def __init__(self, corpus, n):
        # corpus, 训练标注器的语料, 格式为 [[('Hello', 'NNP'), ('world', 'NN'), ('!', '.')], [...], ...]
        # n - 语言模型 n-gram 中的 n

        # 定义词性标注任务
        # 1. transition 为 n-gram 模型
        # 2. emission 为 P( pos |Word )
        # 3. initial distribution 为 P('START') = 1.0

        # 预处理词库,给每句话加上开始和结束符号
        brown_tags_words = []
        for sent in corpus:
            brown_tags_words.append(('START', 'START'))
            brown_tags_words.extend([(tag[:2], word) for word, tag in sent])
            brown_tags_words.append(('END', 'END'))

        # 从语料集获得 emission - 统计条件概率
        cfd_tagwords = ConditionalFreqDist(brown_tags_words)
        # P(W = word, condition = pos)
        cpd_tagwords = ConditionalProbDist(cfd_tagwords, MLEProbDist)
        emission = {
            tag:
            {word: cpd_tagwords[tag].prob(word)
             for word in cfd_tagwords[tag]}
            for tag in cpd_tagwords
        }

        # 从语料集获得 transition - 调用 n-gram 模型
        tags = [[tag for _, tag in sent] for sent in corpus]
        transition = Transition(ngram(tags, n))

        # 定义 initial distribution - 以 START 为句首, 概率为 1
        initial_distribution = {('START', ): 1.0}

        # 定义 词性标注器
        HMM.__init__(self, initial_distribution, transition, emission, n)
Exemplo n.º 10
0
def build_language_models(corpus_words):
    unigram = FreqDist(corpus_words)
    unigram_prob = MLEProbDist(unigram)
    bigram = ConditionalFreqDist(nltk.bigrams(corpus_words))
    bigram_prob = ConditionalProbDist(bigram, MLEProbDist)

    def lm_1(words):
        p = 1.0
        for w in words:
            p = p * unigram_prob.prob(w)
        return p

    def lm_2(words):
        p = 1.0
        previous_word = None
        for w in words:
            if previous_word is None:
                p *= unigram_prob.prob(w)
            else:
                p *= bigram_prob[previous_word].prob(w)
            previous_word = w
        return p

    return lm_1, lm_2
class HMMTagger(object):
    global START_TAG
    START_TAG = "<s>"
    global END_TAG
    END_TAG = "</s>"
    global UNK
    UNK = "UNK"

    def __init__(self, training_sents, n=2, smoothing=None):
        self.n = n
        self.smoothing = smoothing
        self.tagged_sents = self.addStartAndEndMarkers(
            training_sents)  # this takes a lot of time
        self.train()  # this takes almost 4 seconds

    def train(self):
        """ Construct the conditional frequencies and probabilities """
        #extract tags from sentences

        tags = [tag for (_, tag) in self.tagged_sents]
        self.replaceUnique()
        self.emission_frequencies = ConditionalFreqDist(
            [tup[::-1] for tup in self.tagged_sents])
        self.tagset_size = len(self.emission_frequencies.conditions())

        # emission - probability that a certain tag is a certain word
        # e.g. probability that a VB is 'race'
        self.emission_probabilities = ConditionalProbDist(
            self.emission_frequencies, MLEProbDist)
        self.transition_frequencies = ConditionalFreqDist(bigrams(tags))
        self.transition_probabilities = ConditionalProbDist(
            self.transition_frequencies, MLEProbDist)
        self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)

    def replaceUnique(self):
        """ Replaces unique words with the UNK label """
        word_frequencies = FreqDist([word for (word, _) in self.tagged_sents])
        self.lexicon_size = len(word_frequencies)
        hap = set(word_frequencies.hapaxes())
        res = [(UNK, tag) if word in hap else (word, tag)
               for (word, tag) in self.tagged_sents]
        self.tagged_sents = res

    def addStartAndEndMarkers(self, training_sents):
        """ returns a flat list of tokens """
        res = []
        for sent in training_sents:
            res += [(START_TAG, START_TAG)]
            res += sent
            res += [(END_TAG, END_TAG)]
        return res

    def get_transition_probability(self, prev_tag, tag):
        """ Returns probability of prev_tag being followed by tag.
		 Performs smoothing if specified in the command line."""
        if self.smoothing == "LAP":
            prev_tag_count = self.transition_frequencies[prev_tag].N()
            bigram_count = self.transition_frequencies[prev_tag].freq(
                tag) * prev_tag_count
            return (bigram_count + 1) / (1.0 * prev_tag_count +
                                         self.lexicon_size)
        else:
            return self.transition_probabilities[prev_tag].prob(tag)

    def viterbi_col(self, word, prev=None):
        """ General algorithm for a viterbi table column.
		This is only called once for every word. """
        vit = {}
        back = {}
        for tag in self.word_tag_frequencies[word].keys():
            if tag != START_TAG:
                if prev:

                    best_prev_tag = self.get_prev_tag(tag, prev, word)
                    transition_prob = self.get_transition_probability(
                        best_prev_tag, tag)
                    vit[tag] = prev[
                        best_prev_tag] * transition_prob * self.emission_probabilities[
                            tag].prob(word)
                    back[tag] = best_prev_tag

                else:
                    transition_prob = self.get_transition_probability(
                        START_TAG, tag)
                    vit[tag] = transition_prob * self.emission_probabilities[
                        tag].prob(word)
                    back[tag] = START_TAG

        return (vit, back)

    def viterbi(self, words_to_tag):
        """ Viterbi algorithm """
        res = [
        ]  # a list of dicts denoting probability of best path to get to state q after scanning input up to pos i
        backpointers = []  # a list of dicts
        for wordindex in range(len(words_to_tag)):
            current_word = words_to_tag[wordindex]
            if self.is_unknown(current_word):
                current_word = UNK
            if wordindex == 0:
                vit, back = self.viterbi_col(current_word)
            else:
                vit, back = self.viterbi_col(current_word, res[-1])

            res.append(vit)
            backpointers.append(back)

        prev = res[-1]
        backpointers.reverse()
        return self.construct_solution(backpointers, prev)

    def is_unknown(self, word):
        """ Checks if the word is unknown """
        for tag in set(self.emission_probabilities.conditions()):
            pr = self.emission_probabilities[tag]
            if pr.prob(word) > 0:
                return False
        return True

    def construct_solution(self, back, prev):
        """ Constructs solution by following the back pointers on a ready viterbi table """
        current_best_tag = self.get_prev_tag(END_TAG, prev)
        best_seq = [END_TAG, current_best_tag]
        for p in back:
            to_append = p[current_best_tag]
            best_seq.append(to_append)
            current_best_tag = p[current_best_tag]
        best_seq.reverse()
        return best_seq

    def get_prev_tag(self, tag, prev, curr_word=None):
        """ Finds a previous tag A for the current tag B s.t. the probability of AB was the highest
		for the current word.
		Called for every word and every tag """
        best_prev = prev.keys()[
            0]  # assign at least something to avoid None exception
        best_prob = 0.0
        for prevtag in prev.keys():
            # find the maximum probability
            prob = prev[prevtag] * self.transition_probabilities[prevtag].prob(
                tag)

            if curr_word:
                prob *= self.emission_probabilities[tag].prob(curr_word)

            if prob > best_prob:
                best_prob = prob
                best_prev = prevtag

        return best_prev

    def tag_sents(self, test_sents):
        """Tag the given text sentence by sentence"""
        res = []
        for sent in test_sents:
            res.append(self.viterbi(sent)[1:-1])  # remove start and end tags
        return res
Exemplo n.º 12
0
 def run(self):
     cfd = ConditionalFreqDist((tuple(self.data_set[i: i + self.n - 1]), self.data_set[i + self.n - 1]) for i in
                               range(len(self.data_set) - self.n + 1))
     lidstone_estimator = lambda fd: LidstoneProbDist(fd, self.gamma, fd.B() + 1)
     cpd = ConditionalProbDist(cfd, lidstone_estimator)
     self.model = cpd
Exemplo n.º 13
0
 def freq2prob(self, freq_dist):
     num_bins = max([freq_dist[w].B() for w in freq_dist] + [1])
     prob = ConditionalProbDist(freq_dist, LaplaceProbDist, num_bins)
     return prob
    return tags_words


'''
test and train without UNK tag
'''
tags_words_train = add_start_end(0, 8700)
tags_words_test = add_start_end(8701, 9201)
words_train = ([w for (_, w) in tags_words_train])
words_test = ([w for (_, w) in tags_words_test])
tags_train = ([t for (t, _) in tags_words_train])
tags_test = ([t for (t, _) in tags_words_test])
distinct_tags = set(tags_train)
# calculating transition probability
cfd_tags = ConditionalFreqDist(nltk.bigrams(tags_train))
cpd_tags = ConditionalProbDist(cfd_tags, MLEProbDist)
# calculating observation likelihood
cfd_tagwords = ConditionalFreqDist(tags_words_train)
cpd_tagwords = ConditionalProbDist(cfd_tagwords, MLEProbDist)
backpointer = find_tag_for_sentences(words_test)
accuracy_without_UNK_tag = calculate_accuracy(tags_test, backpointer)
'''
test and train with UNK-CAP tag
'''
tags_words_train = add_start_end(0, 8700)
tags_words_test = add_start_end(8701, 9201)
tags_words_train = replace_with_UNKCAP(tags_words_train)
tags_words_test = replace_with_UNKCAP(tags_words_test)
words_train = ([w for (_, w) in tags_words_train])
words_test = ([w for (_, w) in tags_words_test])
tags_train = ([t for (t, _) in tags_words_train])