def train_model(): """Create ngram model from Project Gutenberg texts""" text = '' for corpus in CORPORA: with open(corpus, 'r') as file_: text += file_.read().replace('\n', '') sents = sent_tokenize(text.lower()) tokens = [] # appends <start> and <end> tokens to each sentence for sent in sents: sent = 'START ' + sent + ' END' tokens += word_tokenize(sent) ngrams_ = tuple(ngrams(tokens, N_VAL)) # bigram frequency distribution bi_cfdist = ConditionalFreqDist((ngram[0], ngram[:2]) for ngram in ngrams_) # bigram probability distribution bi_cpdist = ConditionalProbDist(bi_cfdist, LaplaceProbDist) # conditional frequency distribution cfdist = ConditionalFreqDist( (ngram[:N_MINUS1], ngram) for ngram in ngrams_) # conditional probability cpdist = ConditionalProbDist(cfdist, LaplaceProbDist) return bi_cpdist, cpdist
def train_supervised(self, labelled_sequences, **kwargs): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. @return: the trained model @rtype: HiddenMarkovModelTagger @param labelled_sequences: the training data, a set of labelled sequences of observations @type labelled_sequences: list @param kwargs: may include an 'estimator' parameter, a function taking a C{FreqDist} and a number of bins and returning a C{ProbDistI}; otherwise a MLE estimate is used """ # default to the MLE estimate estimator = kwargs.get('estimator') if estimator == None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurences of starting states, transitions out of each state # and output symbols observed in each state starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts == None: starting.inc(state) else: transitions[lasts].inc(state) outputs[state].inc(symbol) lasts = state # update the state and symbol lists if state not in self._states: self._states.append(state) if symbol not in self._symbols: self._symbols.append(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, False, N) B = ConditionalProbDist(outputs, estimator, False, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
def train(self): """ Construct the conditional frequencies and probabilities """ #extract tags from sentences tags = [tag for (_, tag) in self.tagged_sents] self.replaceUnique() self.emission_frequencies = ConditionalFreqDist( [tup[::-1] for tup in self.tagged_sents]) self.tagset_size = len(self.emission_frequencies.conditions()) # emission - probability that a certain tag is a certain word # e.g. probability that a VB is 'race' self.emission_probabilities = ConditionalProbDist( self.emission_frequencies, MLEProbDist) self.transition_frequencies = ConditionalFreqDist(bigrams(tags)) self.transition_probabilities = ConditionalProbDist( self.transition_frequencies, MLEProbDist) self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)
def language_model(collection): from nltk import ConditionalProbDist from nltk import ConditionalFreqDist from nltk import bigrams from nltk import MLEProbDist words = tokenize_collection(collection) freq_model = ConditionalFreqDist(bigrams(words)) prob_model = ConditionalProbDist(freq_model, MLEProbDist) return prob_model
def __init__(self): """Initializes the del_probs and ins_probs variables to empty MLE probability distributions, and the sub_probs to an empty conditional probability distribution.""" self.del_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be deleted self.ins_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be inserted self.sub_probs = ConditionalProbDist( ConditionalFreqDist(), MLEProbDist ) # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character
def __init__(self): """ on MLEProbDist The maximum likelihood estimate for the probability distribution of the experiment used to generate a frequency distribution. The “maximum likelihood estimate” approximates the probability of each sample as the frequency of that sample in the frequency distribution. """ with open(connCompsJSON, 'r') as s: source = load(s) print('Creating the ligature model from: {}'.format(connCompsJSON)) _bigrams = toNGrams(source.values(), isClean=True) _trigrams = [((first, sec), third) for first, sec, third in toNGrams( source.values(), n=3, isClean=True)] # Conditional Frequency distributions self.cfdBigrams = ConditionalFreqDist(_bigrams) self.cfdTrigrams = ConditionalFreqDist(_trigrams) # Conditional Probability distributions self.cpdBigrams = ConditionalProbDist(self.cfdBigrams, MLEProbDist) self.cpdTrigrams = ConditionalProbDist(self.cfdTrigrams, MLEProbDist) del _bigrams, _trigrams
def build_bi_con_prob_dist(self, sentence_list): """ Returns a conditional probability distibution for the bigrams in a list of sentences """ bgrams = [bigram for sublist in [bigrams(['<s>'] + word_tokenize(sent.lower())) for sent in sentence_list] for bigram in sublist] bi_cfreq_dist = ConditionalFreqDist(bgrams) bi_cprob_dist = ConditionalProbDist(bi_cfreq_dist, MLEProbDist) return bi_cprob_dist
def train_costs(self, alignments): """Given a list of character alignments, uses it to estimate the likelihood of different types of errors.""" # find all of the deletions, insertions, and substitutions in the alignment list deletions = [] insertions = [] substitutions = [] for alignment in alignments: fromChar = alignment[0] toChar = alignment[1] if ((fromChar == toChar) or (fromChar != '%' and toChar != '%')): substitutions.append(alignment) elif fromChar == '%': insertions.append(toChar) else: # toChar == '%' deletions.append(fromChar) # use the result above to update the probability distributions scores in del_probs, ins_probs, and sub_probs self.del_probs = MLEProbDist(FreqDist(deletions)) self.ins_probs = MLEProbDist(FreqDist(insertions)) self.sub_probs = ConditionalProbDist( ConditionalFreqDist([(pair[0], pair[1]) for pair in substitutions]), MLEProbDist) return
def __init__(self, corpus, n): # corpus, 训练标注器的语料, 格式为 [[('Hello', 'NNP'), ('world', 'NN'), ('!', '.')], [...], ...] # n - 语言模型 n-gram 中的 n # 定义词性标注任务 # 1. transition 为 n-gram 模型 # 2. emission 为 P( pos |Word ) # 3. initial distribution 为 P('START') = 1.0 # 预处理词库,给每句话加上开始和结束符号 brown_tags_words = [] for sent in corpus: brown_tags_words.append(('START', 'START')) brown_tags_words.extend([(tag[:2], word) for word, tag in sent]) brown_tags_words.append(('END', 'END')) # 从语料集获得 emission - 统计条件概率 cfd_tagwords = ConditionalFreqDist(brown_tags_words) # P(W = word, condition = pos) cpd_tagwords = ConditionalProbDist(cfd_tagwords, MLEProbDist) emission = { tag: {word: cpd_tagwords[tag].prob(word) for word in cfd_tagwords[tag]} for tag in cpd_tagwords } # 从语料集获得 transition - 调用 n-gram 模型 tags = [[tag for _, tag in sent] for sent in corpus] transition = Transition(ngram(tags, n)) # 定义 initial distribution - 以 START 为句首, 概率为 1 initial_distribution = {('START', ): 1.0} # 定义 词性标注器 HMM.__init__(self, initial_distribution, transition, emission, n)
def build_language_models(corpus_words): unigram = FreqDist(corpus_words) unigram_prob = MLEProbDist(unigram) bigram = ConditionalFreqDist(nltk.bigrams(corpus_words)) bigram_prob = ConditionalProbDist(bigram, MLEProbDist) def lm_1(words): p = 1.0 for w in words: p = p * unigram_prob.prob(w) return p def lm_2(words): p = 1.0 previous_word = None for w in words: if previous_word is None: p *= unigram_prob.prob(w) else: p *= bigram_prob[previous_word].prob(w) previous_word = w return p return lm_1, lm_2
class HMMTagger(object): global START_TAG START_TAG = "<s>" global END_TAG END_TAG = "</s>" global UNK UNK = "UNK" def __init__(self, training_sents, n=2, smoothing=None): self.n = n self.smoothing = smoothing self.tagged_sents = self.addStartAndEndMarkers( training_sents) # this takes a lot of time self.train() # this takes almost 4 seconds def train(self): """ Construct the conditional frequencies and probabilities """ #extract tags from sentences tags = [tag for (_, tag) in self.tagged_sents] self.replaceUnique() self.emission_frequencies = ConditionalFreqDist( [tup[::-1] for tup in self.tagged_sents]) self.tagset_size = len(self.emission_frequencies.conditions()) # emission - probability that a certain tag is a certain word # e.g. probability that a VB is 'race' self.emission_probabilities = ConditionalProbDist( self.emission_frequencies, MLEProbDist) self.transition_frequencies = ConditionalFreqDist(bigrams(tags)) self.transition_probabilities = ConditionalProbDist( self.transition_frequencies, MLEProbDist) self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents) def replaceUnique(self): """ Replaces unique words with the UNK label """ word_frequencies = FreqDist([word for (word, _) in self.tagged_sents]) self.lexicon_size = len(word_frequencies) hap = set(word_frequencies.hapaxes()) res = [(UNK, tag) if word in hap else (word, tag) for (word, tag) in self.tagged_sents] self.tagged_sents = res def addStartAndEndMarkers(self, training_sents): """ returns a flat list of tokens """ res = [] for sent in training_sents: res += [(START_TAG, START_TAG)] res += sent res += [(END_TAG, END_TAG)] return res def get_transition_probability(self, prev_tag, tag): """ Returns probability of prev_tag being followed by tag. Performs smoothing if specified in the command line.""" if self.smoothing == "LAP": prev_tag_count = self.transition_frequencies[prev_tag].N() bigram_count = self.transition_frequencies[prev_tag].freq( tag) * prev_tag_count return (bigram_count + 1) / (1.0 * prev_tag_count + self.lexicon_size) else: return self.transition_probabilities[prev_tag].prob(tag) def viterbi_col(self, word, prev=None): """ General algorithm for a viterbi table column. This is only called once for every word. """ vit = {} back = {} for tag in self.word_tag_frequencies[word].keys(): if tag != START_TAG: if prev: best_prev_tag = self.get_prev_tag(tag, prev, word) transition_prob = self.get_transition_probability( best_prev_tag, tag) vit[tag] = prev[ best_prev_tag] * transition_prob * self.emission_probabilities[ tag].prob(word) back[tag] = best_prev_tag else: transition_prob = self.get_transition_probability( START_TAG, tag) vit[tag] = transition_prob * self.emission_probabilities[ tag].prob(word) back[tag] = START_TAG return (vit, back) def viterbi(self, words_to_tag): """ Viterbi algorithm """ res = [ ] # a list of dicts denoting probability of best path to get to state q after scanning input up to pos i backpointers = [] # a list of dicts for wordindex in range(len(words_to_tag)): current_word = words_to_tag[wordindex] if self.is_unknown(current_word): current_word = UNK if wordindex == 0: vit, back = self.viterbi_col(current_word) else: vit, back = self.viterbi_col(current_word, res[-1]) res.append(vit) backpointers.append(back) prev = res[-1] backpointers.reverse() return self.construct_solution(backpointers, prev) def is_unknown(self, word): """ Checks if the word is unknown """ for tag in set(self.emission_probabilities.conditions()): pr = self.emission_probabilities[tag] if pr.prob(word) > 0: return False return True def construct_solution(self, back, prev): """ Constructs solution by following the back pointers on a ready viterbi table """ current_best_tag = self.get_prev_tag(END_TAG, prev) best_seq = [END_TAG, current_best_tag] for p in back: to_append = p[current_best_tag] best_seq.append(to_append) current_best_tag = p[current_best_tag] best_seq.reverse() return best_seq def get_prev_tag(self, tag, prev, curr_word=None): """ Finds a previous tag A for the current tag B s.t. the probability of AB was the highest for the current word. Called for every word and every tag """ best_prev = prev.keys()[ 0] # assign at least something to avoid None exception best_prob = 0.0 for prevtag in prev.keys(): # find the maximum probability prob = prev[prevtag] * self.transition_probabilities[prevtag].prob( tag) if curr_word: prob *= self.emission_probabilities[tag].prob(curr_word) if prob > best_prob: best_prob = prob best_prev = prevtag return best_prev def tag_sents(self, test_sents): """Tag the given text sentence by sentence""" res = [] for sent in test_sents: res.append(self.viterbi(sent)[1:-1]) # remove start and end tags return res
def run(self): cfd = ConditionalFreqDist((tuple(self.data_set[i: i + self.n - 1]), self.data_set[i + self.n - 1]) for i in range(len(self.data_set) - self.n + 1)) lidstone_estimator = lambda fd: LidstoneProbDist(fd, self.gamma, fd.B() + 1) cpd = ConditionalProbDist(cfd, lidstone_estimator) self.model = cpd
def freq2prob(self, freq_dist): num_bins = max([freq_dist[w].B() for w in freq_dist] + [1]) prob = ConditionalProbDist(freq_dist, LaplaceProbDist, num_bins) return prob
return tags_words ''' test and train without UNK tag ''' tags_words_train = add_start_end(0, 8700) tags_words_test = add_start_end(8701, 9201) words_train = ([w for (_, w) in tags_words_train]) words_test = ([w for (_, w) in tags_words_test]) tags_train = ([t for (t, _) in tags_words_train]) tags_test = ([t for (t, _) in tags_words_test]) distinct_tags = set(tags_train) # calculating transition probability cfd_tags = ConditionalFreqDist(nltk.bigrams(tags_train)) cpd_tags = ConditionalProbDist(cfd_tags, MLEProbDist) # calculating observation likelihood cfd_tagwords = ConditionalFreqDist(tags_words_train) cpd_tagwords = ConditionalProbDist(cfd_tagwords, MLEProbDist) backpointer = find_tag_for_sentences(words_test) accuracy_without_UNK_tag = calculate_accuracy(tags_test, backpointer) ''' test and train with UNK-CAP tag ''' tags_words_train = add_start_end(0, 8700) tags_words_test = add_start_end(8701, 9201) tags_words_train = replace_with_UNKCAP(tags_words_train) tags_words_test = replace_with_UNKCAP(tags_words_test) words_train = ([w for (_, w) in tags_words_train]) words_test = ([w for (_, w) in tags_words_test]) tags_train = ([t for (t, _) in tags_words_train])