def __init__(self): """Initializes the del_probs and ins_probs variables to empty MLE probability distributions, and the sub_probs to an empty conditional probability distribution.""" self.del_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be deleted self.ins_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be inserted self.sub_probs = ConditionalProbDist( ConditionalFreqDist(), MLEProbDist ) # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character
def recompute_cluster_dists(text, cluster_descr): c_freqs = FreqDist() for c in text.clusters(cluster_descr): c_freqs.inc(c) c_dist = MLEProbDist(c_freqs) c_bi_freqs = FreqDist() for bi_c in bigrams(text.clusters(cluster_descr)): c_bi_freqs.inc(bi_c) c_bi_dist = MLEProbDist(c_bi_freqs) return c_dist, c_bi_dist
def __init__(self, source, gen_func=lambda x: x): self.dictionary = Dictionary([gen_func(source)]) self.gen_func = gen_func self.source = source self.word_freqs = FreqDist() for word in self.words(): self.word_freqs.inc(word) self.word_dist = MLEProbDist(self.word_freqs)
def train_supervised(self, labelled_sequences, **kwargs): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. @return: the trained model @rtype: HiddenMarkovModelTagger @param labelled_sequences: the training data, a set of labelled sequences of observations @type labelled_sequences: list @param kwargs: may include an 'estimator' parameter, a function taking a C{FreqDist} and a number of bins and returning a C{ProbDistI}; otherwise a MLE estimate is used """ # default to the MLE estimate estimator = kwargs.get('estimator') if estimator == None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurences of starting states, transitions out of each state # and output symbols observed in each state starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts == None: starting.inc(state) else: transitions[lasts].inc(state) outputs[state].inc(symbol) lasts = state # update the state and symbol lists if state not in self._states: self._states.append(state) if symbol not in self._symbols: self._symbols.append(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, False, N) B = ConditionalProbDist(outputs, estimator, False, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
def train_costs(self, alignments): """Given a list of character alignments, uses it to estimate the likelihood of different types of errors.""" # find all of the deletions, insertions, and substitutions in the alignment list deletions = [] insertions = [] substitutions = [] for alignment in alignments: fromChar = alignment[0] toChar = alignment[1] if ((fromChar == toChar) or (fromChar != '%' and toChar != '%')): substitutions.append(alignment) elif fromChar == '%': insertions.append(toChar) else: # toChar == '%' deletions.append(fromChar) # use the result above to update the probability distributions scores in del_probs, ins_probs, and sub_probs self.del_probs = MLEProbDist(FreqDist(deletions)) self.ins_probs = MLEProbDist(FreqDist(insertions)) self.sub_probs = ConditionalProbDist( ConditionalFreqDist([(pair[0], pair[1]) for pair in substitutions]), MLEProbDist) return
def build_language_models(corpus_words): unigram = FreqDist(corpus_words) unigram_prob = MLEProbDist(unigram) bigram = ConditionalFreqDist(nltk.bigrams(corpus_words)) bigram_prob = ConditionalProbDist(bigram, MLEProbDist) def lm_1(words): p = 1.0 for w in words: p = p * unigram_prob.prob(w) return p def lm_2(words): p = 1.0 previous_word = None for w in words: if previous_word is None: p *= unigram_prob.prob(w) else: p *= bigram_prob[previous_word].prob(w) previous_word = w return p return lm_1, lm_2
class EditDistanceFinder(): def __init__(self): """Initializes the del_probs and ins_probs variables to empty MLE probability distributions, and the sub_probs to an empty conditional probability distribution.""" self.del_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be deleted self.ins_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be inserted self.sub_probs = ConditionalProbDist( ConditionalFreqDist(), MLEProbDist ) # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character def ins_cost(self, x): """Given a single character as input, returns a cost (between 0 and 1) of inserting that character.""" ins_prob = self.ins_probs.prob(x) return float(1 - ins_prob) def del_cost(self, x): """Given two characters as input, returns a cost (between 0 and 1) of substituting the first character with the second character.""" del_prob = self.del_probs.prob(x) return float(1 - del_prob) def sub_cost(self, x, y): """Given two characters as input, returns a cost (between 0 and 1) of substituting the first character (x) with the second character (y).""" if x == y: return 0.0 else: return 2.0 * (1.0 - float(self.sub_probs[x].prob(y)) ) # order of x and y def align(self, start, end): """Given two words, returns a distance (as a float) and the corresponding character alignments (as a list of tuples of characters).""" numRows = len(start) + 1 numColumns = len(end) + 1 dptable = np.array(([[0] * numColumns] * numRows), dtype=object) # each cell in the dp table will consist of (cost, char befor modification, char after modification) # e.g. if the last action was to delete 'a' and the resulting cost is 10, (10, a, %) # base cases dptable[numRows - 1, 0] = (0.0, '%', '%') ## fill in the bottom row for i in range(1, numColumns): char = end[i - 1] cost = dptable[numRows - 1, i - 1][0] + self.ins_cost(char) dptable[numRows - 1, i] = (cost, '%', char) ## fill in the first column for j in range(numRows - 2, -1, -1): char = start[numRows - j - 2] cost = dptable[j + 1, 0][0] + self.del_cost(char) dptable[j, 0] = (cost, char, '%') # fill in the rest of the table newStart = "%" + start newEnd = "%" + end for row in range(numRows - 2, -1, -1): for col in range(1, numColumns): sub_cost = dptable[row + 1][col - 1][0] + self.sub_cost( newStart[len(newStart) - row - 1], newEnd[col]) del_cost = dptable[row + 1][col][0] + self.del_cost( newStart[len(newStart) - row - 1]) ins_cost = dptable[row][col - 1][0] + self.ins_cost( newEnd[col]) min_cost = min(sub_cost, del_cost, ins_cost) # find the move with the least cost and set fromChar and toChar accordingly if sub_cost == min_cost: fromChar = newStart[len(newStart) - row - 1] toChar = newEnd[col] elif del_cost == min_cost: fromChar = newStart[len(newStart) - row - 1] toChar = "%" elif ins_cost == min_cost: fromChar = "%" toChar = newEnd[col] dptable[row, col] = (min_cost, fromChar, toChar) # backtrace row = 0 col = numColumns - 1 path = [] while (row != numRows - 1 or col != 0): fromChar = dptable[row][col][1] toChar = dptable[row][col][2] path.insert(0, (fromChar, toChar)) # trace the last action and move to the prior cell ## if the prior move was to substitute if (fromChar == toChar) or (fromChar != '%' and toChar != '%'): row += 1 col -= 1 ## if the prior move was to insert elif (fromChar == '%'): col -= 1 ## if the prior move was to delete else: row += 1 return (dptable[0, numColumns - 1][0], path) def show_alignment(self, alignment): # user has to feed an align result """Takes the alignments returned by align and print them in a friendly way.""" string1 = [a[0] for a in alignment] string2 = [a[1] for a in alignment] print("String1:", ' '.join(string1)) print("String2:", ' '.join(string2)) return def train(self, file): """Given a file name, reads in the file and split it into a list of tuples, e.g. [(misspelling1, correctspelling1), (misspelling2, correctspelling2), ...], then iteratively call train_alignments and train_costs repeatedly until the model converges.""" pairs = [(pair[0], pair[1]) for pair in [ sentence.strip('\n').split(',') for sentence in open(file).readlines() ]] prior = None converged = False while not converged: print("Converging...") alignments = self.train_alignments(pairs) self.train_costs(alignments) # check for convergence if alignments == prior: converged = True prior = alignments return def train_alignments(self, misspellings): """Given a list of misspellings like the one returned by train, calls align on each of the (misspelling, correctspelling) pairs, and returns a single list with all of the character alignments from all of the pairs.""" align_list = [] for i in range(len(misspellings)): align_list += self.align(misspellings[i][0], misspellings[i][1])[1] return align_list def train_costs(self, alignments): """Given a list of character alignments, uses it to estimate the likelihood of different types of errors.""" # find all of the deletions, insertions, and substitutions in the alignment list deletions = [] insertions = [] substitutions = [] for alignment in alignments: fromChar = alignment[0] toChar = alignment[1] if ((fromChar == toChar) or (fromChar != '%' and toChar != '%')): substitutions.append(alignment) elif fromChar == '%': insertions.append(toChar) else: # toChar == '%' deletions.append(fromChar) # use the result above to update the probability distributions scores in del_probs, ins_probs, and sub_probs self.del_probs = MLEProbDist(FreqDist(deletions)) self.ins_probs = MLEProbDist(FreqDist(insertions)) self.sub_probs = ConditionalProbDist( ConditionalFreqDist([(pair[0], pair[1]) for pair in substitutions]), MLEProbDist) return
] for elem in dataRaw_tokens_nopunct: if elem == 's': dataRaw_tokens_nopunct.remove(elem) for elem in dataRaw_tokens_nopunct: if elem == '/s': dataRaw_tokens_nopunct.remove(elem) dataRaw_fdist = FreqDist(dataRaw_tokens_nopunct) ##xx = dataRaw_fdist.most_common() vocabRaw_tokens_nopunct = [ word for word in word_tokenize(vocabRaw) if re.search("\w", word) ] # calculate the possibility distribution dataRaw_pdist = MLEProbDist(dataRaw_fdist) #yy = [(x, dataRaw_pdist.prob(x)) for x in dataRaw_pdist.samples()] #yy =(aa, dataRaw_pdist.prob(aa)) # print possibility of word wordPos = [(x, dataRaw_pdist.prob(x)) for x in vocabRaw_tokens_nopunct] # print possibility of UNK KPos = 0 for y in vocabRaw_tokens_nopunct: KPos += dataRaw_pdist.prob(y) UNKPos = [('UNK', (1 - KPos))] wordPos.append(UNKPos[0]) #print(wordPos) #print('UNK, ',UNKPos)
def tag_prob(self): return MLEProbDist(self.t_freq)
finalOutput.append("UNK") # append UNK as value def printOutContent(input): for a, b in input: # prints out tuple contents print(a + ":" + str(b), end=" ") # on the same line theData = finalOutput # UNIGRAM fdist1 = FreqDist(theData) + FreqDist({"UNK": 0}) # initialises frequency distribution and adds a frequency of 0 for UNK # however unseen events get a value of zero and don't get smoothed... # Unsmoothed unSmoothed = MLEProbDist(fdist1) # initialises probability distribution unSmoothProb = [(x, unSmoothed.prob(x)) for x in unSmoothed.samples()] # Smoothed Smoothed = LaplaceProbDist(fdist1) SmoothedProb = [(x, Smoothed.prob(x)) for x in Smoothed.samples()] # QUESTION 5 # BIGRAM bigram = list(nltk.ngrams(theData, 2)) fdist2 = FreqDist(bigram) # Unsmoothed unSmoothedBigram = MLEProbDist(fdist2) unSmoothedBigramProb = [(x, unSmoothedBigram.prob(x)) for x in unSmoothedBigram.samples()]
from __future__ import print_function from nltk.metrics import * reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() test = 'DET VB VB DET NN NN NN IN DET NN'.split() print(accuracy(reference, test)) reference_set = set(reference) test_set = set(test) precision(reference_set, test_set) print(recall(reference_set, test_set)) print(f_measure(reference_set, test_set)) from nltk import FreqDist, MLEProbDist pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf")) pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss")) print(log_likelihood(['a', 'd'], [pdist1, pdist2])) edit_distance("rain", "shine") s1 = set([1,2,3,4]) s2 = set([3,4,5]) binary_distance(s1, s2) print(jaccard_distance(s1, s2)) print(masi_distance(s1, s2)) spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3}) s1 = "000100000010" s2 = "000010000100"
def mle_of_tags(samples): set_samples = set(samples) #so it won't be n times of the same rule. easy to use later mle_samples = MLEProbDist(FreqDist(samples)) probs = {sample : mle_samples.prob(sample) for sample in set_samples} return probs