def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary=False, feature_values=None, verbose=False): if len(labeled_featuresets) <= support_cutoff: return if self._fname is None: return if depth_cutoff <= 0: return for fval in self._decisions: fval_featuresets = [(featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) == fval] label_freqs = FreqDist(label for (featureset, label) in fval_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._decisions[fval] = DecisionTreeClassifier.train( fval_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose) if self._default is not None: default_featuresets = [ (featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) not in self._decisions ] label_freqs = FreqDist(label for (featureset, label) in default_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._default = DecisionTreeClassifier.train( default_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose)
def __init__(self, freqdist, bins=None): MLEProbDist.__init__(self, freqdist, bins) self._probarray = np.zeros((len(freqdist), )) self._probmap = {} for i, item in enumerate(freqdist.keys()): self._probarray[i] = freqdist.freq(item) self._probmap[i] = item
def generate_alternative(self, n): """ Generate n words using a more complicated algorithm """ generated_tags = [] generated_lemmas = [] generated_words = [] # Incrementally generate (tag, lemma) pairs for i in range(n): tag_choice = None # Start with nothing # Loop through n-grams of grammar size = 2 * self._n while size > 2: tag_choices = self._tags_ngram.backoff_search( generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size) # Determine valid lemmas in context with these tag choices tag_to_lemma = {} if tag_choices is not None: for tag, _ in tag_choices.items(): # For each tag, find valid lemmas in context with that tag lemma = self._lemmas_ngram.choose_word( generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag]) if lemma is not None: tag_to_lemma[tag] = lemma if len(tag_to_lemma) > 1: # We have found valid (tag, lemma) pairs tag_probdist = MLEProbDist(FreqDist( {tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma})) tag_choice = tag_probdist.generate() # Randomly select the tag lemma_choice = tag_to_lemma[tag_choice] # Set the lemma break size -= 1 # Lower to smaller n-gram for more tag choices if tag_choice is None: # We still didn't find a valid (tag, lemma) pair, fallback tag_choice = MLEProbDist(tag_choices).generate() lemma_choice = MLEProbDist( self._tag_lemmas[tag_choice]).generate() generated_tags.append(tag_choice) generated_lemmas.append(lemma_choice) # Generate all words based on (tag, lemma) pairs for (tag, lemma) in zip(generated_tags, generated_lemmas): # Search for and choose word with correct lemma/tag choices = self._words_ngram.backoff_search( generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)]) if choices is None: # Could not find a good word, choose from list choices = self._tag_lemma_words[(tag, lemma)] generated_words.append(MLEProbDist(choices).generate()) return list(self._word_ids.transform_ids(generated_words))
def add_individual(number_individuals, res_address, diagnosis): total_individuals = [] new_address = res_address.sample(number_individuals).to_dict('records') for idx in xrange(number_individuals): diagnosis_freq_dist = FreqDist(diagnosis) diagnosis_prob_dist = MLEProbDist(diagnosis_freq_dist) diagnosis_random = diagnosis_prob_dist.generate() full_address = new_address[idx]['ADDR_FULL'] + '|' + new_address[idx]['CTYNAME'] + '|' + new_address[idx]['ZIP5'] gender, age = get_gender_age(new_address[idx]) new_individual = {'Date_Inf': current_date, 'Gender': gender, 'Age': age, 'Census_Tract': new_address[idx]['GEOID'], 'Address':full_address, 'LON':new_address[idx]['LON'], 'LAT':new_address[idx]['LAT'], 'Diagnosis': diagnosis_random} total_individuals.append(new_individual) return pd.DataFrame.from_records(total_individuals)
def get_gender_age(full_address): GEOID = full_address['GEOID'] try: age_gender_dist = KC_age_gender.loc[[GEOID]].loc[:,'M0-4':'F85-120'] age_gender_freq_dist = FreqDist(age_gender_dist) age_gender_prob_dist_age_gender = MLEProbDist(age_gender_freq_dist) age_gender_random = age_gender_prob_dist_age_gender.generate() gender = age_gender_random[0] age = age_gender_random[1:] return gender, age except: return np.nan, np.nan
def gen_sent(ngram): global lis # "n" contains the ngram number n = lis[1] #number of required sentences is stored in sent_num sent_num = lis[2] i = 0 for i in range(sent_num): j = True # we are using this window to go through the sentence with n-1 previous # words stored in the window window = [] sent = "" for size in range(n - 1): window.append('<start>') while j == True: tup_win = tuple(window) if tup_win not in ngram.keys(): sys.exit("We don't have a start line") # FreqDist and MLEProbDist function will transform the frequencies to probabilities # by performing (item freq/ sum of frequencies) freq_dist = FreqDist(ngram[tup_win]) #prob_dist.generate() will take in the freq-distance and generate a random token # according to the distribution prob_dist = MLEProbDist(freq_dist) next_w = prob_dist.generate() #the following condition is used to detect the end of line if (next_w == "." or next_w == "?" or next_w == "!"): j = False sent += next_w continue #We'd like to make sure the apostrophe token has no space before or after it... # ... as well as the begining of the line elif (next_w == "m" or next_w == "s" or next_w == "re" or next_w == "," or next_w == "’" or next_w == "ve" or next_w == "t" or tup_win[-1] == '<start>'): sent += next_w else: sent += " %s" % next_w #moving the window forward by popping and appending window.pop(0) window.append(next_w) print("\nSentence %s:\n%s" % (i + 1, sent))
def _estimator(fdist, bins): """ Default estimator function using an MLEProbDist. """ # can't be an instance method of NgramModel as they # can't be pickled either. return MLEProbDist(fdist)
def TransitionsGenerate( AddCorpus, train_p, tagger, estimator ): #recalculate the transition matrix using train plain text + additional corpus if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) data = train_p data.extend(AddCorpus) print(type(data)) for s in data: s = list(s) N = len(tagger._states) transitions = ConditionalFreqDist() for sentence in data: lasts = None sentence = list(sentence.strip('\n')) for character in sentence: state = character if not lasts is None: transitions[lasts][state] += 1 lasts = state A = ConditionalProbDist(transitions, estimator, N) return A
def train_supervised(self, labelled_sequences, **kwargs): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. :return: the trained model :rtype: HiddenMarkovModelTagger :param labelled_sequences: the training data, a set of labelled sequences of observations :type labelled_sequences: list :param kwargs: may include an 'estimator' parameter, a function taking a FreqDist and a number of bins and returning a CProbDistI; otherwise a MLE estimate is used """ # default to the MLE estimate estimator = kwargs.get('estimator') if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = set(self._symbols) known_states = set(self._states) starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts is None: starting.inc(state) else: transitions[lasts].inc(state) outputs[state].inc(symbol) lasts = state # update the state and symbol lists if state not in known_states: self._states.append(state) known_states.add(state) if symbol not in known_symbols: self._symbols.append(symbol) known_symbols.add(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) B = ConditionalProbDist(outputs, estimator, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
def __init__(self, n, train, estimator=None): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{list} of C{string} @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} """ self._n = n if estimator == None: estimator = lambda fdist, bins: MLEProbDist(fdist) cfd = ConditionalFreqDist() self._ngrams = set() self._prefix = ('', ) * (n - 1) for ngram in ingrams(chain(self._prefix, train), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, len(cfd)) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n - 1, train, estimator)
def trainModelLM(laplace, symbols, train_output, train_transition): extra_set = [] for i in symbols: for j in symbols: extra_set.append((i, j)) transition = suppleText(train_transition) initial = [] output = [] for i in range(len(train_output)): initial.append(train_output[i][0][1]) for j in range(len(train_output[i])): output.append(train_output[i][j]) if laplace: transition += extra_set initial += symbols output += extra_set transition_cfd = ConditionalFreqDist(transition) transition_cqd = ConditionalProbDist(transition_cfd, MLEProbDist) inital_cfd = FreqDist(initial) initial_cqd = MLEProbDist(inital_cfd) output_cfd = ConditionalFreqDist(output) output_cqd = ConditionalProbDist(output_cfd, MLEProbDist) model = hmm.HiddenMarkovModelTagger(symbols=symbols, states=symbols, transitions=transition_cqd, outputs=output_cqd, priors=initial_cqd) return model
def plot_word_dist_as_cloud(word_dist, file_name=None, plot=False): prob_dist = MLEProbDist(word_dist) viz_dict = {} for word_tuple in word_dist: string = ' '.join(word_tuple) viz_dict[string] = prob_dist.prob(word_tuple) wordcloud = WordCloud(max_words=100).generate_from_frequencies(viz_dict) if file_name != None: wordcloud.to_file("img/" + file_name +".png") if plot: plt.figure() plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
def train_supervised2(trainer, labelled_sequences, plain_sequences, estimator=None): _TAG = 1 _TEXT = 0 if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = set(trainer._symbols) known_states = set(trainer._states) starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() # =================code added to supplement transition matrix==================== for sequence in plain_sequences: lasts = None for token in sequence: if lasts is None: pass else: transitions[lasts][token] += 1 lasts = token if token not in known_states: trainer._states.append(token) known_states.add(token) # ================================end============================================ for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 outputs[state][symbol] += 1 lasts = state # update the state and symbol lists if state not in known_states: trainer._states.append(state) known_states.add(state) if symbol not in known_symbols: trainer._symbols.append(symbol) known_symbols.add(symbol) # create probability distributions (with smoothing) N = len(trainer._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) B = ConditionalProbDist(outputs, estimator, len(trainer._symbols)) return HiddenMarkovModelTagger(trainer._symbols, trainer._states, A, B, pi)
def _generate_one_predicated(self, context, backoff_limit, predicate): context = tuple(context)[1 - self._n:] choices = self.backoff_search(context, backoff_limit, predicate) # Possible tokens if choices is not None: return MLEProbDist(choices).generate() else: return None
def _generate_one(self, context, backoff_limit): context = tuple(context)[1 - self._n:] while not context in self and len(context) >= backoff_limit: context = context[1:] if context in self: return MLEProbDist( self[context]).generate() # Select from possible tokens else: return None
def sentence_generator(gramFreq,numofsentences): i = 0 for i in range (numofsentences): sentenceGen = True sentencelist = () generateSentence = "" for size in range (int(ngrams)-1): sentencelist += ('<start>',) while sentenceGen == True: token_dict = {} for index, val in ngrams_frequency.items(): index2 = index[:-1] if index2 == sentencelist: token_dict.update({index[-1]: val}) # generating frequency using the function frequencyDistribution = FreqDist(token_dict) # generating probability using the function probabilityDistribution = MLEProbDist(frequencyDistribution) # predicting the next word next_word = probabilityDistribution.generate() # words having ".,?,!" if (next_word =="." or next_word == "?" or next_word == "!"): sentenceGen = False generateSentence += next_word continue # words having , ' elif (next_word == "," or next_word == "’"): generateSentence += next_word else: generateSentence += " %s"%next_word if len(sentencelist) != 0 : my_list = list(sentencelist) my_list.pop(0) my_list.append(next_word) sentencelist = tuple(my_list) # Display sentences print ("\nSentence %s: %s"%(i+1,generateSentence))
def gen_sentence(ngram): global arg i = 0 # n in ngrams n = arg[1] # number of sentences to generate m = arg[2] for i in range(m): j = True table = [] sentence = "" for size in range(n - 1): table.append('<START>') while j == True: tuple_table = tuple(table) if tuple_table not in ngram.keys(): # when start is not available sys.exit("No start line!") # generating frequency frequency = FreqDist(ngram[tuple_table]) # generating probability probability = MLEProbDist(frequency) # predicting the next word pred_word = probability.generate() # words having ".,?,!" if (pred_word == "." or pred_word == "?" or pred_word == "!"): j = False sentence += pred_word continue # words having , ' or START tag elif (pred_word == "," or pred_word == "’" or tuple_table[-1] == '<START>'): sentence += pred_word else: sentence += " %s" % pred_word table.pop(0) table.append(pred_word) # Display sentences print("\nSentence %s:\n%s" % (i + 1, sentence))
def __init__(self, corpus, n, estimator=None): if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) bi = [] self._l = [] for tree in corpus[:n]: ts = tree.leaves() sent = ['START'] + ts bi += nltk.bigrams(sent) self._l.append(len(sent)) cfd = ConditionalFreqDist(bi) self._model = ConditionalProbDist(cfd, estimator, len(cfd))
def generate(self, n): """ Generate n words using copied grammar, generated lemmas, and words based on lemmas """ start = random.randint(n, len(self._tags) - n) generated_tags = self._tags[ start:start + n] # Copy a random section of POS tags for grammar # Generate sequence of lemmas based off of grammar generated_lemmas = [] for tag in generated_tags: # Search for and choose a lemma with correct tag choice = self._lemmas_ngram.choose_word( generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag]) if choice is None: # Could not find a good lemma for current POS tag, choose from list choice = MLEProbDist(self._tag_lemmas[tag]).generate() generated_lemmas.append(choice) # Generate sequence of words based off of lemmas and grammar generated_words = [] for (tag, lemma) in zip(generated_tags, generated_lemmas): # Search for and choose word with correct lemma/tag choices = self._words_ngram.backoff_search( generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[ (tag, lemma)]) if choices is None: # Could not find a good word, choose from list choices = self._tag_lemma_words[(tag, lemma)] generated_words.append(MLEProbDist(choices).generate()) return list(self._word_ids.transform_ids(generated_words))
def validate_pcfg_generate(grammar): pd = makeLhrProbDict(grammar) productions = [] cfd = ConditionalFreqDist() for i in np.arange(1000): tree = pcfg_generate(grammar) productions += tree.productions() for p in productions: cfd[p.lhs()].inc(p.rhs()) for c in cfd.conditions(): p = MLEProbDist(cfd[c]) q = pd[c] div = KL_Divergence(p,q) print "KL_Divergence for %s = %f" %(c , div)
def train_transitions(labelled_sequences, additional_transitions, estimator=None): # default to the MLE estimate if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = [] known_states = [] starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[0] symbol = token[1] if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 outputs[state][symbol] += 1 lasts = state # update the state and symbol lists if state not in known_states: known_states.append(state) if symbol not in known_symbols: known_symbols.append(symbol) # create probability distributions (with smoothing) N = len(known_states) pi = estimator(starting, N) A = ConditionalProbDist( ConditionalFreqDist.__add__(transitions, additional_transitions), estimator, N) B = ConditionalProbDist(outputs, estimator, len(known_symbols)) return hmm.HiddenMarkovModelTagger(known_states, known_symbols, A, B, pi)
def doesnt_work(self, y): """ Code adapted from NLTK implementation of supervised training in HMMs """ estimator = lambda fdist, bins: MLEProbDist(fdist) transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in y: lasts = None for state in sequence: if lasts is not None: transitions[lasts][state] += 1 lasts = state N = self.number_of_states + 2 model = ConditionalProbDist(transitions, estimator, N) return model
def main(): DEBUG =1 depRelFile=open(sys.argv[1],'r') #file with dep rel tuples ReadDictFromFile(sys.argv[2],lemma_dict) #lemma file modelFile = open(sys.argv[3],'w') if (len(sys.argv)==5): DEBUG = int(sys.argv[4]) print "---Done loading lemma file---" print "---Computing CDF....---" incompletePairs = ComputeFreqDist(depRelFile) print "---Done computing CDF---" print "incomplete pairs: ",incompletePairs if(DEBUG): print "Info about F(arg)" print "unique samples: ", argFD.B() print "total seen samples: ", argFD.N() print "top arg:", argFD.max() print "count for support: ", argFD['support'] print "Info about CFD(arg|rel,vb)" print "unique conditions seen: ", len(argVbRelCFD.conditions()) print "total seen samples", argVbRelCFD.N() top_CFD1 = sorted(argVbRelCFD[('dobj','enjoy')].items(),key=operator.itemgetter(1), reverse=True)[:10] print "all dobj,enjoy: ", argVbRelCFD[('dobj','enjoy')].N() print "top dobj for enjoy:\n",top_CFD1 print "Info about CFD(arg|vb)" print "unique conditions seen: ", len(argVbCFD.conditions()) print "total seen samples", argVbCFD.N() top_CFD2 = sorted(argVbCFD['enjoy'].items(),key=operator.itemgetter(1), reverse=True)[:10] print "all enjoy: ", argVbCFD['enjoy'].N() print "top arg for enjoy:\n",top_CFD2 print "---Computing MLE PDFs....---" argVbRelPDF = ConditionalProbDist(argVbRelCFD,MLEProbDist) argVbPDF = ConditionalProbDist(argVbCFD,MLEProbDist) argPDF = MLEProbDist(argFD) #I'm not sure here Types is equivelent with argVbRelCFD.conditions() or unique condition+arg #!!!!! lambda should be for each history P(a|v) T = count of unique (v a) pairs starting with v # for each condition v -> sum(CFD[v].B() -> how many unique arguments I've seen after this condition) print "---Computing Witten-Bell smoothed PDFs....---" #for unseen pairs we multiply the backoff_weight with the probability of the backoff model #e.g. if c(rel,vb,arg)=0 and c(vb,arg)>0 then P(arg|rel,vb)=argRelVbPDFWB_backoff_weights[(rel,vb)] * argVbPDFWB[vb].prob(arg) argPDFWB, backoff_uniform = ComputeWBArg(argPDF) argVbPDFWB, argVbPDFWB_backoff_weights, countArgVB = ComputeWBVbArg(argVbPDF,argPDFWB) argRelVbPDFWB,argRelVbPDFWB_backoff_weights, countRelVbArg = ComputeWBRelVbArg(argVbRelPDF,argVbPDFWB) if(DEBUG): print "P(support|dobs,enjoy)" print argVbRelPDF[('dobj','enjoy')].prob('support') print argRelVbPDFWB[('dobj','enjoy')]['support'] print "No args following (dobj,enjoy)", argVbRelCFD[('dobj','enjoy')].B() print "P(support|enjoy)" print argVbPDF['enjoy'].prob('support') print argVbPDFWB['enjoy']['support'] print "P(support)" print argPDF.prob('support') print argPDFWB['support'] WriteToArpaFormat(modelFile, len(argPDFWB),countArgVB,countRelVbArg,argPDFWB,argVbPDFWB,argRelVbPDFWB, backoff_uniform, argVbPDFWB_backoff_weights, argRelVbPDFWB_backoff_weights) if(DEBUG): #print sorted(argVbPDFWB['enjoy'],key=operator.itemgetter(1), reverse=True)[:5] #[('enjoy','support')] for condition in argVbPDFWB.keys()[:10]: sum1 = 0 sum2 = 0 for prob in argVbPDFWB[condition].values(): sum1+=prob for arg in argVbCFD[condition].items(): sum2+=argVbPDF[condition].prob(arg[0]) print "total prob: ", sum1, sum2 print "P_WB(support|dobj, enjoy)" print argRelVbPDFWB[('dobj','enjoy')]['support'] for condition in argRelVbPDFWB.keys()[:10]: sum = 0 for prob in argRelVbPDFWB[condition].values(): sum+=prob print "total prob: ", sum
tokenized_text = len(nltk.sent_tokenize(inputFile)) print(sent_tokenize(inputFile)) print("tokenized text: ", tokenized_text, "\n") tokenized_text = nltk.word_tokenize(inputFile) tokenized_text = [word.lower() for word in tokenized_text if word.isalpha()] print("Lower cased text: ", tokenized_text) print("Word Count: ", len(tokenized_text), "\n") freq_dist_uni = nltk.FreqDist(tokenized_text) print("Most common 10 unigram: ", freq_dist_uni.most_common(10), "\n", "least common 3 words: ", freq_dist_uni.most_common()[-3:], "\n") prob_distArray = [] prob_dist_uni = MLEProbDist(freq_dist_uni) for s in prob_dist_uni.samples(): prob_distArray.append(prob_dist_uni.prob(s)) i = 0 for lim in freq_dist_uni.most_common(10): print(lim, prob_distArray[i]) i += 1 elep = ELEProbDist(freq_dist_uni) for s in elep.samples(): prob_distArray.append(elep.prob(s)) i = 0 for lim in freq_dist_uni.most_common(10): print(lim, prob_distArray[i], "\n") i += 1
def main(): parser = argparse.ArgumentParser(description='Text decipher options') parser.add_argument('cipher_folder', help='cipher data folder') parser.add_argument('--laplace', '-laplace', action='store_true', default=False, help='Laplace Smoothing') parser.add_argument('--langmod', '-lm', action='store_true', default=False, help='Improved decoder') args = parser.parse_args() cipher_folder = args.cipher_folder laplace = args.laplace langmod = args.langmod number_of_supp_lines = 100 #the more lines the slower the code! train_data, test_data, train_plain = get_data(cipher_folder) preprocess_supp_data() supp_data = read_preprocessed_supp_data(number_of_supp_lines) for line in train_plain: #this is so later we have all the transitions in the same place supp_data.extend(list(line)) if laplace: smoothing = LaplaceProbDist else: smoothing = MLEProbDist trainer = hmm.HiddenMarkovModelTrainer() decoder = trainer.train_supervised(train_data, smoothing) #decoder_supp = trainer_supp.train_unsupervised(supp_data, update_outputs=False, model=decoder) #because there's a bug in train_unsupervised (although I found out how to fix it!), I will have to do this manually.... #code copied from the nltk train_supervised method #here, we are updating the transition data to include our supplemental data if langmod: states = decoder._states symbols = decoder._symbols outputs = decoder._outputs priors = decoder._priors starting = FreqDist() #declaring transitions = ConditionalFreqDist( ) #declaring, why we needed all the transitions in the same place for item in supp_data: for sequence in supp_data: lasts = None for state in sequence: if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 lasts = state if laplace: estimator = LaplaceProbDist else: estimator = lambda fdist, bins: MLEProbDist( fdist) #getting this straight from the source code N = len(states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) #conditionalPD is actually already defined by our previously trained model as outputs. #we don't have new ones! decoder = HiddenMarkovModelTagger(symbols, states, A, outputs, pi) print(decoder.test(test_data)) for sent in test_data: print "".join([y[1] for y in decoder.tag([x[0] for x in sent])])
# from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.probability import MLEProbDist from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) WhitespaceTokenizer().tokenize(corpus) for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) prob_dist = MLEProbDist(freq_dist) # P(x) = freq(x) prob_dist.prob('the') freq_dist.freq('the') # # Estimating the probability distribution for roll2 # import random from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.probability import MLEProbDist
def estimator(fdist, bins): return MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = set(self._symbols)
def ml_estimator(freqdist): return MLEProbDist(freqdist)
def train_supervised(self, labelled_sequences, extra_data=False, estimator=None): # This is copied from HiddenMarkovModelTrainer if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = set(self._symbols) known_states = set(self._states) starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[1] symbol = token[0] if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 outputs[state][symbol] += 1 lasts = state # update the state and symbol lists if state not in known_states: self._states.append(state) known_states.add(state) if symbol not in known_symbols: self._symbols.append(symbol) known_symbols.add(symbol) if extra_data: print('-'*20) print("Using extra data to calculate transition probability") sent = "" for word in tqdm(treebank.words()): if word == '.': sent = sent[:-1] + word lasts = None for c in sent: if c in list(string.ascii_lowercase)+[' ', ',', '.']: if lasts is not None: transitions[lasts][c] += 1 lasts = c sent = "" elif word == ',': sent = sent[:-1] + word + ' ' else: sent += word + ' ' # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) B = ConditionalProbDist(outputs, estimator, len(self._symbols)) return hmm.HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
# transition prob for row in X_train: lasts = None for ch in list(row): if(lasts is not None): transitional[lasts][ch] += 1 lasts = ch # emission prob for row in sequences: for pair in row: emissional[pair[1]][pair[0]] += 1 if(improved_laplace): print("################## Laplace ####################### \n") estimator= nltk.probability.LaplaceProbDist else: estimator = lambda fdist, bins: MLEProbDist(fdist) N = len(symbols) PI = estimator(Pi, N) A = ConditionalProbDist(transitional, estimator, N) B = ConditionalProbDist(emissional, estimator ,N) tagger = HiddenMarkovModelTagger(states, symbols, A, B, PI) print("\n ################## C{} Decryption Results #######################".format(int(i)) ) for row in test_cipher: print(tagger.best_path(row)) print("\n ################## C{} Accuracy Results #######################". format(int(i)) ) print(tagger.test(tester))
def compute_kl_divergence(mle_dist1, mle_dist2): ans = 0 for p in mle_dist1.freqdist(): for q in mle_dist2.freqdist(): if p.rhs() == q.rhs(): ans += p.prob() * math.log(p.prob() / q.prob()) return ans for lhs in lhs_of_prods: prods = [ ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=prod.prob()) for prod in productions_corpus if str(prod.lhs()) == lhs ] prods_for_toy_pcfg2 = [ ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=prod.prob()) for prod in productions_toy_pcfg2 if str(prod.lhs()) == lhs ] if len(prods): MLE_prob_dist = MLEProbDist(FreqDist(prods)) if len(prods_for_toy_pcfg2): MLE_prob_dist_for_toy_pcfg2 = MLEProbDist(FreqDist(prods_for_toy_pcfg2)) if not(len(prods) and len(prods_for_toy_pcfg2)): print('skipping {} because this nt does not appear in both cases'.format(lhs)) else: print('this is the KL-Divergence {} for this lhs {}'.format(compute_kl_divergence(MLE_prob_dist, MLE_prob_dist_for_toy_pcfg2), lhs))