def train(cipher_train_data, plain_train_data, cipher_test_data, plain_test_data, laplace_smoothing=False): train_data = prepare_data(cipher_train_data, plain_train_data) trainer = hmm.HiddenMarkovModelTrainer(states=_ALPHABET, symbols=_ALPHABET) # Trainer uses MLE by default if laplace_smoothing: tagger = trainer.train_supervised(train_data, estimator=LaplaceProbDist) else: tagger = trainer.train_supervised(train_data) correct = 0 total = 0 # Test model for s_cipher, s_plain in zip(cipher_test_data, plain_test_data): cipher, decoded = list(zip(*tagger.tag([c for c in s_cipher]))) decoded = "".join(decoded) cipher = "".join(cipher) print("\n") print(f"Cipher - {cipher}") print(f"Plain - {s_plain}") print(f"Prediction - {decoded}") for c_decoded, c_plain in zip(decoded, s_plain): if c_decoded == c_plain: correct += 1 total += len(decoded) print(f"\n>>> Accuracy {correct/total}")
def HMM(data, symbols, tag_set, verbose=True): ''' NB(data,symbols,tag_set,verbose)->model,prediction,report(dict). Keyword arguments: data: see preprocessing.py symbols: list of the input class labels tag_set: list of the output class labels for data structure see preprocessing.py ''' trainer = hmm.HiddenMarkovModelTrainer(tag_set, symbols) tagger = trainer.train_supervised( data.y_train, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins), ) y_pred = [] for sentence in data.x_test: y_pred.append(tagger.tag(sentence)) #unlike the test or evaluate function from the same suit, this requires #a list of symbols, not tuples of symbols and tags y_pred = [[tup[1] for tup in sentence] for sentence in y_pred] print('HMM Results:') print(gen_rep_flat(data, y_pred, False)) return tagger, y_pred, gen_rep_flat(data, y_pred, True)
def ie_preprocess(document): print document sentences = nltk.sent_tokenize(document) # print sentences trigram_tagger = nltk.TrigramTagger(brown_a, cutoff=0) sentences = [nltk.word_tokenize(sent) for sent in sentences] print "\nDefault tagger" x = [t0.tag(sent) for sent in sentences] print x print "\nUnigram tagger" x = [t1.tag(sent) for sent in sentences] print x print "\nBigram tagger" x = [t2.tag(sent) for sent in sentences] print x print "\nTrigram tagger" x = [t3.tag(sent) for sent in sentences] print x print "\n" # sentences = [nltk.pos_tag(sent) for sent in sentences trainer = hmm.HiddenMarkovModelTrainer() train_data = treebank.tagged_sents()[:3000] tagger = trainer.train_supervised(train_data) print tagger print "\nHMM tagger" x = [tagger.tag(sent) for sent in sentences] print x print "\nPOS Tag" sentences = [nltk.pos_tag(sent) for sent in sentences] print sentences return sentences
def __init__(self): """ Constructor. """ super().__init__() self.trainer = hmm.HiddenMarkovModelTrainer() self.tagger = None
def main(): # load corpus tokens, labels, sentences = read_corpus() trainer = h.HiddenMarkovModelTrainer(labels, tokens) # load model hmm = None try: model = open("hmm_pretrain.pkl", 'rb') hmm = pickle.load(model) model.close() except: pass # training model hmm = trainer.train_unsupervised(list(sentences), max_iterations=2, model=hmm) # save model model = open("hmm_pretrain.pkl", 'wb') pickle.dump(hmm, model) model.close() # test the trained model hmm.test(list(sentences[:10]), verbose=True)
def __init__(self, train_sents, sent_dict): '''train_sents entries are in form [((w, pos_tag), iob_tag),...] ''' train_set = [] tag_set = [] symbols = [] self.stemmer = LancasterStemmer() self.just_pos = False self.use_pos = False for tagged_sent in train_sents: example = [] for i, (wd_pos, tag) in enumerate(tagged_sent): tag_set.append(tag) if self.just_pos: symb = wd_pos[1] elif self.use_pos: #symb = wd_pos[0]+wd_pos[1] symb = self.stemmer.stem(wd_pos[0]) + wd_pos[1] else: symb = self.stemmer.stem(wd_pos[0]) symbols.append(symb) example.append((symb, tag)) train_set.append(example) trainer = hmm.HiddenMarkovModelTrainer(list(set(tag_set)), list(set(symbols))) self.hmm = trainer.train_supervised(train_set)
def trainModel(train_sent, laplace, symbols): if laplace: estimator = LaplaceProbDist else: estimator = MLEProbDist trainer = hmm.HiddenMarkovModelTrainer(symbols=symbols) model = trainer.train(labeled_sequences=train_sent, estimator=estimator) return model
def hmm_base(path): train_corpus = load_files(p=path) test_corpus = load_files(p=path, mode="test") trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(train_corpus) res = tagger.evaluate(test_corpus) # accruacy print("test accruacy {}".format(res))
def hmm_tagging(train, test): """ Train a HMM for prediction. Did not work so well. Especially when not using the eplacement method for really rare words. (Freq <=2) :param train: Annotated training data :param test: Annotated test data for eval :return: """ trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(train) print(tagger.evaluate(test))
def validacion_cruzada(classificador,train_data,k): kf = KFold(k,shuffle=True) scores=[] for i,j in kf.split(train_data): if type(classificador) == type(hmm.HiddenMarkovModelTrainer()): model = classificador.train_supervised(train_data[i[0]:i[-1]]) scores.append(model.evaluate(train_data[j[0]:j[-1]])) else: classificador.train(train_data[i[0]:i[-1]]) scores.append(classificador.evaluate(train_data[j[0]:j[-1]])) return scores
def hmm_laplace(path): train_corpus = load_files(p=path) test_corpus = load_files(p=path, mode="test") def est(fd, bins): return LidstoneProbDist(fd, 1, bins) trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(train_corpus, estimator=est) # print(test_corpus[0]) res = tagger.evaluate(test_corpus) # accruacy print("test accruacy {}".format(res))
def train(self, file): """ Trains the Diacritic Restorer on the training set from the given file using the HMM of n-grams. :param file: path to file with training set (sentences with diacritics, ideally detokenized) :type file: str :return: self for further use :rtype: BaseDiacriticsRestorer """ buffer = CorpusNgramBuffer(file, self.n) self.tagger = hmm.HiddenMarkovModelTrainer().train_supervised(buffer) buffer.close() milestone("training done: ") return self
def test(): files = loadFiles(sys.argv[1]) labeled_data = label(files["train_data"].read(), files["train_txt"].read()) trainer = hmm.HiddenMarkovModelTrainer() if len(sys.argv) > 2: if sys.argv[2] == "-laplace": tagger = trainer.train_supervised(labeled_data, LaplaceProbDist) else: tagger = trainer.train_supervised(labeled_data) test_data = testPrep(files["test_data"].read()) comparison = testPrep(files["test_txt"].read()) results = 0 for element in test_data: output = tagger.tag(element) results += accuracy(output, comparison[0]) comparison.pop(0) return results / len(test_data)
def fit(self, data): """ Fits a tagging model to object's data based on object's tagger name :return: a tagger object """ tagger = None self.X = data if self.tagger_type == 'hmm': # Setup a trainer with default(None) values # And train with the data trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(data) elif self.tagger_type == 'crf': trainer = CRFTagger() trainer.train(self.train_data, 'model.crf.tagger') tagger = trainer self.tagger = tagger return tagger
def ie_preprocess(data): trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(train_data) print tagger return tagger.tag(data.split())
train_data = treebank.tagged_sents()[:3900] # # Carga del modelo HMM previamente entrenado # # Estructura de la data de entrenamiento. Tener presente que la convención es diferente de la UPOS, ya que el Dataset es antiguo y por ende tiene otra convención. El algoritmo funciona con cualquier convención. # In[ ]: train_data # HMM pre-construido en NLTK # In[ ]: from nltk.tag import hmm tagger = hmm.HiddenMarkovModelTrainer().train_supervised(train_data) tagger # In[ ]: tagger.tag("Pierre Vinken will get old".split()) # Training accuracy # In[ ]: tagger.evaluate(treebank.tagged_sents()[:3900]) # ## Ejercicio de práctica # # **Objetivo:** Entrena un HMM usando la clase `hmm.HiddenMarkovModelTrainer()` sobre el dataset `UD_Spanish_AnCora`.
tag_prevtag = key + '|'+ key2 if tag_prevtag in transitionProbdict.keys(): if viterbiProb[tag_row2, col-1]>0: possible_probs.append(viterbiProb[tag_row2, col-1]*transitionProbdict[tag_prevtag]*emissionProbdict[word_tag]) ##Escoger ahora el maximo de los elementos por columna viterbiProb[tag_row, col] = max(possible_probs) ##Construccion de la secuencia de etiquetas (palabra y tag) res = [] for i, p in enumerate(seq): for tag in tagStateDict.keys(): if tagStateDict[tag] == np.argmax(viterbiProb[:, i]): res.append((p, tag)) return res vector = ViterbiTags('el mundo es pequeño') #print(vector) ##Ahora se peude hacer un entrenamiento directo con NLTK train_data = treebank.tagged_sents()[:3900] #print(train_data) tagger = hmm.HiddenMarkovModelTrainer().train_supervised(train_data) test = tagger.tag('Pierre Vinken will get old'.split()) print(test) check = tagger.evaluate(train_data) print(check)
y = train_y_characters[i] train_data.append((x, y)) train_data = [train_data] for i in range(0, len(test_x_characters)): x = test_x_characters[i] y = test_y_characters[i] test_data.append((x, y)) test_data = [test_data] ##without segemetation ##the processed 3rd cipher can not use this line states = symbols = x_list hmm_trainer = hmm.HiddenMarkovModelTrainer(states, symbols) ## ##Laplace Smoothing if (laplace_improved == True): hmm_model = hmm_trainer.train_supervised(train_data, estimator=pb.LaplaceProbDist) else: hmm_model = hmm_trainer.train_supervised(train_data) ## ##try different functions result = hmm_model.best_path(test_x_characters) result_acc = hmm_model.evaluate(test_data) ## results = ('').join(result)
def train_tagger(self, train_data): """ Trains an hmm pos tagger""" self.trainer = hmm.HiddenMarkovModelTrainer() self.tagger = self.trainer.train_supervised(train_data)
def __init__(self, labeled_sequences, substitute): self.substitute = substitute trainer = hmm.HiddenMarkovModelTrainer() self.my_tagger = trainer.train_supervised(self.process_lines(labeled_sequences))
def main(): file_path = r'fi-ud-train.pos-tagged.txt' tagged_sents = read_tagged_sents(file_path) random.shuffle(tagged_sents) # Copy the 5 first sentences so that the words that will be replaced with '<UNK>' can # be used when printing ref_sents = copy.deepcopy(tagged_sents[:5]) size = int(len(tagged_sents) * 0.1) train_set, test_set = tagged_sents[size:], tagged_sents[:size] # Make 2 list variables that consist only of (w,t) tuples so the word amount is easier to count train_set_words = list(itertools.chain.from_iterable(train_set)) test_set_words = list(itertools.chain.from_iterable(test_set)) # Frequencies of words in the train_set train_set_wt_freqs = Counter(train_set_words) # Go through train_set and change words with frequencies below 3 to '<UNK>' for i, sent in enumerate(train_set): for j, (word, tag) in enumerate(sent): if train_set_wt_freqs[(word, tag)] < 3: sent[j] = ('<UNK>', tag) if i > 500: # For the 500 first sentences break unk_words = [] # Go through test_set and change words that don't appear in the train_set into '<UNK>' for sent in test_set: for i, (word, tag) in enumerate(sent): if (word, tag) not in train_set_words: unk_words.append((word, tag)) sent[i] = ('<UNK>', tag) UNK_rel_freq = len(unk_words) / len(test_set_words) print("Relative frequency of unknown words in the test set: {}\n".format( UNK_rel_freq)) trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(train_set) print("HMM based POS tagger's accuracy: {}".format( tagger.evaluate(test_set))) # List of the 5 first sentences in the test_set print_sents = [[word for word, tag in tagged_sent] for tagged_sent in tagged_sents[:5]] print('\n\n5 sentences tagged by the ConsecutivePosTagger:\n') for i, sent in enumerate(print_sents): print("Sentence {}:".format(i + 1)) tagged_sent = tagger.tag(sent) # Add the actual word in front of the possible '<UNK>' for j, (word, tag) in enumerate(tagged_sent): ref_word = ref_sents[i][j][0] tagged_sent[j] = (re.sub(r'(<UNK>)', ref_word + r'\1', word), tag) print(tagged_sent, "\n")
def main(): parser = argparse.ArgumentParser(description='Text decipher options') parser.add_argument('cipher_folder', help='cipher data folder') parser.add_argument('--laplace', '-laplace', action='store_true', default=False, help='Laplace Smoothing') parser.add_argument('--langmod', '-lm', action='store_true', default=False, help='Improved decoder') args = parser.parse_args() cipher_folder = args.cipher_folder laplace = args.laplace langmod = args.langmod number_of_supp_lines = 100 #the more lines the slower the code! train_data, test_data, train_plain = get_data(cipher_folder) preprocess_supp_data() supp_data = read_preprocessed_supp_data(number_of_supp_lines) for line in train_plain: #this is so later we have all the transitions in the same place supp_data.extend(list(line)) if laplace: smoothing = LaplaceProbDist else: smoothing = MLEProbDist trainer = hmm.HiddenMarkovModelTrainer() decoder = trainer.train_supervised(train_data, smoothing) #decoder_supp = trainer_supp.train_unsupervised(supp_data, update_outputs=False, model=decoder) #because there's a bug in train_unsupervised (although I found out how to fix it!), I will have to do this manually.... #code copied from the nltk train_supervised method #here, we are updating the transition data to include our supplemental data if langmod: states = decoder._states symbols = decoder._symbols outputs = decoder._outputs priors = decoder._priors starting = FreqDist() #declaring transitions = ConditionalFreqDist( ) #declaring, why we needed all the transitions in the same place for item in supp_data: for sequence in supp_data: lasts = None for state in sequence: if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 lasts = state if laplace: estimator = LaplaceProbDist else: estimator = lambda fdist, bins: MLEProbDist( fdist) #getting this straight from the source code N = len(states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) #conditionalPD is actually already defined by our previously trained model as outputs. #we don't have new ones! decoder = HiddenMarkovModelTagger(symbols, states, A, outputs, pi) print(decoder.test(test_data)) for sent in test_data: print "".join([y[1] for y in decoder.tag([x[0] for x in sent])])
def main(): parser = argparse.ArgumentParser() parser.add_argument('-laplace', help="adds laplace smoothing", action="store_true") parser.add_argument( '-lm', help="informs the character transitions in english using extra-text", action='store_true') parser.add_argument('cipher', help='select the cipher that you want to decode', type=str) args = parser.parse_args() path_to_directory = os.path.abspath(os.path.curdir) path_to_cipher = os.path.join(path_to_directory, args.cipher) if args.laplace: estimation_method = LaplaceProbDist else: estimation_method = MLEProbDist cipher_test, cipher_train, plaintext_test, plaintext_train = read_in_ciphers( path_to_cipher) training_set = [] for i in range(len(cipher_train)): training_units = turn_training_observation_into_nltk_format( cipher_train[i], plaintext_train[i]) training_set.append(training_units) hidden_markov_trainer = hmm.HiddenMarkovModelTrainer() tagger = hidden_markov_trainer.train_supervised( training_set, estimator=estimation_method) if args.lm: with open('frankenstein_ulysses_hrtofdarkness.txt', 'rb') as f: extra_text = f.read() extra_text = str(extra_text) extra_text = extra_text.replace(r'\r', '') extra_text = extra_text.replace(r'\n', '') extra_text = extra_text.replace(r'\x', '') extra_text = clean_additional_text(extra_text) additional_text_transitions = find_transition_frequency(extra_text) original_text_transitions = find_transition_frequency( ''.join(plaintext_train)) combined_transition_frequency = additional_text_transitions.__add__( original_text_transitions) tagger._transitions = ConditionalProbDist( combined_transition_frequency, estimation_method, len(combined_transition_frequency.keys())) test_set = turn_test_cipher_into_nltk_format(cipher_test) predictions = [tagger.tag(test_sentence) for test_sentence in test_set] predicted_sequence = extract_predicted_sequence(predictions) recomposed_sentences = [ ''.join(sentence) for sentence in predicted_sequence ] print('\n') print('These sentences were decoded using the hidden markov model: \n') for sentence in recomposed_sentences: print(sentence) print('\n') whole_text_accuracy = find_total_accuracy(recomposed_sentences, plaintext_test) print('The accuracy for the whole text was %s' % whole_text_accuracy)
plain_path = cipher_folder + '/train_plain.txt' cipher_path = cipher_folder + '/train_cipher.txt' cipher_train = get_text(cipher_path) plain_train = get_text(plain_path) # format the training data train_data = format_data(cipher_train, plain_train) # test data testc_path = cipher_folder + '/test_cipher.txt' testp_path = cipher_folder + '/test_plain.txt' testc = get_text(testc_path) testp = get_text(testp_path) # format the test data test_data = format_data(testc, testp) trainer = hmm.HiddenMarkovModelTrainer() #laplace estimator my_estimator = lambda fdist, bins: LaplaceProbDist(fdist, bins) if args.laplace_smoothing: if args.supplement: tagger = train_supervised2(trainer, train_data, extra_text(), estimator=my_estimator) else: tagger = trainer.train_supervised(train_data, estimator=my_estimator) else: if args.supplement:
current_sentence = [] test = [] for i in range(0, len(test_file) - 1): current_line = test_file[i] word_tag = current_line.split('\t\t') words.add(word_tag[0]) tags.add(word_tag[1]) current_sentence.append((word_tag[0], word_tag[1])) if word_tag[0] == '.': test.append(current_sentence) current_sentence = [] tags = list(tags) words = list(words) trainer = hmm.HiddenMarkovModelTrainer(tags, words) # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: MLEProbDist(fd)) tagger = trainer.train_supervised( train, estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins)) # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: WittenBellProbDist(fd, bins)) # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: KneserNeyProbDist(fd, bins)) print("here") predicted = [] real = [] for i in range(0, len(test) - 1): current = list(zip(*test[i])) tagged = tagger.tag(list(current[0])) current_tags = list(list(zip(*tagged))[1]) predicted += current_tags